From 746f6bf9fe5261a3c9bcc5f83c47c26f148e6dd7 Mon Sep 17 00:00:00 2001 From: AlongWY Date: Thu, 1 Aug 2024 05:24:15 +0000 Subject: [PATCH] deploy: 72066be21ad467c8ffc76b74c152b38decf3f0ac --- .nojekyll | 0 cache.json | 1 + favicon.ico | Bin 0 -> 15086 bytes index.css | 355 + index.html | 92876 ++++++++++++++++++++++++++++++++++++++++++++++++++ index.js | 39 + 6 files changed, 93271 insertions(+) create mode 100644 .nojekyll create mode 100644 cache.json create mode 100644 favicon.ico create mode 100644 index.css create mode 100644 index.html create mode 100644 index.js diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/cache.json b/cache.json new file mode 100644 index 00000000..19438e09 --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2024-07-24T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2407.17469v1","updated":"2024-07-24T17:59:07Z","published":"2024-07-24T17:59:07Z","title":"I Could've Asked That: Reformulating Unanswerable Questions","summary":" When seeking information from unfamiliar documents, users frequently pose\nquestions that cannot be answered by the documents. While existing large\nlanguage models (LLMs) identify these unanswerable questions, they do not\nassist users in reformulating their questions, thereby reducing their overall\nutility. We curate CouldAsk, an evaluation benchmark composed of existing and\nnew datasets for document-grounded question answering, specifically designed to\nstudy reformulating unanswerable questions. We evaluate state-of-the-art\nopen-source and proprietary LLMs on CouldAsk. The results demonstrate the\nlimited capabilities of these models in reformulating questions. Specifically,\nGPT-4 and Llama2-7B successfully reformulate questions only 26% and 12% of the\ntime, respectively. Error analysis shows that 62% of the unsuccessful\nreformulations stem from the models merely rephrasing the questions or even\ngenerating identical questions. We publicly release the benchmark and the code\nto reproduce the experiments.\n","authors":["Wenting Zhao","Ge Gao","Claire Cardie","Alexander M. Rush"],"pdf_url":"https://arxiv.org/pdf/2407.17469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17468v1","updated":"2024-07-24T17:59:05Z","published":"2024-07-24T17:59:05Z","title":"WildHallucinations: Evaluating Long-form Factuality in LLMs with\n Real-World Entity Queries","summary":" While hallucinations of large language models (LLMs) prevail as a major\nchallenge, existing evaluation benchmarks on factuality do not cover the\ndiverse domains of knowledge that the real-world users of LLMs seek information\nabout. To bridge this gap, we introduce WildHallucinations, a benchmark that\nevaluates factuality. It does so by prompting LLMs to generate information\nabout entities mined from user-chatbot conversations in the wild. These\ngenerations are then automatically fact-checked against a systematically\ncurated knowledge source collected from web search. Notably, half of these\nreal-world entities do not have associated Wikipedia pages. We evaluate 118,785\ngenerations from 15 LLMs on 7,919 entities. We find that LLMs consistently\nhallucinate more on entities without Wikipedia pages and exhibit varying\nhallucination rates across different domains. Finally, given the same base\nmodels, adding a retrieval component only slightly reduces hallucinations but\ndoes not eliminate hallucinations.\n","authors":["Wenting Zhao","Tanya Goyal","Yu Ying Chiu","Liwei Jiang","Benjamin Newman","Abhilasha Ravichander","Khyathi Chandu","Ronan Le Bras","Claire Cardie","Yuntian Deng","Yejin Choi"],"pdf_url":"https://arxiv.org/pdf/2407.17468v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17467v1","updated":"2024-07-24T17:59:02Z","published":"2024-07-24T17:59:02Z","title":"CMR Scaling Law: Predicting Critical Mixture Ratios for Continual\n Pre-training of Language Models","summary":" Large Language Models (LLMs) excel in diverse tasks but often underperform in\nspecialized fields due to limited domain-specific or proprietary corpus.\nContinual pre-training (CPT) enhances LLM capabilities by imbuing new\ndomain-specific or proprietary knowledge while replaying general corpus to\nprevent catastrophic forgetting. The data mixture ratio of general corpus and\ndomain-specific corpus, however, has been chosen heuristically, leading to\nsub-optimal training efficiency in practice. In this context, we attempt to\nre-visit the scaling behavior of LLMs under the hood of CPT, and discover a\npower-law relationship between loss, mixture ratio, and training tokens scale.\nWe formalize the trade-off between general and domain-specific capabilities,\nleading to a well-defined Critical Mixture Ratio (CMR) of general and domain\ndata. By striking the balance, CMR maintains the model's general ability and\nachieves the desired domain transfer, ensuring the highest utilization of\navailable resources. Therefore, if we value the balance between efficiency and\neffectiveness, CMR can be consider as the optimal mixture ratio.Through\nextensive experiments, we ascertain the predictability of CMR, and propose CMR\nscaling law and have substantiated its generalization. These findings offer\npractical guidelines for optimizing LLM training in specialized domains,\nensuring both general and domain-specific performance while efficiently\nmanaging training resources.\n","authors":["Jiawei Gu","Zacc Yang","Chuanghao Ding","Rui Zhao","Fei Tan"],"pdf_url":"https://arxiv.org/pdf/2407.17467v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14236v3","updated":"2024-07-24T17:56:32Z","published":"2024-03-21T08:54:24Z","title":"A Unified Framework for Model Editing","summary":" ROME and MEMIT are largely believed to be two different model editing\nalgorithms, with the major difference between them being the ability to perform\nbatched edits. In this paper, we unify these two algorithms under a single\nconceptual umbrella, optimizing for the same goal, which we call the\npreservation-memorization objective. ROME uses an equality constraint to\noptimize this objective to perform one edit at a time, whereas MEMIT employs a\nmore flexible least-square constraint that allows for batched edits. We\ngeneralize ROME and enable batched editing with equality constraint in the form\nof EMMET - an Equality-constrained Mass Model Editing algorithm for\nTransformers, a new batched memory-editing algorithm. EMMET can perform\nbatched-edits up to a batch-size of 10,000, with very similar performance to\nMEMIT across multiple dimensions. With the introduction of EMMET, we truly\nunify ROME and MEMIT and show that both algorithms are equivalent in terms of\ntheir optimization objective, their abilities (singular and batched editing),\ntheir model editing performance and their limitations.\n","authors":["Akshat Gupta","Dev Sajnani","Gopala Anumanchipalli"],"pdf_url":"https://arxiv.org/pdf/2403.14236v3.pdf","comment":"Under review. To appear as poster at KnowledgeableLM Workshop\n co-located with ACL 2024"},{"id":"http://arxiv.org/abs/2407.17447v1","updated":"2024-07-24T17:23:18Z","published":"2024-07-24T17:23:18Z","title":"Fluent Student-Teacher Redteaming","summary":" Many publicly available language models have been safety tuned to reduce the\nlikelihood of toxic or liability-inducing text. Users or security analysts\nattempt to jailbreak or redteam these models with adversarial prompts which\ncause compliance with requests. One attack method is to apply discrete\noptimization techniques to the prompt. However, the resulting attack strings\nare often gibberish text, easily filtered by defenders due to high measured\nperplexity, and may fail for unseen tasks and/or well-tuned models. In this\nwork, we improve existing algorithms (primarily GCG and BEAST) to develop\npowerful and fluent attacks on safety-tuned models like Llama-2 and Phi-3. Our\ntechnique centers around a new distillation-based approach that encourages the\nvictim model to emulate a toxified finetune, either in terms of output\nprobabilities or internal activations. To encourage human-fluent attacks, we\nadd a multi-model perplexity penalty and a repetition penalty to the objective.\nWe also enhance optimizer strength by allowing token insertions, token swaps,\nand token deletions and by using longer attack sequences. The resulting process\nis able to reliably jailbreak the most difficult target models with prompts\nthat appear similar to human-written prompts. On Advbench we achieve attack\nsuccess rates $>93$% for Llama-2-7B, Llama-3-8B, and Vicuna-7B, while\nmaintaining model-measured perplexity $<33$; we achieve $95$% attack success\nfor Phi-3, though with higher perplexity. We also find a universally-optimized\nsingle fluent prompt that induces $>88$% compliance on previously unseen tasks\nacross Llama-2-7B, Phi-3-mini and Vicuna-7B and transfers to other black-box\nmodels.\n","authors":["T. Ben Thompson","Michael Sklar"],"pdf_url":"https://arxiv.org/pdf/2407.17447v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.01267v2","updated":"2024-07-24T17:13:55Z","published":"2024-03-02T17:10:44Z","title":"Dissecting Language Models: Machine Unlearning via Selective Pruning","summary":" Understanding and shaping the behaviour of Large Language Models (LLMs) is\nincreasingly important as applications become more powerful and more frequently\nadopted. This paper introduces a machine unlearning method specifically\ndesigned for LLMs. We introduce a selective pruning method for LLMs that\nremoves neurons based on their relative importance on a targeted capability\ncompared to overall network performance. This approach is a compute- and\ndata-efficient method for identifying and removing neurons that enable specific\nbehaviours. Our findings reveal that both feed-forward and attention neurons in\nLLMs are specialized; that is, for specific tasks, certain neurons are more\ncrucial than others. Code from all experiments is available at\nhttps://github.com/nickypro/selective-pruning\n","authors":["Nicholas Pochinkov","Nandi Schoots"],"pdf_url":"https://arxiv.org/pdf/2403.01267v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14933v2","updated":"2024-07-24T16:52:51Z","published":"2024-07-20T16:50:18Z","title":"Consent in Crisis: The Rapid Decline of the AI Data Commons","summary":" General-purpose artificial intelligence (AI) systems are built on massive\nswathes of public web data, assembled into corpora such as C4, RefinedWeb, and\nDolma. To our knowledge, we conduct the first, large-scale, longitudinal audit\nof the consent protocols for the web domains underlying AI training corpora.\nOur audit of 14,000 web domains provides an expansive view of crawlable web\ndata and how codified data use preferences are changing over time. We observe a\nproliferation of AI-specific clauses to limit use, acute differences in\nrestrictions on AI developers, as well as general inconsistencies between\nwebsites' expressed intentions in their Terms of Service and their robots.txt.\nWe diagnose these as symptoms of ineffective web protocols, not designed to\ncope with the widespread re-purposing of the internet for AI. Our longitudinal\nanalyses show that in a single year (2023-2024) there has been a rapid\ncrescendo of data restrictions from web sources, rendering ~5%+ of all tokens\nin C4, or 28%+ of the most actively maintained, critical sources in C4, fully\nrestricted from use. For Terms of Service crawling restrictions, a full 45% of\nC4 is now restricted. If respected or enforced, these restrictions are rapidly\nbiasing the diversity, freshness, and scaling laws for general-purpose AI\nsystems. We hope to illustrate the emerging crises in data consent, for both\ndevelopers and creators. The foreclosure of much of the open web will impact\nnot only commercial AI, but also non-commercial AI and academic research.\n","authors":["Shayne Longpre","Robert Mahari","Ariel Lee","Campbell Lund","Hamidah Oderinwale","William Brannon","Nayan Saxena","Naana Obeng-Marnu","Tobin South","Cole Hunter","Kevin Klyman","Christopher Klamm","Hailey Schoelkopf","Nikhil Singh","Manuel Cherep","Ahmad Anis","An Dinh","Caroline Chitongo","Da Yin","Damien Sileo","Deividas Mataciunas","Diganta Misra","Emad Alghamdi","Enrico Shippole","Jianguo Zhang","Joanna Materzynska","Kun Qian","Kush Tiwary","Lester Miranda","Manan Dey","Minnie Liang","Mohammed Hamdy","Niklas Muennighoff","Seonghyeon Ye","Seungone Kim","Shrestha Mohanty","Vipul Gupta","Vivek Sharma","Vu Minh Chien","Xuhui Zhou","Yizhi Li","Caiming Xiong","Luis Villa","Stella Biderman","Hanlin Li","Daphne Ippolito","Sara Hooker","Jad Kabbara","Sandy Pentland"],"pdf_url":"https://arxiv.org/pdf/2407.14933v2.pdf","comment":"41 pages (13 main), 5 figures, 9 tables"},{"id":"http://arxiv.org/abs/2407.17406v1","updated":"2024-07-24T16:38:38Z","published":"2024-07-24T16:38:38Z","title":"Dependency Transformer Grammars: Integrating Dependency Structures into\n Transformer Language Models","summary":" Syntactic Transformer language models aim to achieve better generalization\nthrough simultaneously modeling syntax trees and sentences. While prior work\nhas been focusing on adding constituency-based structures to Transformers, we\nintroduce Dependency Transformer Grammars (DTGs), a new class of Transformer\nlanguage model with explicit dependency-based inductive bias. DTGs simulate\ndependency transition systems with constrained attention patterns by modifying\nattention masks, incorporate the stack information through relative positional\nencoding, and augment dependency arc representation with a combination of token\nembeddings and operation embeddings. When trained on a dataset of sentences\nannotated with dependency trees, DTGs achieve better generalization while\nmaintaining comparable perplexity with Transformer language model baselines.\nDTGs also outperform recent constituency-based models, showing that dependency\ncan better guide Transformer language models. Our code is released at\nhttps://github.com/zhaoyd1/Dep_Transformer_Grammars.\n","authors":["Yida Zhao","Chao Lou","Kewei Tu"],"pdf_url":"https://arxiv.org/pdf/2407.17406v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17390v1","updated":"2024-07-24T16:14:15Z","published":"2024-07-24T16:14:15Z","title":"CovScore: Evaluation of Multi-Document Abstractive Title Set Generation","summary":" This paper introduces CovScore, an automatic reference-less methodology for\nevaluating thematic title sets, extracted from a corpus of documents. While\nsuch extraction methods are widely used, evaluating their effectiveness remains\nan open question. Moreover, some existing practices heavily rely on slow and\nlaborious human annotation procedures. Inspired by recently introduced\nLLM-based judge methods, we propose a novel methodology that decomposes quality\ninto five main metrics along different aspects of evaluation. This framing\nsimplifies and expedites the manual evaluation process and enables automatic\nand independent LLM-based evaluation. As a test case, we apply our approach to\na corpus of Holocaust survivor testimonies, motivated both by its relevance to\ntitle set extraction and by the moral significance of this pursuit. We validate\nthe methodology by experimenting with naturalistic and synthetic title set\ngeneration systems and compare their performance with the methodology.\n","authors":["Itamar Trainin","Omri Abend"],"pdf_url":"https://arxiv.org/pdf/2407.17390v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17387v1","updated":"2024-07-24T16:11:39Z","published":"2024-07-24T16:11:39Z","title":"PERSONA: A Reproducible Testbed for Pluralistic Alignment","summary":" The rapid advancement of language models (LMs) necessitates robust alignment\nwith diverse user values. However, current preference optimization approaches\noften fail to capture the plurality of user opinions, instead reinforcing\nmajority viewpoints and marginalizing minority perspectives. We introduce\nPERSONA, a reproducible test bed designed to evaluate and improve pluralistic\nalignment of LMs. We procedurally generate diverse user profiles from US census\ndata, resulting in 1,586 synthetic personas with varied demographic and\nidiosyncratic attributes. We then generate a large-scale evaluation dataset\ncontaining 3,868 prompts and 317,200 feedback pairs obtained from our synthetic\npersonas. Leveraging this dataset, we systematically evaluate LM capabilities\nin role-playing diverse users, verified through human judges, and the\nestablishment of both a benchmark, PERSONA Bench, for pluralistic alignment\napproaches as well as an extensive dataset to create new and future benchmarks.\nThe full dataset and benchmarks are available here:\nhttps://www.synthlabs.ai/research/persona.\n","authors":["Louis Castricato","Nathan Lile","Rafael Rafailov","Jan-Philipp Fränken","Chelsea Finn"],"pdf_url":"https://arxiv.org/pdf/2407.17387v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17383v1","updated":"2024-07-24T16:07:11Z","published":"2024-07-24T16:07:11Z","title":"A Comprehensive Approach to Misspelling Correction with BERT and\n Levenshtein Distance","summary":" Writing, as an omnipresent form of human communication, permeates nearly\nevery aspect of contemporary life. Consequently, inaccuracies or errors in\nwritten communication can lead to profound consequences, ranging from financial\nlosses to potentially life-threatening situations. Spelling mistakes, among the\nmost prevalent writing errors, are frequently encountered due to various\nfactors. This research aims to identify and rectify diverse spelling errors in\ntext using neural networks, specifically leveraging the Bidirectional Encoder\nRepresentations from Transformers (BERT) masked language model. To achieve this\ngoal, we compiled a comprehensive dataset encompassing both non-real-word and\nreal-word errors after categorizing different types of spelling mistakes.\nSubsequently, multiple pre-trained BERT models were employed. To ensure optimal\nperformance in correcting misspelling errors, we propose a combined approach\nutilizing the BERT masked language model and Levenshtein distance. The results\nfrom our evaluation data demonstrate that the system presented herein exhibits\nremarkable capabilities in identifying and rectifying spelling mistakes, often\nsurpassing existing systems tailored for the Persian language.\n","authors":["Amirreza Naziri","Hossein Zeinali"],"pdf_url":"https://arxiv.org/pdf/2407.17383v1.pdf","comment":"12 pages, 9 figures, 5 tables"},{"id":"http://arxiv.org/abs/2407.17379v1","updated":"2024-07-24T15:59:01Z","published":"2024-07-24T15:59:01Z","title":"MMRA: A Benchmark for Multi-granularity Multi-image Relational\n Association","summary":" Given the remarkable success that large visual language models (LVLMs) have\nachieved in image perception tasks, the endeavor to make LVMLs perceive the\nworld like humans is drawing increasing attention. Current multi-modal\nbenchmarks mainly focus on the objective fact or certain topic related\npotential knowledge within a image, but overlook the associative relations\nbetween multiple images. Therefore, we define a multi-image relation\nassociation task, and meticulously curate \\textbf{MMRA} benchmark, a\n\\textbf{M}ulti-granularity \\textbf{M}ulti-image \\textbf{R}elational\n\\textbf{A}ssociation benchmark, consisted of \\textbf{1026} samples. In order to\nsystematically and comprehensively evaluate mainstream LVLMs, we establish an\nassociational relation system among images that contain \\textbf{11 subtasks}\n(e.g, UsageSimilarity, SubEvent, etc.) at two granularity levels (i.e.,\n\"\\textbf{image}\" and \"\\textbf{entity}\") according to the relations in\nConceptNet. Our experiments demonstrate that, on our MMRA benchmark, current\nmainstream LVLMs all have their own advantages and disadvantages across\ndifferent subtasks. It is worth noting that, at the entity level, the\nperformance of all models is worse than that of them at the image level,\nindicating that the fine-grained multi-image perception task is still\nchallenging for LVLMs. The tasks related to spatial perception are relatively\ndifficult for LVLMs to handle. Furthermore, we find that LVMLs exhibit a good\nability to perceive image details, and the key to enhancing their multi-image\nassociation capability is to strengthen the reasoning ability of their language\nmodel component. All our codes and data are released at\nhtt\\url{https://github.com/Wusiwei0410/MMRA}.\n","authors":["Siwei Wu","Kang Zhu","Yu Bai","Yiming Liang","Yizhi Li","Haoning Wu","Jiaheng Liu","Ruibo Liu","Xingwei Qu","Xuxin Cheng","Ge Zhang","Wenhao Huang","Chenghua Lin"],"pdf_url":"https://arxiv.org/pdf/2407.17379v1.pdf","comment":"VLMS, Multi-Image Association"},{"id":"http://arxiv.org/abs/2404.03302v3","updated":"2024-07-24T15:51:08Z","published":"2024-04-04T08:52:30Z","title":"How Easily do Irrelevant Inputs Skew the Responses of Large Language\n Models?","summary":" By leveraging the retrieval of information from external knowledge databases,\nLarge Language Models (LLMs) exhibit enhanced capabilities for accomplishing\nmany knowledge-intensive tasks. However, due to the inherent flaws of current\nretrieval systems, there might exist irrelevant information within those\nretrieving top-ranked passages. In this work, we present a comprehensive\ninvestigation into the robustness of LLMs to different types of irrelevant\ninformation under various conditions. We initially introduce a framework to\nconstruct high-quality irrelevant information that ranges from semantically\nunrelated, partially related, and related to questions. Furthermore, our\nanalysis demonstrates that the constructed irrelevant information not only\nscores highly on similarity metrics, being highly retrieved by existing\nsystems, but also bears semantic connections to the context. Our investigation\nreveals that current LLMs still face challenges in discriminating highly\nsemantically related information and can be easily distracted by these\nirrelevant yet misleading content. Besides, we also find that current solutions\nfor handling irrelevant information have limitations in improving the\nrobustness of LLMs to such distractions. All the resources are available on\nGitHub at https://github.com/Di-viner/LLM-Robustness-to-Irrelevant-Information.\n","authors":["Siye Wu","Jian Xie","Jiangjie Chen","Tinghui Zhu","Kai Zhang","Yanghua Xiao"],"pdf_url":"https://arxiv.org/pdf/2404.03302v3.pdf","comment":"COLM 2024"},{"id":"http://arxiv.org/abs/2402.14154v2","updated":"2024-07-24T15:19:20Z","published":"2024-02-21T22:27:40Z","title":"MM-Soc: Benchmarking Multimodal Large Language Models in Social Media\n Platforms","summary":" Social media platforms are hubs for multimodal information exchange,\nencompassing text, images, and videos, making it challenging for machines to\ncomprehend the information or emotions associated with interactions in online\nspaces. Multimodal Large Language Models (MLLMs) have emerged as a promising\nsolution to these challenges, yet they struggle to accurately interpret human\nemotions and complex content such as misinformation. This paper introduces\nMM-Soc, a comprehensive benchmark designed to evaluate MLLMs' understanding of\nmultimodal social media content. MM-Soc compiles prominent multimodal datasets\nand incorporates a novel large-scale YouTube tagging dataset, targeting a range\nof tasks from misinformation detection, hate speech detection, and social\ncontext generation. Through our exhaustive evaluation on ten size-variants of\nfour open-source MLLMs, we have identified significant performance disparities,\nhighlighting the need for advancements in models' social understanding\ncapabilities. Our analysis reveals that, in a zero-shot setting, various types\nof MLLMs generally exhibit difficulties in handling social media tasks.\nHowever, MLLMs demonstrate performance improvements post fine-tuning,\nsuggesting potential pathways for improvement. Our code and data are available\nat https://github.com/claws-lab/MMSoc.git.\n","authors":["Yiqiao Jin","Minje Choi","Gaurav Verma","Jindong Wang","Srijan Kumar"],"pdf_url":"https://arxiv.org/pdf/2402.14154v2.pdf","comment":"In Proceedings of ACL 2024"},{"id":"http://arxiv.org/abs/2407.17349v1","updated":"2024-07-24T15:18:17Z","published":"2024-07-24T15:18:17Z","title":"Boosting Large Language Models with Socratic Method for Conversational\n Mathematics Teaching","summary":" With the introduction of large language models (LLMs), automatic math\nreasoning has seen tremendous success. However, current methods primarily focus\non providing solutions or using techniques like Chain-of-Thought to enhance\nproblem-solving accuracy. In this paper, we focus on improving the capability\nof mathematics teaching via a Socratic teaching-based LLM\n(\\texttt{SocraticLLM}), which guides learners toward profound thinking with\nclarity and self-discovery via conversation. We collect and release a\nhigh-quality mathematical teaching dataset, named \\texttt{SocraticMATH}, which\nprovides Socratic-style conversations of problems with extra knowledge. Also,\nwe propose a knowledge-enhanced LLM as a strong baseline to generate reliable\nresponses with review, guidance/heuristic, rectification, and summarization.\nExperimental results show the great advantages of \\texttt{SocraticLLM} by\ncomparing it with several strong generative models. The codes and datasets are\navailable on \\url{https://github.com/ECNU-ICALK/SocraticMath}.\n","authors":["Yuyang Ding","Hanglei Hu","Jie Zhou","Qin Chen","Bo Jiang","Liang He"],"pdf_url":"https://arxiv.org/pdf/2407.17349v1.pdf","comment":"Accepted By CIKM 2024"},{"id":"http://arxiv.org/abs/2407.17344v1","updated":"2024-07-24T15:13:12Z","published":"2024-07-24T15:13:12Z","title":"Label Alignment and Reassignment with Generalist Large Language Model\n for Enhanced Cross-Domain Named Entity Recognition","summary":" Named entity recognition on the in-domain supervised and few-shot settings\nhave been extensively discussed in the NLP community and made significant\nprogress. However, cross-domain NER, a more common task in practical scenarios,\nstill poses a challenge for most NER methods. Previous research efforts in that\narea primarily focus on knowledge transfer such as correlate label information\nfrom source to target domains but few works pay attention to the problem of\nlabel conflict. In this study, we introduce a label alignment and reassignment\napproach, namely LAR, to address this issue for enhanced cross-domain named\nentity recognition, which includes two core procedures: label alignment between\nsource and target domains and label reassignment for type inference. The\nprocess of label reassignment can significantly be enhanced by integrating with\nan advanced large-scale language model such as ChatGPT. We conduct an extensive\nrange of experiments on NER datasets involving both supervised and zero-shot\nscenarios. Empirical experimental results demonstrate the validation of our\nmethod with remarkable performance under the supervised and zero-shot\nout-of-domain settings compared to SOTA methods.\n","authors":["Ke Bao","Chonghuan Yang"],"pdf_url":"https://arxiv.org/pdf/2407.17344v1.pdf","comment":"9 pages, 4 figures"},{"id":"http://arxiv.org/abs/2407.16521v2","updated":"2024-07-24T15:12:09Z","published":"2024-07-23T14:34:38Z","title":"AMONGAGENTS: Evaluating Large Language Models in the Interactive\n Text-Based Social Deduction Game","summary":" Strategic social deduction games serve as valuable testbeds for evaluating\nthe understanding and inference skills of language models, offering crucial\ninsights into social science, artificial intelligence, and strategic gaming.\nThis paper focuses on creating proxies of human behavior in simulated\nenvironments, with Among Us utilized as a tool for studying simulated human\nbehavior. The study introduces a text-based game environment, named\nAmongAgents, that mirrors the dynamics of Among Us. Players act as crew members\naboard a spaceship, tasked with identifying impostors who are sabotaging the\nship and eliminating the crew. Within this environment, the behavior of\nsimulated language agents is analyzed. The experiments involve diverse game\nsequences featuring different configurations of Crewmates and Impostor\npersonality archetypes. Our work demonstrates that state-of-the-art large\nlanguage models (LLMs) can effectively grasp the game rules and make decisions\nbased on the current context. This work aims to promote further exploration of\nLLMs in goal-oriented games with incomplete information and complex action\nspaces, as these settings offer valuable opportunities to assess language model\nperformance in socially driven scenarios.\n","authors":["Yizhou Chi","Lingjun Mao","Zineng Tang"],"pdf_url":"https://arxiv.org/pdf/2407.16521v2.pdf","comment":"Wordplay @ ACL 2024"},{"id":"http://arxiv.org/abs/2305.12517v5","updated":"2024-07-24T15:10:41Z","published":"2023-05-21T17:14:31Z","title":"Description-Based Text Similarity","summary":" Identifying texts with a given semantics is central for many information\nseeking scenarios. Similarity search over vector embeddings appear to be\ncentral to this ability, yet the similarity reflected in current text\nembeddings is corpus-driven, and is inconsistent and sub-optimal for many use\ncases. What, then, is a good notion of similarity for effective retrieval of\ntext?\n We identify the need to search for texts based on abstract descriptions of\ntheir content, and the corresponding notion of \\emph{description based\nsimilarity}. We demonstrate the inadequacy of current text embeddings and\npropose an alternative model that significantly improves when used in standard\nnearest neighbor search. The model is trained using positive and negative pairs\nsourced through prompting a LLM, demonstrating how data from LLMs can be used\nfor creating new capabilities not immediately possible using the original\nmodel.\n","authors":["Shauli Ravfogel","Valentina Pyatkin","Amir DN Cohen","Avshalom Manevich","Yoav Goldberg"],"pdf_url":"https://arxiv.org/pdf/2305.12517v5.pdf","comment":"Accepted in COLM 2024"},{"id":"http://arxiv.org/abs/2407.14829v2","updated":"2024-07-24T15:09:29Z","published":"2024-07-20T10:13:54Z","title":"Overview of AI-Debater 2023: The Challenges of Argument Generation Tasks","summary":" In this paper we present the results of the AI-Debater 2023 Challenge held by\nthe Chinese Conference on Affect Computing (CCAC 2023), and introduce the\nrelated datasets. We organize two tracks to handle the argumentative generation\ntasks in different scenarios, namely, Counter-Argument Generation (Track 1) and\nClaim-based Argument Generation (Track 2). Each track is equipped with its\ndistinct dataset and baseline model respectively. In total, 32 competing teams\nregister for the challenge, from which we received 11 successful submissions.\nIn this paper, we will present the results of the challenge and a summary of\nthe systems, highlighting commonalities and innovations among participating\nsystems. Datasets and baseline models of the AI-Debater 2023 Challenge have\nbeen already released and can be accessed through the official website of the\nchallenge.\n","authors":["Jiayu Lin","Guanrong Chen","Bojun Jin","Chenyang Li","Shutong Jia","Wancong Lin","Yang Sun","Yuhang He","Caihua Yang","Jianzhu Bao","Jipeng Wu","Wen Su","Jinglu Chen","Xinyi Li","Tianyu Chen","Mingjie Han","Shuaiwen Du","Zijian Wang","Jiyin Li","Fuzhong Suo","Hao Wang","Nuanchen Lin","Xuanjing Huang","Changjian Jiang","RuiFeng Xu","Long Zhang","Jiuxin Cao","Ting Jin","Zhongyu Wei"],"pdf_url":"https://arxiv.org/pdf/2407.14829v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.10969v3","updated":"2024-07-24T14:57:48Z","published":"2024-07-15T17:59:29Z","title":"Q-Sparse: All Large Language Models can be Fully Sparsely-Activated","summary":" We introduce, Q-Sparse, a simple yet effective approach to training\nsparsely-activated large language models (LLMs). Q-Sparse enables full sparsity\nof activations in LLMs which can bring significant efficiency gains in\ninference. This is achieved by applying top-K sparsification to the activations\nand the straight-through-estimator to the training. We also introduce Block\nQ-Sparse for batch training and inference. The key results from this work are,\n(1) Q-Sparse can achieve results comparable to those of baseline LLMs while\nbeing much more efficient at inference time; (2) We present an\ninference-optimal scaling law for sparsely-activated LLMs; (3) Q-Sparse is\neffective in different settings, including training-from-scratch,\ncontinue-training of off-the-shelf LLMs, and finetuning; (4) Q-Sparse works for\nboth full-precision and 1-bit LLMs (e.g., BitNet b1.58). Particularly, the\nsynergy of BitNet b1.58 and Q-Sparse (can be equipped with MoE) provides the\ncornerstone and a clear path to revolutionize the efficiency, including cost\nand energy consumption, of future LLMs.\n","authors":["Hongyu Wang","Shuming Ma","Ruiping Wang","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2407.10969v3.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2407.17291v1","updated":"2024-07-24T14:02:20Z","published":"2024-07-24T14:02:20Z","title":"How Good (Or Bad) Are LLMs at Detecting Misleading Visualizations?","summary":" In this study, we address the growing issue of misleading charts, a prevalent\nproblem that undermines the integrity of information dissemination. Misleading\ncharts can distort the viewer's perception of data, leading to\nmisinterpretations and decisions based on false information. The development of\neffective automatic detection methods for misleading charts is an urgent field\nof research. The recent advancement of multimodal Large Language Models (LLMs)\nhas introduced a promising direction for addressing this challenge. We explored\nthe capabilities of these models in analyzing complex charts and assessing the\nimpact of different prompting strategies on the models' analyses. We utilized a\ndataset of misleading charts collected from the internet by prior research and\ncrafted nine distinct prompts, ranging from simple to complex, to test the\nability of four different multimodal LLMs in detecting over 21 different chart\nissues. Through three experiments--from initial exploration to detailed\nanalysis--we progressively gained insights into how to effectively prompt LLMs\nto identify misleading charts and developed strategies to address the\nscalability challenges encountered as we expanded our detection range from the\ninitial five issues to 21 issues in the final experiment. Our findings reveal\nthat multimodal LLMs possess a strong capability for chart comprehension and\ncritical thinking in data interpretation. There is significant potential in\nemploying multimodal LLMs to counter misleading information by supporting\ncritical thinking and enhancing visualization literacy. This study demonstrates\nthe applicability of LLMs in addressing the pressing concern of misleading\ncharts.\n","authors":["Leo Yu-Ho Lo","Huamin Qu"],"pdf_url":"https://arxiv.org/pdf/2407.17291v1.pdf","comment":"To be presented at IEEE VIS 2024"},{"id":"http://arxiv.org/abs/2311.14324v2","updated":"2024-07-24T13:34:14Z","published":"2023-11-24T07:53:48Z","title":"Large Language Models as Topological Structure Enhancers for\n Text-Attributed Graphs","summary":" The latest advancements in large language models (LLMs) have revolutionized\nthe field of natural language processing (NLP). Inspired by the success of LLMs\nin NLP tasks, some recent work has begun investigating the potential of\napplying LLMs in graph learning tasks. However, most of the existing work\nfocuses on utilizing LLMs as powerful node feature augmenters, leaving\nemploying LLMs to enhance graph topological structures an understudied problem.\nIn this work, we explore how to leverage the information retrieval and text\ngeneration capabilities of LLMs to refine/enhance the topological structure of\ntext-attributed graphs (TAGs) under the node classification setting. First, we\npropose using LLMs to help remove unreliable edges and add reliable ones in the\nTAG. Specifically, we first let the LLM output the semantic similarity between\nnode attributes through delicate prompt designs, and then perform edge deletion\nand edge addition based on the similarity. Second, we propose using\npseudo-labels generated by the LLM to improve graph topology, that is, we\nintroduce the pseudo-label propagation as a regularization to guide the graph\nneural network (GNN) in learning proper edge weights. Finally, we incorporate\nthe two aforementioned LLM-based methods for graph topological refinement into\nthe process of GNN training, and perform extensive experiments on four\nreal-world datasets. The experimental results demonstrate the effectiveness of\nLLM-based graph topology refinement (achieving a 0.15%--2.47% performance gain\non public benchmarks).\n","authors":["Shengyin Sun","Yuxiang Ren","Chen Ma","Xuecang Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.14324v2.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2401.17505v4","updated":"2024-07-24T12:57:56Z","published":"2024-01-30T23:46:35Z","title":"Arrows of Time for Large Language Models","summary":" We study the probabilistic modeling performed by Autoregressive Large\nLanguage Models (LLMs) through the angle of time directionality, addressing a\nquestion first raised in (Shannon, 1951). For large enough models, we\nempirically find a time asymmetry in their ability to learn natural language: a\ndifference in the average log-perplexity when trying to predict the next token\nversus when trying to predict the previous one. This difference is at the same\ntime subtle and very consistent across various modalities (language, model\nsize, training time, ...). Theoretically, this is surprising: from an\ninformation-theoretic point of view, there should be no such difference. We\nprovide a theoretical framework to explain how such an asymmetry can appear\nfrom sparsity and computational complexity considerations, and outline a number\nof perspectives opened by our results.\n","authors":["Vassilis Papadopoulos","Jérémie Wenger","Clément Hongler"],"pdf_url":"https://arxiv.org/pdf/2401.17505v4.pdf","comment":"Corrected typos in Table 2. Added links. 12 figures, 20 pages"},{"id":"http://arxiv.org/abs/2407.09835v2","updated":"2024-07-24T12:43:33Z","published":"2024-07-13T10:08:55Z","title":"Investigating Low-Rank Training in Transformer Language Models:\n Efficiency and Scaling Analysis","summary":" State-of-the-art LLMs often rely on scale with high computational costs,\nwhich has sparked a research agenda to reduce parameter counts and costs\nwithout significantly impacting performance. Our study focuses on\nTransformer-based LLMs, specifically applying low-rank parametrization to the\ncomputationally intensive feedforward networks (FFNs), which are less studied\nthan attention blocks. In contrast to previous works, (i) we explore low-rank\nparametrization at scale, up to 1.3B parameters; (ii) within Transformer\nlanguage models rather than convolutional architectures; and (iii) starting\nfrom training from scratch. Experiments on the large RefinedWeb dataset show\nthat low-rank parametrization is both efficient (e.g., 2.6$\\times$ FFN speed-up\nwith 32\\% parameters) and effective during training. Interestingly, these\nstructured FFNs exhibit steeper scaling curves than the original models.\nMotivated by this finding, we develop the wide and structured networks\nsurpassing the current medium-sized and large-sized Transformer in perplexity\nand throughput performance. Our code is available at\nhttps://github.com/CLAIRE-Labo/StructuredFFN/tree/main.\n","authors":["Xiuying Wei","Skander Moalla","Razvan Pascanu","Caglar Gulcehre"],"pdf_url":"https://arxiv.org/pdf/2407.09835v2.pdf","comment":"Accepted by ICML 2024 Next Generation of Sequence Modeling\n Architectures Workshop. Short version of arXiv:2406.16450"},{"id":"http://arxiv.org/abs/2407.17230v1","updated":"2024-07-24T12:34:23Z","published":"2024-07-24T12:34:23Z","title":"Improving ICD coding using Chapter based Named Entities and Attentional\n Models","summary":" Recent advancements in natural language processing (NLP) have led to\nautomation in various domains. However, clinical NLP often relies on benchmark\ndatasets that may not reflect real-world scenarios accurately. Automatic ICD\ncoding, a vital NLP task, typically uses outdated and imbalanced datasets like\nMIMIC-III, with existing methods yielding micro-averaged F1 scores between 0.4\nand 0.7 due to many false positives. Our research introduces an enhanced\napproach to ICD coding that improves F1 scores by using chapter-based named\nentities and attentional models. This method categorizes discharge summaries\ninto ICD-9 Chapters and develops attentional models with chapter-specific data,\neliminating the need to consider external data for code identification. For\ncategorization, we use Chapter-IV to de-bias and influence key entities and\nweights without neural networks, creating accurate thresholds and providing\ninterpretability for human validation. Post-validation, we develop attentional\nmodels for three frequent and three non-frequent codes from Chapter-IV using\nBidirectional-Gated Recurrent Units (GRUs) with Attention and Transformer with\nMulti-head Attention architectures. The average Micro-F1 scores of 0.79 and\n0.81 from these models demonstrate significant performance improvements in ICD\ncoding.\n","authors":["Abhijith R. Beeravolu","Mirjam Jonkman","Sami Azam","Friso De Boer"],"pdf_url":"https://arxiv.org/pdf/2407.17230v1.pdf","comment":"10 Pages"},{"id":"http://arxiv.org/abs/2407.17227v1","updated":"2024-07-24T12:28:03Z","published":"2024-07-24T12:28:03Z","title":"LEAN-GitHub: Compiling GitHub LEAN repositories for a versatile LEAN\n prover","summary":" Recently, large language models have presented promising results in aiding\nformal mathematical reasoning. However, their performance is restricted due to\nthe scarcity of formal theorem-proving data, which requires additional effort\nto be extracted from raw formal language corpora. Meanwhile, a significant\namount of human-written formal language corpora remains underutilized. To\naddress this issue, we propose LEAN-GitHub, a dataset consisting of large-scale\nformal data extracted from almost all Lean 4 repositories on GitHub. After\nfine-tuning InternLM-math-plus on this dataset, our model achieved accuracies\nof 48.8% with a single pass and 54.5% with 64 passes on the Lean 4 miniF2F\ntest, surpassing state-of-the-art method at 52%. And it also achieves\nstate-of-the-art on two other Lean 4 benchmarks (ProofNet and Putnam) targeting\ndifferent fields/levels of math. These results demonstrate that our proposed\ndataset is beneficial for formal reasoning on a wide range of math topics. We\nopen-source our model at https://GitHub. com/InternLM/InternLM-Math and our\ndata at https://huggingface.co/ datasets/InternLM/Lean-GitHub\n","authors":["Zijian Wu","Jiayu Wang","Dahua Lin","Kai Chen"],"pdf_url":"https://arxiv.org/pdf/2407.17227v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08582v2","updated":"2024-07-24T12:25:17Z","published":"2023-10-12T17:59:50Z","title":"Tree-Planner: Efficient Close-loop Task Planning with Large Language\n Models","summary":" This paper studies close-loop task planning, which refers to the process of\ngenerating a sequence of skills (a plan) to accomplish a specific goal while\nadapting the plan based on real-time observations. Recently, prompting Large\nLanguage Models (LLMs) to generate actions iteratively has become a prevalent\nparadigm due to its superior performance and user-friendliness. However, this\nparadigm is plagued by two inefficiencies: high token consumption and redundant\nerror correction, both of which hinder its scalability for large-scale testing\nand applications. To address these issues, we propose Tree-Planner, which\nreframes task planning with LLMs into three distinct phases: plan sampling,\naction tree construction, and grounded deciding. Tree-Planner starts by using\nan LLM to sample a set of potential plans before execution, followed by the\naggregation of them to form an action tree. Finally, the LLM performs a\ntop-down decision-making process on the tree, taking into account real-time\nenvironmental information. Experiments show that Tree-Planner achieves\nstate-of-the-art performance while maintaining high efficiency. By decomposing\nLLM queries into a single plan-sampling call and multiple grounded-deciding\ncalls, a considerable part of the prompt are less likely to be repeatedly\nconsumed. As a result, token consumption is reduced by 92.2% compared to the\npreviously best-performing model. Additionally, by enabling backtracking on the\naction tree as needed, the correction process becomes more flexible, leading to\na 40.5% decrease in error corrections.\n","authors":["Mengkang Hu","Yao Mu","Xinmiao Yu","Mingyu Ding","Shiguang Wu","Wenqi Shao","Qiguang Chen","Bin Wang","Yu Qiao","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2310.08582v2.pdf","comment":"Published in ICLR 2024"},{"id":"http://arxiv.org/abs/2407.01976v2","updated":"2024-07-24T11:45:48Z","published":"2024-07-02T06:29:05Z","title":"A Bounding Box is Worth One Token: Interleaving Layout and Text in a\n Large Language Model for Document Understanding","summary":" Recently, many studies have demonstrated that exclusively incorporating\nOCR-derived text and spatial layouts with large language models (LLMs) can be\nhighly effective for document understanding tasks. However, existing methods\nthat integrate spatial layouts with text have limitations, such as producing\noverly long text sequences or failing to fully leverage the autoregressive\ntraits of LLMs. In this work, we introduce Interleaving Layout and Text in a\nLarge Language Model (LayTextLLM)} for document understanding. In particular,\nLayTextLLM projects each bounding box to a single embedding and interleaves it\nwith text, efficiently avoiding long sequence issues while leveraging\nautoregressive traits of LLMs. LayTextLLM not only streamlines the interaction\nof layout and textual data but also shows enhanced performance in Key\nInformation Extraction (KIE) and Visual Question Answering (VQA). Comprehensive\nbenchmark evaluations reveal significant improvements, with a 27.2% increase on\nKIE tasks and 12.0% on VQA tasks compared to previous state-of-the-art document\nunderstanding MLLMs, as well as a 15.1% improvement over other SOTA OCR-based\nLLMs on KIE tasks.\n","authors":["Jinghui Lu","Haiyang Yu","Yanjie Wang","Yongjie Ye","Jingqun Tang","Ziwei Yang","Binghong Wu","Qi Liu","Hao Feng","Han Wang","Hao Liu","Can Huang"],"pdf_url":"https://arxiv.org/pdf/2407.01976v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17174v1","updated":"2024-07-24T11:24:25Z","published":"2024-07-24T11:24:25Z","title":"NarrationDep: Narratives on Social Media For Automatic Depression\n Detection","summary":" Social media posts provide valuable insight into the narrative of users and\ntheir intentions, including providing an opportunity to automatically model\nwhether a social media user is depressed or not. The challenge lies in\nfaithfully modelling user narratives from their online social media posts,\nwhich could potentially be useful in several different applications. We have\ndeveloped a novel and effective model called \\texttt{NarrationDep}, which\nfocuses on detecting narratives associated with depression. By analyzing a\nuser's tweets, \\texttt{NarrationDep} accurately identifies crucial narratives.\n\\texttt{NarrationDep} is a deep learning framework that jointly models\nindividual user tweet representations and clusters of users' tweets. As a\nresult, \\texttt{NarrationDep} is characterized by a novel two-layer deep\nlearning model: the first layer models using social media text posts, and the\nsecond layer learns semantic representations of tweets associated with a\ncluster. To faithfully model these cluster representations, the second layer\nincorporates a novel component that hierarchically learns from users' posts.\nThe results demonstrate that our framework outperforms other comparative models\nincluding recently developed models on a variety of datasets.\n","authors":["Hamad Zogan","Imran Razzak","Shoaib Jameel","Guandong Xu"],"pdf_url":"https://arxiv.org/pdf/2407.17174v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17172v1","updated":"2024-07-24T11:22:57Z","published":"2024-07-24T11:22:57Z","title":"Speech Editing -- a Summary","summary":" With the rise of video production and social media, speech editing has become\ncrucial for creators to address issues like mispronunciations, missing words,\nor stuttering in audio recordings. This paper explores text-based speech\nediting methods that modify audio via text transcripts without manual waveform\nediting. These approaches ensure edited audio is indistinguishable from the\noriginal by altering the mel-spectrogram. Recent advancements, such as\ncontext-aware prosody correction and advanced attention mechanisms, have\nimproved speech editing quality. This paper reviews state-of-the-art methods,\ncompares key metrics, and examines widely used datasets. The aim is to\nhighlight ongoing issues and inspire further research and innovation in speech\nediting.\n","authors":["Tobias Kässmann","Yining Liu","Danni Liu"],"pdf_url":"https://arxiv.org/pdf/2407.17172v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17167v1","updated":"2024-07-24T11:14:06Z","published":"2024-07-24T11:14:06Z","title":"Zero-Shot vs. Few-Shot Multi-Speaker TTS Using Pre-trained Czech\n SpeechT5 Model","summary":" In this paper, we experimented with the SpeechT5 model pre-trained on\nlarge-scale datasets. We pre-trained the foundation model from scratch and\nfine-tuned it on a large-scale robust multi-speaker text-to-speech (TTS) task.\nWe tested the model capabilities in a zero- and few-shot scenario. Based on two\nlistening tests, we evaluated the synthetic audio quality and the similarity of\nhow synthetic voices resemble real voices. Our results showed that the SpeechT5\nmodel can generate a synthetic voice for any speaker using only one minute of\nthe target speaker's data. We successfully demonstrated the high quality and\nsimilarity of our synthetic voices on publicly known Czech politicians and\ncelebrities.\n","authors":["Jan Lehečka","Zdeněk Hanzlíček","Jindřich Matoušek","Daniel Tihelka"],"pdf_url":"https://arxiv.org/pdf/2407.17167v1.pdf","comment":"Accepted to TSD2024"},{"id":"http://arxiv.org/abs/2407.17160v1","updated":"2024-07-24T11:03:47Z","published":"2024-07-24T11:03:47Z","title":"A Comparative Analysis of Bilingual and Trilingual Wav2Vec Models for\n Automatic Speech Recognition in Multilingual Oral History Archives","summary":" In this paper, we are comparing monolingual Wav2Vec 2.0 models with various\nmultilingual models to see whether we could improve speech recognition\nperformance on a unique oral history archive containing a lot of mixed-language\nsentences. Our main goal is to push forward research on this unique dataset,\nwhich is an extremely valuable part of our cultural heritage. Our results\nsuggest that monolingual speech recognition models are, in most cases, superior\nto multilingual models, even when processing the oral history archive full of\nmixed-language sentences from non-native speakers. We also performed the same\nexperiments on the public CommonVoice dataset to verify our results. We are\ncontributing to the research community by releasing our pre-trained models to\nthe public.\n","authors":["Jan Lehečka","Josef V. Psutka","Luboš Šmídl","Pavel Ircing","Josef Psutka"],"pdf_url":"https://arxiv.org/pdf/2407.17160v1.pdf","comment":"Accepted to INTERSPEECH2024"},{"id":"http://arxiv.org/abs/2407.17150v1","updated":"2024-07-24T10:49:19Z","published":"2024-07-24T10:49:19Z","title":"SimCT: A Simple Consistency Test Protocol in LLMs Development Lifecycle","summary":" In this work, we report our efforts to advance the standard operation\nprocedure of developing Large Language Models (LLMs) or LLMs-based systems or\nservices in industry. We introduce the concept of Large Language Model\nDevelopment Lifecycle (LDLC) and then highlight the importance of consistency\ntest in ensuring the delivery quality. The principled solution of consistency\ntest, however, is usually overlooked by industrial practitioners and not urgent\nin academia, and current practical solutions are insufficiently rigours and\nlabor-intensive. We thus propose a simple yet effective consistency test\nprotocol, named SimCT. SimCT is mainly to proactively check the consistency\nacross different development stages of \"bare metal\" LLMs or associated services\nwithout accessing the model artifacts, in an attempt to expedite the delivery\nby reducing the back-and-forth alignment communications among multiple teams\ninvolved in different development stages.\n Specifically, SimCT encompasses response-wise and model-wise tests. We\nimplement the protocol with LightGBM and Student's t-test for two components\nrespectively, and perform extensive experiments to substantiate the\neffectiveness of SimCT and the involved components.\n","authors":["Fufangchen Zhao","Guoqiang Jin","Rui Zhao","Jiangheng Huang","Fei Tan"],"pdf_url":"https://arxiv.org/pdf/2407.17150v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17126v1","updated":"2024-07-24T09:57:51Z","published":"2024-07-24T09:57:51Z","title":"SDoH-GPT: Using Large Language Models to Extract Social Determinants of\n Health (SDoH)","summary":" Extracting social determinants of health (SDoH) from unstructured medical\nnotes depends heavily on labor-intensive annotations, which are typically\ntask-specific, hampering reusability and limiting sharing. In this study we\nintroduced SDoH-GPT, a simple and effective few-shot Large Language Model (LLM)\nmethod leveraging contrastive examples and concise instructions to extract SDoH\nwithout relying on extensive medical annotations or costly human intervention.\nIt achieved tenfold and twentyfold reductions in time and cost respectively,\nand superior consistency with human annotators measured by Cohen's kappa of up\nto 0.92. The innovative combination of SDoH-GPT and XGBoost leverages the\nstrengths of both, ensuring high accuracy and computational efficiency while\nconsistently maintaining 0.90+ AUROC scores. Testing across three distinct\ndatasets has confirmed its robustness and accuracy. This study highlights the\npotential of leveraging LLMs to revolutionize medical note classification,\ndemonstrating their capability to achieve highly accurate classifications with\nsignificantly reduced time and cost.\n","authors":["Bernardo Consoli","Xizhi Wu","Song Wang","Xinyu Zhao","Yanshan Wang","Justin Rousseau","Tom Hartvigsen","Li Shen","Huanmei Wu","Yifan Peng","Qi Long","Tianlong Chen","Ying Ding"],"pdf_url":"https://arxiv.org/pdf/2407.17126v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17125v1","updated":"2024-07-24T09:48:48Z","published":"2024-07-24T09:48:48Z","title":"Behavioral Testing: Can Large Language Models Implicitly Resolve\n Ambiguous Entities?","summary":" One of the major aspects contributing to the striking performance of large\nlanguage models (LLMs) is the vast amount of factual knowledge accumulated\nduring pre-training. Yet, many LLMs suffer from self-inconsistency, which\nraises doubts about their trustworthiness and reliability. In this paper, we\nfocus on entity type ambiguity and analyze current state-of-the-art LLMs for\ntheir proficiency and consistency in applying their factual knowledge when\nprompted for entities under ambiguity. To do so, we propose an evaluation\nprotocol that disentangles knowing from applying knowledge, and test\nstate-of-the-art LLMs on 49 entities. Our experiments reveal that LLMs perform\npoorly with ambiguous prompts, achieving only 80% accuracy. Our results further\ndemonstrate systematic discrepancies in LLM behavior and their failure to\nconsistently apply information, indicating that the models can exhibit\nknowledge without being able to utilize it, significant biases for preferred\nreadings, as well as self inconsistencies. Our study highlights the importance\nof handling entity ambiguity in future for more trustworthy LLMs\n","authors":["Anastasiia Sedova","Robert Litschko","Diego Frassinelli","Benjamin Roth","Barbara Plank"],"pdf_url":"https://arxiv.org/pdf/2407.17125v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.15569v2","updated":"2024-07-24T08:56:11Z","published":"2024-01-28T05:12:09Z","title":"Efficient Tuning and Inference for Large Language Models on Textual\n Graphs","summary":" Rich textual and topological information of textual graphs need to be modeled\nin real-world applications such as webpages, e-commerce, and academic articles.\nPractitioners have been long following the path of adopting a shallow text\nencoder and a subsequent graph neural network (GNN) to solve this problem. In\nlight of recent advancements in large language models (LLMs), it is apparent\nthat integrating LLMs for enhanced textual encoding can substantially improve\nthe performance of textual graphs. Nevertheless, the efficiency of these\nmethods poses a significant challenge. In this paper, we propose ENGINE, a\nparameter- and memory-efficient fine-tuning method for textual graphs with an\nLLM encoder. The key insight is to combine the LLMs and GNNs through a tunable\nside structure, which significantly reduces the training complexity without\nimpairing the joint model's capacity. Extensive experiments on textual graphs\ndemonstrate our method's effectiveness by achieving the best model performance,\nmeanwhile having the lowest training cost compared to previous methods.\nMoreover, we introduce two variants with caching and dynamic early exit to\nfurther enhance training and inference speed. Specifically, caching accelerates\nENGINE's training by 12x, and dynamic early exit achieves up to 5x faster\ninference with a negligible performance drop (at maximum 1.17% relevant drop\nacross 7 datasets). Our codes are available at:\nhttps://github.com/ZhuYun97/ENGINE\n","authors":["Yun Zhu","Yaoke Wang","Haizhou Shi","Siliang Tang"],"pdf_url":"https://arxiv.org/pdf/2401.15569v2.pdf","comment":"Accepted by IJCAI2024"},{"id":"http://arxiv.org/abs/2309.03227v2","updated":"2024-07-24T08:31:21Z","published":"2023-09-04T02:30:19Z","title":"Learning a Patent-Informed Biomedical Knowledge Graph Reveals\n Technological Potential of Drug Repositioning Candidates","summary":" Drug repositioning-a promising strategy for discovering new therapeutic uses\nfor existing drugs-has been increasingly explored in the computational science\nliterature using biomedical databases. However, the technological potential of\ndrug repositioning candidates has often been overlooked. This study presents a\nnovel protocol to comprehensively analyse various sources such as\npharmaceutical patents and biomedical databases, and identify drug\nrepositioning candidates with both technological potential and scientific\nevidence. To this end, first, we constructed a scientific biomedical knowledge\ngraph (s-BKG) comprising relationships between drugs, diseases, and genes\nderived from biomedical databases. Our protocol involves identifying drugs that\nexhibit limited association with the target disease but are closely located in\nthe s-BKG, as potential drug candidates. We constructed a patent-informed\nbiomedical knowledge graph (p-BKG) by adding pharmaceutical patent information.\nFinally, we developed a graph embedding protocol to ascertain the structure of\nthe p-BKG, thereby calculating the relevance scores of those candidates with\ntarget disease-related patents to evaluate their technological potential. Our\ncase study on Alzheimer's disease demonstrates its efficacy and feasibility,\nwhile the quantitative outcomes and systematic methods are expected to bridge\nthe gap between computational discoveries and successful market applications in\ndrug repositioning research.\n","authors":["Yongseung Jegal","Jaewoong Choi","Jiho Lee","Ki-Su Park","Seyoung Lee","Janghyeok Yoon"],"pdf_url":"https://arxiv.org/pdf/2309.03227v2.pdf","comment":"We are sorry to withdraw this paper. We found some critical errors in\n the introduction and results sections. Specifically, we found that the first\n author have wrongly inserted citations on background works and he made\n mistakes in the graph embedding methods and relevant results are wrongly\n calculated. In this regard, we tried to revise this paper and withdraw the\n current version. Thank you"},{"id":"http://arxiv.org/abs/2308.14484v2","updated":"2024-07-24T08:24:21Z","published":"2023-08-28T10:51:11Z","title":"Multimodal Detection of Bots on X (Twitter) using Transformers","summary":" Although not all bots are malicious, the vast majority of them are\nresponsible for spreading misinformation and manipulating the public opinion\nabout several issues, i.e., elections and many more. Therefore, the early\ndetection of bots is crucial. Although there have been proposed methods for\ndetecting bots in social media, there are still substantial limitations. For\ninstance, existing research initiatives still extract a large number of\nfeatures and train traditional machine learning algorithms or use GloVe\nembeddings and train LSTMs. However, feature extraction is a tedious procedure\ndemanding domain expertise. Also, language models based on transformers have\nbeen proved to be better than LSTMs. Other approaches create large graphs and\ntrain graph neural networks requiring in this way many hours for training and\naccess to computational resources. To tackle these limitations, this is the\nfirst study employing only the user description field and images of three\nchannels denoting the type and content of tweets posted by the users. Firstly,\nwe create digital DNA sequences, transform them to 3d images, and apply\npretrained models of the vision domain, including EfficientNet, AlexNet, VGG16,\netc. Next, we propose a multimodal approach, where we use TwHIN-BERT for\ngetting the textual representation of the user description field and employ\nVGG16 for acquiring the visual representation for the image modality. We\npropose three different fusion methods, namely concatenation, gated multimodal\nunit, and crossmodal attention, for fusing the different modalities and compare\ntheir performances. Finally, we present a qualitative analysis of the behavior\nof our best performing model. Extensive experiments conducted on the Cresci'17\nand TwiBot-20 datasets demonstrate valuable advantages of our introduced\napproaches over state-of-the-art ones.\n","authors":["Loukas Ilias","Ioannis Michail Kazelidis","Dimitris Askounis"],"pdf_url":"https://arxiv.org/pdf/2308.14484v2.pdf","comment":"IEEE Transactions on Information Forensics and Security (Accepted)"},{"id":"http://arxiv.org/abs/2407.17081v1","updated":"2024-07-24T08:17:37Z","published":"2024-07-24T08:17:37Z","title":"A Survey Forest Diagram : Gain a Divergent Insight View on a Specific\n Research Topic","summary":" With the exponential growth in the number of papers and the trend of AI\nresearch, the use of Generative AI for information retrieval and\nquestion-answering has become popular for conducting research surveys. However,\nnovice researchers unfamiliar with a particular field may not significantly\nimprove their efficiency in interacting with Generative AI because they have\nnot developed divergent thinking in that field. This study aims to develop an\nin-depth Survey Forest Diagram that guides novice researchers in divergent\nthinking about the research topic by indicating the citation clues among\nmultiple papers, to help expand the survey perspective for novice researchers.\n","authors":["Jinghong Li","Wen Gu","Koichi Ota","Shinobu Hasegawa"],"pdf_url":"https://arxiv.org/pdf/2407.17081v1.pdf","comment":"This paper will submit to IEEE SMC 2024"},{"id":"http://arxiv.org/abs/2407.11100v3","updated":"2024-07-24T08:10:29Z","published":"2024-07-15T07:20:02Z","title":"Building Intelligence Identification System via Large Language Model\n Watermarking: A Survey and Beyond","summary":" Large Language Models (LLMs) are increasingly integrated into diverse\nindustries, posing substantial security risks due to unauthorized replication\nand misuse. To mitigate these concerns, robust identification mechanisms are\nwidely acknowledged as an effective strategy. Identification systems for LLMs\nnow rely heavily on watermarking technology to manage and protect intellectual\nproperty and ensure data security. However, previous studies have primarily\nconcentrated on the basic principles of algorithms and lacked a comprehensive\nanalysis of watermarking theory and practice from the perspective of\nintelligent identification. To bridge this gap, firstly, we explore how a\nrobust identity recognition system can be effectively implemented and managed\nwithin LLMs by various participants using watermarking technology. Secondly, we\npropose a mathematical framework based on mutual information theory, which\nsystematizes the identification process to achieve more precise and customized\nwatermarking. Additionally, we present a comprehensive evaluation of\nperformance metrics for LLM watermarking, reflecting participant preferences\nand advancing discussions on its identification applications. Lastly, we\noutline the existing challenges in current watermarking technologies and\ntheoretical frameworks, and provide directional guidance to address these\nchallenges. Our systematic classification and detailed exposition aim to\nenhance the comparison and evaluation of various methods, fostering further\nresearch and development toward a transparent, secure, and equitable LLM\necosystem.\n","authors":["Xuhong Wang","Haoyu Jiang","Yi Yu","Jingru Yu","Yilun Lin","Ping Yi","Yingchun Wang","Yu Qiao","Li Li","Fei-Yue Wang"],"pdf_url":"https://arxiv.org/pdf/2407.11100v3.pdf","comment":"59 pages, 7 figures"},{"id":"http://arxiv.org/abs/2407.17075v1","updated":"2024-07-24T08:04:00Z","published":"2024-07-24T08:04:00Z","title":"SAFETY-J: Evaluating Safety with Critique","summary":" The deployment of Large Language Models (LLMs) in content generation raises\nsignificant safety concerns, particularly regarding the transparency and\ninterpretability of content evaluations. Current methods, primarily focused on\nbinary safety classifications, lack mechanisms for detailed critique, limiting\ntheir utility for model improvement and user trust. To address these\nlimitations, we introduce SAFETY-J, a bilingual generative safety evaluator for\nEnglish and Chinese with critique-based judgment. SAFETY-J utilizes a robust\ntraining dataset that includes diverse dialogues and augmented query-response\npairs to assess safety across various scenarios comprehensively. We establish\nan automated meta-evaluation benchmark that objectively assesses the quality of\ncritiques with minimal human intervention, facilitating scalable and continuous\nimprovement. Additionally, SAFETY-J employs an iterative preference learning\ntechnique to dynamically refine safety assessments based on meta-evaluations\nand critiques. Our evaluations demonstrate that SAFETY-J provides more nuanced\nand accurate safety evaluations, thereby enhancing both critique quality and\npredictive reliability in complex content scenarios. To facilitate further\nresearch and application, we will open-source SAFETY-J's training protocols,\ndatasets, and code.\n","authors":["Yixiu Liu","Yuxiang Zheng","Shijie Xia","Yuan Guo","Jiajun Li","Yi Tu","Chaoling Song","Pengfei Liu"],"pdf_url":"https://arxiv.org/pdf/2407.17075v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17060v1","updated":"2024-07-24T07:37:12Z","published":"2024-07-24T07:37:12Z","title":"High Efficiency Image Compression for Large Visual-Language Models","summary":" In recent years, large visual language models (LVLMs) have shown impressive\nperformance and promising generalization capability in multi-modal tasks, thus\nreplacing humans as receivers of visual information in various application\nscenarios. In this paper, we pioneer to propose a variable bitrate image\ncompression framework consisting of a pre-editing module and an end-to-end\ncodec to achieve promising rate-accuracy performance for different LVLMs. In\nparticular, instead of optimizing an adaptive pre-editing network towards a\nparticular task or several representative tasks, we propose a new optimization\nstrategy tailored for LVLMs, which is designed based on the representation and\ndiscrimination capability with token-level distortion and rank. The pre-editing\nmodule and the variable bitrate end-to-end image codec are jointly trained by\nthe losses based on semantic tokens of the large model, which introduce\nenhanced generalization capability for various data and tasks. {Experimental\nresults demonstrate that the proposed framework could efficiently achieve much\nbetter rate-accuracy performance compared to the state-of-the-art coding\nstandard, Versatile Video Coding.} Meanwhile, experiments with multi-modal\ntasks have revealed the robustness and generalization capability of the\nproposed framework.\n","authors":["Binzhe Li","Shurun Wang","Shiqi Wang","Yan Ye"],"pdf_url":"https://arxiv.org/pdf/2407.17060v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16190v2","updated":"2024-07-24T07:32:25Z","published":"2024-07-23T05:32:00Z","title":"Artificial Agency and Large Language Models","summary":" The arrival of Large Language Models (LLMs) has stirred up philosophical\ndebates about the possibility of realizing agency in an artificial manner. In\nthis work we contribute to the debate by presenting a theoretical model that\ncan be used as a threshold conception for artificial agents. The model defines\nagents as systems whose actions and goals are always influenced by a dynamic\nframework of factors that consists of the agent's accessible history, its\nadaptive repertoire and its external environment. This framework, in turn, is\ninfluenced by the actions that the agent takes and the goals that it forms. We\nshow with the help of the model that state-of-the-art LLMs are not agents yet,\nbut that there are elements to them that suggest a way forward. The paper\nargues that a combination of the agent architecture presented in Park et al.\n(2023) together with the use of modules like the Coscientist in Boiko et al.\n(2023) could potentially be a way to realize agency in an artificial manner. We\nend the paper by reflecting on the obstacles one might face in building such an\nartificial agent and by presenting possible directions for future research.\n","authors":["Maud van Lier","Gorka Muñoz-Gil"],"pdf_url":"https://arxiv.org/pdf/2407.16190v2.pdf","comment":"Accepted for publication in journal Intellectica, special issue\n \"Philosophies of AI: thinking and writing with LLMs\" (Intellectica, issue 81)"},{"id":"http://arxiv.org/abs/2402.13463v4","updated":"2024-07-24T06:50:18Z","published":"2024-02-21T01:39:56Z","title":"RefuteBench: Evaluating Refuting Instruction-Following for Large\n Language Models","summary":" The application scope of large language models (LLMs) is increasingly\nexpanding. In practical use, users might provide feedback based on the model's\noutput, hoping for a responsive model that can complete responses according to\ntheir feedback. Whether the model can appropriately respond to users' refuting\nfeedback and consistently follow through with execution has not been thoroughly\nanalyzed. In light of this, this paper proposes a comprehensive benchmark,\nRefuteBench, covering tasks such as question answering, machine translation,\nand email writing. The evaluation aims to assess whether models can positively\naccept feedback in form of refuting instructions and whether they can\nconsistently adhere to user demands throughout the conversation. We conduct\nevaluations on numerous LLMs and find that LLMs are stubborn, i.e. exhibit\ninclination to their internal knowledge, often failing to comply with user\nfeedback. Additionally, as the length of the conversation increases, models\ngradually forget the user's stated feedback and roll back to their own\nresponses. We further propose a recall-and-repeat prompts as a simple and\neffective way to enhance the model's responsiveness to feedback.\n","authors":["Jianhao Yan","Yun Luo","Yue Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.13463v4.pdf","comment":"ACL 2024 final version"},{"id":"http://arxiv.org/abs/2406.03855v3","updated":"2024-07-24T06:39:15Z","published":"2024-06-06T08:41:46Z","title":"Performance of large language models in numerical vs. semantic medical\n knowledge: Benchmarking on evidence-based Q&As","summary":" Clinical problem-solving requires processing of semantic medical knowledge\nsuch as illness scripts and numerical medical knowledge of diagnostic tests for\nevidence-based decision-making. As large language models (LLMs) show promising\nresults in many aspects of language-based clinical practice, their ability to\ngenerate non-language evidence-based answers to clinical questions is\ninherently limited by tokenization. Therefore, we evaluated LLMs' performance\non two question types: numeric (correlating findings) and semantic\n(differentiating entities) while examining differences within and between LLMs\nin medical aspects and comparing their performance to humans. To generate\nstraightforward multi-choice questions and answers (QAs) based on\nevidence-based medicine (EBM), we used a comprehensive medical knowledge graph\n(encompassed data from more than 50,00 peer-reviewed articles) and created the\n\"EBMQA\". EBMQA contains 105,000 QAs labeled with medical and non-medical topics\nand classified into numerical or semantic questions. We benchmarked this\ndataset using more than 24,500 QAs on two state-of-the-art LLMs: Chat-GPT4 and\nClaude3-Opus. We evaluated the LLMs accuracy on semantic and numerical question\ntypes and according to sub-labeled topics. For validation, six medical experts\nwere tested on 100 numerical EBMQA questions. We found that both LLMs excelled\nmore in semantic than numerical QAs, with Claude3 surpassing GPT4 in numerical\nQAs. However, both LLMs showed inter and intra gaps in different medical\naspects and remained inferior to humans. Thus, their medical advice should be\naddressed carefully.\n","authors":["Eden Avnat","Michal Levy","Daniel Herstain","Elia Yanko","Daniel Ben Joya","Michal Tzuchman Katz","Dafna Eshel","Sahar Laros","Yael Dagan","Shahar Barami","Joseph Mermelstein","Shahar Ovadia","Noam Shomron","Varda Shalev","Raja-Elie E. Abdulnour"],"pdf_url":"https://arxiv.org/pdf/2406.03855v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17023v1","updated":"2024-07-24T06:06:07Z","published":"2024-07-24T06:06:07Z","title":"From Internal Conflict to Contextual Adaptation of Language Models","summary":" Knowledge-intensive language understanding tasks require Language Models\n(LMs) to integrate relevant context, mitigating their inherent weaknesses, such\nas incomplete or outdated knowledge. Nevertheless, studies indicate that LMs\noften ignore the provided context as it can conflict with the pre-existing LM's\nmemory learned during pre-training. Moreover, conflicting knowledge can already\nbe present in the LM's parameters, termed intra-memory conflict. Existing works\nhave studied the two types of knowledge conflicts only in isolation. We\nconjecture that the (degree of) intra-memory conflicts can in turn affect LM's\nhandling of context-memory conflicts. To study this, we introduce the DYNAMICQA\ndataset, which includes facts with a temporal dynamic nature where a fact can\nchange with a varying time frequency and disputable dynamic facts, which can\nchange depending on the viewpoint. DYNAMICQA is the first to include real-world\nknowledge conflicts and provide context to study the link between the different\ntypes of knowledge conflicts. With the proposed dataset, we assess the use of\nuncertainty for measuring the intra-memory conflict and introduce a novel\nCoherent Persuasion (CP) score to evaluate the context's ability to sway LM's\nsemantic output. Our extensive experiments reveal that static facts, which are\nunlikely to change, are more easily updated with additional context, relative\nto temporal and disputable facts.\n","authors":["Sara Vera Marjanović","Haeun Yu","Pepa Atanasova","Maria Maistro","Christina Lioma","Isabelle Augenstein"],"pdf_url":"https://arxiv.org/pdf/2407.17023v1.pdf","comment":"22 pages, 15 figures"},{"id":"http://arxiv.org/abs/2407.17022v1","updated":"2024-07-24T06:02:57Z","published":"2024-07-24T06:02:57Z","title":"Can Language Models Evaluate Human Written Text? Case Study on Korean\n Student Writing for Education","summary":" Large language model (LLM)-based evaluation pipelines have demonstrated their\ncapability to robustly evaluate machine-generated text. Extending this\nmethodology to assess human-written text could significantly benefit\neducational settings by providing direct feedback to enhance writing skills,\nalthough this application is not straightforward. In this paper, we investigate\nwhether LLMs can effectively assess human-written text for educational\npurposes. We collected 100 texts from 32 Korean students across 15 types of\nwriting and employed GPT-4-Turbo to evaluate them using grammaticality,\nfluency, coherence, consistency, and relevance as criteria. Our analyses\nindicate that LLM evaluators can reliably assess grammaticality and fluency, as\nwell as more objective types of writing, though they struggle with other\ncriteria and types of writing. We publicly release our dataset and feedback.\n","authors":["Seungyoon Kim","Seungone Kim"],"pdf_url":"https://arxiv.org/pdf/2407.17022v1.pdf","comment":"Work In Progress"},{"id":"http://arxiv.org/abs/2403.09092v2","updated":"2024-07-24T05:57:01Z","published":"2024-03-14T04:32:13Z","title":"MCFEND: A Multi-source Benchmark Dataset for Chinese Fake News Detection","summary":" The prevalence of fake news across various online sources has had a\nsignificant influence on the public. Existing Chinese fake news detection\ndatasets are limited to news sourced solely from Weibo. However, fake news\noriginating from multiple sources exhibits diversity in various aspects,\nincluding its content and social context. Methods trained on purely one single\nnews source can hardly be applicable to real-world scenarios. Our pilot\nexperiment demonstrates that the F1 score of the state-of-the-art method that\nlearns from a large Chinese fake news detection dataset, Weibo-21, drops\nsignificantly from 0.943 to 0.470 when the test data is changed to multi-source\nnews data, failing to identify more than one-third of the multi-source fake\nnews. To address this limitation, we constructed the first multi-source\nbenchmark dataset for Chinese fake news detection, termed MCFEND, which is\ncomposed of news we collected from diverse sources such as social platforms,\nmessaging apps, and traditional online news outlets. Notably, such news has\nbeen fact-checked by 14 authoritative fact-checking agencies worldwide. In\naddition, various existing Chinese fake news detection methods are thoroughly\nevaluated on our proposed dataset in cross-source, multi-source, and unseen\nsource ways. MCFEND, as a benchmark dataset, aims to advance Chinese fake news\ndetection approaches in real-world scenarios.\n","authors":["Yupeng Li","Haorui He","Jin Bai","Dacheng Wen"],"pdf_url":"https://arxiv.org/pdf/2403.09092v2.pdf","comment":"Accepted by the ACM Web Conference 2024 (WWW 2024) oral, dataset\n available: https://github.com/TrustworthyComp"},{"id":"http://arxiv.org/abs/2407.17011v1","updated":"2024-07-24T05:26:52Z","published":"2024-07-24T05:26:52Z","title":"Unveiling In-Context Learning: A Coordinate System to Understand Its\n Working Mechanism","summary":" Large language models (LLMs) exhibit remarkable in-context learning (ICL)\ncapabilities. However, the underlying working mechanism of ICL remains poorly\nunderstood. Recent research presents two conflicting views on ICL: One\nattributes it to LLMs' inherent ability of task recognition, deeming label\ncorrectness and shot numbers of demonstrations as not crucial; the other\nemphasizes the impact of similar examples in the demonstrations, stressing the\nneed for label correctness and more shots. In this work, we provide a\nTwo-Dimensional Coordinate System that unifies both views into a systematic\nframework. The framework explains the behavior of ICL through two orthogonal\nvariables: whether LLMs can recognize the task and whether similar examples are\npresented in the demonstrations. We propose the peak inverse rank metric to\ndetect the task recognition ability of LLMs and study LLMs' reactions to\ndifferent definitions of similarity. Based on these, we conduct extensive\nexperiments to elucidate how ICL functions across each quadrant on multiple\nrepresentative classification tasks. Finally, we extend our analyses to\ngeneration tasks, showing that our coordinate system can also be used to\ninterpret ICL for generation tasks effectively.\n","authors":["Anhao Zhao","Fanghua Ye","Jinlan Fu","Xiaoyu Shen"],"pdf_url":"https://arxiv.org/pdf/2407.17011v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11233v2","updated":"2024-07-24T05:22:48Z","published":"2024-06-17T06:00:24Z","title":"Probing the Decision Boundaries of In-context Learning in Large Language\n Models","summary":" In-context learning is a key paradigm in large language models (LLMs) that\nenables them to generalize to new tasks and domains by simply prompting these\nmodels with a few exemplars without explicit parameter updates. Many attempts\nhave been made to understand in-context learning in LLMs as a function of model\nscale, pretraining data, and other factors. In this work, we propose a new\nmechanism to probe and understand in-context learning from the lens of decision\nboundaries for in-context binary classification. Decision boundaries are\nstraightforward to visualize and provide important information about the\nqualitative behavior of the inductive biases of standard classifiers. To our\nsurprise, we find that the decision boundaries learned by current LLMs in\nsimple binary classification tasks are often irregular and non-smooth,\nregardless of linear separability in the underlying task. This paper\ninvestigates the factors influencing these decision boundaries and explores\nmethods to enhance their generalizability. We assess various approaches,\nincluding training-free and fine-tuning methods for LLMs, the impact of model\narchitecture, and the effectiveness of active prompting techniques for\nsmoothing decision boundaries in a data-efficient manner. Our findings provide\na deeper understanding of in-context learning dynamics and offer practical\nimprovements for enhancing robustness and generalizability of in-context\nlearning.\n","authors":["Siyan Zhao","Tung Nguyen","Aditya Grover"],"pdf_url":"https://arxiv.org/pdf/2406.11233v2.pdf","comment":"18 pages, code at https://github.com/siyan-zhao/ICL_decision_boundary"},{"id":"http://arxiv.org/abs/2407.13787v2","updated":"2024-07-24T04:57:55Z","published":"2024-07-12T11:31:00Z","title":"The Honorific Effect: Exploring the Impact of Japanese Linguistic\n Formalities on AI-Generated Physics Explanations","summary":" This study investigates the influence of Japanese honorifics on the responses\nof large language models (LLMs) when explaining the law of conservation of\nmomentum. We analyzed the outputs of six state-of-the-art AI models, including\nvariations of ChatGPT, Coral, and Gemini, using 14 different honorific forms.\nOur findings reveal that honorifics significantly affect the quality,\nconsistency, and formality of AI-generated responses, demonstrating LLMs'\nability to interpret and adapt to social context cues embedded in language.\nNotable variations were observed across different models, with some emphasizing\nhistorical context and derivations, while others focused on intuitive\nexplanations. The study highlights the potential for using honorifics to adjust\nthe depth and complexity of AI-generated explanations in educational contexts.\nFurthermore, the responsiveness of AI models to cultural linguistic elements\nunderscores the importance of considering cultural factors in AI development\nfor educational applications. These results open new avenues for research in\nAI-assisted education and cultural adaptation in AI systems, with significant\nimplications for personalizing learning experiences and developing culturally\nsensitive AI tools for global education.\n","authors":["Keisuke Sato"],"pdf_url":"https://arxiv.org/pdf/2407.13787v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.17432v4","updated":"2024-07-24T04:44:11Z","published":"2023-12-29T01:56:17Z","title":"Video Understanding with Large Language Models: A Survey","summary":" With the burgeoning growth of online video platforms and the escalating\nvolume of video content, the demand for proficient video understanding tools\nhas intensified markedly. Given the remarkable capabilities of large language\nmodels (LLMs) in language and multimodal tasks, this survey provides a detailed\noverview of recent advancements in video understanding that harness the power\nof LLMs (Vid-LLMs). The emergent capabilities of Vid-LLMs are surprisingly\nadvanced, particularly their ability for open-ended multi-granularity (general,\ntemporal, and spatiotemporal) reasoning combined with commonsense knowledge,\nsuggesting a promising path for future video understanding. We examine the\nunique characteristics and capabilities of Vid-LLMs, categorizing the\napproaches into three main types: Video Analyzer x LLM, Video Embedder x LLM,\nand (Analyzer + Embedder) x LLM. Furthermore, we identify five sub-types based\non the functions of LLMs in Vid-LLMs: LLM as Summarizer, LLM as Manager, LLM as\nText Decoder, LLM as Regressor, and LLM as Hidden Layer. Furthermore, this\nsurvey presents a comprehensive study of the tasks, datasets, benchmarks, and\nevaluation methodologies for Vid-LLMs. Additionally, it explores the expansive\napplications of Vid-LLMs across various domains, highlighting their remarkable\nscalability and versatility in real-world video understanding challenges.\nFinally, it summarizes the limitations of existing Vid-LLMs and outlines\ndirections for future research. For more information, readers are recommended\nto visit the repository at\nhttps://github.com/yunlong10/Awesome-LLMs-for-Video-Understanding.\n","authors":["Yunlong Tang","Jing Bi","Siting Xu","Luchuan Song","Susan Liang","Teng Wang","Daoan Zhang","Jie An","Jingyang Lin","Rongyi Zhu","Ali Vosoughi","Chao Huang","Zeliang Zhang","Pinxin Liu","Mingqian Feng","Feng Zheng","Jianguo Zhang","Ping Luo","Jiebo Luo","Chenliang Xu"],"pdf_url":"https://arxiv.org/pdf/2312.17432v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16997v1","updated":"2024-07-24T04:39:24Z","published":"2024-07-24T04:39:24Z","title":"Revisiting Who's Harry Potter: Towards Targeted Unlearning from a Causal\n Intervention Perspective","summary":" This paper investigates Who's Harry Potter (WHP), a pioneering yet\ninsufficiently understood method for LLM unlearning. We explore it in two\nsteps. First, we introduce a new task of LLM targeted unlearning, where given\nan unlearning target (e.g., a person) and some unlearning documents, we aim to\nunlearn only the information about the target, rather than everything in the\nunlearning documents. We further argue that a successful unlearning should\nsatisfy criteria such as not outputting gibberish, not fabricating facts about\nthe unlearning target, and not releasing factual information under jailbreak\nattacks. Second, we construct a causal intervention framework for targeted\nunlearning, where the knowledge of the unlearning target is modeled as a\nconfounder between LLM input and output, and the unlearning process as a\ndeconfounding process. This framework justifies and extends WHP, deriving a\nsimple unlearning algorithm that includes WHP as a special case. Experiments on\nexisting and new datasets show that our approach, without explicitly optimizing\nfor the aforementioned criteria, achieves competitive performance in all of\nthem. Our code is available at\nhttps://github.com/UCSB-NLP-Chang/causal_unlearn.git.\n","authors":["Yujian Liu","Yang Zhang","Tommi Jaakkola","Shiyu Chang"],"pdf_url":"https://arxiv.org/pdf/2407.16997v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16994v1","updated":"2024-07-24T04:27:55Z","published":"2024-07-24T04:27:55Z","title":"A Voter-Based Stochastic Rejection-Method Framework for Asymptotically\n Safe Language Model Outputs","summary":" This paper proposes a new method for preventing unsafe or otherwise low\nquality large language model (LLM) outputs, by leveraging the stochasticity of\nLLMs. We propose a system whereby LLM checkers vote on the acceptability of a\ngenerated output, regenerating it if a threshold of disapproval is reached,\nuntil sufficient checkers approve. We further propose estimators for cost and\nfailure rate, and based on those estimators and experimental data tailored to\nthe application, we propose an algorithm that achieves a desired failure rate\nat the least possible cost. We demonstrate that, under these models, failure\nrate decreases exponentially as a function of cost when voter count and\nthreshold are chosen according to the algorithm, and that the models reasonably\nestimate the actual performance of such a system in action, even with limited\ndata.\n","authors":["Jake R. Watts","Joel Sokol"],"pdf_url":"https://arxiv.org/pdf/2407.16994v1.pdf","comment":"7 pages, 2 figures"},{"id":"http://arxiv.org/abs/2407.12994v2","updated":"2024-07-24T03:53:41Z","published":"2024-07-17T20:23:19Z","title":"A Survey of Prompt Engineering Methods in Large Language Models for\n Different NLP Tasks","summary":" Large language models (LLMs) have shown remarkable performance on many\ndifferent Natural Language Processing (NLP) tasks. Prompt engineering plays a\nkey role in adding more to the already existing abilities of LLMs to achieve\nsignificant performance gains on various NLP tasks. Prompt engineering requires\ncomposing natural language instructions called prompts to elicit knowledge from\nLLMs in a structured way. Unlike previous state-of-the-art (SoTA) models,\nprompt engineering does not require extensive parameter re-training or\nfine-tuning based on the given NLP task and thus solely operates on the\nembedded knowledge of LLMs. Additionally, LLM enthusiasts can intelligently\nextract LLMs' knowledge through a basic natural language conversational\nexchange or prompt engineering, allowing more and more people even without deep\nmathematical machine learning background to experiment with LLMs. With prompt\nengineering gaining popularity in the last two years, researchers have come up\nwith numerous engineering techniques around designing prompts to improve\naccuracy of information extraction from the LLMs. In this paper, we summarize\ndifferent prompting techniques and club them together based on different NLP\ntasks that they have been used for. We further granularly highlight the\nperformance of these prompting strategies on various datasets belonging to that\nNLP task, talk about the corresponding LLMs used, present a taxonomy diagram\nand discuss the possible SoTA for specific datasets. In total, we read and\npresent a survey of 44 research papers which talk about 39 different prompting\nmethods on 29 different NLP tasks of which most of them have been published in\nthe last two years.\n","authors":["Shubham Vatsal","Harsh Dubey"],"pdf_url":"https://arxiv.org/pdf/2407.12994v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16970v1","updated":"2024-07-24T03:32:05Z","published":"2024-07-24T03:32:05Z","title":"Towards Aligning Language Models with Textual Feedback","summary":" We present ALT (ALignment with Textual feedback), an approach that aligns\nlanguage models with user preferences expressed in text. We argue that text\noffers greater expressiveness, enabling users to provide richer feedback than\nsimple comparative preferences and this richer feedback can lead to more\nefficient and effective alignment. ALT aligns the model by conditioning its\ngeneration on the textual feedback. Our method relies solely on language\nmodeling techniques and requires minimal hyper-parameter tuning, though it\nstill presents the main benefits of RL-based alignment algorithms and can\neffectively learn from textual feedback. We explore the efficacy and efficiency\nof textual feedback across different tasks such as toxicity reduction,\nsummarization, and dialog response generation. We find that ALT outperforms PPO\nfor the task of toxicity reduction while being able to match its performance on\nsummarization with only 20% of the samples. We also explore how ALT can be used\nwith feedback provided by an existing LLM where we explore an LLM providing\nconstrained and unconstrained textual feedback. We also outline future\ndirections to align models with natural language feedback.\n","authors":["Saüc Abadal Lloret","Shehzaad Dhuliawala","Keerthiram Murugesan","Mrinmaya Sachan"],"pdf_url":"https://arxiv.org/pdf/2407.16970v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16951v1","updated":"2024-07-24T02:37:42Z","published":"2024-07-24T02:37:42Z","title":"Towards Transfer Unlearning: Empirical Evidence of Cross-Domain Bias\n Mitigation","summary":" Large language models (LLMs) often inherit biases from vast amounts of\ntraining corpora. Traditional debiasing methods, while effective to some\nextent, do not completely eliminate memorized biases and toxicity in LLMs. In\nthis paper, we study an unlearning-based approach to debiasing in LLMs by\nperforming gradient ascent on hate speech against minority groups, i.e.,\nminimizing the likelihood of biased or toxic content. Specifically, we propose\na mask language modeling unlearning technique, which unlearns the harmful part\nof the text. This method enables LLMs to selectively forget and disassociate\nfrom biased and harmful content. Experimental results demonstrate the\neffectiveness of our approach in diminishing bias while maintaining the\nlanguage modeling abilities. Surprisingly, the results also unveil an\nunexpected potential for cross-domain transfer unlearning: debiasing in one\nbias form (e.g. gender) may contribute to mitigating others (e.g. race and\nreligion).\n","authors":["Huimin Lu","Masaru Isonuma","Junichiro Mori","Ichiro Sakata"],"pdf_url":"https://arxiv.org/pdf/2407.16951v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02363v2","updated":"2024-07-24T02:36:07Z","published":"2024-05-03T05:09:54Z","title":"LLM as Dataset Analyst: Subpopulation Structure Discovery with Large\n Language Model","summary":" The distribution of subpopulations is an important property hidden within a\ndataset. Uncovering and analyzing the subpopulation distribution within\ndatasets provides a comprehensive understanding of the datasets, standing as a\npowerful tool beneficial to various downstream tasks, including Dataset\nSubpopulation Organization, Subpopulation Shift, and Slice Discovery. Despite\nits importance, there has been no work that systematically explores the\nsubpopulation distribution of datasets to our knowledge. To address the\nlimitation and solve all the mentioned tasks in a unified way, we introduce a\nnovel concept of subpopulation structures to represent, analyze, and utilize\nsubpopulation distributions within datasets. To characterize the structures in\nan interpretable manner, we propose the Subpopulation Structure Discovery with\nLarge Language Models (SSD-LLM) framework, which employs world knowledge and\ninstruction-following capabilities of Large Language Models (LLMs) to\nlinguistically analyze informative image captions and summarize the structures.\nFurthermore, we propose complete workflows to address downstream tasks, named\nTask-specific Tuning, showcasing the application of the discovered structure to\na spectrum of subpopulation-related tasks, including dataset subpopulation\norganization, subpopulation shift, and slice discovery. Furthermore, we propose\ncomplete workflows to address downstream tasks, named Task-specific Tuning,\nshowcasing the application of the discovered structure to a spectrum of\nsubpopulation-related tasks, including dataset subpopulation organization,\nsubpopulation shift, and slice discovery.\n","authors":["Yulin Luo","Ruichuan An","Bocheng Zou","Yiming Tang","Jiaming Liu","Shanghang Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.02363v2.pdf","comment":"ECCV24 Camera Ready"},{"id":"http://arxiv.org/abs/2407.16939v1","updated":"2024-07-24T02:17:10Z","published":"2024-07-24T02:17:10Z","title":"Early screening of potential breakthrough technologies with enhanced\n interpretability: A patent-specific hierarchical attention network model","summary":" Despite the usefulness of machine learning approaches for the early screening\nof potential breakthrough technologies, their practicality is often hindered by\nopaque models. To address this, we propose an interpretable machine learning\napproach to predicting future citation counts from patent texts using a\npatent-specific hierarchical attention network (PatentHAN) model. Central to\nthis approach are (1) a patent-specific pre-trained language model, capturing\nthe meanings of technical words in patent claims, (2) a hierarchical network\nstructure, enabling detailed analysis at the claim level, and (3) a claim-wise\nself-attention mechanism, revealing pivotal claims during the screening\nprocess. A case study of 35,376 pharmaceutical patents demonstrates the\neffectiveness of our approach in early screening of potential breakthrough\ntechnologies while ensuring interpretability. Furthermore, we conduct\nadditional analyses using different language models and claim types to examine\nthe robustness of the approach. It is expected that the proposed approach will\nenhance expert-machine collaboration in identifying breakthrough technologies,\nproviding new insight derived from text mining into technological value.\n","authors":["Jaewoong Choi","Janghyeok Yoon","Changyong Lee"],"pdf_url":"https://arxiv.org/pdf/2407.16939v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.14850v2","updated":"2024-07-24T02:11:47Z","published":"2024-02-20T01:59:11Z","title":"CHATATC: Large Language Model-Driven Conversational Agents for\n Supporting Strategic Air Traffic Flow Management","summary":" Generative artificial intelligence (AI) and large language models (LLMs) have\ngained rapid popularity through publicly available tools such as ChatGPT. The\nadoption of LLMs for personal and professional use is fueled by the natural\ninteractions between human users and computer applications such as ChatGPT,\nalong with powerful summarization and text generation capabilities. Given the\nwidespread use of such generative AI tools, in this work we investigate how\nthese tools can be deployed in a non-safety critical, strategic traffic flow\nmanagement setting. Specifically, we train an LLM, CHATATC, based on a large\nhistorical data set of Ground Delay Program (GDP) issuances, spanning 2000-2023\nand consisting of over 80,000 GDP implementations, revisions, and\ncancellations. We test the query and response capabilities of CHATATC,\ndocumenting successes (e.g., providing correct GDP rates, durations, and\nreason) and shortcomings (e.g,. superlative questions). We also detail the\ndesign of a graphical user interface for future users to interact and\ncollaborate with the CHATATC conversational agent.\n","authors":["Sinan Abdulhak","Wayne Hubbard","Karthik Gopalakrishnan","Max Z. Li"],"pdf_url":"https://arxiv.org/pdf/2402.14850v2.pdf","comment":"8 pages, 5 figures; minor revisions to address reviewer feedback for\n final submission to the 11th International Conference on Research in Air\n Transportation (ICRAT)"},{"id":"http://arxiv.org/abs/2407.03718v2","updated":"2024-07-24T02:03:47Z","published":"2024-07-04T08:08:12Z","title":"Multi-Convformer: Extending Conformer with Multiple Convolution Kernels","summary":" Convolutions have become essential in state-of-the-art end-to-end Automatic\nSpeech Recognition~(ASR) systems due to their efficient modelling of local\ncontext. Notably, its use in Conformers has led to superior performance\ncompared to vanilla Transformer-based ASR systems. While components other than\nthe convolution module in the Conformer have been reexamined, altering the\nconvolution module itself has been far less explored. Towards this, we\nintroduce Multi-Convformer that uses multiple convolution kernels within the\nconvolution module of the Conformer in conjunction with gating. This helps in\nimproved modeling of local dependencies at varying granularities. Our model\nrivals existing Conformer variants such as CgMLP and E-Branchformer in\nperformance, while being more parameter efficient. We empirically compare our\napproach with Conformer and its variants across four different datasets and\nthree different modelling paradigms and show up to 8% relative word error\nrate~(WER) improvements.\n","authors":["Darshan Prabhu","Yifan Peng","Preethi Jyothi","Shinji Watanabe"],"pdf_url":"https://arxiv.org/pdf/2407.03718v2.pdf","comment":"Accepted to INTERSPEECH 2024"},{"id":"http://arxiv.org/abs/2407.16931v1","updated":"2024-07-24T01:46:55Z","published":"2024-07-24T01:46:55Z","title":"ScholarChemQA: Unveiling the Power of Language Models in Chemical\n Research Question Answering","summary":" Question Answering (QA) effectively evaluates language models' reasoning and\nknowledge depth. While QA datasets are plentiful in areas like general domain\nand biomedicine, academic chemistry is less explored. Chemical QA plays a\ncrucial role in both education and research by effectively translating complex\nchemical information into readily understandable format. Addressing this gap,\nwe introduce ScholarChemQA, a large-scale QA dataset constructed from chemical\npapers. This dataset reflects typical real-world challenges, including an\nimbalanced data distribution and a substantial amount of unlabeled data that\ncan be potentially useful. Correspondingly, we introduce a QAMatch model,\nspecifically designed to effectively answer chemical questions by fully\nleveraging our collected data. We first address the issue of imbalanced label\ndistribution by re-weighting the instance-wise loss based on the inverse\nfrequency of each class, ensuring minority classes are not dominated by\nmajority ones during optimization. Next, we utilize the unlabeled data to\nenrich the learning process, generating a variety of augmentations based on a\nSoftMix operation and ensuring their predictions align with the same target,\ni.e., pseudo-labels. To ensure the quality of the pseudo-labels, we propose a\ncalibration procedure aimed at closely aligning the pseudo-label estimates of\nindividual samples with a desired ground truth distribution. Experiments show\nthat our QAMatch significantly outperforms the recent similar-scale baselines\nand Large Language Models (LLMs) not only on our ScholarChemQA dataset but also\non four benchmark datasets. We hope our benchmark and model can facilitate and\npromote more research on chemical QA.\n","authors":["Xiuying Chen","Tairan Wang","Taicheng Guo","Kehan Guo","Juexiao Zhou","Haoyang Li","Mingchen Zhuge","Jürgen Schmidhuber","Xin Gao","Xiangliang Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.16931v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2402.16568v2","updated":"2024-07-24T01:44:05Z","published":"2024-02-26T13:47:09Z","title":"Two-stage Generative Question Answering on Temporal Knowledge Graph\n Using Large Language Models","summary":" Temporal knowledge graph question answering (TKGQA) poses a significant\nchallenge task, due to the temporal constraints hidden in questions and the\nanswers sought from dynamic structured knowledge. Although large language\nmodels (LLMs) have made considerable progress in their reasoning ability over\nstructured data, their application to the TKGQA task is a relatively unexplored\narea. This paper first proposes a novel generative temporal knowledge graph\nquestion answering framework, GenTKGQA, which guides LLMs to answer temporal\nquestions through two phases: Subgraph Retrieval and Answer Generation. First,\nwe exploit LLM's intrinsic knowledge to mine temporal constraints and\nstructural links in the questions without extra training, thus narrowing down\nthe subgraph search space in both temporal and structural dimensions. Next, we\ndesign virtual knowledge indicators to fuse the graph neural network signals of\nthe subgraph and the text representations of the LLM in a non-shallow way,\nwhich helps the open-source LLM deeply understand the temporal order and\nstructural dependencies among the retrieved facts through instruction tuning.\nExperimental results on two widely used datasets demonstrate the superiority of\nour model.\n","authors":["Yifu Gao","Linbo Qiao","Zhigang Kan","Zhihua Wen","Yongquan He","Dongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2402.16568v2.pdf","comment":"Accepted by ACL(Findings) 2024"},{"id":"http://arxiv.org/abs/2311.17086v2","updated":"2024-07-24T01:41:01Z","published":"2023-11-28T02:31:52Z","title":"PEA-Diffusion: Parameter-Efficient Adapter with Knowledge Distillation\n in non-English Text-to-Image Generation","summary":" Text-to-image diffusion models are well-known for their ability to generate\nrealistic images based on textual prompts. However, the existing works have\npredominantly focused on English, lacking support for non-English text-to-image\nmodels. The most commonly used translation methods cannot solve the generation\nproblem related to language culture, while training from scratch on a specific\nlanguage dataset is prohibitively expensive. In this paper, we are inspired to\npropose a simple plug-and-play language transfer method based on knowledge\ndistillation. All we need to do is train a lightweight MLP-like\nparameter-efficient adapter (PEA) with only 6M parameters under teacher\nknowledge distillation along with a small parallel data corpus. We are\nsurprised to find that freezing the parameters of UNet can still achieve\nremarkable performance on the language-specific prompt evaluation set,\ndemonstrating that PEA can stimulate the potential generation ability of the\noriginal UNet. Additionally, it closely approaches the performance of the\nEnglish text-to-image model on a general prompt evaluation set. Furthermore,\nour adapter can be used as a plugin to achieve significant results in\ndownstream tasks in cross-lingual text-to-image generation. Code will be\navailable at: https://github.com/OPPO-Mente-Lab/PEA-Diffusion\n","authors":["Jian Ma","Chen Chen","Qingsong Xie","Haonan Lu"],"pdf_url":"https://arxiv.org/pdf/2311.17086v2.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2407.16920v1","updated":"2024-07-24T01:04:34Z","published":"2024-07-24T01:04:34Z","title":"Train-Attention: Meta-Learning Where to Focus in Continual Knowledge\n Learning","summary":" Previous studies on continual knowledge learning (CKL) in large language\nmodels (LLMs) have predominantly focused on approaches such as regularization,\narchitectural modifications, and rehearsal techniques to mitigate catastrophic\nforgetting. However, these methods naively inherit the inefficiencies of\nstandard training procedures, indiscriminately applying uniform weight across\nall tokens, which can lead to unnecessary parameter updates and increased\nforgetting. To address these shortcomings, we propose a novel CKL approach\ntermed Train-Attention-Augmented Language Model (TAALM), which enhances\nlearning efficiency by dynamically predicting and applying weights to tokens\nbased on their usefulness. This method employs a meta-learning framework that\noptimizes token importance predictions, facilitating targeted knowledge updates\nand minimizing forgetting. Also, we observe that existing benchmarks do not\nclearly exhibit the trade-off between learning and retaining, therefore we\npropose a new benchmark, \\textsc{LAMA-ckl}, to address this issue. Through\nexperiments conducted on both newly introduced and established CKL benchmarks,\nTAALM proves the state-of-the-art performance upon the baselines, and also\nshows synergistic compatibility when integrated with previous CKL approaches.\n","authors":["Yeongbin Seo","Dongha Lee","Jinyoung Yeo"],"pdf_url":"https://arxiv.org/pdf/2407.16920v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.10861v2","updated":"2024-07-24T00:10:04Z","published":"2024-05-17T15:48:30Z","title":"Tailoring Vaccine Messaging with Common-Ground Opinions","summary":" One way to personalize chatbot interactions is by establishing common ground\nwith the intended reader. A domain where establishing mutual understanding\ncould be particularly impactful is vaccine concerns and misinformation. Vaccine\ninterventions are forms of messaging which aim to answer concerns expressed\nabout vaccination. Tailoring responses in this domain is difficult, since\nopinions often have seemingly little ideological overlap. We define the task of\ntailoring vaccine interventions to a Common-Ground Opinion (CGO). Tailoring\nresponses to a CGO involves meaningfully improving the answer by relating it to\nan opinion or belief the reader holds. In this paper we introduce TAILOR-CGO, a\ndataset for evaluating how well responses are tailored to provided CGOs. We\nbenchmark several major LLMs on this task; finding GPT-4-Turbo performs\nsignificantly better than others. We also build automatic evaluation metrics,\nincluding an efficient and accurate BERT model that outperforms finetuned LLMs,\ninvestigate how to successfully tailor vaccine messaging to CGOs, and provide\nactionable recommendations from this investigation.\n Code and model weights: https://github.com/rickardstureborg/tailor-cgo\nDataset: https://huggingface.co/datasets/DukeNLP/tailor-cgo\n","authors":["Rickard Stureborg","Sanxing Chen","Ruoyu Xie","Aayushi Patel","Christopher Li","Chloe Qinyu Zhu","Tingnan Hu","Jun Yang","Bhuwan Dhingra"],"pdf_url":"https://arxiv.org/pdf/2405.10861v2.pdf","comment":"NAACL Findings 2024"},{"id":"http://arxiv.org/abs/2407.16607v2","updated":"2024-07-24T23:34:21Z","published":"2024-07-23T16:13:22Z","title":"Data Mixture Inference: What do BPE Tokenizers Reveal about their\n Training Data?","summary":" The pretraining data of today's strongest language models is opaque; in\nparticular, little is known about the proportions of various domains or\nlanguages represented. In this work, we tackle a task which we call data\nmixture inference, which aims to uncover the distributional make-up of training\ndata. We introduce a novel attack based on a previously overlooked source of\ninformation -- byte-pair encoding (BPE) tokenizers, used by the vast majority\nof modern language models. Our key insight is that the ordered list of merge\nrules learned by a BPE tokenizer naturally reveals information about the token\nfrequencies in its training data: the first merge is the most common byte pair,\nthe second is the most common pair after merging the first token, and so on.\nGiven a tokenizer's merge list along with data samples for each category of\ninterest, we formulate a linear program that solves for the proportion of each\ncategory in the tokenizer's training set. Importantly, to the extent to which\ntokenizer training data is representative of the pretraining data, we\nindirectly learn about pretraining data. In controlled experiments, we show\nthat our attack recovers mixture ratios with high precision for tokenizers\ntrained on known mixtures of natural languages, programming languages, and data\nsources. We then apply our approach to off-the-shelf tokenizers released with\nrecent LMs. We confirm much publicly disclosed information about these models,\nand also make several new inferences: GPT-4o's tokenizer is much more\nmultilingual than its predecessors, training on 39% non-English data; Llama3\nextends GPT-3.5's tokenizer primarily for multilingual (48%) use; GPT-3.5's and\nClaude's tokenizers are trained on predominantly code (~60%). We hope our work\nsheds light on current design practices for pretraining data, and inspires\ncontinued research into data mixture inference for LMs.\n","authors":["Jonathan Hayase","Alisa Liu","Yejin Choi","Sewoong Oh","Noah A. Smith"],"pdf_url":"https://arxiv.org/pdf/2407.16607v2.pdf","comment":"19 pages, 5 figures"},{"id":"http://arxiv.org/abs/2402.12784v2","updated":"2024-07-24T23:00:50Z","published":"2024-02-20T07:49:30Z","title":"Understanding and Mitigating the Threat of Vec2Text to Dense Retrieval\n Systems","summary":" The emergence of Vec2Text -- a method for text embedding inversion -- has\nraised serious privacy concerns for dense retrieval systems which use text\nembeddings, such as those offered by OpenAI and Cohere. This threat comes from\nthe ability for a malicious attacker with access to embeddings to reconstruct\nthe original text. In this paper, we investigate various factors related to\nembedding models that may impact text recoverability via Vec2Text. We explore\nfactors such as distance metrics, pooling functions, bottleneck pre-training,\ntraining with noise addition, embedding quantization, and embedding dimensions,\nwhich were not considered in the original Vec2Text paper. Through a\ncomprehensive analysis of these factors, our objective is to gain a deeper\nunderstanding of the key elements that affect the trade-offs between the text\nrecoverability and retrieval effectiveness of dense retrieval systems, offering\ninsights for practitioners designing privacy-aware dense retrieval systems. We\nalso propose a simple embedding transformation fix that guarantees equal\nranking effectiveness while mitigating the recoverability risk. Overall, this\nstudy reveals that Vec2Text could pose a threat to current dense retrieval\nsystems, but there are some effective methods to patch such systems.\n","authors":["Shengyao Zhuang","Bevan Koopman","Xiaoran Chu","Guido Zuccon"],"pdf_url":"https://arxiv.org/pdf/2402.12784v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15612v2","updated":"2024-07-24T21:10:24Z","published":"2024-07-22T13:14:27Z","title":"Can GPT-4 learn to analyze moves in research article abstracts?","summary":" One of the most powerful and enduring ideas in written discourse analysis is\nthat genres can be described in terms of the moves which structure a writer's\npurpose. Considerable research has sought to identify these distinct\ncommunicative acts, but analyses have been beset by problems of subjectivity,\nreliability and the time-consuming need for multiple coders to confirm\nanalyses. In this paper we employ the affordances of GPT-4 to automate the\nannotation process by using natural language prompts. Focusing on abstracts\nfrom articles in four applied linguistics journals, we devise prompts which\nenable the model to identify moves effectively. The annotated outputs of these\nprompts were evaluated by two assessors with a third addressing disagreements.\nThe results show that an 8-shot prompt was more effective than one using two,\nconfirming that the inclusion of examples illustrating areas of variability can\nenhance GPT-4's ability to recognize multiple moves in a single sentence and\nreduce bias related to textual position. We suggest that GPT-4 offers\nconsiderable potential in automating this annotation process, when human actors\nwith domain specific linguistic expertise inform the prompting process.\n","authors":["Danni Yu","Marina Bondi","Ken Hyland"],"pdf_url":"https://arxiv.org/pdf/2407.15612v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17638v1","updated":"2024-07-24T21:06:40Z","published":"2024-07-24T21:06:40Z","title":"Time Matters: Examine Temporal Effects on Biomedical Language Models","summary":" Time roots in applying language models for biomedical applications: models\nare trained on historical data and will be deployed for new or future data,\nwhich may vary from training data. While increasing biomedical tasks have\nemployed state-of-the-art language models, there are very few studies have\nexamined temporal effects on biomedical models when data usually shifts across\ndevelopment and deployment. This study fills the gap by statistically probing\nrelations between language model performance and data shifts across three\nbiomedical tasks. We deploy diverse metrics to evaluate model performance,\ndistance methods to measure data drifts, and statistical methods to quantify\ntemporal effects on biomedical language models. Our study shows that time\nmatters for deploying biomedical language models, while the degree of\nperformance degradation varies by biomedical tasks and statistical\nquantification approaches. We believe this study can establish a solid\nbenchmark to evaluate and assess temporal effects on deploying biomedical\nlanguage models.\n","authors":["Weisi Liu","Zhe He","Xiaolei Huang"],"pdf_url":"https://arxiv.org/pdf/2407.17638v1.pdf","comment":"Accept to AMIA 2024 Annual Symposium"},{"id":"http://arxiv.org/abs/2407.17636v1","updated":"2024-07-24T21:02:53Z","published":"2024-07-24T21:02:53Z","title":"IgnitionInnovators at \"Discharge Me!\": Chain-of-Thought Instruction\n Finetuning Large Language Models for Discharge Summaries","summary":" This paper presents our proposed approach to the Discharge Me! shared task,\ncollocated with the 23th Workshop on Biomedical Natural Language Processing\n(BioNLP). In this work, we develop an LLM-based framework for solving the\nDischarge Summary Documentation (DSD) task, i.e., generating the two critical\ntarget sections `Brief Hospital Course' and `Discharge Instructions' in the\ndischarge summary. By streamlining the recent instruction-finetuning process on\nLLMs, we explore several prompting strategies for optimally adapting LLMs to\nspecific generation task of DSD. Experimental results show that providing a\nclear output structure, complimented by a set of comprehensive\nChain-of-Thoughts (CoT) questions, effectively improves the model's reasoning\ncapability, and thereby, enhancing the structural correctness and faithfulness\nof clinical information in the generated text. Source code is available at:\nhttps://github.com/antangrocket1312/Discharge_LLM\n","authors":["An Quang Tang","Xiuzhen Zhang","Minh Ngoc Dinh"],"pdf_url":"https://arxiv.org/pdf/2407.17636v1.pdf","comment":"Accepted by BioNLP2024 Workshop"},{"id":"http://arxiv.org/abs/2401.07575v2","updated":"2024-07-24T20:50:04Z","published":"2024-01-15T10:18:08Z","title":"Cascaded Cross-Modal Transformer for Audio-Textual Classification","summary":" Speech classification tasks often require powerful language understanding\nmodels to grasp useful features, which becomes problematic when limited\ntraining data is available. To attain superior classification performance, we\npropose to harness the inherent value of multimodal representations by\ntranscribing speech using automatic speech recognition (ASR) models and\ntranslating the transcripts into different languages via pretrained translation\nmodels. We thus obtain an audio-textual (multimodal) representation for each\ndata sample. Subsequently, we combine language-specific Bidirectional Encoder\nRepresentations from Transformers (BERT) with Wav2Vec2.0 audio features via a\nnovel cascaded cross-modal transformer (CCMT). Our model is based on two\ncascaded transformer blocks. The first one combines text-specific features from\ndistinct languages, while the second one combines acoustic features with\nmultilingual features previously learned by the first transformer block. We\nemployed our system in the Requests Sub-Challenge of the ACM Multimedia 2023\nComputational Paralinguistics Challenge. CCMT was declared the winning\nsolution, obtaining an unweighted average recall (UAR) of 65.41% and 85.87% for\ncomplaint and request detection, respectively. Moreover, we applied our\nframework on the Speech Commands v2 and HarperValleyBank dialog data sets,\nsurpassing previous studies reporting results on these benchmarks. Our code is\nfreely available for download at: https://github.com/ristea/ccmt.\n","authors":["Nicolae-Catalin Ristea","Andrei Anghel","Radu Tudor Ionescu"],"pdf_url":"https://arxiv.org/pdf/2401.07575v2.pdf","comment":"Accepted for publication in Artificial Intelligence Review"},{"id":"http://arxiv.org/abs/2407.17629v1","updated":"2024-07-24T20:38:13Z","published":"2024-07-24T20:38:13Z","title":"Papilusion at DAGPap24: Paper or Illusion? Detecting AI-generated\n Scientific Papers","summary":" This paper presents Papilusion, an AI-generated scientific text detector\ndeveloped within the DAGPap24 shared task on detecting automatically generated\nscientific papers. We propose an ensemble-based approach and conduct ablation\nstudies to analyze the effect of the detector configurations on the\nperformance. Papilusion is ranked 6th on the leaderboard, and we improve our\nperformance after the competition ended, achieving 99.46 (+9.63) of the\nF1-score on the official test set.\n","authors":["Nikita Andreev","Alexander Shirnin","Vladislav Mikhailov","Ekaterina Artemova"],"pdf_url":"https://arxiv.org/pdf/2407.17629v1.pdf","comment":"to appear in DAGPAP 2024 proceedings"},{"id":"http://arxiv.org/abs/2407.17624v1","updated":"2024-07-24T20:30:55Z","published":"2024-07-24T20:30:55Z","title":"Traditional Methods Outperform Generative LLMs at Forecasting Credit\n Ratings","summary":" Large Language Models (LLMs) have been shown to perform well for many\ndownstream tasks. Transfer learning can enable LLMs to acquire skills that were\nnot targeted during pre-training. In financial contexts, LLMs can sometimes\nbeat well-established benchmarks. This paper investigates how well LLMs perform\nin the task of forecasting corporate credit ratings. We show that while LLMs\nare very good at encoding textual information, traditional methods are still\nvery competitive when it comes to encoding numeric and multimodal data. For our\ntask, current LLMs perform worse than a more traditional XGBoost architecture\nthat combines fundamental and macroeconomic data with high-density text-based\nembedding features.\n","authors":["Felix Drinkall","Janet B. Pierrehumbert","Stefan Zohren"],"pdf_url":"https://arxiv.org/pdf/2407.17624v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17605v1","updated":"2024-07-24T19:29:13Z","published":"2024-07-24T19:29:13Z","title":"Coupling Speech Encoders with Downstream Text Models","summary":" We present a modular approach to building cascade speech translation (AST)\nmodels that guarantees that the resulting model performs no worse than the\n1-best cascade baseline while preserving state-of-the-art speech recognition\n(ASR) and text translation (MT) performance for a given task. Our novel\ncontribution is the use of an ``exporter'' layer that is trained under L2-loss\nto ensure a strong match between ASR embeddings and the MT token embeddings for\nthe 1-best sequence. The ``exporter'' output embeddings are fed directly to the\nMT model in lieu of 1-best token embeddings, thus guaranteeing that the\nresulting model performs no worse than the 1-best cascade baseline, while\nallowing back-propagation gradient to flow from the MT model into the ASR\ncomponents. The matched-embeddings cascade architecture provide a significant\nimprovement over its 1-best counterpart in scenarios where incremental training\nof the MT model is not an option and yet we seek to improve quality by\nleveraging (speech, transcription, translated transcription) data provided with\nthe AST task. The gain disappears when the MT model is incrementally trained on\nthe parallel text data available with the AST task. The approach holds promise\nfor other scenarios that seek to couple ASR encoders and immutable text models,\nsuch at large language models (LLM).\n","authors":["Ciprian Chelba","Johan Schalkwyk"],"pdf_url":"https://arxiv.org/pdf/2407.17605v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13067v3","updated":"2024-07-24T18:54:53Z","published":"2023-05-22T14:37:05Z","title":"Distilling Robustness into Natural Language Inference Models with\n Domain-Targeted Augmentation","summary":" Knowledge distillation optimises a smaller student model to behave similarly\nto a larger teacher model, retaining some of the performance benefits. While\nthis method can improve results on in-distribution examples, it does not\nnecessarily generalise to out-of-distribution (OOD) settings. We investigate\ntwo complementary methods for improving the robustness of the resulting student\nmodels on OOD domains. The first approach augments the distillation with\ngenerated unlabelled examples that match the target distribution. The second\nmethod upsamples data points among the training set that are similar to the\ntarget distribution. When applied on the task of natural language inference\n(NLI), our experiments on MNLI show that distillation with these modifications\noutperforms previous robustness solutions. We also find that these methods\nimprove performance on OOD domains even beyond the target domain.\n","authors":["Joe Stacey","Marek Rei"],"pdf_url":"https://arxiv.org/pdf/2305.13067v3.pdf","comment":"Accepted at ACL Findings 2024"},{"id":"http://arxiv.org/abs/2407.06023v3","updated":"2024-07-24T18:40:36Z","published":"2024-07-08T15:17:46Z","title":"Distilling System 2 into System 1","summary":" Large language models (LLMs) can spend extra compute during inference to\ngenerate intermediate thoughts, which helps to produce better final responses.\nSince Chain-of-Thought (Wei et al., 2022), many such System 2 techniques have\nbeen proposed such as Rephrase and Respond (Deng et al., 2023a), System 2\nAttention (Weston and Sukhbaatar, 2023) and Branch-Solve-Merge (Saha et al.,\n2023). In this work we investigate self-supervised methods to ``compile''\n(distill) higher quality outputs from System 2 techniques back into LLM\ngenerations without intermediate reasoning token sequences, as this reasoning\nhas been distilled into System 1. We show that several such techniques can be\nsuccessfully distilled, resulting in improved results compared to the original\nSystem 1 performance, and with less inference cost than System 2. We posit that\nsuch System 2 distillation will be an important feature of future continually\nlearning AI systems, enabling them to focus System 2 capabilities on the\nreasoning tasks that they cannot yet do well.\n","authors":["Ping Yu","Jing Xu","Jason Weston","Ilia Kulikov"],"pdf_url":"https://arxiv.org/pdf/2407.06023v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.17987v4","updated":"2024-07-24T18:38:51Z","published":"2024-06-26T00:00:45Z","title":"Multi-step Inference over Unstructured Data","summary":" The advent of Large Language Models (LLMs) and Generative AI has\nrevolutionized natural language applications across various domains. However,\nhigh-stakes decision-making tasks in fields such as medical, legal and finance\nrequire a level of precision, comprehensiveness, and logical consistency that\npure LLM or Retrieval-Augmented-Generation (RAG) approaches often fail to\ndeliver. At Elemental Cognition (EC), we have developed a neuro-symbolic AI\nplatform to tackle these problems. The platform integrates fine-tuned LLMs for\nknowledge extraction and alignment with a robust symbolic reasoning engine for\nlogical inference, planning and interactive constraint solving. We describe\nCora, a Collaborative Research Assistant built on this platform, that is\ndesigned to perform complex research and discovery tasks in high-stakes\ndomains. This paper discusses the multi-step inference challenges inherent in\nsuch domains, critiques the limitations of existing LLM-based methods, and\ndemonstrates how Cora's neuro-symbolic approach effectively addresses these\nissues. We provide an overview of the system architecture, key algorithms for\nknowledge extraction and formal reasoning, and present preliminary evaluation\nresults that highlight Cora's superior performance compared to well-known LLM\nand RAG baselines.\n","authors":["Aditya Kalyanpur","Kailash Karthik Saravanakumar","Victor Barres","CJ McFate","Lori Moon","Nati Seifu","Maksim Eremeev","Jose Barrera","Abraham Bautista-Castillo","Eric Brown","David Ferrucci"],"pdf_url":"https://arxiv.org/pdf/2406.17987v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17546v1","updated":"2024-07-24T17:25:12Z","published":"2024-07-24T17:25:12Z","title":"Exploring Domain Robust Lightweight Reward Models based on Router\n Mechanism","summary":" Recent advancements in large language models have heavily relied on the large\nreward model from reinforcement learning from human feedback for fine-tuning.\nHowever, the use of a single reward model across various domains may not always\nbe optimal, often requiring retraining from scratch when new domain data is\nintroduced. To address these challenges, we explore the utilization of small\nlanguage models operating in a domain-specific manner based on router\nmechanisms. Our three approaches are: 1) utilize mixture of experts to form a\nsingle reward model by modularizing an internal router and experts, 2)\nemploying external router to select the appropriate reward model from multiple\ndomain-specific models, and 3) the framework reduces parameter size by loading\nreward models and router adapters onto a single small language model using\nadapters. Experimental validation underscores the effectiveness of our\napproach, demonstrating performance comparable to baseline methods while also\nreducing the total parameter size.\n","authors":["Hyuk Namgoong","Jeesu Jung","Sangkeun Jung","Yoonhyung Roh"],"pdf_url":"https://arxiv.org/pdf/2407.17546v1.pdf","comment":"This paper is accepted for ACL 2024"},{"id":"http://arxiv.org/abs/2407.17545v1","updated":"2024-07-24T16:33:04Z","published":"2024-07-24T16:33:04Z","title":"Large Language Models for Anomaly Detection in Computational Workflows:\n from Supervised Fine-Tuning to In-Context Learning","summary":" Anomaly detection in computational workflows is critical for ensuring system\nreliability and security. However, traditional rule-based methods struggle to\ndetect novel anomalies. This paper leverages large language models (LLMs) for\nworkflow anomaly detection by exploiting their ability to learn complex data\npatterns. Two approaches are investigated: 1) supervised fine-tuning (SFT),\nwhere pre-trained LLMs are fine-tuned on labeled data for sentence\nclassification to identify anomalies, and 2) in-context learning (ICL) where\nprompts containing task descriptions and examples guide LLMs in few-shot\nanomaly detection without fine-tuning. The paper evaluates the performance,\nefficiency, generalization of SFT models, and explores zero-shot and few-shot\nICL prompts and interpretability enhancement via chain-of-thought prompting.\nExperiments across multiple workflow datasets demonstrate the promising\npotential of LLMs for effective anomaly detection in complex executions.\n","authors":["Hongwei Jin","George Papadimitriou","Krishnan Raghavan","Pawel Zuk","Prasanna Balaprakash","Cong Wang","Anirban Mandal","Ewa Deelman"],"pdf_url":"https://arxiv.org/pdf/2407.17545v1.pdf","comment":"12 pages, 14 figures, paper is accepted by SC'24, source code, see:\n https://github.com/PoSeiDon-Workflows/LLM_AD"},{"id":"http://arxiv.org/abs/2407.17532v1","updated":"2024-07-24T03:33:47Z","published":"2024-07-24T03:33:47Z","title":"Generative artificial intelligence in dentistry: Current approaches and\n future challenges","summary":" Artificial intelligence (AI) has become a commodity for people because of the\nadvent of generative AI (GenAI) models that bridge the usability gap of AI by\nproviding a natural language interface to interact with complex models. These\nGenAI models range from text generation - such as two-way chat systems - to the\ngeneration of image or video from textual descriptions input by a user. These\nadvancements in AI have impacted Dentistry in multiple aspects. In dental\neducation, the student now has the opportunity to solve a plethora of questions\nby only prompting a GenAI model and have the answer in a matter of seconds.\nGenAI models can help us deliver better patient healthcare by helping\npractitioners gather knowledge quickly and efficiently. Finally, GenAI can also\nbe used in dental research, where the applications range from new drug\ndiscovery to assistance in academic writing. In this review, we first define\nGenAI models and describe their multiple generation modalities; then, we\nexplain and discuss their current and potential applications in Dentistry; and\nfinally, we describe the challenges these new technologies impose in our area.\n","authors":["Fabián Villena","Claudia Véliz","Rosario García-Huidobro","Sebastián Aguayo"],"pdf_url":"https://arxiv.org/pdf/2407.17532v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2407.17470v1","updated":"2024-07-24T17:59:43Z","published":"2024-07-24T17:59:43Z","title":"SV4D: Dynamic 3D Content Generation with Multi-Frame and Multi-View\n Consistency","summary":" We present Stable Video 4D (SV4D), a latent video diffusion model for\nmulti-frame and multi-view consistent dynamic 3D content generation. Unlike\nprevious methods that rely on separately trained generative models for video\ngeneration and novel view synthesis, we design a unified diffusion model to\ngenerate novel view videos of dynamic 3D objects. Specifically, given a\nmonocular reference video, SV4D generates novel views for each video frame that\nare temporally consistent. We then use the generated novel view videos to\noptimize an implicit 4D representation (dynamic NeRF) efficiently, without the\nneed for cumbersome SDS-based optimization used in most prior works. To train\nour unified novel view video generation model, we curated a dynamic 3D object\ndataset from the existing Objaverse dataset. Extensive experimental results on\nmultiple datasets and user studies demonstrate SV4D's state-of-the-art\nperformance on novel-view video synthesis as well as 4D generation compared to\nprior works.\n","authors":["Yiming Xie","Chun-Han Yao","Vikram Voleti","Huaizu Jiang","Varun Jampani"],"pdf_url":"https://arxiv.org/pdf/2407.17470v1.pdf","comment":"Project page: https://sv4d.github.io/"},{"id":"http://arxiv.org/abs/2407.17460v1","updated":"2024-07-24T17:57:21Z","published":"2024-07-24T17:57:21Z","title":"SoNIC: Safe Social Navigation with Adaptive Conformal Inference and\n Constrained Reinforcement Learning","summary":" Reinforcement Learning (RL) has enabled social robots to generate\ntrajectories without human-designed rules or interventions, which makes it more\neffective than hard-coded systems for generalizing to complex real-world\nscenarios. However, social navigation is a safety-critical task that requires\nrobots to avoid collisions with pedestrians while previous RL-based solutions\nfall short in safety performance in complex environments. To enhance the safety\nof RL policies, to the best of our knowledge, we propose the first algorithm,\nSoNIC, that integrates adaptive conformal inference (ACI) with constrained\nreinforcement learning (CRL) to learn safe policies for social navigation. More\nspecifically, our method augments RL observations with ACI-generated\nnonconformity scores and provides explicit guidance for agents to leverage the\nuncertainty metrics to avoid safety-critical areas by incorporating safety\nconstraints with spatial relaxation. Our method outperforms state-of-the-art\nbaselines in terms of both safety and adherence to social norms by a large\nmargin and demonstrates much stronger robustness to out-of-distribution\nscenarios. Our code and video demos are available on our project website:\nhttps://sonic-social-nav.github.io/.\n","authors":["Jianpeng Yao","Xiaopan Zhang","Yu Xia","Zejin Wang","Amit K. Roy-Chowdhury","Jiachen Li"],"pdf_url":"https://arxiv.org/pdf/2407.17460v1.pdf","comment":"Project website: https://sonic-social-nav.github.io/"},{"id":"http://arxiv.org/abs/2407.17457v1","updated":"2024-07-24T17:50:00Z","published":"2024-07-24T17:50:00Z","title":"CSCPR: Cross-Source-Context Indoor RGB-D Place Recognition","summary":" We present a new algorithm, Cross-Source-Context Place Recognition (CSCPR),\nfor RGB-D indoor place recognition that integrates global retrieval and\nreranking into a single end-to-end model. Unlike prior approaches that\nprimarily focus on the RGB domain, CSCPR is designed to handle the RGB-D data.\nWe extend the Context-of-Clusters (CoCs) for handling noisy colorized point\nclouds and introduce two novel modules for reranking: the Self-Context Cluster\n(SCC) and Cross Source Context Cluster (CSCC), which enhance feature\nrepresentation and match query-database pairs based on local features,\nrespectively. We also present two new datasets, ScanNetIPR and ARKitIPR. Our\nexperiments demonstrate that CSCPR significantly outperforms state-of-the-art\nmodels on these datasets by at least 36.5% in Recall@1 at ScanNet-PR dataset\nand 44% in new datasets. Code and datasets will be released.\n","authors":["Jing Liang","Zhuo Deng","Zheming Zhou","Min Sun","Omid Ghasemalizadeh","Cheng-Hao Kuo","Arnie Sen","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2407.17457v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17453v1","updated":"2024-07-24T17:37:05Z","published":"2024-07-24T17:37:05Z","title":"$VILA^2$: VILA Augmented VILA","summary":" Visual language models (VLMs) have rapidly progressed, driven by the success\nof large language models (LLMs). While model architectures and training\ninfrastructures advance rapidly, data curation remains under-explored. When\ndata quantity and quality become a bottleneck, existing work either directly\ncrawls more raw data from the Internet that does not have a guarantee of data\nquality or distills from black-box commercial models (e.g., GPT-4V / Gemini)\ncausing the performance upper bounded by that model. In this work, we introduce\na novel approach that includes a self-augment step and a specialist-augment\nstep to iteratively improve data quality and model performance. In the\nself-augment step, a VLM recaptions its own pretraining data to enhance data\nquality, and then retrains from scratch using this refined dataset to improve\nmodel performance. This process can iterate for several rounds. Once\nself-augmentation saturates, we employ several specialist VLMs finetuned from\nthe self-augmented VLM with domain-specific expertise, to further infuse\nspecialist knowledge into the generalist VLM through task-oriented recaptioning\nand retraining. With the combined self-augmented and specialist-augmented\ntraining, we introduce $VILA^2$ (VILA-augmented-VILA), a VLM family that\nconsistently improves the accuracy on a wide range of tasks over prior art, and\nachieves new state-of-the-art results on MMMU leaderboard among open-sourced\nmodels.\n","authors":["Yunhao Fang","Ligeng Zhu","Yao Lu","Yan Wang","Pavlo Molchanov","Jang Hyun Cho","Marco Pavone","Song Han","Hongxu Yin"],"pdf_url":"https://arxiv.org/pdf/2407.17453v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17449v1","updated":"2024-07-24T17:30:21Z","published":"2024-07-24T17:30:21Z","title":"Looking at Model Debiasing through the Lens of Anomaly Detection","summary":" It is widely recognized that deep neural networks are sensitive to bias in\nthe data. This means that during training these models are likely to learn\nspurious correlations between data and labels, resulting in limited\ngeneralization abilities and low performance. In this context, model debiasing\napproaches can be devised aiming at reducing the model's dependency on such\nunwanted correlations, either leveraging the knowledge of bias information or\nnot. In this work, we focus on the latter and more realistic scenario, showing\nthe importance of accurately predicting the bias-conflicting and bias-aligned\nsamples to obtain compelling performance in bias mitigation. On this ground, we\npropose to conceive the problem of model bias from an out-of-distribution\nperspective, introducing a new bias identification method based on anomaly\ndetection. We claim that when data is mostly biased, bias-conflicting samples\ncan be regarded as outliers with respect to the bias-aligned distribution in\nthe feature space of a biased model, thus allowing for precisely detecting them\nwith an anomaly detection method. Coupling the proposed bias identification\napproach with bias-conflicting data upsampling and augmentation in a two-step\nstrategy, we reach state-of-the-art performance on synthetic and real benchmark\ndatasets. Ultimately, our proposed approach shows that the data bias issue does\nnot necessarily require complex debiasing methods, given that an accurate bias\nidentification procedure is defined.\n","authors":["Vito Paolo Pastore","Massimiliano Ciranni","Davide Marinelli","Francesca Odone","Vittorio Murino"],"pdf_url":"https://arxiv.org/pdf/2407.17449v1.pdf","comment":"15 pages, 7 figures"},{"id":"http://arxiv.org/abs/2407.17442v1","updated":"2024-07-24T17:19:58Z","published":"2024-07-24T17:19:58Z","title":"AHMF: Adaptive Hybrid-Memory-Fusion Model for Driver Attention\n Prediction","summary":" Accurate driver attention prediction can serve as a critical reference for\nintelligent vehicles in understanding traffic scenes and making informed\ndriving decisions. Though existing studies on driver attention prediction\nimproved performance by incorporating advanced saliency detection techniques,\nthey overlooked the opportunity to achieve human-inspired prediction by\nanalyzing driving tasks from a cognitive science perspective. During driving,\ndrivers' working memory and long-term memory play crucial roles in scene\ncomprehension and experience retrieval, respectively. Together, they form\nsituational awareness, facilitating drivers to quickly understand the current\ntraffic situation and make optimal decisions based on past driving experiences.\nTo explicitly integrate these two types of memory, this paper proposes an\nAdaptive Hybrid-Memory-Fusion (AHMF) driver attention prediction model to\nachieve more human-like predictions. Specifically, the model first encodes\ninformation about specific hazardous stimuli in the current scene to form\nworking memories. Then, it adaptively retrieves similar situational experiences\nfrom the long-term memory for final prediction. Utilizing domain adaptation\ntechniques, the model performs parallel training across multiple datasets,\nthereby enriching the accumulated driving experience within the long-term\nmemory module. Compared to existing models, our model demonstrates significant\nimprovements across various metrics on multiple public datasets, proving the\neffectiveness of integrating hybrid memories in driver attention prediction.\n","authors":["Dongyang Xu","Qingfan Wang","Ji Ma","Xiangyun Zeng","Lei Chen"],"pdf_url":"https://arxiv.org/pdf/2407.17442v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17438v1","updated":"2024-07-24T17:15:58Z","published":"2024-07-24T17:15:58Z","title":"HumanVid: Demystifying Training Data for Camera-controllable Human Image\n Animation","summary":" Human image animation involves generating videos from a character photo,\nallowing user control and unlocking potential for video and movie production.\nWhile recent approaches yield impressive results using high-quality training\ndata, the inaccessibility of these datasets hampers fair and transparent\nbenchmarking. Moreover, these approaches prioritize 2D human motion and\noverlook the significance of camera motions in videos, leading to limited\ncontrol and unstable video generation.To demystify the training data, we\npresent HumanVid, the first large-scale high-quality dataset tailored for human\nimage animation, which combines crafted real-world and synthetic data. For the\nreal-world data, we compile a vast collection of copyright-free real-world\nvideos from the internet. Through a carefully designed rule-based filtering\nstrategy, we ensure the inclusion of high-quality videos, resulting in a\ncollection of 20K human-centric videos in 1080P resolution. Human and camera\nmotion annotation is accomplished using a 2D pose estimator and a SLAM-based\nmethod. For the synthetic data, we gather 2,300 copyright-free 3D avatar assets\nto augment existing available 3D assets. Notably, we introduce a rule-based\ncamera trajectory generation method, enabling the synthetic pipeline to\nincorporate diverse and precise camera motion annotation, which can rarely be\nfound in real-world data. To verify the effectiveness of HumanVid, we establish\na baseline model named CamAnimate, short for Camera-controllable Human\nAnimation, that considers both human and camera motions as conditions. Through\nextensive experimentation, we demonstrate that such simple baseline training on\nour HumanVid achieves state-of-the-art performance in controlling both human\npose and camera motions, setting a new benchmark. Code and data will be\npublicly available at \\url{https://github.com/zhenzhiwang/HumanVid/}.\n","authors":["Zhenzhi Wang","Yixuan Li","Yanhong Zeng","Youqing Fang","Yuwei Guo","Wenran Liu","Jing Tan","Kai Chen","Tianfan Xue","Bo Dai","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2407.17438v1.pdf","comment":"camera controllable human image animation, a dataset and a baseline"},{"id":"http://arxiv.org/abs/2407.15708v2","updated":"2024-07-24T16:55:08Z","published":"2024-07-22T15:17:39Z","title":"SwinSF: Image Reconstruction from Spatial-Temporal Spike Streams","summary":" The spike camera, with its high temporal resolution, low latency, and high\ndynamic range, addresses high-speed imaging challenges like motion blur. It\ncaptures photons at each pixel independently, creating binary spike streams\nrich in temporal information but challenging for image reconstruction. Current\nalgorithms, both traditional and deep learning-based, still need to be improved\nin the utilization of the rich temporal detail and the restoration of the\ndetails of the reconstructed image. To overcome this, we introduce Swin\nSpikeformer (SwinSF), a novel model for dynamic scene reconstruction from spike\nstreams. SwinSF is composed of Spike Feature Extraction, Spatial-Temporal\nFeature Extraction, and Final Reconstruction Module. It combines shifted window\nself-attention and proposed temporal spike attention, ensuring a comprehensive\nfeature extraction that encapsulates both spatial and temporal dynamics,\nleading to a more robust and accurate reconstruction of spike streams.\nFurthermore, we build a new synthesized dataset for spike image reconstruction\nwhich matches the resolution of the latest spike camera, ensuring its relevance\nand applicability to the latest developments in spike camera imaging.\nExperimental results demonstrate that the proposed network SwinSF sets a new\nbenchmark, achieving state-of-the-art performance across a series of datasets,\nincluding both real-world and synthesized data across various resolutions. Our\ncodes and proposed dataset will be available soon.\n","authors":["Liangyan Jiang","Chuang Zhu","Yanxu Chen"],"pdf_url":"https://arxiv.org/pdf/2407.15708v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17418v1","updated":"2024-07-24T16:53:17Z","published":"2024-07-24T16:53:17Z","title":"3D Gaussian Splatting: Survey, Technologies, Challenges, and\n Opportunities","summary":" 3D Gaussian Splatting (3DGS) has emerged as a prominent technique with the\npotential to become a mainstream method for 3D representations. It can\neffectively transform multi-view images into explicit 3D Gaussian\nrepresentations through efficient training, and achieve real-time rendering of\nnovel views. This survey aims to analyze existing 3DGS-related works from\nmultiple intersecting perspectives, including related tasks, technologies,\nchallenges, and opportunities. The primary objective is to provide newcomers\nwith a rapid understanding of the field and to assist researchers in\nmethodically organizing existing technologies and challenges. Specifically, we\ndelve into the optimization, application, and extension of 3DGS, categorizing\nthem based on their focuses or motivations. Additionally, we summarize and\nclassify nine types of technical modules and corresponding improvements\nidentified in existing works. Based on these analyses, we further examine the\ncommon challenges and technologies across various tasks, proposing potential\nresearch opportunities.\n","authors":["Yanqi Bao","Tianyu Ding","Jing Huo","Yaoli Liu","Yuxin Li","Wenbin Li","Yang Gao","Jiebo Luo"],"pdf_url":"https://arxiv.org/pdf/2407.17418v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17412v1","updated":"2024-07-24T16:47:45Z","published":"2024-07-24T16:47:45Z","title":"(PASS) Visual Prompt Locates Good Structure Sparsity through a Recurrent\n HyperNetwork","summary":" Large-scale neural networks have demonstrated remarkable performance in\ndifferent domains like vision and language processing, although at the cost of\nmassive computation resources. As illustrated by compression literature,\nstructural model pruning is a prominent algorithm to encourage model\nefficiency, thanks to its acceleration-friendly sparsity patterns. One of the\nkey questions of structural pruning is how to estimate the channel\nsignificance. In parallel, work on data-centric AI has shown that\nprompting-based techniques enable impressive generalization of large language\nmodels across diverse downstream tasks. In this paper, we investigate a\ncharming possibility - \\textit{leveraging visual prompts to capture the channel\nimportance and derive high-quality structural sparsity}. To this end, we\npropose a novel algorithmic framework, namely \\texttt{PASS}. It is a tailored\nhyper-network to take both visual prompts and network weight statistics as\ninput, and output layer-wise channel sparsity in a recurrent manner. Such\ndesigns consider the intrinsic channel dependency between layers. Comprehensive\nexperiments across multiple network architectures and six datasets demonstrate\nthe superiority of \\texttt{PASS} in locating good structural sparsity. For\nexample, at the same FLOPs level, \\texttt{PASS} subnetworks achieve $1\\%\\sim\n3\\%$ better accuracy on Food101 dataset; or with a similar performance of\n$80\\%$ accuracy, \\texttt{PASS} subnetworks obtain $0.35\\times$ more speedup\nthan the baselines.\n","authors":["Tianjin Huang","Fang Meng","Li Shen","Fan Liu","Yulong Pei","Mykola Pechenizkiy","Shiwei Liu","Tianlong Chen"],"pdf_url":"https://arxiv.org/pdf/2407.17412v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2407.17409v1","updated":"2024-07-24T16:43:38Z","published":"2024-07-24T16:43:38Z","title":"Generation of Training Data from HD Maps in the Lanelet2 Framework","summary":" Using HD maps directly as training data for machine learning tasks has seen a\nmassive surge in popularity and shown promising results, e.g. in the field of\nmap perception. Despite that, a standardized HD map framework supporting all\nparts of map-based automated driving and training label generation from map\ndata does not exist. Furthermore, feeding map perception models with map data\nas part of the input during real-time inference is not addressed by the\nresearch community. In order to fill this gap, we presentlanelet2_ml_converter,\nan integrated extension to the HD map framework Lanelet2, widely used in\nautomated driving systems by academia and industry. With this addition Lanelet2\nunifies map based automated driving, machine learning inference and training,\nall from a single source of map data and format. Requirements for a unified\nframework are analyzed and the implementation of these requirements is\ndescribed. The usability of labels in state of the art machine learning is\ndemonstrated with application examples from the field of map perception. The\nsource code is available embedded in the Lanelet2 framework under\nhttps://github.com/fzi-forschungszentrum-informatik/Lanelet2/tree/feature_ml_converter\n","authors":["Fabian Immel","Richard Fehler","Frank Bieder","Christoph Stiller"],"pdf_url":"https://arxiv.org/pdf/2407.17409v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03588v2","updated":"2024-07-24T16:26:41Z","published":"2024-07-04T02:45:29Z","title":"FDS: Feedback-guided Domain Synthesis with Multi-Source Conditional\n Diffusion Models for Domain Generalization","summary":" Domain Generalization techniques aim to enhance model robustness by\nsimulating novel data distributions during training, typically through various\naugmentation or stylization strategies. However, these methods frequently\nsuffer from limited control over the diversity of generated images and lack\nassurance that these images span distinct distributions. To address these\nchallenges, we propose FDS, Feedback-guided Domain Synthesis, a novel strategy\nthat employs diffusion models to synthesize novel, pseudo-domains by training a\nsingle model on all source domains and performing domain mixing based on\nlearned features. By incorporating images that pose classification challenges\nto models trained on original samples, alongside the original dataset, we\nensure the generation of a training set that spans a broad distribution\nspectrum. Our comprehensive evaluations demonstrate that this methodology sets\nnew benchmarks in domain generalization performance across a range of\nchallenging datasets, effectively managing diverse types of domain shifts. The\nimplementation is available at: \\url{https://github.com/Mehrdad-Noori/FDS.git}.\n","authors":["Mehrdad Noori","Milad Cheraghalikhani","Ali Bahri","Gustavo Adolfo Vargas Hakim","David Osowiechi","Moslem Yazdanpanah","Ismail Ben Ayed","Christian Desrosiers"],"pdf_url":"https://arxiv.org/pdf/2407.03588v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17399v1","updated":"2024-07-24T16:23:46Z","published":"2024-07-24T16:23:46Z","title":"Self-Calibrated Variance-Stabilizing Transformations for Real-World\n Image Denoising","summary":" Supervised deep learning has become the method of choice for image denoising.\nIt involves the training of neural networks on large datasets composed of pairs\nof noisy and clean images. However, the necessity of training data that are\nspecific to the targeted application constrains the widespread use of denoising\nnetworks. Recently, several approaches have been developed to overcome this\ndifficulty by whether artificially generating realistic clean/noisy image\npairs, or training exclusively on noisy images. In this paper, we show that,\ncontrary to popular belief, denoising networks specialized in the removal of\nGaussian noise can be efficiently leveraged in favor of real-world image\ndenoising, even without additional training. For this to happen, an appropriate\nvariance-stabilizing transform (VST) has to be applied beforehand. We propose\nan algorithm termed Noise2VST for the learning of such a model-free VST. Our\napproach requires only the input noisy image and an off-the-shelf Gaussian\ndenoiser. We demonstrate through extensive experiments the efficiency and\nsuperiority of Noise2VST in comparison to existing methods trained in the\nabsence of specific clean/noisy pairs.\n","authors":["Sébastien Herbreteau","Michael Unser"],"pdf_url":"https://arxiv.org/pdf/2407.17399v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17398v1","updated":"2024-07-24T16:22:27Z","published":"2024-07-24T16:22:27Z","title":"3D Question Answering for City Scene Understanding","summary":" 3D multimodal question answering (MQA) plays a crucial role in scene\nunderstanding by enabling intelligent agents to comprehend their surroundings\nin 3D environments. While existing research has primarily focused on indoor\nhousehold tasks and outdoor roadside autonomous driving tasks, there has been\nlimited exploration of city-level scene understanding tasks. Furthermore,\nexisting research faces challenges in understanding city scenes, due to the\nabsence of spatial semantic information and human-environment interaction\ninformation at the city level.To address these challenges, we investigate 3D\nMQA from both dataset and method perspectives. From the dataset perspective, we\nintroduce a novel 3D MQA dataset named City-3DQA for city-level scene\nunderstanding, which is the first dataset to incorporate scene semantic and\nhuman-environment interactive tasks within the city. From the method\nperspective, we propose a Scene graph enhanced City-level Understanding method\n(Sg-CityU), which utilizes the scene graph to introduce the spatial semantic. A\nnew benchmark is reported and our proposed Sg-CityU achieves accuracy of 63.94\n% and 63.76 % in different settings of City-3DQA. Compared to indoor 3D MQA\nmethods and zero-shot using advanced large language models (LLMs), Sg-CityU\ndemonstrates state-of-the-art (SOTA) performance in robustness and\ngeneralization.\n","authors":["Penglei Sun","Yaoxian Song","Xiang Liu","Xiaofei Yang","Qiang Wang","Tiefeng Li","Yang Yang","Xiaowen Chu"],"pdf_url":"https://arxiv.org/pdf/2407.17398v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17380v1","updated":"2024-07-24T16:04:18Z","published":"2024-07-24T16:04:18Z","title":"2D and 3D Deep Learning Models for MRI-based Parkinson's Disease\n Classification: A Comparative Analysis of Convolutional Kolmogorov-Arnold\n Networks, Convolutional Neural Networks, and Graph Convolutional Networks","summary":" Early and accurate diagnosis of Parkinson's Disease (PD) remains challenging.\nThis study compares deep learning architectures for MRI-based PD\nclassification, introducing the first three-dimensional (3D) implementation of\nConvolutional Kolmogorov-Arnold Networks (ConvKANs), a new approach that\ncombines convolution layers with adaptive, spline-based activations. We\nevaluated Convolutional Neural Networks (CNNs), ConvKANs, and Graph\nConvolutional Networks (GCNs) using three open-source datasets; a total of 142\nparticipants (75 with PD and 67 age-matched healthy controls). For 2D analysis,\nwe extracted 100 axial slices centred on the midbrain from each T1-weighted\nscan. For 3D analysis, we used the entire volumetric scans. ConvKANs integrate\nlearnable B-spline functions with convolutional layers. GCNs represent MRI data\nas graphs, theoretically capturing structural relationships that may be\noverlooked by traditional approaches. Interpretability visualizations,\nincluding the first ConvKAN spline activation maps, and projections of graph\nnode embeddings, were depicted. ConvKANs demonstrated high performance across\ndatasets and dimensionalities, achieving the highest 2D AUROC (0.98) in one\ndataset and matching CNN peak 3D performance (1.00). CNN models performed well,\nwhile GCN models improved in 3D analyses, reaching up to 0.97 AUROC. 3D\nimplementations yielded higher AUROC values compared to 2D counterparts across\nall models. ConvKAN implementation shows promise for MRI analysis in PD\nclassification, particularly in the context of early diagnosis. The improvement\nin 3D analyses highlights the value of volumetric data in capturing subtle\nPD-related changes. While MRI is not currently used for PD diagnosis, these\nfindings suggest its potential as a component of a multimodal diagnostic\napproach, especially for early detection.\n","authors":["Salil B Patel","Vicky Goh","James F FitzGerald","Chrystalina A Antoniades"],"pdf_url":"https://arxiv.org/pdf/2407.17380v1.pdf","comment":"19 Pages, 5 figures"},{"id":"http://arxiv.org/abs/2403.18820v2","updated":"2024-07-24T16:04:02Z","published":"2024-03-27T17:59:54Z","title":"MetaCap: Meta-learning Priors from Multi-View Imagery for Sparse-view\n Human Performance Capture and Rendering","summary":" Faithful human performance capture and free-view rendering from sparse RGB\nobservations is a long-standing problem in Vision and Graphics. The main\nchallenges are the lack of observations and the inherent ambiguities of the\nsetting, e.g. occlusions and depth ambiguity. As a result, radiance fields,\nwhich have shown great promise in capturing high-frequency appearance and\ngeometry details in dense setups, perform poorly when naively supervising them\non sparse camera views, as the field simply overfits to the sparse-view inputs.\nTo address this, we propose MetaCap, a method for efficient and high-quality\ngeometry recovery and novel view synthesis given very sparse or even a single\nview of the human. Our key idea is to meta-learn the radiance field weights\nsolely from potentially sparse multi-view videos, which can serve as a prior\nwhen fine-tuning them on sparse imagery depicting the human. This prior\nprovides a good network weight initialization, thereby effectively addressing\nambiguities in sparse-view capture. Due to the articulated structure of the\nhuman body and motion-induced surface deformations, learning such a prior is\nnon-trivial. Therefore, we propose to meta-learn the field weights in a\npose-canonicalized space, which reduces the spatial feature range and makes\nfeature learning more effective. Consequently, one can fine-tune our field\nparameters to quickly generalize to unseen poses, novel illumination conditions\nas well as novel and sparse (even monocular) camera views. For evaluating our\nmethod under different scenarios, we collect a new dataset, WildDynaCap, which\ncontains subjects captured in, both, a dense camera dome and in-the-wild sparse\ncamera rigs, and demonstrate superior results compared to recent\nstate-of-the-art methods on, both, public and WildDynaCap dataset.\n","authors":["Guoxing Sun","Rishabh Dabral","Pascal Fua","Christian Theobalt","Marc Habermann"],"pdf_url":"https://arxiv.org/pdf/2403.18820v2.pdf","comment":"Project page: https://vcai.mpi-inf.mpg.de/projects/MetaCap/"},{"id":"http://arxiv.org/abs/2407.17379v1","updated":"2024-07-24T15:59:01Z","published":"2024-07-24T15:59:01Z","title":"MMRA: A Benchmark for Multi-granularity Multi-image Relational\n Association","summary":" Given the remarkable success that large visual language models (LVLMs) have\nachieved in image perception tasks, the endeavor to make LVMLs perceive the\nworld like humans is drawing increasing attention. Current multi-modal\nbenchmarks mainly focus on the objective fact or certain topic related\npotential knowledge within a image, but overlook the associative relations\nbetween multiple images. Therefore, we define a multi-image relation\nassociation task, and meticulously curate \\textbf{MMRA} benchmark, a\n\\textbf{M}ulti-granularity \\textbf{M}ulti-image \\textbf{R}elational\n\\textbf{A}ssociation benchmark, consisted of \\textbf{1026} samples. In order to\nsystematically and comprehensively evaluate mainstream LVLMs, we establish an\nassociational relation system among images that contain \\textbf{11 subtasks}\n(e.g, UsageSimilarity, SubEvent, etc.) at two granularity levels (i.e.,\n\"\\textbf{image}\" and \"\\textbf{entity}\") according to the relations in\nConceptNet. Our experiments demonstrate that, on our MMRA benchmark, current\nmainstream LVLMs all have their own advantages and disadvantages across\ndifferent subtasks. It is worth noting that, at the entity level, the\nperformance of all models is worse than that of them at the image level,\nindicating that the fine-grained multi-image perception task is still\nchallenging for LVLMs. The tasks related to spatial perception are relatively\ndifficult for LVLMs to handle. Furthermore, we find that LVMLs exhibit a good\nability to perceive image details, and the key to enhancing their multi-image\nassociation capability is to strengthen the reasoning ability of their language\nmodel component. All our codes and data are released at\nhtt\\url{https://github.com/Wusiwei0410/MMRA}.\n","authors":["Siwei Wu","Kang Zhu","Yu Bai","Yiming Liang","Yizhi Li","Haoning Wu","Jiaheng Liu","Ruibo Liu","Xingwei Qu","Xuxin Cheng","Ge Zhang","Wenhao Huang","Chenghua Lin"],"pdf_url":"https://arxiv.org/pdf/2407.17379v1.pdf","comment":"VLMS, Multi-Image Association"},{"id":"http://arxiv.org/abs/2407.17378v1","updated":"2024-07-24T15:58:24Z","published":"2024-07-24T15:58:24Z","title":"PrevPredMap: Exploring Temporal Modeling with Previous Predictions for\n Online Vectorized HD Map Construction","summary":" Temporal information is crucial for detecting occluded instances. Existing\ntemporal representations have progressed from BEV or PV features to more\ncompact query features. Compared to these aforementioned features, predictions\noffer the highest level of abstraction, providing explicit information. In the\ncontext of online vectorized HD map construction, this unique characteristic of\npredictions is potentially advantageous for long-term temporal modeling and the\nintegration of map priors. This paper introduces PrevPredMap, a pioneering\ntemporal modeling framework that leverages previous predictions for\nconstructing online vectorized HD maps. We have meticulously crafted two\nessential modules for PrevPredMap: the previous-predictions-based query\ngenerator and the dynamic-position-query decoder. Specifically, the\nprevious-predictions-based query generator is designed to separately encode\ndifferent types of information from previous predictions, which are then\neffectively utilized by the dynamic-position-query decoder to generate current\npredictions. Furthermore, we have developed a dual-mode strategy to ensure\nPrevPredMap's robust performance across both single-frame and temporal modes.\nExtensive experiments demonstrate that PrevPredMap achieves state-of-the-art\nperformance on the nuScenes and Argoverse2 datasets. Code will be available at\nhttps://github.com/pnnnnnnn/PrevPredMap.\n","authors":["Nan Peng","Xun Zhou","Mingming Wang","Xiaojun Yang","Songming Chen","Guisong Chen"],"pdf_url":"https://arxiv.org/pdf/2407.17378v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17365v1","updated":"2024-07-24T15:42:34Z","published":"2024-07-24T15:42:34Z","title":"ViPer: Visual Personalization of Generative Models via Individual\n Preference Learning","summary":" Different users find different images generated for the same prompt\ndesirable. This gives rise to personalized image generation which involves\ncreating images aligned with an individual's visual preference. Current\ngenerative models are, however, unpersonalized, as they are tuned to produce\noutputs that appeal to a broad audience. Using them to generate images aligned\nwith individual users relies on iterative manual prompt engineering by the user\nwhich is inefficient and undesirable. We propose to personalize the image\ngeneration process by first capturing the generic preferences of the user in a\none-time process by inviting them to comment on a small selection of images,\nexplaining why they like or dislike each. Based on these comments, we infer a\nuser's structured liked and disliked visual attributes, i.e., their visual\npreference, using a large language model. These attributes are used to guide a\ntext-to-image model toward producing images that are tuned towards the\nindividual user's visual preference. Through a series of user studies and large\nlanguage model guided evaluations, we demonstrate that the proposed method\nresults in generations that are well aligned with individual users' visual\npreferences.\n","authors":["Sogand Salehi","Mahdi Shafiei","Teresa Yeo","Roman Bachmann","Amir Zamir"],"pdf_url":"https://arxiv.org/pdf/2407.17365v1.pdf","comment":"Project page at https://viper.epfl.ch/"},{"id":"http://arxiv.org/abs/2407.17361v1","updated":"2024-07-24T15:38:20Z","published":"2024-07-24T15:38:20Z","title":"MuST: Multi-Scale Transformers for Surgical Phase Recognition","summary":" Phase recognition in surgical videos is crucial for enhancing computer-aided\nsurgical systems as it enables automated understanding of sequential procedural\nstages. Existing methods often rely on fixed temporal windows for video\nanalysis to identify dynamic surgical phases. Thus, they struggle to\nsimultaneously capture short-, mid-, and long-term information necessary to\nfully understand complex surgical procedures. To address these issues, we\npropose Multi-Scale Transformers for Surgical Phase Recognition (MuST), a novel\nTransformer-based approach that combines a Multi-Term Frame encoder with a\nTemporal Consistency Module to capture information across multiple temporal\nscales of a surgical video. Our Multi-Term Frame Encoder computes\ninterdependencies across a hierarchy of temporal scales by sampling sequences\nat increasing strides around the frame of interest. Furthermore, we employ a\nlong-term Transformer encoder over the frame embeddings to further enhance\nlong-term reasoning. MuST achieves higher performance than previous\nstate-of-the-art methods on three different public benchmarks.\n","authors":["Alejandra Pérez","Santiago Rodríguez","Nicolás Ayobi","Nicolás Aparicio","Eugénie Dessevres","Pablo Arbeláez"],"pdf_url":"https://arxiv.org/pdf/2407.17361v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17354v1","updated":"2024-07-24T15:27:21Z","published":"2024-07-24T15:27:21Z","title":"Deep Spherical Superpixels","summary":" Over the years, the use of superpixel segmentation has become very popular in\nvarious applications, serving as a preprocessing step to reduce data size by\nadapting to the content of the image, regardless of its semantic content. While\nthe superpixel segmentation of standard planar images, captured with a 90{\\deg}\nfield of view, has been extensively studied, there has been limited focus on\ndedicated methods to omnidirectional or spherical images, captured with a\n360{\\deg} field of view. In this study, we introduce the first deep\nlearning-based superpixel segmentation approach tailored for omnidirectional\nimages called DSS (for Deep Spherical Superpixels). Our methodology leverages\non spherical CNN architectures and the differentiable K-means clustering\nparadigm for superpixels, to generate superpixels that follow the spherical\ngeometry. Additionally, we propose to use data augmentation techniques\nspecifically designed for 360{\\deg} images, enabling our model to efficiently\nlearn from a limited set of annotated omnidirectional data. Our extensive\nvalidation across two datasets demonstrates that taking into account the\ninherent circular geometry of such images into our framework improves the\nsegmentation performance over traditional and deep learning-based superpixel\nmethods. Our code is available online.\n","authors":["Rémi Giraud","Michaël Clément"],"pdf_url":"https://arxiv.org/pdf/2407.17354v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.14154v2","updated":"2024-07-24T15:19:20Z","published":"2024-02-21T22:27:40Z","title":"MM-Soc: Benchmarking Multimodal Large Language Models in Social Media\n Platforms","summary":" Social media platforms are hubs for multimodal information exchange,\nencompassing text, images, and videos, making it challenging for machines to\ncomprehend the information or emotions associated with interactions in online\nspaces. Multimodal Large Language Models (MLLMs) have emerged as a promising\nsolution to these challenges, yet they struggle to accurately interpret human\nemotions and complex content such as misinformation. This paper introduces\nMM-Soc, a comprehensive benchmark designed to evaluate MLLMs' understanding of\nmultimodal social media content. MM-Soc compiles prominent multimodal datasets\nand incorporates a novel large-scale YouTube tagging dataset, targeting a range\nof tasks from misinformation detection, hate speech detection, and social\ncontext generation. Through our exhaustive evaluation on ten size-variants of\nfour open-source MLLMs, we have identified significant performance disparities,\nhighlighting the need for advancements in models' social understanding\ncapabilities. Our analysis reveals that, in a zero-shot setting, various types\nof MLLMs generally exhibit difficulties in handling social media tasks.\nHowever, MLLMs demonstrate performance improvements post fine-tuning,\nsuggesting potential pathways for improvement. Our code and data are available\nat https://github.com/claws-lab/MMSoc.git.\n","authors":["Yiqiao Jin","Minje Choi","Gaurav Verma","Jindong Wang","Srijan Kumar"],"pdf_url":"https://arxiv.org/pdf/2402.14154v2.pdf","comment":"In Proceedings of ACL 2024"},{"id":"http://arxiv.org/abs/2407.17339v1","updated":"2024-07-24T15:04:00Z","published":"2024-07-24T15:04:00Z","title":"Preliminary study on artificial intelligence methods for cybersecurity\n threat detection in computer networks based on raw data packets","summary":" Most of the intrusion detection methods in computer networks are based on\ntraffic flow characteristics. However, this approach may not fully exploit the\npotential of deep learning algorithms to directly extract features and patterns\nfrom raw packets. Moreover, it impedes real-time monitoring due to the\nnecessity of waiting for the processing pipeline to complete and introduces\ndependencies on additional software components.\n In this paper, we investigate deep learning methodologies capable of\ndetecting attacks in real-time directly from raw packet data within network\ntraffic. We propose a novel approach where packets are stacked into windows and\nseparately recognised, with a 2D image representation suitable for processing\nwith computer vision models. Our investigation utilizes the CIC IDS-2017\ndataset, which includes both benign traffic and prevalent real-world attacks,\nproviding a comprehensive foundation for our research.\n","authors":["Aleksander Ogonowski","Michał Żebrowski","Arkadiusz Ćwiek","Tobiasz Jarosiewicz","Konrad Klimaszewski","Adam Padee","Piotr Wasiuk","Michał Wójcik"],"pdf_url":"https://arxiv.org/pdf/2407.17339v1.pdf","comment":"Submitted to Computer Science Journal"},{"id":"http://arxiv.org/abs/2407.17336v1","updated":"2024-07-24T15:02:09Z","published":"2024-07-24T15:02:09Z","title":"Cascaded Light Propagation Volumes using Spherical Radial Basis\n Functions","summary":" This paper introduces a contribution made to one of the newest methods for\nsimulating indirect lighting in dynamic scenes , the cascaded light propagation\nvolumes . Our contribution consists on using Spherical Radial Basis Functions\ninstead of Spherical Harmonic, since the first achieves much better results\nwhen many coefficients are used. We explain how to integrate the Spherical\nRadial Basis Functions with the cascaded light propagation volumes, and\nevaluate our technique against the same implementation, but with Spherical\nharmonics.\n","authors":["Ludovic Silvestre","João Pereira"],"pdf_url":"https://arxiv.org/pdf/2407.17336v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17331v1","updated":"2024-07-24T14:54:16Z","published":"2024-07-24T14:54:16Z","title":"Multi-label Cluster Discrimination for Visual Representation Learning","summary":" Contrastive Language Image Pre-training (CLIP) has recently demonstrated\nsuccess across various tasks due to superior feature representation empowered\nby image-text contrastive learning. However, the instance discrimination method\nused by CLIP can hardly encode the semantic structure of training data. To\nhandle this limitation, cluster discrimination has been proposed through\niterative cluster assignment and classification. Nevertheless, most cluster\ndiscrimination approaches only define a single pseudo-label for each image,\nneglecting multi-label signals in the image. In this paper, we propose a novel\nMulti-Label Cluster Discrimination method named MLCD to enhance representation\nlearning. In the clustering step, we first cluster the large-scale LAION-400M\ndataset into one million centers based on off-the-shelf embedding features.\nConsidering that natural images frequently contain multiple visual objects or\nattributes, we select the multiple closest centers as auxiliary class labels.\nIn the discrimination step, we design a novel multi-label classification loss,\nwhich elegantly separates losses from positive classes and negative classes,\nand alleviates ambiguity on decision boundary. We validate the proposed\nmulti-label cluster discrimination method with experiments on different scales\nof models and pre-training datasets. Experimental results show that our method\nachieves state-of-the-art performance on multiple downstream tasks including\nlinear probe, zero-shot classification, and image-text retrieval.\n","authors":["Xiang An","Kaicheng Yang","Xiangzi Dai","Ziyong Feng","Jiankang Deng"],"pdf_url":"https://arxiv.org/pdf/2407.17331v1.pdf","comment":"Accepted by ECCV2024"},{"id":"http://arxiv.org/abs/2407.17328v1","updated":"2024-07-24T14:52:18Z","published":"2024-07-24T14:52:18Z","title":"DarSwin-Unet: Distortion Aware Encoder-Decoder Architecture","summary":" Wide-angle fisheye images are becoming increasingly common for perception\ntasks in applications such as robotics, security, and mobility (e.g. drones,\navionics). However, current models often either ignore the distortions in\nwide-angle images or are not suitable to perform pixel-level tasks. In this\npaper, we present an encoder-decoder model based on a radial transformer\narchitecture that adapts to distortions in wide-angle lenses by leveraging the\nphysical characteristics defined by the radial distortion profile. In contrast\nto the original model, which only performs classification tasks, we introduce a\nU-Net architecture, DarSwin-Unet, designed for pixel level tasks. Furthermore,\nwe propose a novel strategy that minimizes sparsity when sampling the image for\ncreating its input tokens. Our approach enhances the model capability to handle\npixel-level tasks in wide-angle fisheye images, making it more effective for\nreal-world applications. Compared to other baselines, DarSwin-Unet achieves the\nbest results across different datasets, with significant gains when trained on\nbounded levels of distortions (very low, low, medium, and high) and tested on\nall, including out-of-distribution distortions. We demonstrate its performance\non depth estimation and show through extensive experiments that DarSwin-Unet\ncan perform zero-shot adaptation to unseen distortions of different wide-angle\nlenses.\n","authors":["Akshaya Athwale","Ichrak Shili","Émile Bergeron","Ola Ahmad","Jean-François Lalonde"],"pdf_url":"https://arxiv.org/pdf/2407.17328v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17324v1","updated":"2024-07-24T14:48:40Z","published":"2024-07-24T14:48:40Z","title":"Enhanced Deep Learning Methodologies and MRI Selection Techniques for\n Dementia Diagnosis in the Elderly Population","summary":" Dementia, a debilitating neurological condition affecting millions worldwide,\npresents significant diagnostic challenges. In this work, we introduce a novel\nmethodology for the classification of demented and non-demented elderly\npatients using 3D brain Magnetic Resonance Imaging (MRI) scans. Our approach\nfeatures a unique technique for selectively processing MRI slices, focusing on\nthe most relevant brain regions and excluding less informative sections. This\nmethodology is complemented by a confidence-based classification committee\ncomposed of three custom deep learning models: Dem3D ResNet, Dem3D CNN, and\nDem3D EfficientNet. These models work synergistically to enhance\ndecision-making accuracy, leveraging their collective strengths. Tested on the\nOpen Access Series of Imaging Studies(OASIS) dataset, our method achieved an\nimpressive accuracy of 94.12%, surpassing existing methodologies. Furthermore,\nvalidation on the Alzheimer's Disease Neuroimaging Initiative (ADNI) dataset\nconfirmed the robustness and generalizability of our approach. The use of\nexplainable AI (XAI) techniques and comprehensive ablation studies further\nsubstantiate the effectiveness of our techniques, providing insights into the\ndecision-making process and the importance of our methodology. This research\noffers a significant advancement in dementia diagnosis, providing a highly\naccurate and efficient tool for clinical applications.\n","authors":["Nikolaos Ntampakis","Konstantinos Diamantaras","Ioanna Chouvarda","Vasileios Argyriou","Panagiotis Sarigianndis"],"pdf_url":"https://arxiv.org/pdf/2407.17324v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17312v1","updated":"2024-07-24T14:29:05Z","published":"2024-07-24T14:29:05Z","title":"Physical Adversarial Attack on Monocular Depth Estimation via\n Shape-Varying Patches","summary":" Adversarial attacks against monocular depth estimation (MDE) systems pose\nsignificant challenges, particularly in safety-critical applications such as\nautonomous driving. Existing patch-based adversarial attacks for MDE are\nconfined to the vicinity of the patch, making it difficult to affect the entire\ntarget. To address this limitation, we propose a physics-based adversarial\nattack on monocular depth estimation, employing a framework called Attack with\nShape-Varying Patches (ASP), aiming to optimize patch content, shape, and\nposition to maximize effectiveness. We introduce various mask shapes, including\nquadrilateral, rectangular, and circular masks, to enhance the flexibility and\nefficiency of the attack. Furthermore, we propose a new loss function to extend\nthe influence of the patch beyond the overlapping regions. Experimental results\ndemonstrate that our attack method generates an average depth error of 18\nmeters on the target car with a patch area of 1/9, affecting over 98\\% of the\ntarget area.\n","authors":["Chenxing Zhao","Yang Li","Shihao Wu","Wenyi Tan","Shuangju Zhou","Quan Pan"],"pdf_url":"https://arxiv.org/pdf/2407.17312v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17310v1","updated":"2024-07-24T14:22:55Z","published":"2024-07-24T14:22:55Z","title":"LangOcc: Self-Supervised Open Vocabulary Occupancy Estimation via Volume\n Rendering","summary":" Semantic occupancy has recently gained significant traction as a prominent\nmethod for 3D scene representation. However, most existing camera-based methods\nrely on costly datasets with fine-grained 3D voxel labels or LiDAR scans for\ntraining, which limits their practicality and scalability, raising the need for\nself-supervised approaches in this domain. Moreover, most methods are tied to a\npredefined set of classes which they can detect. In this work we present a\nnovel approach for open vocabulary occupancy estimation called\n\\textit{LangOcc}, that is trained only via camera images, and can detect\narbitrary semantics via vision-language alignment. In particular, we distill\nthe knowledge of the strong vision-language aligned encoder CLIP into a 3D\noccupancy model via differentiable volume rendering. Our model estimates\nvision-language aligned features in a 3D voxel grid using only images. It is\ntrained in a self-supervised manner by rendering our estimations back to 2D\nspace, where ground-truth features can be computed. This training mechanism\nautomatically supervises the scene geometry, allowing for a straight-forward\nand powerful training method without any explicit geometry supervision. LangOcc\noutperforms LiDAR-supervised competitors in open vocabulary occupancy by a\nlarge margin, solely relying on vision-based training. We also achieve\nstate-of-the-art results in self-supervised semantic occupancy estimation on\nthe Occ3D-nuScenes dataset, despite not being limited to a specific set of\ncategories, thus demonstrating the effectiveness of our proposed\nvision-language training.\n","authors":["Simon Boeder","Fabian Gigengack","Benjamin Risse"],"pdf_url":"https://arxiv.org/pdf/2407.17310v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.09920v2","updated":"2024-07-24T14:11:17Z","published":"2024-07-13T15:28:15Z","title":"MutDet: Mutually Optimizing Pre-training for Remote Sensing Object\n Detection","summary":" Detection pre-training methods for the DETR series detector have been\nextensively studied in natural scenes, e.g., DETReg. However, the detection\npre-training remains unexplored in remote sensing scenes. In existing\npre-training methods, alignment between object embeddings extracted from a\npre-trained backbone and detector features is significant. However, due to\ndifferences in feature extraction methods, a pronounced feature discrepancy\nstill exists and hinders the pre-training performance. The remote sensing\nimages with complex environments and more densely distributed objects\nexacerbate the discrepancy. In this work, we propose a novel Mutually\noptimizing pre-training framework for remote sensing object Detection, dubbed\nas MutDet. In MutDet, we propose a systemic solution against this challenge.\nFirstly, we propose a mutual enhancement module, which fuses the object\nembeddings and detector features bidirectionally in the last encoder layer,\nenhancing their information interaction.Secondly, contrastive alignment loss is\nemployed to guide this alignment process softly and simultaneously enhances\ndetector features' discriminativity. Finally, we design an auxiliary siamese\nhead to mitigate the task gap arising from the introduction of enhancement\nmodule. Comprehensive experiments on various settings show new state-of-the-art\ntransfer performance. The improvement is particularly pronounced when data\nquantity is limited. When using 10% of the DIOR-R data, MutDet improves DetReg\nby 6.1% in AP50. Codes and models are available at:\nhttps://github.com/floatingstarZ/MutDet.\n","authors":["Ziyue Huang","Yongchao Feng","Qingjie Liu","Yunhong Wang"],"pdf_url":"https://arxiv.org/pdf/2407.09920v2.pdf","comment":"14 pages, 4 figures; Accept to ECCV 2024"},{"id":"http://arxiv.org/abs/2407.17291v1","updated":"2024-07-24T14:02:20Z","published":"2024-07-24T14:02:20Z","title":"How Good (Or Bad) Are LLMs at Detecting Misleading Visualizations?","summary":" In this study, we address the growing issue of misleading charts, a prevalent\nproblem that undermines the integrity of information dissemination. Misleading\ncharts can distort the viewer's perception of data, leading to\nmisinterpretations and decisions based on false information. The development of\neffective automatic detection methods for misleading charts is an urgent field\nof research. The recent advancement of multimodal Large Language Models (LLMs)\nhas introduced a promising direction for addressing this challenge. We explored\nthe capabilities of these models in analyzing complex charts and assessing the\nimpact of different prompting strategies on the models' analyses. We utilized a\ndataset of misleading charts collected from the internet by prior research and\ncrafted nine distinct prompts, ranging from simple to complex, to test the\nability of four different multimodal LLMs in detecting over 21 different chart\nissues. Through three experiments--from initial exploration to detailed\nanalysis--we progressively gained insights into how to effectively prompt LLMs\nto identify misleading charts and developed strategies to address the\nscalability challenges encountered as we expanded our detection range from the\ninitial five issues to 21 issues in the final experiment. Our findings reveal\nthat multimodal LLMs possess a strong capability for chart comprehension and\ncritical thinking in data interpretation. There is significant potential in\nemploying multimodal LLMs to counter misleading information by supporting\ncritical thinking and enhancing visualization literacy. This study demonstrates\nthe applicability of LLMs in addressing the pressing concern of misleading\ncharts.\n","authors":["Leo Yu-Ho Lo","Huamin Qu"],"pdf_url":"https://arxiv.org/pdf/2407.17291v1.pdf","comment":"To be presented at IEEE VIS 2024"},{"id":"http://arxiv.org/abs/2212.00749v2","updated":"2024-07-24T14:00:50Z","published":"2022-12-01T18:35:03Z","title":"Multimodal Query-guided Object Localization","summary":" Consider a scenario in one-shot query-guided object localization where\nneither an image of the object nor the object category name is available as a\nquery. In such a scenario, a hand-drawn sketch of the object could be a choice\nfor a query. However, hand-drawn crude sketches alone, when used as queries,\nmight be ambiguous for object localization, e.g., a sketch of a laptop could be\nconfused for a sofa. On the other hand, a linguistic definition of the\ncategory, e.g., a small portable computer small enough to use in your lap\"\nalong with the sketch query, gives better visual and semantic cues for object\nlocalization. In this work, we present a multimodal query-guided object\nlocalization approach under the challenging open-set setting. In particular, we\nuse queries from two modalities, namely, hand-drawn sketch and description of\nthe object (also known as gloss), to perform object localization. Multimodal\nquery-guided object localization is a challenging task, especially when a large\ndomain gap exists between the queries and the natural images, as well as due to\nthe challenge of combining the complementary and minimal information present\nacross the queries. For example, hand-drawn crude sketches contain abstract\nshape information of an object, while the text descriptions often capture\npartial semantic information about a given object category. To address the\naforementioned challenges, we present a novel cross-modal attention scheme that\nguides the region proposal network to generate object proposals relevant to the\ninput queries and a novel orthogonal projection-based proposal scoring\ntechnique that scores each proposal with respect to the queries, thereby\nyielding the final localization results. ...\n","authors":["Aditay Tripathi","Rajath R Dani","Anand Mishra","Anirban Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2212.00749v2.pdf","comment":"Accepted to MMTA"},{"id":"http://arxiv.org/abs/2311.17135v4","updated":"2024-07-24T13:55:48Z","published":"2023-11-28T18:54:16Z","title":"TLControl: Trajectory and Language Control for Human Motion Synthesis","summary":" Controllable human motion synthesis is essential for applications in AR/VR,\ngaming and embodied AI. Existing methods often focus solely on either language\nor full trajectory control, lacking precision in synthesizing motions aligned\nwith user-specified trajectories, especially for multi-joint control. To\naddress these issues, we present TLControl, a novel method for realistic human\nmotion synthesis, incorporating both low-level Trajectory and high-level\nLanguage semantics controls, through the integration of neural-based and\noptimization-based techniques. Specifically, we begin with training a VQ-VAE\nfor a compact and well-structured latent motion space organized by body parts.\nWe then propose a Masked Trajectories Transformer (MTT) for predicting a motion\ndistribution conditioned on language and trajectory. Once trained, we use MTT\nto sample initial motion predictions given user-specified partial trajectories\nand text descriptions as conditioning. Finally, we introduce a test-time\noptimization to refine these coarse predictions for precise trajectory control,\nwhich offers flexibility by allowing users to specify various optimization\ngoals and ensures high runtime efficiency. Comprehensive experiments show that\nTLControl significantly outperforms the state-of-the-art in trajectory accuracy\nand time efficiency, making it practical for interactive and high-quality\nanimation generation.\n","authors":["Weilin Wan","Zhiyang Dou","Taku Komura","Wenping Wang","Dinesh Jayaraman","Lingjie Liu"],"pdf_url":"https://arxiv.org/pdf/2311.17135v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03135v6","updated":"2024-07-24T13:50:56Z","published":"2023-08-06T15:05:42Z","title":"EventBind: Learning a Unified Representation to Bind Them All for\n Event-based Open-world Understanding","summary":" In this paper, we propose EventBind, a novel and effective framework that\nunleashes the potential of vision-language models (VLMs) for event-based\nrecognition to compensate for the lack of large-scale event-based datasets. In\nparticular, due to the distinct modality gap with the image-text data and the\nlack of large-scale datasets, learning a common representation space for\nimages, texts, and events is non-trivial.Intuitively, we need to address two\nkey challenges: 1) how to generalize CLIP's visual encoder to event data while\nfully leveraging events' unique properties, e.g., sparsity and high temporal\nresolution; 2) how to effectively align the multi-modal embeddings, i.e.,\nimage, text, and events. Accordingly, we first introduce a novel event encoder\nthat subtly models the temporal information from events and meanwhile,\ngenerates event prompts for modality bridging. We then design a text encoder\nthat generates content prompts and utilizes hybrid text prompts to enhance\nEventBind's generalization ability across diverse datasets.With the proposed\nevent encoder, text encoder, and image encoder, a novel Hierarchical Triple\nContrastive Alignment (HTCA) module is introduced to jointly optimize the\ncorrelation and enable efficient knowledge transfer among the three modalities.\nWe evaluate various settings, including fine-tuning and few-shot on three\nbenchmarks, and our EventBind achieves new state-of-the-art accuracy compared\nwith the previous methods, such as on N-Caltech101 (+5.34% and +1.70%) and\nN-Imagenet (+5.65% and +1.99%) with fine-tuning and 20-shot settings,\nrespectively. Moreover, our EventBind can be flexibly extended to the event\nretrieval task using text or image queries, showing plausible performance.\nProject page:https://vlislab22.github.io/EventBind/.\n","authors":["Jiazhou Zhou","Xu Zheng","Yuanhuiyi Lyu","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2308.03135v6.pdf","comment":"ECCV 2024 Accepted. Camera-ready version with supplementary"},{"id":"http://arxiv.org/abs/2407.17274v1","updated":"2024-07-24T13:39:51Z","published":"2024-07-24T13:39:51Z","title":"Revolutionizing Text-to-Image Retrieval as Autoregressive Token-to-Voken\n Generation","summary":" Text-to-image retrieval is a fundamental task in multimedia processing,\naiming to retrieve semantically relevant cross-modal content. Traditional\nstudies have typically approached this task as a discriminative problem,\nmatching the text and image via the cross-attention mechanism (one-tower\nframework) or in a common embedding space (two-tower framework). Recently,\ngenerative cross-modal retrieval has emerged as a new research line, which\nassigns images with unique string identifiers and generates the target\nidentifier as the retrieval target. Despite its great potential, existing\ngenerative approaches are limited due to the following issues: insufficient\nvisual information in identifiers, misalignment with high-level semantics, and\nlearning gap towards the retrieval target. To address the above issues, we\npropose an autoregressive voken generation method, named AVG. AVG tokenizes\nimages into vokens, i.e., visual tokens, and innovatively formulates the\ntext-to-image retrieval task as a token-to-voken generation problem. AVG\ndiscretizes an image into a sequence of vokens as the identifier of the image,\nwhile maintaining the alignment with both the visual information and high-level\nsemantics of the image. Additionally, to bridge the learning gap between\ngenerative training and the retrieval target, we incorporate discriminative\ntraining to modify the learning direction during token-to-voken training.\nExtensive experiments demonstrate that AVG achieves superior results in both\neffectiveness and efficiency.\n","authors":["Yongqi Li","Hongru Cai","Wenjie Wang","Leigang Qu","Yinwei Wei","Wenjie Li","Liqiang Nie","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2407.17274v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2407.17272v1","updated":"2024-07-24T13:39:07Z","published":"2024-07-24T13:39:07Z","title":"DenseTrack: Drone-based Crowd Tracking via Density-aware\n Motion-appearance Synergy","summary":" Drone-based crowd tracking faces difficulties in accurately identifying and\nmonitoring objects from an aerial perspective, largely due to their small size\nand close proximity to each other, which complicates both localization and\ntracking. To address these challenges, we present the Density-aware Tracking\n(DenseTrack) framework. DenseTrack capitalizes on crowd counting to precisely\ndetermine object locations, blending visual and motion cues to improve the\ntracking of small-scale objects. It specifically addresses the problem of\ncross-frame motion to enhance tracking accuracy and dependability. DenseTrack\nemploys crowd density estimates as anchors for exact object localization within\nvideo frames. These estimates are merged with motion and position information\nfrom the tracking network, with motion offsets serving as key tracking cues.\nMoreover, DenseTrack enhances the ability to distinguish small-scale objects\nusing insights from the visual-language model, integrating appearance with\nmotion cues. The framework utilizes the Hungarian algorithm to ensure the\naccurate matching of individuals across frames. Demonstrated on DroneCrowd\ndataset, our approach exhibits superior performance, confirming its\neffectiveness in scenarios captured by drones.\n","authors":["Yi Lei","Huilin Zhu","Jingling Yuan","Guangli Xiang","Xian Zhong","Shengfeng He"],"pdf_url":"https://arxiv.org/pdf/2407.17272v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.12842v2","updated":"2024-07-24T13:38:33Z","published":"2023-09-22T12:59:39Z","title":"SRFNet: Monocular Depth Estimation with Fine-grained Structure via\n Spatial Reliability-oriented Fusion of Frames and Events","summary":" Monocular depth estimation is a crucial task to measure distance relative to\na camera, which is important for applications, such as robot navigation and\nself-driving. Traditional frame-based methods suffer from performance drops due\nto the limited dynamic range and motion blur. Therefore, recent works leverage\nnovel event cameras to complement or guide the frame modality via frame-event\nfeature fusion. However, event streams exhibit spatial sparsity, leaving some\nareas unperceived, especially in regions with marginal light changes.\nTherefore, direct fusion methods, e.g., RAMNet, often ignore the contribution\nof the most confident regions of each modality. This leads to structural\nambiguity in the modality fusion process, thus degrading the depth estimation\nperformance. In this paper, we propose a novel Spatial Reliability-oriented\nFusion Network (SRFNet), that can estimate depth with fine-grained structure at\nboth daytime and nighttime. Our method consists of two key technical\ncomponents. Firstly, we propose an attention-based interactive fusion (AIF)\nmodule that applies spatial priors of events and frames as the initial masks\nand learns the consensus regions to guide the inter-modal feature fusion. The\nfused feature are then fed back to enhance the frame and event feature\nlearning. Meanwhile, it utilizes an output head to generate a fused mask, which\nis iteratively updated for learning consensual spatial priors. Secondly, we\npropose the Reliability-oriented Depth Refinement (RDR) module to estimate\ndense depth with the fine-grained structure based on the fused features and\nmasks. We evaluate the effectiveness of our method on the synthetic and\nreal-world datasets, which shows that, even without pretraining, our method\noutperforms the prior methods, e.g., RAMNet, especially in night scenes. Our\nproject homepage: https://vlislab22.github.io/SRFNet.\n","authors":["Tianbo Pan","Zidong Cao","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2309.12842v2.pdf","comment":"Accepted to ICRA 2024"},{"id":"http://arxiv.org/abs/2407.17267v1","updated":"2024-07-24T13:30:46Z","published":"2024-07-24T13:30:46Z","title":"M4: Multi-Proxy Multi-Gate Mixture of Experts Network for Multiple\n Instance Learning in Histopathology Image Analysis","summary":" Multiple instance learning (MIL) has been successfully applied for whole\nslide images (WSIs) analysis in computational pathology, enabling a wide range\nof prediction tasks from tumor subtyping to inferring genetic mutations and\nmulti-omics biomarkers. However, existing MIL methods predominantly focus on\nsingle-task learning, resulting in not only overall low efficiency but also the\noverlook of inter-task relatedness. To address these issues, we proposed an\nadapted architecture of Multi-gate Mixture-of-experts with Multi-proxy for\nMultiple instance learning (M4), and applied this framework for simultaneous\nprediction of multiple genetic mutations from WSIs. The proposed M4 model has\ntwo main innovations: (1) utilizing a mixture of experts with multiple gating\nstrategies for multi-genetic mutation prediction on a single pathological\nslide; (2) constructing multi-proxy expert network and gate network for\ncomprehensive and effective modeling of pathological image information. Our\nmodel achieved significant improvements across five tested TCGA datasets in\ncomparison to current state-of-the-art single-task methods. The code is\navailable at:https://github.com/Bigyehahaha/M4.\n","authors":["Junyu Li","Ye Zhang","Wen Shu","Xiaobing Feng","Yingchun Wang","Pengju Yan","Xiaolin Li","Chulin Sha","Min He"],"pdf_url":"https://arxiv.org/pdf/2407.17267v1.pdf","comment":"25pages,5figures"},{"id":"http://arxiv.org/abs/2407.17265v1","updated":"2024-07-24T13:29:17Z","published":"2024-07-24T13:29:17Z","title":"SCIsegV2: A Universal Tool for Segmentation of Intramedullary Lesions in\n Spinal Cord Injury","summary":" Spinal cord injury (SCI) is a devastating incidence leading to permanent\nparalysis and loss of sensory-motor functions potentially resulting in the\nformation of lesions within the spinal cord. Imaging biomarkers obtained from\nmagnetic resonance imaging (MRI) scans can predict the functional recovery of\nindividuals with SCI and help choose the optimal treatment strategy. Currently,\nmost studies employ manual quantification of these MRI-derived biomarkers,\nwhich is a subjective and tedious task. In this work, we propose (i) a\nuniversal tool for the automatic segmentation of intramedullary SCI lesions,\ndubbed \\texttt{SCIsegV2}, and (ii) a method to automatically compute the width\nof the tissue bridges from the segmented lesion. Tissue bridges represent the\nspared spinal tissue adjacent to the lesion, which is associated with\nfunctional recovery in SCI patients. The tool was trained and validated on a\nheterogeneous dataset from 7 sites comprising patients from different SCI\nphases (acute, sub-acute, and chronic) and etiologies (traumatic SCI, ischemic\nSCI, and degenerative cervical myelopathy). Tissue bridges quantified\nautomatically did not significantly differ from those computed manually,\nsuggesting that the proposed automatic tool can be used to derive relevant MRI\nbiomarkers. \\texttt{SCIsegV2} and the automatic tissue bridges computation are\nopen-source and available in Spinal Cord Toolbox (v6.4 and above) via the\n\\texttt{sct\\_deepseg -task seg\\_sc\\_lesion\\_t2w\\_sci} and\n\\texttt{sct\\_analyze\\_lesion} functions, respectively.\n","authors":["Enamundram Naga Karthik","Jan Valošek","Lynn Farner","Dario Pfyffer","Simon Schading-Sassenhausen","Anna Lebret","Gergely David","Andrew C. Smith","Kenneth A. Weber II","Maryam Seif","RHSCIR Network Imaging Group","Patrick Freund","Julien Cohen-Adad"],"pdf_url":"https://arxiv.org/pdf/2407.17265v1.pdf","comment":"Accepted at MICCAI AMAI 2024 workshop"},{"id":"http://arxiv.org/abs/2407.17261v1","updated":"2024-07-24T13:24:25Z","published":"2024-07-24T13:24:25Z","title":"Embedding-Free Transformer with Inference Spatial Reduction for\n Efficient Semantic Segmentation","summary":" We present an Encoder-Decoder Attention Transformer, EDAFormer, which\nconsists of the Embedding-Free Transformer (EFT) encoder and the all-attention\ndecoder leveraging our Embedding-Free Attention (EFA) structure. The proposed\nEFA is a novel global context modeling mechanism that focuses on functioning\nthe global non-linearity, not the specific roles of the query, key and value.\nFor the decoder, we explore the optimized structure for considering the\nglobality, which can improve the semantic segmentation performance. In\naddition, we propose a novel Inference Spatial Reduction (ISR) method for the\ncomputational efficiency. Different from the previous spatial reduction\nattention methods, our ISR method further reduces the key-value resolution at\nthe inference phase, which can mitigate the computation-performance trade-off\ngap for the efficient semantic segmentation. Our EDAFormer shows the\nstate-of-the-art performance with the efficient computation compared to the\nexisting transformer-based semantic segmentation models in three public\nbenchmarks, including ADE20K, Cityscapes and COCO-Stuff. Furthermore, our ISR\nmethod reduces the computational cost by up to 61% with minimal mIoU\nperformance degradation on Cityscapes dataset. The code is available at\nhttps://github.com/hyunwoo137/EDAFormer.\n","authors":["Hyunwoo Yu","Yubin Cho","Beoungwoo Kang","Seunghun Moon","Kyeongbo Kong","Suk-Ju Kang"],"pdf_url":"https://arxiv.org/pdf/2407.17261v1.pdf","comment":"Accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2405.15700v2","updated":"2024-07-24T13:17:10Z","published":"2024-05-24T16:44:22Z","title":"Trackastra: Transformer-based cell tracking for live-cell microscopy","summary":" Cell tracking is a ubiquitous image analysis task in live-cell microscopy.\nUnlike multiple object tracking (MOT) for natural images, cell tracking\ntypically involves hundreds of similar-looking objects that can divide in each\nframe, making it a particularly challenging problem. Current state-of-the-art\napproaches follow the tracking-by-detection paradigm, i.e. first all cells are\ndetected per frame and successively linked in a second step to form\nbiologically consistent cell tracks. Linking is commonly solved via discrete\noptimization methods, which require manual tuning of hyperparameters for each\ndataset and are therefore cumbersome to use in practice. Here we propose\nTrackastra, a general purpose cell tracking approach that uses a simple\ntransformer architecture to directly learn pairwise associations of cells\nwithin a temporal window from annotated data. Importantly, unlike existing\ntransformer-based MOT pipelines, our learning architecture also accounts for\ndividing objects such as cells and allows for accurate tracking even with\nsimple greedy linking, thus making strides towards removing the requirement for\na complex linking step. The proposed architecture operates on the full\nspatio-temporal context of detections within a time window by avoiding the\ncomputational burden of processing dense images. We show that our tracking\napproach performs on par with or better than highly tuned state-of-the-art cell\ntracking algorithms for various biological datasets, such as bacteria, cell\ncultures and fluorescent particles. We provide code at\nhttps://github.com/weigertlab/trackastra.\n","authors":["Benjamin Gallusser","Martin Weigert"],"pdf_url":"https://arxiv.org/pdf/2405.15700v2.pdf","comment":"Accepted at ECCV 2024"},{"id":"http://arxiv.org/abs/2304.09691v5","updated":"2024-07-24T13:17:07Z","published":"2023-04-19T14:32:56Z","title":"DarSwin: Distortion Aware Radial Swin Transformer","summary":" Wide-angle lenses are commonly used in perception tasks requiring a large\nfield of view. Unfortunately, these lenses produce significant distortions,\nmaking conventional models that ignore the distortion effects unable to adapt\nto wide-angle images. In this paper, we present a novel transformer-based model\nthat automatically adapts to the distortion produced by wide-angle lenses. Our\nproposed image encoder architecture, dubbed DarSwin, leverages the physical\ncharacteristics of such lenses analytically defined by the radial distortion\nprofile. In contrast to conventional transformer-based architectures, DarSwin\ncomprises a radial patch partitioning, a distortion-based sampling technique\nfor creating token embeddings, and an angular position encoding for radial\npatch merging. Compared to other baselines, DarSwin achieves the best results\non different datasets with significant gains when trained on bounded levels of\ndistortions (very low, low, medium, and high) and tested on all, including\nout-of-distribution distortions. While the base DarSwin architecture requires\nknowledge of the radial distortion profile, we show it can be combined with a\nself-calibration network that estimates such a profile from the input image\nitself, resulting in a completely uncalibrated pipeline. Finally, we also\npresent DarSwin-Unet, which extends DarSwin, to an encoder-decoder architecture\nsuitable for pixel-level tasks. We demonstrate its performance on depth\nestimation and show through extensive experiments that DarSwin-Unet can perform\nzero-shot adaptation to unseen distortions of different wide-angle lenses. The\ncode and models are publicly available at https://lvsn.github.io/darswin/\n","authors":["Akshaya Athwale","Arman Afrasiyabi","Justin Lagüe","Ichrak Shili","Ola Ahmad","Jean-François Lalonde"],"pdf_url":"https://arxiv.org/pdf/2304.09691v5.pdf","comment":"18 pages, 12 figures"},{"id":"http://arxiv.org/abs/2311.16037v2","updated":"2024-07-24T13:16:38Z","published":"2023-11-27T17:58:21Z","title":"GaussianEditor: Editing 3D Gaussians Delicately with Text Instructions","summary":" Recently, impressive results have been achieved in 3D scene editing with text\ninstructions based on a 2D diffusion model. However, current diffusion models\nprimarily generate images by predicting noise in the latent space, and the\nediting is usually applied to the whole image, which makes it challenging to\nperform delicate, especially localized, editing for 3D scenes. Inspired by\nrecent 3D Gaussian splatting, we propose a systematic framework, named\nGaussianEditor, to edit 3D scenes delicately via 3D Gaussians with text\ninstructions. Benefiting from the explicit property of 3D Gaussians, we design\na series of techniques to achieve delicate editing. Specifically, we first\nextract the region of interest (RoI) corresponding to the text instruction,\naligning it to 3D Gaussians. The Gaussian RoI is further used to control the\nediting process. Our framework can achieve more delicate and precise editing of\n3D scenes than previous methods while enjoying much faster training speed, i.e.\nwithin 20 minutes on a single V100 GPU, more than twice as fast as\nInstruct-NeRF2NeRF (45 minutes -- 2 hours).\n","authors":["Junjie Wang","Jiemin Fang","Xiaopeng Zhang","Lingxi Xie","Qi Tian"],"pdf_url":"https://arxiv.org/pdf/2311.16037v2.pdf","comment":"CVPR 2024, Project page: https://GaussianEditor.github.io"},{"id":"http://arxiv.org/abs/2407.13421v2","updated":"2024-07-24T13:09:22Z","published":"2024-07-18T11:43:26Z","title":"CycleMix: Mixing Source Domains for Domain Generalization in\n Style-Dependent Data","summary":" As deep learning-based systems have become an integral part of everyday life,\nlimitations in their generalization ability have begun to emerge. Machine\nlearning algorithms typically rely on the i.i.d. assumption, meaning that their\ntraining and validation data are expected to follow the same distribution,\nwhich does not necessarily hold in practice. In the case of image\nclassification, one frequent reason that algorithms fail to generalize is that\nthey rely on spurious correlations present in training data, such as\nassociating image styles with target classes. These associations may not be\npresent in the unseen test data, leading to significant degradation of their\neffectiveness. In this work, we attempt to mitigate this Domain Generalization\n(DG) problem by training a robust feature extractor which disregards features\nattributed to image-style but infers based on style-invariant image\nrepresentations. To achieve this, we train CycleGAN models to learn the\ndifferent styles present in the training data and randomly mix them together to\ncreate samples with novel style attributes to improve generalization.\nExperimental results on the PACS DG benchmark validate the proposed method.\n","authors":["Aristotelis Ballas","Christos Diou"],"pdf_url":"https://arxiv.org/pdf/2407.13421v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.07503v3","updated":"2024-07-24T13:07:26Z","published":"2024-07-10T09:41:36Z","title":"Inter and Intra Prior Learning-based Hyperspectral Image Reconstruction\n Using Snapshot SWIR Metasurface","summary":" Shortwave-infrared(SWIR) spectral information, ranging from 1 {\\mu}m to\n2.5{\\mu}m, overcomes the limitations of traditional color cameras in acquiring\nscene information. However, conventional SWIR hyperspectral imaging systems\nface challenges due to their bulky setups and low acquisition speeds. This work\nintroduces a snapshot SWIR hyperspectral imaging system based on a metasurface\nfilter and a corresponding filter selection method to achieve the lowest\ncorrelation coefficient among these filters. This system offers the advantages\nof compact size and snapshot imaging. We propose a novel inter and intra prior\nlearning unfolding framework to achieve high-quality SWIR hyperspectral image\nreconstruction, which bridges the gap between prior learning and cross-stage\ninformation interaction. Additionally, We design an adaptive feature transfer\nmechanism to adaptively transfer the contextual correlation of multi-scale\nencoder features to prevent detailed information loss in the decoder.\nExperiment results demonstrate that our method can reconstruct hyperspectral\nimages with high speed and superior performance over existing methods.\n","authors":["Linqiang Li","Jinglei Hao","Yongqiang Zhao","Pan Liu","Haofang Yan","Ziqin Zhang","Seong G. Kong"],"pdf_url":"https://arxiv.org/pdf/2407.07503v3.pdf","comment":"12 pages,9 figures"},{"id":"http://arxiv.org/abs/2407.16636v2","updated":"2024-07-24T13:04:19Z","published":"2024-07-23T16:52:42Z","title":"Velocity Driven Vision: Asynchronous Sensor Fusion Birds Eye View Models\n for Autonomous Vehicles","summary":" Fusing different sensor modalities can be a difficult task, particularly if\nthey are asynchronous. Asynchronisation may arise due to long processing times\nor improper synchronisation during calibration, and there must exist a way to\nstill utilise this previous information for the purpose of safe driving, and\nobject detection in ego vehicle/ multi-agent trajectory prediction.\nDifficulties arise in the fact that the sensor modalities have captured\ninformation at different times and also at different positions in space.\nTherefore, they are not spatially nor temporally aligned. This paper will\ninvestigate the challenge of radar and LiDAR sensors being asynchronous\nrelative to the camera sensors, for various time latencies. The spatial\nalignment will be resolved before lifting into BEV space via the transformation\nof the radar/LiDAR point clouds into the new ego frame coordinate system. Only\nafter this can we concatenate the radar/LiDAR point cloud and lifted camera\nfeatures. Temporal alignment will be remedied for radar data only, we will\nimplement a novel method of inferring the future radar point positions using\nthe velocity information. Our approach to resolving the issue of sensor\nasynchrony yields promising results. We demonstrate velocity information can\ndrastically improve IoU for asynchronous datasets, as for a time latency of 360\nmilliseconds (ms), IoU improves from 49.54 to 53.63. Additionally, for a time\nlatency of 550ms, the camera+radar (C+R) model outperforms the camera+LiDAR\n(C+L) model by 0.18 IoU. This is an advancement in utilising the\noften-neglected radar sensor modality, which is less favoured than LiDAR for\nautonomous driving purposes.\n","authors":["Seamie Hayes","Sushil Sharma","Ciarán Eising"],"pdf_url":"https://arxiv.org/pdf/2407.16636v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.04557v2","updated":"2024-07-24T12:48:47Z","published":"2023-06-07T16:04:08Z","title":"PhenoBench -- A Large Dataset and Benchmarks for Semantic Image\n Interpretation in the Agricultural Domain","summary":" The production of food, feed, fiber, and fuel is a key task of agriculture,\nwhich has to cope with many challenges in the upcoming decades, e.g., a higher\ndemand, climate change, lack of workers, and the availability of arable land.\nVision systems can support making better and more sustainable field management\ndecisions, but also support the breeding of new crop varieties by allowing\ntemporally dense and reproducible measurements. Recently, agricultural robotics\ngot an increasing interest in the vision and robotics communities since it is a\npromising avenue for coping with the aforementioned lack of workers and\nenabling more sustainable production. While large datasets and benchmarks in\nother domains are readily available and enable significant progress,\nagricultural datasets and benchmarks are comparably rare. We present an\nannotated dataset and benchmarks for the semantic interpretation of real\nagricultural fields. Our dataset recorded with a UAV provides high-quality,\npixel-wise annotations of crops and weeds, but also crop leaf instances at the\nsame time. Furthermore, we provide benchmarks for various tasks on a hidden\ntest set comprised of different fields: known fields covered by the training\ndata and a completely unseen field. Our dataset, benchmarks, and code are\navailable at \\url{https://www.phenobench.org}.\n","authors":["Jan Weyler","Federico Magistri","Elias Marks","Yue Linn Chong","Matteo Sodano","Gianmarco Roggiolani","Nived Chebrolu","Cyrill Stachniss","Jens Behley"],"pdf_url":"https://arxiv.org/pdf/2306.04557v2.pdf","comment":"Accepted by IEEE Transactions on Pattern Analysis and Machine\n Intelligence (T-PAMI)"},{"id":"http://arxiv.org/abs/2407.17229v1","updated":"2024-07-24T12:32:24Z","published":"2024-07-24T12:32:24Z","title":"LPGen: Enhancing High-Fidelity Landscape Painting Generation through\n Diffusion Model","summary":" Generating landscape paintings expands the possibilities of artistic\ncreativity and imagination. Traditional landscape painting methods involve\nusing ink or colored ink on rice paper, which requires substantial time and\neffort. These methods are susceptible to errors and inconsistencies and lack\nprecise control over lines and colors. This paper presents LPGen, a\nhigh-fidelity, controllable model for landscape painting generation,\nintroducing a novel multi-modal framework that integrates image prompts into\nthe diffusion model. We extract its edges and contours by computing canny edges\nfrom the target landscape image. These, along with natural language text\nprompts and drawing style references, are fed into the latent diffusion model\nas conditions. We implement a decoupled cross-attention strategy to ensure\ncompatibility between image and text prompts, facilitating multi-modal image\ngeneration. A decoder generates the final image. Quantitative and qualitative\nanalyses demonstrate that our method outperforms existing approaches in\nlandscape painting generation and exceeds the current state-of-the-art. The\nLPGen network effectively controls the composition and color of landscape\npaintings, generates more accurate images, and supports further research in\ndeep learning-based landscape painting generation.\n","authors":["Wanggong Yang","Xiaona Wang","Yingrui Qiu","Yifei Zhao"],"pdf_url":"https://arxiv.org/pdf/2407.17229v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03678v3","updated":"2024-07-24T12:23:52Z","published":"2023-12-06T18:41:01Z","title":"Hybrid Functional Maps for Crease-Aware Non-Isometric Shape Matching","summary":" Non-isometric shape correspondence remains a fundamental challenge in\ncomputer vision. Traditional methods using Laplace-Beltrami operator (LBO)\neigenmodes face limitations in characterizing high-frequency extrinsic shape\nchanges like bending and creases. We propose a novel approach of combining the\nnon-orthogonal extrinsic basis of eigenfunctions of the elastic thin-shell\nhessian with the intrinsic ones of the LBO, creating a hybrid spectral space in\nwhich we construct functional maps. To this end, we present a theoretical\nframework to effectively integrate non-orthogonal basis functions into\ndescriptor- and learning-based functional map methods. Our approach can be\nincorporated easily into existing functional map pipelines across varying\napplications and is able to handle complex deformations beyond isometries. We\nshow extensive evaluations across various supervised and unsupervised settings\nand demonstrate significant improvements. Notably, our approach achieves up to\n15% better mean geodesic error for non-isometric correspondence settings and up\nto 45% improvement in scenarios with topological noise.\n","authors":["Lennart Bastian","Yizheng Xie","Nassir Navab","Zorah Lähner"],"pdf_url":"https://arxiv.org/pdf/2312.03678v3.pdf","comment":"Presented at CVPR 2024. This version contains two additional figures\n in the main paper and generalization experiments in the appendix. Please cite\n the official IEEE CVPR publication"},{"id":"http://arxiv.org/abs/2402.02333v2","updated":"2024-07-24T12:23:41Z","published":"2024-02-04T04:00:33Z","title":"Copyright Protection in Generative AI: A Technical Perspective","summary":" Generative AI has witnessed rapid advancement in recent years, expanding\ntheir capabilities to create synthesized content such as text, images, audio,\nand code. The high fidelity and authenticity of contents generated by these\nDeep Generative Models (DGMs) have sparked significant copyright concerns.\nThere have been various legal debates on how to effectively safeguard\ncopyrights in DGMs. This work delves into this issue by providing a\ncomprehensive overview of copyright protection from a technical perspective. We\nexamine from two distinct viewpoints: the copyrights pertaining to the source\ndata held by the data owners and those of the generative models maintained by\nthe model builders. For data copyright, we delve into methods data owners can\nprotect their content and DGMs can be utilized without infringing upon these\nrights. For model copyright, our discussion extends to strategies for\npreventing model theft and identifying outputs generated by specific models.\nFinally, we highlight the limitations of existing techniques and identify areas\nthat remain unexplored. Furthermore, we discuss prospective directions for the\nfuture of copyright protection, underscoring its importance for the sustainable\nand ethical development of Generative AI.\n","authors":["Jie Ren","Han Xu","Pengfei He","Yingqian Cui","Shenglai Zeng","Jiankun Zhang","Hongzhi Wen","Jiayuan Ding","Pei Huang","Lingjuan Lyu","Hui Liu","Yi Chang","Jiliang Tang"],"pdf_url":"https://arxiv.org/pdf/2402.02333v2.pdf","comment":"26 pages"},{"id":"http://arxiv.org/abs/2407.17219v1","updated":"2024-07-24T12:19:39Z","published":"2024-07-24T12:19:39Z","title":"Graph Neural Networks: A suitable Alternative to MLPs in Latent 3D\n Medical Image Classification?","summary":" Recent studies have underscored the capabilities of natural imaging\nfoundation models to serve as powerful feature extractors, even in a zero-shot\nsetting for medical imaging data. Most commonly, a shallow multi-layer\nperceptron (MLP) is appended to the feature extractor to facilitate end-to-end\nlearning and downstream prediction tasks such as classification, thus\nrepresenting the de facto standard. However, as graph neural networks (GNNs)\nhave become a practicable choice for various tasks in medical research in the\nrecent past, we direct attention to the question of how effective GNNs are\ncompared to MLP prediction heads for the task of 3D medical image\nclassification, proposing them as a potential alternative. In our experiments,\nwe devise a subject-level graph for each volumetric dataset instance. Therein\nlatent representations of all slices in the volume, encoded through a DINOv2\npretrained vision transformer (ViT), constitute the nodes and their respective\nnode features. We use public datasets to compare the classification heads\nnumerically and evaluate various graph construction and graph convolution\nmethods in our experiments. Our findings show enhancements of the GNN in\nclassification performance and substantial improvements in runtime compared to\nan MLP prediction head. Additional robustness evaluations further validate the\npromising performance of the GNN, promoting them as a suitable alternative to\ntraditional MLP classification heads. Our code is publicly available at:\nhttps://github.com/compai-lab/2024-miccai-grail-kiechle\n","authors":["Johannes Kiechle","Daniel M. Lang","Stefan M. Fischer","Lina Felsner","Jan C. Peeken","Julia A. Schnabel"],"pdf_url":"https://arxiv.org/pdf/2407.17219v1.pdf","comment":"Accepted at MICCAI 2024 - GRAIL Workshop"},{"id":"http://arxiv.org/abs/2407.17209v1","updated":"2024-07-24T12:09:07Z","published":"2024-07-24T12:09:07Z","title":"Nonverbal Immediacy Analysis in Education: A Multimodal Computational\n Model","summary":" This paper introduces a novel computational approach for analyzing nonverbal\nsocial behavior in educational settings. Integrating multimodal behavioral\ncues, including facial expressions, gesture intensity, and spatial dynamics,\nthe model assesses the nonverbal immediacy (NVI) of teachers from RGB classroom\nvideos. A dataset of 400 30-second video segments from German classrooms was\nconstructed for model training and validation. The gesture intensity regressor\nachieved a correlation of 0.84, the perceived distance regressor 0.55, and the\nNVI model 0.44 with median human ratings. The model demonstrates the potential\nto provide a valuable support in nonverbal behavior assessment, approximating\nthe accuracy of individual human raters. Validated against both questionnaire\ndata and trained observer ratings, our models show moderate to strong\ncorrelations with relevant educational outcomes, indicating their efficacy in\nreflecting effective teaching behaviors. This research advances the objective\nassessment of nonverbal communication behaviors, opening new pathways for\neducational research.\n","authors":["Uroš Petković","Jonas Frenkel","Olaf Hellwich","Rebecca Lazarides"],"pdf_url":"https://arxiv.org/pdf/2407.17209v1.pdf","comment":"12 pages, 3 figures. Camera-ready version for the SAB 2024: 17th\n International Conference on the Simulation of Adaptive Behavior"},{"id":"http://arxiv.org/abs/2407.17197v1","updated":"2024-07-24T11:58:31Z","published":"2024-07-24T11:58:31Z","title":"ALPI: Auto-Labeller with Proxy Injection for 3D Object Detection using\n 2D Labels Only","summary":" 3D object detection plays a crucial role in various applications such as\nautonomous vehicles, robotics and augmented reality. However, training 3D\ndetectors requires a costly precise annotation, which is a hindrance to scaling\nannotation to large datasets. To address this challenge, we propose a weakly\nsupervised 3D annotator that relies solely on 2D bounding box annotations from\nimages, along with size priors. One major problem is that supervising a 3D\ndetection model using only 2D boxes is not reliable due to ambiguities between\ndifferent 3D poses and their identical 2D projection. We introduce a simple yet\neffective and generic solution: we build 3D proxy objects with annotations by\nconstruction and add them to the training dataset. Our method requires only\nsize priors to adapt to new classes. To better align 2D supervision with 3D\ndetection, our method ensures depth invariance with a novel expression of the\n2D losses. Finally, to detect more challenging instances, our annotator follows\nan offline pseudo-labelling scheme which gradually improves its 3D\npseudo-labels. Extensive experiments on the KITTI dataset demonstrate that our\nmethod not only performs on-par or above previous works on the Car category,\nbut also achieves performance close to fully supervised methods on more\nchallenging classes. We further demonstrate the effectiveness and robustness of\nour method by being the first to experiment on the more challenging nuScenes\ndataset. We additionally propose a setting where weak labels are obtained from\na 2D detector pre-trained on MS-COCO instead of human annotations.\n","authors":["Saad Lahlali","Nicolas Granger","Hervé Le Borgne","Quoc-Cuong Pham"],"pdf_url":"https://arxiv.org/pdf/2407.17197v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17193v1","updated":"2024-07-24T11:51:47Z","published":"2024-07-24T11:51:47Z","title":"Unpaired Photo-realistic Image Deraining with Energy-informed Diffusion\n Model","summary":" Existing unpaired image deraining approaches face challenges in accurately\ncapture the distinguishing characteristics between the rainy and clean domains,\nresulting in residual degradation and color distortion within the reconstructed\nimages. To this end, we propose an energy-informed diffusion model for unpaired\nphoto-realistic image deraining (UPID-EDM). Initially, we delve into the\nintricate visual-language priors embedded within the contrastive language-image\npre-training model (CLIP), and demonstrate that the CLIP priors aid in the\ndiscrimination of rainy and clean images. Furthermore, we introduce a\ndual-consistent energy function (DEF) that retains the rain-irrelevant\ncharacteristics while eliminating the rain-relevant features. This energy\nfunction is trained by the non-corresponding rainy and clean images. In\naddition, we employ the rain-relevance discarding energy function (RDEF) and\nthe rain-irrelevance preserving energy function (RPEF) to direct the reverse\nsampling procedure of a pre-trained diffusion model, effectively removing the\nrain streaks while preserving the image contents. Extensive experiments\ndemonstrate that our energy-informed model surpasses the existing unpaired\nlearning approaches in terms of both supervised and no-reference metrics.\n","authors":["Yuanbo Wen","Tao Gao","Ting Chen"],"pdf_url":"https://arxiv.org/pdf/2407.17193v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02776v2","updated":"2024-07-24T11:48:28Z","published":"2024-06-04T20:45:53Z","title":"MeshVPR: Citywide Visual Place Recognition Using 3D Meshes","summary":" Mesh-based scene representation offers a promising direction for simplifying\nlarge-scale hierarchical visual localization pipelines, combining a visual\nplace recognition step based on global features (retrieval) and a visual\nlocalization step based on local features. While existing work demonstrates the\nviability of meshes for visual localization, the impact of using synthetic\ndatabases rendered from them in visual place recognition remains largely\nunexplored. In this work we investigate using dense 3D textured meshes for\nlarge-scale Visual Place Recognition (VPR). We identify a significant\nperformance drop when using synthetic mesh-based image databases compared to\nreal-world images for retrieval. To address this, we propose MeshVPR, a novel\nVPR pipeline that utilizes a lightweight features alignment framework to bridge\nthe gap between real-world and synthetic domains. MeshVPR leverages pre-trained\nVPR models and is efficient and scalable for city-wide deployments. We\nintroduce novel datasets with freely available 3D meshes and manually collected\nqueries from Berlin, Paris, and Melbourne. Extensive evaluations demonstrate\nthat MeshVPR achieves competitive performance with standard VPR pipelines,\npaving the way for mesh-based localization systems. Data, code, and interactive\nvisualizations are available at https://meshvpr.github.io/\n","authors":["Gabriele Berton","Lorenz Junglas","Riccardo Zaccone","Thomas Pollok","Barbara Caputo","Carlo Masone"],"pdf_url":"https://arxiv.org/pdf/2406.02776v2.pdf","comment":"Website: https://mesh-vpr.github.io/"},{"id":"http://arxiv.org/abs/2407.17181v1","updated":"2024-07-24T11:32:33Z","published":"2024-07-24T11:32:33Z","title":"Trans2Unet: Neural fusion for Nuclei Semantic Segmentation","summary":" Nuclei segmentation, despite its fundamental role in histopathological image\nanalysis, is still a challenge work. The main challenge of this task is the\nexistence of overlapping areas, which makes separating independent nuclei more\ncomplicated. In this paper, we propose a new two-branch architecture by\ncombining the Unet and TransUnet networks for nuclei segmentation task. In the\nproposed architecture, namely Trans2Unet, the input image is first sent into\nthe Unet branch whose the last convolution layer is removed. This branch makes\nthe network combine features from different spatial regions of the input image\nand localizes more precisely the regions of interest. The input image is also\nfed into the second branch. In the second branch, which is called TransUnet\nbranch, the input image will be divided into patches of images. With Vision\ntransformer (ViT) in architecture, TransUnet can serve as a powerful encoder\nfor medical image segmentation tasks and enhance image details by recovering\nlocalized spatial information. To boost up Trans2Unet efficiency and\nperformance, we proposed to infuse TransUnet with a computational-efficient\nvariation called \"Waterfall\" Atrous Spatial Pooling with Skip Connection\n(WASP-KC) module, which is inspired by the \"Waterfall\" Atrous Spatial Pooling\n(WASP) module. Experiment results on the 2018 Data Science Bowl benchmark show\nthe effectiveness and performance of the proposed architecture while compared\nwith previous segmentation models.\n","authors":["Dinh-Phu Tran","Quoc-Anh Nguyen","Van-Truong Pham","Thi-Thao Tran"],"pdf_url":"https://arxiv.org/pdf/2407.17181v1.pdf","comment":"ICCAIS 2022"},{"id":"http://arxiv.org/abs/2407.17170v1","updated":"2024-07-24T11:22:02Z","published":"2024-07-24T11:22:02Z","title":"Domain Generalized Recaptured Screen Image Identification Using SWIN\n Transformer","summary":" An increasing number of classification approaches have been developed to\naddress the issue of image rebroadcast and recapturing, a standard attack\nstrategy in insurance frauds, face spoofing, and video piracy. However, most of\nthem neglected scale variations and domain generalization scenarios, performing\npoorly in instances involving domain shifts, typically made worse by\ninter-domain and cross-domain scale variances. To overcome these issues, we\npropose a cascaded data augmentation and SWIN transformer domain generalization\nframework (DAST-DG) in the current research work Initially, we examine the\ndisparity in dataset representation. A feature generator is trained to make\nauthentic images from various domains indistinguishable. This process is then\napplied to recaptured images, creating a dual adversarial learning setup.\nExtensive experiments demonstrate that our approach is practical and surpasses\nstate-of-the-art methods across different databases. Our model achieves an\naccuracy of approximately 82\\% with a precision of 95\\% on high-variance\ndatasets.\n","authors":["Preeti Mehta","Aman Sagar","Suchi Kumari"],"pdf_url":"https://arxiv.org/pdf/2407.17170v1.pdf","comment":"11 pages, 10 figures, 9 tables"},{"id":"http://arxiv.org/abs/2407.17162v1","updated":"2024-07-24T11:06:47Z","published":"2024-07-24T11:06:47Z","title":"Context-aware Multi-task Learning for Pedestrian Intent and Trajectory\n Prediction","summary":" The advancement of socially-aware autonomous vehicles hinges on precise\nmodeling of human behavior. Within this broad paradigm, the specific challenge\nlies in accurately predicting pedestrian's trajectory and intention.\nTraditional methodologies have leaned heavily on historical trajectory data,\nfrequently overlooking vital contextual cues such as pedestrian-specific traits\nand environmental factors. Furthermore, there's a notable knowledge gap as\ntrajectory and intention prediction have largely been approached as separate\nproblems, despite their mutual dependence. To bridge this gap, we introduce\nPTINet (Pedestrian Trajectory and Intention Prediction Network), which jointly\nlearns the trajectory and intention prediction by combining past trajectory\nobservations, local contextual features (individual pedestrian behaviors), and\nglobal features (signs, markings etc.). The efficacy of our approach is\nevaluated on widely used public datasets: JAAD and PIE, where it has\ndemonstrated superior performance over existing state-of-the-art models in\ntrajectory and intention prediction. The results from our experiments and\nablation studies robustly validate PTINet's effectiveness in jointly exploring\nintention and trajectory prediction for pedestrian behaviour modelling. The\nexperimental evaluation indicates the advantage of using global and local\ncontextual features for pedestrian trajectory and intention prediction. The\neffectiveness of PTINet in predicting pedestrian behavior paves the way for the\ndevelopment of automated systems capable of seamlessly interacting with\npedestrians in urban settings.\n","authors":["Farzeen Munir","Tomasz Piotr Kucner"],"pdf_url":"https://arxiv.org/pdf/2407.17162v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17157v1","updated":"2024-07-24T11:00:08Z","published":"2024-07-24T11:00:08Z","title":"Establishing Truly Causal Relationship Between Whole Slide Image\n Predictions and Diagnostic Evidence Subregions in Deep Learning","summary":" In the field of deep learning-driven Whole Slide Image (WSI) classification,\nMultiple Instance Learning (MIL) has gained significant attention due to its\nability to be trained using only slide-level diagnostic labels. Previous MIL\nresearches have primarily focused on enhancing feature aggregators for globally\nanalyzing WSIs, but overlook a causal relationship in diagnosis: model's\nprediction should ideally stem solely from regions of the image that contain\ndiagnostic evidence (such as tumor cells), which usually occupy relatively\nsmall areas. To address this limitation and establish the truly causal\nrelationship between model predictions and diagnostic evidence regions, we\npropose Causal Inference Multiple Instance Learning (CI-MIL). CI-MIL integrates\nfeature distillation with a novel patch decorrelation mechanism, employing a\ntwo-stage causal inference approach to distill and process patches with high\ndiagnostic value. Initially, CI-MIL leverages feature distillation to identify\npatches likely containing tumor cells and extracts their corresponding feature\nrepresentations. These features are then mapped to random Fourier feature\nspace, where a learnable weighting scheme is employed to minimize inter-feature\ncorrelations, effectively reducing redundancy from homogenous patches and\nmitigating data bias. These processes strengthen the causal relationship\nbetween model predictions and diagnostically relevant regions, making the\nprediction more direct and reliable. Experimental results demonstrate that\nCI-MIL outperforms state-of-the-art methods. Additionally, CI-MIL exhibits\nsuperior interpretability, as its selected regions demonstrate high consistency\nwith ground truth annotations, promising more reliable diagnostic assistance\nfor pathologists.\n","authors":["Tianhang Nan","Yong Ding","Hao Quan","Deliang Li","Mingchen Zou","Xiaoyu Cui"],"pdf_url":"https://arxiv.org/pdf/2407.17157v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17155v1","updated":"2024-07-24T10:53:14Z","published":"2024-07-24T10:53:14Z","title":"FIIH: Fully Invertible Image Hiding for Secure and Robust","summary":" Image hiding is the study of techniques for covert storage and transmission,\nwhich embeds a secret image into a container image and generates stego image to\nmake it similar in appearance to a normal image. However, existing image hiding\nmethods have a serious problem that the hiding and revealing process cannot be\nfully invertible, which results in the revealing network not being able to\nrecover the secret image losslessly, which makes it impossible to\nsimultaneously achieve high fidelity and secure transmission of the secret\nimage in an insecure network environment. To solve this problem,this paper\nproposes a fully invertible image hiding architecture based on invertible\nneural network,aiming to realize invertible hiding of secret images,which is\ninvertible on both data and network. Based on this ingenious architecture, the\nmethod can withstand deep learning based image steganalysis. In addition, we\npropose a new method for enhancing the robustness of stego images after\ninterference during transmission. Experiments demonstrate that the FIIH\nproposed in this paper significantly outperforms other state-of-the-art image\nhiding methods in hiding a single image, and also significantly outperforms\nother state-of-the-art methods in robustness and security.\n","authors":["Lang Huang","Lin Huo","Zheng Gan","Xinrong He"],"pdf_url":"https://arxiv.org/pdf/2407.17155v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17152v1","updated":"2024-07-24T10:51:46Z","published":"2024-07-24T10:51:46Z","title":"XMeCap: Meme Caption Generation with Sub-Image Adaptability","summary":" Humor, deeply rooted in societal meanings and cultural details, poses a\nunique challenge for machines. While advances have been made in natural\nlanguage processing, real-world humor often thrives in a multi-modal context,\nencapsulated distinctively by memes. This paper poses a particular emphasis on\nthe impact of multi-images on meme captioning. After that, we introduce the\n\\textsc{XMeCap} framework, a novel approach that adopts supervised fine-tuning\nand reinforcement learning based on an innovative reward model, which factors\nin both global and local similarities between visuals and text. Our results,\nbenchmarked against contemporary models, manifest a marked improvement in\ncaption generation for both single-image and multi-image memes, as well as\ndifferent meme categories. \\textsc{XMeCap} achieves an average evaluation score\nof 75.85 for single-image memes and 66.32 for multi-image memes, outperforming\nthe best baseline by 3.71\\% and 4.82\\%, respectively. This research not only\nestablishes a new frontier in meme-related studies but also underscores the\npotential of machines in understanding and generating humor in a multi-modal\nsetting.\n","authors":["Yuyan Chen","Songzhou Yan","Zhihong Zhu","Zhixu Li","Yanghua Xiao"],"pdf_url":"https://arxiv.org/pdf/2407.17152v1.pdf","comment":"Accepted to MM 2024"},{"id":"http://arxiv.org/abs/2205.09615v5","updated":"2024-07-24T10:49:23Z","published":"2022-05-19T15:13:00Z","title":"EXACT: How to Train Your Accuracy","summary":" Classification tasks are usually evaluated in terms of accuracy. However,\naccuracy is discontinuous and cannot be directly optimized using gradient\nascent. Popular methods minimize cross-entropy, hinge loss, or other surrogate\nlosses, which can lead to suboptimal results. In this paper, we propose a new\noptimization framework by introducing stochasticity to a model's output and\noptimizing expected accuracy, i.e. accuracy of the stochastic model. Extensive\nexperiments on linear models and deep image classification show that the\nproposed optimization method is a powerful alternative to widely used\nclassification losses.\n","authors":["Ivan Karpukhin","Stanislav Dereka","Sergey Kolesnikov"],"pdf_url":"https://arxiv.org/pdf/2205.09615v5.pdf","comment":"Pattern Recognition Letters (2024)"},{"id":"http://arxiv.org/abs/2404.12867v2","updated":"2024-07-24T10:33:11Z","published":"2024-04-19T13:08:43Z","title":"FipTR: A Simple yet Effective Transformer Framework for Future Instance\n Prediction in Autonomous Driving","summary":" The future instance prediction from a Bird's Eye View(BEV) perspective is a\nvital component in autonomous driving, which involves future instance\nsegmentation and instance motion prediction. Existing methods usually rely on a\nredundant and complex pipeline which requires multiple auxiliary outputs and\npost-processing procedures. Moreover, estimated errors on each of the auxiliary\npredictions will lead to degradation of the prediction performance. In this\npaper, we propose a simple yet effective fully end-to-end framework named\nFuture Instance Prediction Transformer(FipTR), which views the task as BEV\ninstance segmentation and prediction for future frames. We propose to adopt\ninstance queries representing specific traffic participants to directly\nestimate the corresponding future occupied masks, and thus get rid of complex\npost-processing procedures. Besides, we devise a flow-aware BEV predictor for\nfuture BEV feature prediction composed of a flow-aware deformable attention\nthat takes backward flow guiding the offset sampling. A novel future instance\nmatching strategy is also proposed to further improve the temporal coherence.\nExtensive experiments demonstrate the superiority of FipTR and its\neffectiveness under different temporal BEV encoders. The code is available at\nhttps://github.com/TabGuigui/FipTR .\n","authors":["Xingtai Gui","Tengteng Huang","Haonan Shao","Haotian Yao","Chi Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.12867v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17140v1","updated":"2024-07-24T10:20:19Z","published":"2024-07-24T10:20:19Z","title":"RT-DETRv2: Improved Baseline with Bag-of-Freebies for Real-Time\n Detection Transformer","summary":" In this report, we present RT-DETRv2, an improved Real-Time DEtection\nTRansformer (RT-DETR). RT-DETRv2 builds upon the previous state-of-the-art\nreal-time detector, RT-DETR, and opens up a set of bag-of-freebies for\nflexibility and practicality, as well as optimizing the training strategy to\nachieve enhanced performance. To improve the flexibility, we suggest setting a\ndistinct number of sampling points for features at different scales in the\ndeformable attention to achieve selective multi-scale feature extraction by the\ndecoder. To enhance practicality, we propose an optional discrete sampling\noperator to replace the grid_sample operator that is specific to RT-DETR\ncompared to YOLOs. This removes the deployment constraints typically associated\nwith DETRs. For the training strategy, we propose dynamic data augmentation and\nscale-adaptive hyperparameters customization to improve performance without\nloss of speed. Source code and pre-trained models will be available at\nhttps://github.com/lyuwenyu/RT-DETR.\n","authors":["Wenyu Lv","Yian Zhao","Qinyao Chang","Kui Huang","Guanzhong Wang","Yi Liu"],"pdf_url":"https://arxiv.org/pdf/2407.17140v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04413v2","updated":"2024-07-24T10:16:33Z","published":"2024-06-06T18:01:30Z","title":"Efficient 3D-Aware Facial Image Editing via Attribute-Specific Prompt\n Learning","summary":" Drawing upon StyleGAN's expressivity and disentangled latent space, existing\n2D approaches employ textual prompting to edit facial images with different\nattributes. In contrast, 3D-aware approaches that generate faces at different\ntarget poses require attribute-specific classifiers, learning separate model\nweights for each attribute, and are not scalable for novel attributes. In this\nwork, we propose an efficient, plug-and-play, 3D-aware face editing framework\nbased on attribute-specific prompt learning, enabling the generation of facial\nimages with controllable attributes across various target poses. To this end,\nwe introduce a text-driven learnable style token-based latent attribute editor\n(LAE). The LAE harnesses a pre-trained vision-language model to find\ntext-guided attribute-specific editing direction in the latent space of any\npre-trained 3D-aware GAN. It utilizes learnable style tokens and style mappers\nto learn and transform this editing direction to 3D latent space. To train LAE\nwith multiple attributes, we use directional contrastive loss and style token\nloss. Furthermore, to ensure view consistency and identity preservation across\ndifferent poses and attributes, we employ several 3D-aware identity and pose\npreservation losses. Our experiments show that our proposed framework generates\nhigh-quality images with 3D awareness and view consistency while maintaining\nattribute-specific features. We demonstrate the effectiveness of our method on\ndifferent facial attributes, including hair color and style, expression, and\nothers.\n","authors":["Amandeep Kumar","Muhammad Awais","Sanath Narayan","Hisham Cholakkal","Salman Khan","Rao Muhammad Anwer"],"pdf_url":"https://arxiv.org/pdf/2406.04413v2.pdf","comment":"Accepted at ECCV, 2024. Amandeep Kumar and Muhammad Awais are joint\n first authors. More details are available at\n https://awaisrauf.github.io/3d_face_editing"},{"id":"http://arxiv.org/abs/2402.14654v2","updated":"2024-07-24T09:55:25Z","published":"2024-02-22T16:05:13Z","title":"Multi-HMR: Multi-Person Whole-Body Human Mesh Recovery in a Single Shot","summary":" We present Multi-HMR, a strong sigle-shot model for multi-person 3D human\nmesh recovery from a single RGB image. Predictions encompass the whole body,\ni.e., including hands and facial expressions, using the SMPL-X parametric model\nand 3D location in the camera coordinate system. Our model detects people by\npredicting coarse 2D heatmaps of person locations, using features produced by a\nstandard Vision Transformer (ViT) backbone. It then predicts their whole-body\npose, shape and 3D location using a new cross-attention module called the Human\nPrediction Head (HPH), with one query attending to the entire set of features\nfor each detected person. As direct prediction of fine-grained hands and facial\nposes in a single shot, i.e., without relying on explicit crops around body\nparts, is hard to learn from existing data, we introduce CUFFS, the Close-Up\nFrames of Full-Body Subjects dataset, containing humans close to the camera\nwith diverse hand poses. We show that incorporating it into the training data\nfurther enhances predictions, particularly for hands. Multi-HMR also optionally\naccounts for camera intrinsics, if available, by encoding camera ray directions\nfor each image token. This simple design achieves strong performance on\nwhole-body and body-only benchmarks simultaneously: a ViT-S backbone on\n$448{\\times}448$ images already yields a fast and competitive model, while\nlarger models and higher resolutions obtain state-of-the-art results.\n","authors":["Fabien Baradel","Matthieu Armando","Salma Galaaoui","Romain Brégier","Philippe Weinzaepfel","Grégory Rogez","Thomas Lucas"],"pdf_url":"https://arxiv.org/pdf/2402.14654v2.pdf","comment":"Accepted at ECCV'24 - Code: https://github.com/naver/multi-hmr"},{"id":"http://arxiv.org/abs/2404.17147v3","updated":"2024-07-24T09:28:11Z","published":"2024-04-26T04:34:45Z","title":"On the Federated Learning Framework for Cooperative Perception","summary":" Cooperative perception is essential to enhance the efficiency and safety of\nfuture transportation systems, requiring extensive data sharing among vehicles\non the road, which raises significant privacy concerns. Federated learning\noffers a promising solution by enabling data privacy-preserving collaborative\nenhancements in perception, decision-making, and planning among connected and\nautonomous vehicles (CAVs). However, federated learning is impeded by\nsignificant challenges arising from data heterogeneity across diverse clients,\npotentially diminishing model accuracy and prolonging convergence periods. This\nstudy introduces a specialized federated learning framework for CP, termed the\nfederated dynamic weighted aggregation (FedDWA) algorithm, facilitated by\ndynamic adjusting loss (DALoss) function. This framework employs dynamic client\nweighting to direct model convergence and integrates a novel loss function that\nutilizes Kullback-Leibler divergence (KLD) to counteract the detrimental\neffects of non-independently and identically distributed (Non-IID) and\nunbalanced data. Utilizing the BEV transformer as the primary model, our\nrigorous testing on the OpenV2V dataset, augmented with FedBEVT data,\ndemonstrates significant improvements in the average intersection over union\n(IoU). These results highlight the substantial potential of our federated\nlearning framework to address data heterogeneity challenges in CP, thereby\nenhancing the accuracy of environmental perception models and facilitating more\nrobust and efficient collaborative learning solutions in the transportation\nsector.\n","authors":["Zhenrong Zhang","Jianan Liu","Xi Zhou","Tao Huang","Qing-Long Han","Jingxin Liu","Hongbin Liu"],"pdf_url":"https://arxiv.org/pdf/2404.17147v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17114v1","updated":"2024-07-24T09:24:39Z","published":"2024-07-24T09:24:39Z","title":"A Self-Supervised Image Registration Approach for Measuring Local\n Response Patterns in Metastatic Ovarian Cancer","summary":" High-grade serous ovarian carcinoma (HGSOC) is characterised by significant\nspatial and temporal heterogeneity, typically manifesting at an advanced\nmetastatic stage. A major challenge in treating advanced HGSOC is effectively\nmonitoring localised change in tumour burden across multiple sites during\nneoadjuvant chemotherapy (NACT) and predicting long-term pathological response\nand overall patient survival. In this work, we propose a self-supervised\ndeformable image registration algorithm that utilises a general-purpose image\nencoder for image feature extraction to co-register contrast-enhanced\ncomputerised tomography scan images acquired before and after neoadjuvant\nchemotherapy. This approach addresses challenges posed by highly complex tumour\ndeformations and longitudinal lesion matching during treatment. Localised\ntumour changes are calculated using the Jacobian determinant maps of the\nregistration deformation at multiple disease sites and their macroscopic areas,\nincluding hypo-dense (i.e., cystic/necrotic), hyper-dense (i.e., calcified),\nand intermediate density (i.e., soft tissue) portions. A series of experiments\nis conducted to understand the role of a general-purpose image encoder and its\napplication in quantifying change in tumour burden during neoadjuvant\nchemotherapy in HGSOC. This work is the first to demonstrate the feasibility of\na self-supervised image registration approach in quantifying NACT-induced\nlocalised tumour changes across the whole disease burden of patients with\ncomplex multi-site HGSOC, which could be used as a potential marker for ovarian\ncancer patient's long-term pathological response and survival.\n","authors":["Inês P. Machado","Anna Reithmeir","Fryderyk Kogl","Leonardo Rundo","Gabriel Funingana","Marika Reinius","Gift Mungmeeprued","Zeyu Gao","Cathal McCague","Eric Kerfoot","Ramona Woitek","Evis Sala","Yangming Ou","James Brenton","Julia Schnabel","Mireia Crispin"],"pdf_url":"https://arxiv.org/pdf/2407.17114v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16344v2","updated":"2024-07-24T08:57:32Z","published":"2024-07-23T09:45:25Z","title":"SOAP: Enhancing Spatio-Temporal Relation and Motion Information\n Capturing for Few-Shot Action Recognition","summary":" High frame-rate (HFR) videos of action recognition improve fine-grained\nexpression while reducing the spatio-temporal relation and motion information\ndensity. Thus, large amounts of video samples are continuously required for\ntraditional data-driven training. However, samples are not always sufficient in\nreal-world scenarios, promoting few-shot action recognition (FSAR) research. We\nobserve that most recent FSAR works build spatio-temporal relation of video\nsamples via temporal alignment after spatial feature extraction, cutting apart\nspatial and temporal features within samples. They also capture motion\ninformation via narrow perspectives between adjacent frames without considering\ndensity, leading to insufficient motion information capturing. Therefore, we\npropose a novel plug-and-play architecture for FSAR called Spatio-tempOral\nfrAme tuPle enhancer (SOAP) in this paper. The model we designed with such\narchitecture refers to SOAP-Net. Temporal connections between different feature\nchannels and spatio-temporal relation of features are considered instead of\nsimple feature extraction. Comprehensive motion information is also captured,\nusing frame tuples with multiple frames containing more motion information than\nadjacent frames. Combining frame tuples of diverse frame counts further\nprovides a broader perspective. SOAP-Net achieves new state-of-the-art\nperformance across well-known benchmarks such as SthSthV2, Kinetics, UCF101,\nand HMDB51. Extensive empirical evaluations underscore the competitiveness,\npluggability, generalization, and robustness of SOAP. The code is released at\nhttps://github.com/wenbohuang1002/SOAP.\n","authors":["Wenbo Huang","Jinghui Zhang","Xuwei Qian","Zhen Wu","Meng Wang","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.16344v2.pdf","comment":"Accepted by ACM MM 2024"},{"id":"http://arxiv.org/abs/2407.17101v1","updated":"2024-07-24T08:53:29Z","published":"2024-07-24T08:53:29Z","title":"PiPa++: Towards Unification of Domain Adaptive Semantic Segmentation via\n Self-supervised Learning","summary":" Unsupervised domain adaptive segmentation aims to improve the segmentation\naccuracy of models on target domains without relying on labeled data from those\ndomains. This approach is crucial when labeled target domain data is scarce or\nunavailable. It seeks to align the feature representations of the source domain\n(where labeled data is available) and the target domain (where only unlabeled\ndata is present), thus enabling the model to generalize well to the target\ndomain. Current image- and video-level domain adaptation have been addressed\nusing different and specialized frameworks, training strategies and\noptimizations despite their underlying connections. In this paper, we propose a\nunified framework PiPa++, which leverages the core idea of ``comparing'' to (1)\nexplicitly encourage learning of discriminative pixel-wise features with\nintraclass compactness and inter-class separability, (2) promote the robust\nfeature learning of the identical patch against different contexts or\nfluctuations, and (3) enable the learning of temporal continuity under dynamic\nenvironments. With the designed task-smart contrastive sampling strategy,\nPiPa++ enables the mining of more informative training samples according to the\ntask demand. Extensive experiments demonstrate the effectiveness of our method\non both image-level and video-level domain adaption benchmarks. Moreover, the\nproposed method is compatible with other UDA approaches to further improve the\nperformance without introducing extra parameters.\n","authors":["Mu Chen","Zhedong Zheng","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2407.17101v1.pdf","comment":"This study is under IEEE TMM review. arXiv admin note: substantial\n text overlap with arXiv:2211.07609"},{"id":"http://arxiv.org/abs/2407.17095v1","updated":"2024-07-24T08:46:58Z","published":"2024-07-24T08:46:58Z","title":"MemBench: Memorized Image Trigger Prompt Dataset for Diffusion Models","summary":" Diffusion models have achieved remarkable success in Text-to-Image generation\ntasks, leading to the development of many commercial models. However, recent\nstudies have reported that diffusion models often generate replicated images in\ntrain data when triggered by specific prompts, potentially raising social\nissues ranging from copyright to privacy concerns. To sidestep the\nmemorization, there have been recent studies for developing memorization\nmitigation methods for diffusion models. Nevertheless, the lack of benchmarks\nimpedes the assessment of the true effectiveness of these methods. In this\nwork, we present MemBench, the first benchmark for evaluating image\nmemorization mitigation methods. Our benchmark includes a large number of\nmemorized image trigger prompts in Stable Diffusion, the most popularly used\nmodel nowadays. Furthermore, in contrast to the prior work evaluating\nmitigation performance only on trigger prompts, we present metrics evaluating\non both trigger prompts and general prompts, so that we can see whether\nmitigation methods address the memorization issue while maintaining performance\nfor general prompts. This is an important development considering the practical\napplications which previous works have overlooked. Through evaluation on\nMemBench, we verify that the performance of existing image memorization\nmitigation methods is still insufficient for application to diffusion models.\n","authors":["Chunsan Hong","Tae-Hyun Oh","Minhyuk Sung"],"pdf_url":"https://arxiv.org/pdf/2407.17095v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09308v2","updated":"2024-07-24T08:43:36Z","published":"2024-04-14T17:33:33Z","title":"In My Perspective, In My Hands: Accurate Egocentric 2D Hand Pose and\n Action Recognition","summary":" Action recognition is essential for egocentric video understanding, allowing\nautomatic and continuous monitoring of Activities of Daily Living (ADLs)\nwithout user effort. Existing literature focuses on 3D hand pose input, which\nrequires computationally intensive depth estimation networks or wearing an\nuncomfortable depth sensor. In contrast, there has been insufficient research\nin understanding 2D hand pose for egocentric action recognition, despite the\navailability of user-friendly smart glasses in the market capable of capturing\na single RGB image. Our study aims to fill this research gap by exploring the\nfield of 2D hand pose estimation for egocentric action recognition, making two\ncontributions. Firstly, we introduce two novel approaches for 2D hand pose\nestimation, namely EffHandNet for single-hand estimation and EffHandEgoNet,\ntailored for an egocentric perspective, capturing interactions between hands\nand objects. Both methods outperform state-of-the-art models on H2O and FPHA\npublic benchmarks. Secondly, we present a robust action recognition\narchitecture from 2D hand and object poses. This method incorporates\nEffHandEgoNet, and a transformer-based action recognition method. Evaluated on\nH2O and FPHA datasets, our architecture has a faster inference time and\nachieves an accuracy of 91.32% and 94.43%, respectively, surpassing state of\nthe art, including 3D-based methods. Our work demonstrates that using 2D\nskeletal data is a robust approach for egocentric action understanding.\nExtensive evaluation and ablation studies show the impact of the hand pose\nestimation approach, and how each input affects the overall performance.\n","authors":["Wiktor Mucha","Martin Kampel"],"pdf_url":"https://arxiv.org/pdf/2404.09308v2.pdf","comment":"Accepted at: The 18th IEEE International Conference on Automatic Face\n and Gesture Recognition"},{"id":"http://arxiv.org/abs/2401.03407v6","updated":"2024-07-24T08:27:47Z","published":"2024-01-07T07:56:47Z","title":"Bilateral Reference for High-Resolution Dichotomous Image Segmentation","summary":" We introduce a novel bilateral reference framework (BiRefNet) for\nhigh-resolution dichotomous image segmentation (DIS). It comprises two\nessential components: the localization module (LM) and the reconstruction\nmodule (RM) with our proposed bilateral reference (BiRef). The LM aids in\nobject localization using global semantic information. Within the RM, we\nutilize BiRef for the reconstruction process, where hierarchical patches of\nimages provide the source reference and gradient maps serve as the target\nreference. These components collaborate to generate the final predicted maps.\nWe also introduce auxiliary gradient supervision to enhance focus on regions\nwith finer details. Furthermore, we outline practical training strategies\ntailored for DIS to improve map quality and training process. To validate the\ngeneral applicability of our approach, we conduct extensive experiments on four\ntasks to evince that BiRefNet exhibits remarkable performance, outperforming\ntask-specific cutting-edge methods across all benchmarks. Our codes are\navailable at https://github.com/ZhengPeng7/BiRefNet.\n","authors":["Peng Zheng","Dehong Gao","Deng-Ping Fan","Li Liu","Jorma Laaksonen","Wanli Ouyang","Nicu Sebe"],"pdf_url":"https://arxiv.org/pdf/2401.03407v6.pdf","comment":"Version 6, the final version of the journal with a fixed institute"},{"id":"http://arxiv.org/abs/2407.13519v2","updated":"2024-07-24T08:23:26Z","published":"2024-07-18T13:53:15Z","title":"GPSFormer: A Global Perception and Local Structure Fitting-based\n Transformer for Point Cloud Understanding","summary":" Despite the significant advancements in pre-training methods for point cloud\nunderstanding, directly capturing intricate shape information from irregular\npoint clouds without reliance on external data remains a formidable challenge.\nTo address this problem, we propose GPSFormer, an innovative Global Perception\nand Local Structure Fitting-based Transformer, which learns detailed shape\ninformation from point clouds with remarkable precision. The core of GPSFormer\nis the Global Perception Module (GPM) and the Local Structure Fitting\nConvolution (LSFConv). Specifically, GPM utilizes Adaptive Deformable Graph\nConvolution (ADGConv) to identify short-range dependencies among similar\nfeatures in the feature space and employs Multi-Head Attention (MHA) to learn\nlong-range dependencies across all positions within the feature space,\nultimately enabling flexible learning of contextual representations. Inspired\nby Taylor series, we design LSFConv, which learns both low-order fundamental\nand high-order refinement information from explicitly encoded local geometric\nstructures. Integrating the GPM and LSFConv as fundamental components, we\nconstruct GPSFormer, a cutting-edge Transformer that effectively captures\nglobal and local structures of point clouds. Extensive experiments validate\nGPSFormer's effectiveness in three point cloud tasks: shape classification,\npart segmentation, and few-shot learning. The code of GPSFormer is available at\n\\url{https://github.com/changshuowang/GPSFormer}.\n","authors":["Changshuo Wang","Meiqing Wu","Siew-Kei Lam","Xin Ning","Shangshu Yu","Ruiping Wang","Weijun Li","Thambipillai Srikanthan"],"pdf_url":"https://arxiv.org/pdf/2407.13519v2.pdf","comment":"Accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2407.17085v1","updated":"2024-07-24T08:22:49Z","published":"2024-07-24T08:22:49Z","title":"OVR: A Dataset for Open Vocabulary Temporal Repetition Counting in\n Videos","summary":" We introduce a dataset of annotations of temporal repetitions in videos. The\ndataset, OVR (pronounced as over), contains annotations for over 72K videos,\nwith each annotation specifying the number of repetitions, the start and end\ntime of the repetitions, and also a free-form description of what is repeating.\nThe annotations are provided for videos sourced from Kinetics and Ego4D, and\nconsequently cover both Exo and Ego viewing conditions, with a huge variety of\nactions and activities. Moreover, OVR is almost an order of magnitude larger\nthan previous datasets for video repetition. We also propose a baseline\ntransformer-based counting model, OVRCounter, that can localise and count\nrepetitions in videos that are up to 320 frames long. The model is trained and\nevaluated on the OVR dataset, and its performance assessed with and without\nusing text to specify the target class to count. The performance is also\ncompared to a prior repetition counting model. The dataset is available for\ndownload at: https://sites.google.com/view/openvocabreps/\n","authors":["Debidatta Dwibedi","Yusuf Aytar","Jonathan Tompson","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2407.17085v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17083v1","updated":"2024-07-24T08:20:02Z","published":"2024-07-24T08:20:02Z","title":"When Text and Images Don't Mix: Bias-Correcting Language-Image\n Similarity Scores for Anomaly Detection","summary":" Contrastive Language-Image Pre-training (CLIP) achieves remarkable\nperformance in various downstream tasks through the alignment of image and text\ninput embeddings and holds great promise for anomaly detection. However, our\nempirical experiments show that the embeddings of text inputs unexpectedly\ntightly cluster together, far away from image embeddings, contrary to the\nmodel's contrastive training objective to align image-text input pairs. We show\nthat this phenomenon induces a `similarity bias' - in which false negative and\nfalse positive errors occur due to bias in the similarities between images and\nthe normal label text embeddings. To address this bias, we propose a novel\nmethodology called BLISS which directly accounts for this similarity bias\nthrough the use of an auxiliary, external set of text inputs. BLISS is simple,\nit does not require strong inductive biases about anomalous behaviour nor an\nexpensive training process, and it significantly outperforms baseline methods\non benchmark image datasets, even when access to normal data is extremely\nlimited.\n","authors":["Adam Goodge","Bryan Hooi","Wee Siong Ng"],"pdf_url":"https://arxiv.org/pdf/2407.17083v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14077v2","updated":"2024-07-24T07:51:18Z","published":"2024-05-23T00:46:53Z","title":"Learning to Transform Dynamically for Better Adversarial Transferability","summary":" Adversarial examples, crafted by adding perturbations imperceptible to\nhumans, can deceive neural networks. Recent studies identify the adversarial\ntransferability across various models, \\textit{i.e.}, the cross-model attack\nability of adversarial samples. To enhance such adversarial transferability,\nexisting input transformation-based methods diversify input data with\ntransformation augmentation. However, their effectiveness is limited by the\nfinite number of available transformations. In our study, we introduce a novel\napproach named Learning to Transform (L2T). L2T increases the diversity of\ntransformed images by selecting the optimal combination of operations from a\npool of candidates, consequently improving adversarial transferability. We\nconceptualize the selection of optimal transformation combinations as a\ntrajectory optimization problem and employ a reinforcement learning strategy to\neffectively solve the problem. Comprehensive experiments on the ImageNet\ndataset, as well as practical tests with Google Vision and GPT-4V, reveal that\nL2T surpasses current methodologies in enhancing adversarial transferability,\nthereby confirming its effectiveness and practical significance. The code is\navailable at https://github.com/RongyiZhu/L2T.\n","authors":["Rongyi Zhu","Zeliang Zhang","Susan Liang","Zhuo Liu","Chenliang Xu"],"pdf_url":"https://arxiv.org/pdf/2405.14077v2.pdf","comment":"accepted as a poster in CVPR 2024"},{"id":"http://arxiv.org/abs/2407.04833v2","updated":"2024-07-24T07:49:00Z","published":"2024-07-05T19:38:10Z","title":"3D Adaptive Structural Convolution Network for Domain-Invariant Point\n Cloud Recognition","summary":" Adapting deep learning networks for point cloud data recognition in\nself-driving vehicles faces challenges due to the variability in datasets and\nsensor technologies, emphasizing the need for adaptive techniques to maintain\naccuracy across different conditions. In this paper, we introduce the 3D\nAdaptive Structural Convolution Network (3D-ASCN), a cutting-edge framework for\n3D point cloud recognition. It combines 3D convolution kernels, a structural\ntree structure, and adaptive neighborhood sampling for effective geometric\nfeature extraction. This method obtains domain-invariant features and\ndemonstrates robust, adaptable performance on a variety of point cloud\ndatasets, ensuring compatibility across diverse sensor configurations without\nthe need for parameter adjustments. This highlights its potential to\nsignificantly enhance the reliability and efficiency of self-driving vehicle\ntechnology.\n","authors":["Younggun Kim","Beomsik Cho","Seonghoon Ryoo","Soomok Lee"],"pdf_url":"https://arxiv.org/pdf/2407.04833v2.pdf","comment":"11 pages, 3 figures"},{"id":"http://arxiv.org/abs/2407.17064v1","updated":"2024-07-24T07:45:37Z","published":"2024-07-24T07:45:37Z","title":"AI-based Density Recognition","summary":" Learning-based analysis of images is commonly used in the fields of mobility\nand robotics for safe environmental motion and interaction. This requires not\nonly object recognition but also the assignment of certain properties to them.\nWith the help of this information, causally related actions can be adapted to\ndifferent circumstances. Such logical interactions can be optimized by\nrecognizing object-assigned properties. Density as a physical property offers\nthe possibility to recognize how heavy an object is, which material it is made\nof, which forces are at work, and consequently which influence it has on its\nenvironment. Our approach introduces an AI-based concept for assigning physical\nproperties to objects through the use of associated images. Based on\nsynthesized data, we derive specific patterns from 2D images using a neural\nnetwork to extract further information such as volume, material, or density.\nAccordingly, we discuss the possibilities of property-based feature extraction\nto improve causally related logics.\n","authors":["Simone Müller","Daniel Kolb","Matthias Müller","Dieter Kranzlmüller"],"pdf_url":"https://arxiv.org/pdf/2407.17064v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17060v1","updated":"2024-07-24T07:37:12Z","published":"2024-07-24T07:37:12Z","title":"High Efficiency Image Compression for Large Visual-Language Models","summary":" In recent years, large visual language models (LVLMs) have shown impressive\nperformance and promising generalization capability in multi-modal tasks, thus\nreplacing humans as receivers of visual information in various application\nscenarios. In this paper, we pioneer to propose a variable bitrate image\ncompression framework consisting of a pre-editing module and an end-to-end\ncodec to achieve promising rate-accuracy performance for different LVLMs. In\nparticular, instead of optimizing an adaptive pre-editing network towards a\nparticular task or several representative tasks, we propose a new optimization\nstrategy tailored for LVLMs, which is designed based on the representation and\ndiscrimination capability with token-level distortion and rank. The pre-editing\nmodule and the variable bitrate end-to-end image codec are jointly trained by\nthe losses based on semantic tokens of the large model, which introduce\nenhanced generalization capability for various data and tasks. {Experimental\nresults demonstrate that the proposed framework could efficiently achieve much\nbetter rate-accuracy performance compared to the state-of-the-art coding\nstandard, Versatile Video Coding.} Meanwhile, experiments with multi-modal\ntasks have revealed the robustness and generalization capability of the\nproposed framework.\n","authors":["Binzhe Li","Shurun Wang","Shiqi Wang","Yan Ye"],"pdf_url":"https://arxiv.org/pdf/2407.17060v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17058v1","updated":"2024-07-24T07:36:33Z","published":"2024-07-24T07:36:33Z","title":"DiffCD: A Symmetric Differentiable Chamfer Distance for Neural Implicit\n Surface Fitting","summary":" Neural implicit surfaces can be used to recover accurate 3D geometry from\nimperfect point clouds. In this work, we show that state-of-the-art techniques\nwork by minimizing an approximation of a one-sided Chamfer distance. This shape\nmetric is not symmetric, as it only ensures that the point cloud is near the\nsurface but not vice versa. As a consequence, existing methods can produce\ninaccurate reconstructions with spurious surfaces. Although one approach\nagainst spurious surfaces has been widely used in the literature, we\ntheoretically and experimentally show that it is equivalent to regularizing the\nsurface area, resulting in over-smoothing. As a more appealing alternative, we\npropose DiffCD, a novel loss function corresponding to the symmetric Chamfer\ndistance. In contrast to previous work, DiffCD also assures that the surface is\nnear the point cloud, which eliminates spurious surfaces without the need for\nadditional regularization. We experimentally show that DiffCD reliably recovers\na high degree of shape detail, substantially outperforming existing work across\nvarying surface complexity and noise levels. Project code is available at\nhttps://github.com/linusnie/diffcd.\n","authors":["Linus Härenstam-Nielsen","Lu Sang","Abhishek Saroha","Nikita Araslanov","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2407.17058v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.10121v3","updated":"2024-07-24T07:35:32Z","published":"2024-07-14T08:51:25Z","title":"MSD: A Benchmark Dataset for Floor Plan Generation of Building Complexes","summary":" Diverse and realistic floor plan data are essential for the development of\nuseful computer-aided methods in architectural design. Today's large-scale\nfloor plan datasets predominantly feature simple floor plan layouts, typically\nrepresenting single-apartment dwellings only. To compensate for the mismatch\nbetween current datasets and the real world, we develop \\textbf{Modified Swiss\nDwellings} (MSD) -- the first large-scale floor plan dataset that contains a\nsignificant share of layouts of multi-apartment dwellings. MSD features over\n5.3K floor plans of medium- to large-scale building complexes, covering over\n18.9K distinct apartments. We validate that existing approaches for floor plan\ngeneration, while effective in simpler scenarios, cannot yet seamlessly address\nthe challenges posed by MSD. Our benchmark calls for new research in floor plan\nmachine understanding. Code and data are open.\n","authors":["Casper van Engelenburg","Fatemeh Mostafavi","Emanuel Kuhn","Yuntae Jeon","Michael Franzen","Matthias Standfest","Jan van Gemert","Seyran Khademi"],"pdf_url":"https://arxiv.org/pdf/2407.10121v3.pdf","comment":"ECCV 2024 (incl. Suppl. Mat.)"},{"id":"http://arxiv.org/abs/2405.20443v2","updated":"2024-07-24T07:34:35Z","published":"2024-05-30T19:40:08Z","title":"P-MSDiff: Parallel Multi-Scale Diffusion for Remote Sensing Image\n Segmentation","summary":" Diffusion models and multi-scale features are essential components in\nsemantic segmentation tasks that deal with remote-sensing images. They\ncontribute to improved segmentation boundaries and offer significant contextual\ninformation. U-net-like architectures are frequently employed in diffusion\nmodels for segmentation tasks. These architectural designs include dense skip\nconnections that may pose challenges for interpreting intermediate features.\nConsequently, they might not efficiently convey semantic information throughout\nvarious layers of the encoder-decoder architecture. To address these\nchallenges, we propose a new model for semantic segmentation known as the\ndiffusion model with parallel multi-scale branches. This model consists of\nParallel Multiscale Diffusion modules (P-MSDiff) and a Cross-Bridge Linear\nAttention mechanism (CBLA). P-MSDiff enhances the understanding of semantic\ninformation across multiple levels of granularity and detects repetitive\ndistribution data through the integration of recursive denoising branches. It\nfurther facilitates the amalgamation of data by connecting relevant branches to\nthe primary framework to enable concurrent denoising. Furthermore, within the\ninterconnected transformer architecture, the LA module has been substituted\nwith the CBLA module. This module integrates a semidefinite matrix linked to\nthe query into the dot product computation of keys and values. This integration\nenables the adaptation of queries within the LA framework. This adjustment\nenhances the structure for multi-head attention computation, leading to\nenhanced network performance and CBLA is a plug-and-play module. Our model\ndemonstrates superior performance based on the J1 metric on both the UAVid and\nVaihingen Building datasets, showing improvements of 1.60% and 1.40% over\nstrong baseline models, respectively.\n","authors":["Qi Zhang","Guohua Geng","Longquan Yan","Pengbo Zhou","Zhaodi Li","Kang Li","Qinglin Liu"],"pdf_url":"https://arxiv.org/pdf/2405.20443v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.11442v2","updated":"2024-07-24T07:31:37Z","published":"2024-05-19T04:35:05Z","title":"Unifying 3D Vision-Language Understanding via Promptable Queries","summary":" A unified model for 3D vision-language (3D-VL) understanding is expected to\ntake various scene representations and perform a wide range of tasks in a 3D\nscene. However, a considerable gap exists between existing methods and such a\nunified model, due to the independent application of representation and\ninsufficient exploration of 3D multi-task training. In this paper, we introduce\nPQ3D, a unified model capable of using Promptable Queries to tackle a wide\nrange of 3D-VL tasks, from low-level instance segmentation to high-level\nreasoning and planning. This is achieved through three key innovations: (1)\nunifying various 3D scene representations (i.e., voxels, point clouds,\nmulti-view images) into a shared 3D coordinate space by segment-level grouping,\n(2) an attention-based query decoder for task-specific information retrieval\nguided by prompts, and (3) universal output heads for different tasks to\nsupport multi-task training. Tested across ten diverse 3D-VL datasets, PQ3D\ndemonstrates impressive performance on these tasks, setting new records on most\nbenchmarks. Particularly, PQ3D improves the state-of-the-art on ScanNet200 by\n4.9% (AP25), ScanRefer by 5.4% (acc@0.5), Multi3DRefer by 11.7% (F1@0.5), and\nScan2Cap by 13.4% (CIDEr@0.5). Moreover, PQ3D supports flexible inference with\nindividual or combined forms of available 3D representations, e.g., solely\nvoxel input.\n","authors":["Ziyu Zhu","Zhuofan Zhang","Xiaojian Ma","Xuesong Niu","Yixin Chen","Baoxiong Jia","Zhidong Deng","Siyuan Huang","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2405.11442v2.pdf","comment":"ECCV 2024. Project page: https://pq3d.github.io"},{"id":"http://arxiv.org/abs/2312.06193v2","updated":"2024-07-24T07:21:35Z","published":"2023-12-11T08:16:55Z","title":"DisControlFace: Adding Disentangled Control to Diffusion Autoencoder for\n One-shot Explicit Facial Image Editing","summary":" In this work, we focus on exploring explicit fine-grained control of\ngenerative facial image editing, all while generating faithful facial\nappearances and consistent semantic details, which however, is quite\nchallenging and has not been extensively explored, especially under an one-shot\nscenario. We identify the key challenge as the exploration of disentangled\nconditional control between high-level semantics and explicit parameters (e.g.,\n3DMM) in the generation process, and accordingly propose a novel\ndiffusion-based editing framework, named DisControlFace. Specifically, we\nleverage a Diffusion Autoencoder (Diff-AE) as the semantic reconstruction\nbackbone. To enable explicit face editing, we construct an Exp-FaceNet that is\ncompatible with Diff-AE to generate spatial-wise explicit control conditions\nbased on estimated 3DMM parameters. Different from current diffusion-based\nediting methods that train the whole conditional generative model from scratch,\nwe freeze the pre-trained weights of the Diff-AE to maintain its semantically\ndeterministic conditioning capability and accordingly propose a random semantic\nmasking (RSM) strategy to effectively achieve an independent training of\nExp-FaceNet. This setting endows the model with disentangled face control\nmeanwhile reducing semantic information shift in editing. Our model can be\ntrained using 2D in-the-wild portrait images without requiring 3D or video data\nand perform robust editing on any new facial image through a simple one-shot\nfine-tuning. Comprehensive experiments demonstrate that DisControlFace can\ngenerate realistic facial images with better editing accuracy and identity\npreservation over state-of-the-art methods. Project page:\nhttps://discontrolface.github.io/\n","authors":["Haozhe Jia","Yan Li","Hengfei Cui","Di Xu","Yuwang Wang","Tao Yu"],"pdf_url":"https://arxiv.org/pdf/2312.06193v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16653v2","updated":"2024-07-24T07:18:46Z","published":"2024-07-23T17:14:01Z","title":"Aggregated Attributions for Explanatory Analysis of 3D Segmentation\n Models","summary":" Analysis of 3D segmentation models, especially in the context of medical\nimaging, is often limited to segmentation performance metrics that overlook the\ncrucial aspect of explainability and bias. Currently, effectively explaining\nthese models with saliency maps is challenging due to the high dimensions of\ninput images multiplied by the ever-growing number of segmented class labels.\nTo this end, we introduce Agg^2Exp, a methodology for aggregating fine-grained\nvoxel attributions of the segmentation model's predictions. Unlike classical\nexplanation methods that primarily focus on the local feature attribution,\nAgg^2Exp enables a more comprehensive global view on the importance of\npredicted segments in 3D images. Our benchmarking experiments show that\ngradient-based voxel attributions are more faithful to the model's predictions\nthan perturbation-based explanations. As a concrete use-case, we apply Agg^2Exp\nto discover knowledge acquired by the Swin UNEt TRansformer model trained on\nthe TotalSegmentator v2 dataset for segmenting anatomical structures in\ncomputed tomography medical images. Agg^2Exp facilitates the explanatory\nanalysis of large segmentation models beyond their predictive performance.\n","authors":["Maciej Chrabaszcz","Hubert Baniecki","Piotr Komorowski","Szymon Płotka","Przemyslaw Biecek"],"pdf_url":"https://arxiv.org/pdf/2407.16653v2.pdf","comment":"Added Acknowledgments"},{"id":"http://arxiv.org/abs/2405.20091v3","updated":"2024-07-24T07:06:43Z","published":"2024-05-30T14:27:40Z","title":"VAAD: Visual Attention Analysis Dashboard applied to e-Learning","summary":" In this paper, we present an approach in the Multimodal Learning Analytics\nfield. Within this approach, we have developed a tool to visualize and analyze\neye movement data collected during learning sessions in online courses. The\ntool is named VAAD, an acronym for Visual Attention Analysis Dashboard. These\neye movement data have been gathered using an eye-tracker and subsequently\nprocessed and visualized for interpretation. The purpose of the tool is to\nconduct a descriptive analysis of the data by facilitating its visualization,\nenabling the identification of differences and learning patterns among various\nlearner populations. Additionally, it integrates a predictive module capable of\nanticipating learner activities during a learning session. Consequently, VAAD\nholds the potential to offer valuable insights into online learning behaviors\nfrom both descriptive and predictive perspectives.\n","authors":["Miriam Navarro","Álvaro Becerra","Roberto Daza","Ruth Cobos","Aythami Morales","Julian Fierrez"],"pdf_url":"https://arxiv.org/pdf/2405.20091v3.pdf","comment":"Accepted in CEDI 2024 (VII Congreso Espa\\~nol de Inform\\'atica), A\n Coru\\~na, Spain"},{"id":"http://arxiv.org/abs/2406.14556v3","updated":"2024-07-24T07:03:29Z","published":"2024-06-20T17:59:03Z","title":"Asynchronous Large Language Model Enhanced Planner for Autonomous\n Driving","summary":" Despite real-time planners exhibiting remarkable performance in autonomous\ndriving, the growing exploration of Large Language Models (LLMs) has opened\navenues for enhancing the interpretability and controllability of motion\nplanning. Nevertheless, LLM-based planners continue to encounter significant\nchallenges, including elevated resource consumption and extended inference\ntimes, which pose substantial obstacles to practical deployment. In light of\nthese challenges, we introduce AsyncDriver, a new asynchronous LLM-enhanced\nclosed-loop framework designed to leverage scene-associated instruction\nfeatures produced by LLM to guide real-time planners in making precise and\ncontrollable trajectory predictions. On one hand, our method highlights the\nprowess of LLMs in comprehending and reasoning with vectorized scene data and a\nseries of routing instructions, demonstrating its effective assistance to\nreal-time planners. On the other hand, the proposed framework decouples the\ninference processes of the LLM and real-time planners. By capitalizing on the\nasynchronous nature of their inference frequencies, our approach have\nsuccessfully reduced the computational cost introduced by LLM, while\nmaintaining comparable performance. Experiments show that our approach achieves\nsuperior closed-loop evaluation performance on nuPlan's challenging scenarios.\n","authors":["Yuan Chen","Zi-han Ding","Ziqin Wang","Yan Wang","Lijun Zhang","Si Liu"],"pdf_url":"https://arxiv.org/pdf/2406.14556v3.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2306.08326v3","updated":"2024-07-24T06:53:19Z","published":"2023-06-14T07:58:14Z","title":"Early Detection of Late Blight Tomato Disease using Histogram Oriented\n Gradient based Support Vector Machine","summary":" The tomato is one of the most important fruits on earth. It plays an\nimportant and useful role in the agricultural production of any country. This\nresearch propose a novel smart technique for early detection of late blight\ndiseases in tomatoes. This work improve the dataset with an increase in images\nfrom the field (the Plant Village dataset) and proposed a hybrid algorithm\ncomposed of support vector machines (SVM) and histogram-oriented gradients\n(HOG) for real-time detection of late blight tomato disease. To propose a\nHOG-based SVM model for early detection of late blight tomato leaf disease. To\ncheck the performance of the proposed model in terms of MSE, accuracy,\nprecision, and recall as compared to Decision Tree and KNN. The integration of\nadvanced technology in agriculture has the potential to revolutionize the\nindustry, making it more efficient, sustainable, and profitable. This research\nwork on the early detection of tomato diseases contributes to the growing\nimportance of smart farming, the need for climate-smart agriculture, the rising\nneed to more efficiently utilize natural resources, and the demand for higher\ncrop yields. The proposed hybrid algorithm of SVM and HOG has significant\npotential for the early detection of late blight disease in tomato plants. The\nperformance of the proposed model against decision tree and KNN algorithms and\nthe results may assist in selecting the best algorithm for future applications.\nThe research work can help farmers make data-driven decisions to optimize crop\nyield and quality while also reducing the environmental impact of farming\npractices.\n","authors":["Yousef Alhwaiti","Muhammad Ishaq","Muhammad Hameed Siddiqi","Muhammad Waqas","Madallah Alruwaili","Saad Alanazi","Asfandyar Khan","Faheem Khan"],"pdf_url":"https://arxiv.org/pdf/2306.08326v3.pdf","comment":"The article titled \"Early Detection of Late Blight Tomato Disease\n using Histogram Oriented Gradient based Support Vector Machine\" need to be\n withdrawn there are other contributors in the improvement of this article"},{"id":"http://arxiv.org/abs/2311.17050v3","updated":"2024-07-24T06:49:30Z","published":"2023-11-28T18:56:01Z","title":"Surf-D: Generating High-Quality Surfaces of Arbitrary Topologies Using\n Diffusion Models","summary":" We present Surf-D, a novel method for generating high-quality 3D shapes as\nSurfaces with arbitrary topologies using Diffusion models. Previous methods\nexplored shape generation with different representations and they suffer from\nlimited topologies and poor geometry details. To generate high-quality surfaces\nof arbitrary topologies, we use the Unsigned Distance Field (UDF) as our\nsurface representation to accommodate arbitrary topologies. Furthermore, we\npropose a new pipeline that employs a point-based AutoEncoder to learn a\ncompact and continuous latent space for accurately encoding UDF and support\nhigh-resolution mesh extraction. We further show that our new pipeline\nsignificantly outperforms the prior approaches to learning the distance fields,\nsuch as the grid-based AutoEncoder, which is not scalable and incapable of\nlearning accurate UDF. In addition, we adopt a curriculum learning strategy to\nefficiently embed various surfaces. With the pretrained shape latent space, we\nemploy a latent diffusion model to acquire the distribution of various shapes.\nExtensive experiments are presented on using Surf-D for unconditional\ngeneration, category conditional generation, image conditional generation, and\ntext-to-shape tasks. The experiments demonstrate the superior performance of\nSurf-D in shape generation across multiple modalities as conditions. Visit our\nproject page at https://yzmblog.github.io/projects/SurfD/.\n","authors":["Zhengming Yu","Zhiyang Dou","Xiaoxiao Long","Cheng Lin","Zekun Li","Yuan Liu","Norman Müller","Taku Komura","Marc Habermann","Christian Theobalt","Xin Li","Wenping Wang"],"pdf_url":"https://arxiv.org/pdf/2311.17050v3.pdf","comment":"Accepted to ECCV 2024. Project Page:\n https://yzmblog.github.io/projects/SurfD/"},{"id":"http://arxiv.org/abs/2406.18113v2","updated":"2024-07-24T06:43:07Z","published":"2024-06-26T06:59:09Z","title":"The Surprising Effectiveness of Multimodal Large Language Models for\n Video Moment Retrieval","summary":" Recent studies have shown promising results in utilizing multimodal large\nlanguage models (MLLMs) for computer vision tasks such as object detection and\nsemantic segmentation. However, many challenging video tasks remain\nunder-explored. Video-language tasks necessitate spatial and temporal\ncomprehension and require significant compute. Therefore, prior works have\ndeveloped complex, highly specialized architectures or leveraged additional\ninput signals such as video transcripts to best encode contextual and temporal\ninformation, which limits their generality and can be impractical. One\nparticularly challenging task is video moment retrieval, which requires precise\ntemporal and contextual grounding. This work demonstrates the surprising\neffectiveness of leveraging image-text pretrained MLLMs for moment retrieval.\nWe introduce Mr. BLIP (Mr. as in Moment Retrieval), a multimodal, single-stage\nmodel that requires no expensive video-language pretraining, no additional\ninput signal (e.g., no transcript or audio), and has a simpler and more\nversatile design than prior state-of-the-art methods. We achieve a new\nstate-of-the-art in moment retrieval on the widely used benchmarks\nCharades-STA, QVHighlights, and ActivityNet Captions. Notably, we attain over\n9% (absolute) higher Recall (at 0.5 and 0.7 IoU) on the challenging long-video\nmulti-moment QVHighlights benchmark. Our code is publicly available.\n","authors":["Meinardus Boris","Batra Anil","Rohrbach Anna","Rohrbach Marcus"],"pdf_url":"https://arxiv.org/pdf/2406.18113v2.pdf","comment":"Code: https://github.com/sudo-Boris/mr-Blip"},{"id":"http://arxiv.org/abs/2407.17035v1","updated":"2024-07-24T06:42:46Z","published":"2024-07-24T06:42:46Z","title":"Q-Ground: Image Quality Grounding with Large Multi-modality Models","summary":" Recent advances of large multi-modality models (LMM) have greatly improved\nthe ability of image quality assessment (IQA) method to evaluate and explain\nthe quality of visual content. However, these advancements are mostly focused\non overall quality assessment, and the detailed examination of local quality,\nwhich is crucial for comprehensive visual understanding, is still largely\nunexplored. In this work, we introduce Q-Ground, the first framework aimed at\ntackling fine-scale visual quality grounding by combining large multi-modality\nmodels with detailed visual quality analysis. Central to our contribution is\nthe introduction of the QGround-100K dataset, a novel resource containing 100k\ntriplets of (image, quality text, distortion segmentation) to facilitate deep\ninvestigations into visual quality. The dataset comprises two parts: one with\nhuman-labeled annotations for accurate quality assessment, and another labeled\nautomatically by LMMs such as GPT4V, which helps improve the robustness of\nmodel training while also reducing the costs of data collection. With the\nQGround-100K dataset, we propose a LMM-based method equipped with multi-scale\nfeature learning to learn models capable of performing both image quality\nanswering and distortion segmentation based on text prompts. This\ndual-capability approach not only refines the model's understanding of\nregion-aware image quality but also enables it to interactively respond to\ncomplex, text-based queries about image quality and specific distortions.\nQ-Ground takes a step towards sophisticated visual quality analysis in a finer\nscale, establishing a new benchmark for future research in the area. Codes and\ndataset are available at https://github.com/Q-Future/Q-Ground.\n","authors":["Chaofeng Chen","Sensen Yang","Haoning Wu","Liang Liao","Zicheng Zhang","Annan Wang","Wenxiu Sun","Qiong Yan","Weisi Lin"],"pdf_url":"https://arxiv.org/pdf/2407.17035v1.pdf","comment":"ACM Multimedia 2024 (Oral)"},{"id":"http://arxiv.org/abs/2407.17028v1","updated":"2024-07-24T06:15:28Z","published":"2024-07-24T06:15:28Z","title":"Enhancing Environmental Monitoring through Multispectral Imaging: The\n WasteMS Dataset for Semantic Segmentation of Lakeside Waste","summary":" Environmental monitoring of lakeside green areas is crucial for environmental\nprotection. Compared to manual inspections, computer vision technologies offer\na more efficient solution when deployed on-site. Multispectral imaging provides\ndiverse information about objects under different spectrums, aiding in the\ndifferentiation between waste and lakeside lawn environments. This study\nintroduces WasteMS, the first multispectral dataset established for the\nsemantic segmentation of lakeside waste. WasteMS includes a diverse range of\nwaste types in lawn environments, captured under various lighting conditions.\nWe implemented a rigorous annotation process to label waste in images.\nRepresentative semantic segmentation frameworks were used to evaluate\nsegmentation accuracy using WasteMS. Challenges encountered when using WasteMS\nfor segmenting waste on lakeside lawns were discussed. The WasteMS dataset is\navailable at https://github.com/zhuqinfeng1999/WasteMS.\n","authors":["Qinfeng Zhu","Ningxin Weng","Lei Fan","Yuanzhi Cai"],"pdf_url":"https://arxiv.org/pdf/2407.17028v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09084v4","updated":"2024-07-24T06:07:28Z","published":"2023-08-17T16:23:52Z","title":"MovePose: A High-performance Human Pose Estimation Algorithm on Mobile\n and Edge Devices","summary":" We present MovePose, an optimized lightweight convolutional neural network\ndesigned specifically for real-time body pose estimation on CPU-based mobile\ndevices. The current solutions do not provide satisfactory accuracy and speed\nfor human posture estimation, and MovePose addresses this gap. It aims to\nmaintain real-time performance while improving the accuracy of human posture\nestimation for mobile devices. Our MovePose algorithm has attained an Mean\nAverage Precision (mAP) score of 68.0 on the COCO \\cite{cocodata} validation\ndataset. The MovePose algorithm displayed efficiency with a performance of 69+\nframes per second (fps) when run on an Intel i9-10920x CPU. Additionally, it\nshowcased an increased performance of 452+ fps on an NVIDIA RTX3090 GPU. On an\nAndroid phone equipped with a Snapdragon 8 + 4G processor, the fps reached\nabove 11. To enhance accuracy, we incorporated three techniques: deconvolution,\nlarge kernel convolution, and coordinate classification methods. Compared to\nbasic upsampling, deconvolution is trainable, improves model capacity, and\nenhances the receptive field. Large kernel convolution strengthens these\nproperties at a decreased computational cost. In summary, MovePose provides\nhigh accuracy and real-time performance, marking it a potential tool for a\nvariety of applications, including those focused on mobile-side human posture\nestimation. The code and models for this algorithm will be made publicly\naccessible.\n","authors":["Dongyang Yu","Haoyue Zhang","Ruisheng Zhao","Guoqi Chen","Wangpeng An","Yanhong Yang"],"pdf_url":"https://arxiv.org/pdf/2308.09084v4.pdf","comment":"This paper has been accepted by ICANN 2024 and is an oral\n presentation"},{"id":"http://arxiv.org/abs/2407.17020v1","updated":"2024-07-24T06:00:33Z","published":"2024-07-24T06:00:33Z","title":"EAFormer: Scene Text Segmentation with Edge-Aware Transformers","summary":" Scene text segmentation aims at cropping texts from scene images, which is\nusually used to help generative models edit or remove texts. The existing text\nsegmentation methods tend to involve various text-related supervisions for\nbetter performance. However, most of them ignore the importance of text edges,\nwhich are significant for downstream applications. In this paper, we propose\nEdge-Aware Transformers, termed EAFormer, to segment texts more accurately,\nespecially at the edge of texts. Specifically, we first design a text edge\nextractor to detect edges and filter out edges of non-text areas. Then, we\npropose an edge-guided encoder to make the model focus more on text edges.\nFinally, an MLP-based decoder is employed to predict text masks. We have\nconducted extensive experiments on commonly-used benchmarks to verify the\neffectiveness of EAFormer. The experimental results demonstrate that the\nproposed method can perform better than previous methods, especially on the\nsegmentation of text edges. Considering that the annotations of several\nbenchmarks (e.g., COCO_TS and MLT_S) are not accurate enough to fairly evaluate\nour methods, we have relabeled these datasets. Through experiments, we observe\nthat our method can achieve a higher performance improvement when more accurate\nannotations are used for training.\n","authors":["Haiyang Yu","Teng Fu","Bin Li","Xiangyang Xue"],"pdf_url":"https://arxiv.org/pdf/2407.17020v1.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2407.16248v2","updated":"2024-07-24T05:56:55Z","published":"2024-07-23T07:36:54Z","title":"Spatiotemporal Graph Guided Multi-modal Network for Livestreaming\n Product Retrieval","summary":" With the rapid expansion of e-commerce, more consumers have become accustomed\nto making purchases via livestreaming. Accurately identifying the products\nbeing sold by salespeople, i.e., livestreaming product retrieval (LPR), poses a\nfundamental and daunting challenge. The LPR task encompasses three primary\ndilemmas in real-world scenarios: 1) the recognition of intended products from\ndistractor products present in the background; 2) the video-image heterogeneity\nthat the appearance of products showcased in live streams often deviates\nsubstantially from standardized product images in stores; 3) there are numerous\nconfusing products with subtle visual nuances in the shop. To tackle these\nchallenges, we propose the Spatiotemporal Graphing Multi-modal Network (SGMN).\nFirst, we employ a text-guided attention mechanism that leverages the spoken\ncontent of salespeople to guide the model to focus toward intended products,\nemphasizing their salience over cluttered background products. Second, a\nlong-range spatiotemporal graph network is further designed to achieve both\ninstance-level interaction and frame-level matching, solving the misalignment\ncaused by video-image heterogeneity. Third, we propose a multi-modal hard\nexample mining, assisting the model in distinguishing highly similar products\nwith fine-grained features across the video-image-text domain. Through\nextensive quantitative and qualitative experiments, we demonstrate the superior\nperformance of our proposed SGMN model, surpassing the state-of-the-art methods\nby a substantial margin. The code is available at\nhttps://github.com/Huxiaowan/SGMN.\n","authors":["Xiaowan Hu","Yiyi Chen","Yan Li","Minquan Wang","Haoqian Wang","Quan Chen","Han Li","Peng Jiang"],"pdf_url":"https://arxiv.org/pdf/2407.16248v2.pdf","comment":"9 pages, 12 figures"},{"id":"http://arxiv.org/abs/2308.03725v2","updated":"2024-07-24T05:49:27Z","published":"2023-08-07T17:07:48Z","title":"Efficient Temporal Sentence Grounding in Videos with Multi-Teacher\n Knowledge Distillation","summary":" Temporal Sentence Grounding in Videos (TSGV) aims to detect the event\ntimestamps described by the natural language query from untrimmed videos. This\npaper discusses the challenge of achieving efficient computation in TSGV models\nwhile maintaining high performance. Most existing approaches exquisitely design\ncomplex architectures to improve accuracy with extra layers and loss, suffering\nfrom inefficiency and heaviness. Although some works have noticed that, they\nonly make an issue of feature fusion layers, which can hardly enjoy the\nhighspeed merit in the whole clunky network. To tackle this problem, we propose\na novel efficient multi-teacher model (EMTM) based on knowledge distillation to\ntransfer diverse knowledge from both heterogeneous and isomorphic networks.\nSpecifically, We first unify different outputs of the heterogeneous models into\none single form. Next, a Knowledge Aggregation Unit (KAU) is built to acquire\nhigh-quality integrated soft labels from multiple teachers. After that, the KAU\nmodule leverages the multi-scale video and global query information to\nadaptively determine the weights of different teachers. A Shared Encoder\nstrategy is then proposed to solve the problem that the student shallow layers\nhardly benefit from teachers, in which an isomorphic teacher is collaboratively\ntrained with the student to align their hidden states. Extensive experimental\nresults on three popular TSGV benchmarks demonstrate that our method is both\neffective and efficient without bells and whistles.\n","authors":["Renjie Liang","Yiming Yang","Hui Lu","Li Li"],"pdf_url":"https://arxiv.org/pdf/2308.03725v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09624v3","updated":"2024-07-24T05:28:49Z","published":"2024-04-15T09:56:20Z","title":"AesExpert: Towards Multi-modality Foundation Model for Image Aesthetics\n Perception","summary":" The highly abstract nature of image aesthetics perception (IAP) poses\nsignificant challenge for current multimodal large language models (MLLMs). The\nlack of human-annotated multi-modality aesthetic data further exacerbates this\ndilemma, resulting in MLLMs falling short of aesthetics perception\ncapabilities. To address the above challenge, we first introduce a\ncomprehensively annotated Aesthetic Multi-Modality Instruction Tuning (AesMMIT)\ndataset, which serves as the footstone for building multi-modality aesthetics\nfoundation models. Specifically, to align MLLMs with human aesthetics\nperception, we construct a corpus-rich aesthetic critique database with 21,904\ndiverse-sourced images and 88K human natural language feedbacks, which are\ncollected via progressive questions, ranging from coarse-grained aesthetic\ngrades to fine-grained aesthetic descriptions. To ensure that MLLMs can handle\ndiverse queries, we further prompt GPT to refine the aesthetic critiques and\nassemble the large-scale aesthetic instruction tuning dataset, i.e. AesMMIT,\nwhich consists of 409K multi-typed instructions to activate stronger aesthetic\ncapabilities. Based on the AesMMIT database, we fine-tune the open-sourced\ngeneral foundation models, achieving multi-modality Aesthetic Expert models,\ndubbed AesExpert. Extensive experiments demonstrate that the proposed AesExpert\nmodels deliver significantly better aesthetic perception performances than the\nstate-of-the-art MLLMs, including the most advanced GPT-4V and\nGemini-Pro-Vision. Project homepage: https://yipoh.github.io/aes-expert/.\n","authors":["Yipo Huang","Xiangfei Sheng","Zhichao Yang","Quan Yuan","Zhichao Duan","Pengfei Chen","Leida Li","Weisi Lin","Guangming Shi"],"pdf_url":"https://arxiv.org/pdf/2404.09624v3.pdf","comment":"Accepted by ACMMM24"},{"id":"http://arxiv.org/abs/2404.09490v2","updated":"2024-07-24T05:08:08Z","published":"2024-04-15T06:24:56Z","title":"Leveraging Temporal Contextualization for Video Action Recognition","summary":" We propose a novel framework for video understanding, called Temporally\nContextualized CLIP (TC-CLIP), which leverages essential temporal information\nthrough global interactions in a spatio-temporal domain within a video. To be\nspecific, we introduce Temporal Contextualization (TC), a layer-wise temporal\ninformation infusion mechanism for videos, which 1) extracts core information\nfrom each frame, 2) connects relevant information across frames for the\nsummarization into context tokens, and 3) leverages the context tokens for\nfeature encoding. Furthermore, the Video-conditional Prompting (VP) module\nprocesses context tokens to generate informative prompts in the text modality.\nExtensive experiments in zero-shot, few-shot, base-to-novel, and\nfully-supervised action recognition validate the effectiveness of our model.\nAblation studies for TC and VP support our design choices. Our project page\nwith the source code is available at https://github.com/naver-ai/tc-clip\n","authors":["Minji Kim","Dongyoon Han","Taekyung Kim","Bohyung Han"],"pdf_url":"https://arxiv.org/pdf/2404.09490v2.pdf","comment":"26 pages, 11 figures, 16 tables. To be presented at ECCV'24"},{"id":"http://arxiv.org/abs/2405.07987v4","updated":"2024-07-24T05:01:21Z","published":"2024-05-13T17:58:30Z","title":"The Platonic Representation Hypothesis","summary":" We argue that representations in AI models, particularly deep networks, are\nconverging. First, we survey many examples of convergence in the literature:\nover time and across multiple domains, the ways by which different neural\nnetworks represent data are becoming more aligned. Next, we demonstrate\nconvergence across data modalities: as vision models and language models get\nlarger, they measure distance between datapoints in a more and more alike way.\nWe hypothesize that this convergence is driving toward a shared statistical\nmodel of reality, akin to Plato's concept of an ideal reality. We term such a\nrepresentation the platonic representation and discuss several possible\nselective pressures toward it. Finally, we discuss the implications of these\ntrends, their limitations, and counterexamples to our analysis.\n","authors":["Minyoung Huh","Brian Cheung","Tongzhou Wang","Phillip Isola"],"pdf_url":"https://arxiv.org/pdf/2405.07987v4.pdf","comment":"Equal contributions. Project: https://phillipi.github.io/prh/ Code:\n https://github.com/minyoungg/platonic-rep"},{"id":"http://arxiv.org/abs/2407.17003v1","updated":"2024-07-24T05:00:31Z","published":"2024-07-24T05:00:31Z","title":"Progressive Query Refinement Framework for Bird's-Eye-View Semantic\n Segmentation from Surrounding Images","summary":" Expressing images with Multi-Resolution (MR) features has been widely adopted\nin many computer vision tasks. In this paper, we introduce the MR concept into\nBird's-Eye-View (BEV) semantic segmentation for autonomous driving. This\nintroduction enhances our model's ability to capture both global and local\ncharacteristics of driving scenes through our proposed residual learning.\nSpecifically, given a set of MR BEV query maps, the lowest resolution query map\nis initially updated using a View Transformation (VT) encoder. This updated\nquery map is then upscaled and merged with a higher resolution query map to\nundergo further updates in a subsequent VT encoder. This process is repeated\nuntil the resolution of the updated query map reaches the target. Finally, the\nlowest resolution map is added to the target resolution to generate the final\nquery map. During training, we enforce both the lowest and final query maps to\nalign with the ground-truth BEV semantic map to help our model effectively\ncapture the global and local characteristics. We also propose a visual feature\ninteraction network that promotes interactions between features across images\nand across feature levels, thus highly contributing to the performance\nimprovement. We evaluate our model on a large-scale real-world dataset. The\nexperimental results show that our model outperforms the SOTA models in terms\nof IoU metric. Codes are available at\nhttps://github.com/d1024choi/ProgressiveQueryRefineNet\n","authors":["Dooseop Choi","Jungyu Kang","Taeghyun An","Kyounghwan Ahn","KyoungWook Min"],"pdf_url":"https://arxiv.org/pdf/2407.17003v1.pdf","comment":"IROS 2024"},{"id":"http://arxiv.org/abs/2312.17432v4","updated":"2024-07-24T04:44:11Z","published":"2023-12-29T01:56:17Z","title":"Video Understanding with Large Language Models: A Survey","summary":" With the burgeoning growth of online video platforms and the escalating\nvolume of video content, the demand for proficient video understanding tools\nhas intensified markedly. Given the remarkable capabilities of large language\nmodels (LLMs) in language and multimodal tasks, this survey provides a detailed\noverview of recent advancements in video understanding that harness the power\nof LLMs (Vid-LLMs). The emergent capabilities of Vid-LLMs are surprisingly\nadvanced, particularly their ability for open-ended multi-granularity (general,\ntemporal, and spatiotemporal) reasoning combined with commonsense knowledge,\nsuggesting a promising path for future video understanding. We examine the\nunique characteristics and capabilities of Vid-LLMs, categorizing the\napproaches into three main types: Video Analyzer x LLM, Video Embedder x LLM,\nand (Analyzer + Embedder) x LLM. Furthermore, we identify five sub-types based\non the functions of LLMs in Vid-LLMs: LLM as Summarizer, LLM as Manager, LLM as\nText Decoder, LLM as Regressor, and LLM as Hidden Layer. Furthermore, this\nsurvey presents a comprehensive study of the tasks, datasets, benchmarks, and\nevaluation methodologies for Vid-LLMs. Additionally, it explores the expansive\napplications of Vid-LLMs across various domains, highlighting their remarkable\nscalability and versatility in real-world video understanding challenges.\nFinally, it summarizes the limitations of existing Vid-LLMs and outlines\ndirections for future research. For more information, readers are recommended\nto visit the repository at\nhttps://github.com/yunlong10/Awesome-LLMs-for-Video-Understanding.\n","authors":["Yunlong Tang","Jing Bi","Siting Xu","Luchuan Song","Susan Liang","Teng Wang","Daoan Zhang","Jie An","Jingyang Lin","Rongyi Zhu","Ali Vosoughi","Chao Huang","Zeliang Zhang","Pinxin Liu","Mingqian Feng","Feng Zheng","Jianguo Zhang","Ping Luo","Jiebo Luo","Chenliang Xu"],"pdf_url":"https://arxiv.org/pdf/2312.17432v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.18293v3","updated":"2024-07-24T04:43:30Z","published":"2024-02-28T12:38:44Z","title":"Continuous Memory Representation for Anomaly Detection","summary":" There have been significant advancements in anomaly detection in an\nunsupervised manner, where only normal images are available for training.\nSeveral recent methods aim to detect anomalies based on a memory, comparing or\nreconstructing the input with directly stored normal features (or trained\nfeatures with normal images). However, such memory-based approaches operate on\na discrete feature space implemented by the nearest neighbor or attention\nmechanism, suffering from poor generalization or an identity shortcut issue\noutputting the same as input, respectively. Furthermore, the majority of\nexisting methods are designed to detect single-class anomalies, resulting in\nunsatisfactory performance when presented with multiple classes of objects. To\ntackle all of the above challenges, we propose CRAD, a novel anomaly detection\nmethod for representing normal features within a \"continuous\" memory, enabled\nby transforming spatial features into coordinates and mapping them to\ncontinuous grids. Furthermore, we carefully design the grids tailored for\nanomaly detection, representing both local and global normal features and\nfusing them effectively. Our extensive experiments demonstrate that CRAD\nsuccessfully generalizes the normal features and mitigates the identity\nshortcut, furthermore, CRAD effectively handles diverse classes in a single\nmodel thanks to the high-granularity continuous representation. In an\nevaluation using the MVTec AD dataset, CRAD significantly outperforms the\nprevious state-of-the-art method by reducing 65.0% of the error for multi-class\nunified anomaly detection. The project page is available at\nhttps://tae-mo.github.io/crad/.\n","authors":["Joo Chan Lee","Taejune Kim","Eunbyung Park","Simon S. Woo","Jong Hwan Ko"],"pdf_url":"https://arxiv.org/pdf/2402.18293v3.pdf","comment":"Project page: https://tae-mo.github.io/crad/"},{"id":"http://arxiv.org/abs/2407.16993v1","updated":"2024-07-24T04:27:03Z","published":"2024-07-24T04:27:03Z","title":"LoFormer: Local Frequency Transformer for Image Deblurring","summary":" Due to the computational complexity of self-attention (SA), prevalent\ntechniques for image deblurring often resort to either adopting localized SA or\nemploying coarse-grained global SA methods, both of which exhibit drawbacks\nsuch as compromising global modeling or lacking fine-grained correlation. In\norder to address this issue by effectively modeling long-range dependencies\nwithout sacrificing fine-grained details, we introduce a novel approach termed\nLocal Frequency Transformer (LoFormer). Within each unit of LoFormer, we\nincorporate a Local Channel-wise SA in the frequency domain (Freq-LC) to\nsimultaneously capture cross-covariance within low- and high-frequency local\nwindows. These operations offer the advantage of (1) ensuring equitable\nlearning opportunities for both coarse-grained structures and fine-grained\ndetails, and (2) exploring a broader range of representational properties\ncompared to coarse-grained global SA methods. Additionally, we introduce an MLP\nGating mechanism complementary to Freq-LC, which serves to filter out\nirrelevant features while enhancing global learning capabilities. Our\nexperiments demonstrate that LoFormer significantly improves performance in the\nimage deblurring task, achieving a PSNR of 34.09 dB on the GoPro dataset with\n126G FLOPs. https://github.com/DeepMed-Lab-ECNU/Single-Image-Deblur\n","authors":["Xintian Mao","Jiansheng Wang","Xingran Xie","Qingli Li","Yan Wang"],"pdf_url":"https://arxiv.org/pdf/2407.16993v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16988v1","updated":"2024-07-24T04:13:43Z","published":"2024-07-24T04:13:43Z","title":"DreamCar: Leveraging Car-specific Prior for in-the-wild 3D Car\n Reconstruction","summary":" Self-driving industries usually employ professional artists to build\nexquisite 3D cars. However, it is expensive to craft large-scale digital\nassets. Since there are already numerous datasets available that contain a vast\nnumber of images of cars, we focus on reconstructing high-quality 3D car models\nfrom these datasets. However, these datasets only contain one side of cars in\nthe forward-moving scene. We try to use the existing generative models to\nprovide more supervision information, but they struggle to generalize well in\ncars since they are trained on synthetic datasets not car-specific. In\naddition, The reconstructed 3D car texture misaligns due to a large error in\ncamera pose estimation when dealing with in-the-wild images. These restrictions\nmake it challenging for previous methods to reconstruct complete 3D cars. To\naddress these problems, we propose a novel method, named DreamCar, which can\nreconstruct high-quality 3D cars given a few images even a single image. To\ngeneralize the generative model, we collect a car dataset, named Car360, with\nover 5,600 vehicles. With this dataset, we make the generative model more\nrobust to cars. We use this generative prior specific to the car to guide its\nreconstruction via Score Distillation Sampling. To further complement the\nsupervision information, we utilize the geometric and appearance symmetry of\ncars. Finally, we propose a pose optimization method that rectifies poses to\ntackle texture misalignment. Extensive experiments demonstrate that our method\nsignificantly outperforms existing methods in reconstructing high-quality 3D\ncars. \\href{https://xiaobiaodu.github.io/dreamcar-project/}{Our code is\navailable.}\n","authors":["Xiaobiao Du","Haiyang Sun","Ming Lu","Tianqing Zhu","Xin Yu"],"pdf_url":"https://arxiv.org/pdf/2407.16988v1.pdf","comment":"Projet Page: https://xiaobiaodu.github.io/dreamcar-project/"},{"id":"http://arxiv.org/abs/2404.09512v2","updated":"2024-07-24T04:06:12Z","published":"2024-04-15T07:15:39Z","title":"Magic Clothing: Controllable Garment-Driven Image Synthesis","summary":" We propose Magic Clothing, a latent diffusion model (LDM)-based network\narchitecture for an unexplored garment-driven image synthesis task. Aiming at\ngenerating customized characters wearing the target garments with diverse text\nprompts, the image controllability is the most critical issue, i.e., to\npreserve the garment details and maintain faithfulness to the text prompts. To\nthis end, we introduce a garment extractor to capture the detailed garment\nfeatures, and employ self-attention fusion to incorporate them into the\npretrained LDMs, ensuring that the garment details remain unchanged on the\ntarget character. Then, we leverage the joint classifier-free guidance to\nbalance the control of garment features and text prompts over the generated\nresults. Meanwhile, the proposed garment extractor is a plug-in module\napplicable to various finetuned LDMs, and it can be combined with other\nextensions like ControlNet and IP-Adapter to enhance the diversity and\ncontrollability of the generated characters. Furthermore, we design\nMatched-Points-LPIPS (MP-LPIPS), a robust metric for evaluating the consistency\nof the target image to the source garment. Extensive experiments demonstrate\nthat our Magic Clothing achieves state-of-the-art results under various\nconditional controls for garment-driven image synthesis. Our source code is\navailable at https://github.com/ShineChen1024/MagicClothing.\n","authors":["Weifeng Chen","Tao Gu","Yuhao Xu","Chengcai Chen"],"pdf_url":"https://arxiv.org/pdf/2404.09512v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16982v1","updated":"2024-07-24T03:58:58Z","published":"2024-07-24T03:58:58Z","title":"Diffree: Text-Guided Shape Free Object Inpainting with Diffusion Model","summary":" This paper addresses an important problem of object addition for images with\nonly text guidance. It is challenging because the new object must be integrated\nseamlessly into the image with consistent visual context, such as lighting,\ntexture, and spatial location. While existing text-guided image inpainting\nmethods can add objects, they either fail to preserve the background\nconsistency or involve cumbersome human intervention in specifying bounding\nboxes or user-scribbled masks. To tackle this challenge, we introduce Diffree,\na Text-to-Image (T2I) model that facilitates text-guided object addition with\nonly text control. To this end, we curate OABench, an exquisite synthetic\ndataset by removing objects with advanced image inpainting techniques. OABench\ncomprises 74K real-world tuples of an original image, an inpainted image with\nthe object removed, an object mask, and object descriptions. Trained on OABench\nusing the Stable Diffusion model with an additional mask prediction module,\nDiffree uniquely predicts the position of the new object and achieves object\naddition with guidance from only text. Extensive experiments demonstrate that\nDiffree excels in adding new objects with a high success rate while maintaining\nbackground consistency, spatial appropriateness, and object relevance and\nquality.\n","authors":["Lirui Zhao","Tianshuo Yang","Wenqi Shao","Yuxin Zhang","Yu Qiao","Ping Luo","Kaipeng Zhang","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2407.16982v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16981v1","updated":"2024-07-24T03:58:07Z","published":"2024-07-24T03:58:07Z","title":"Case-Enhanced Vision Transformer: Improving Explanations of Image\n Similarity with a ViT-based Similarity Metric","summary":" This short paper presents preliminary research on the Case-Enhanced Vision\nTransformer (CEViT), a similarity measurement method aimed at improving the\nexplainability of similarity assessments for image data. Initial experimental\nresults suggest that integrating CEViT into k-Nearest Neighbor (k-NN)\nclassification yields classification accuracy comparable to state-of-the-art\ncomputer vision models, while adding capabilities for illustrating differences\nbetween classes. CEViT explanations can be influenced by prior cases, to\nillustrate aspects of similarity relevant to those cases.\n","authors":["Ziwei Zhao","David Leake","Xiaomeng Ye","David Crandall"],"pdf_url":"https://arxiv.org/pdf/2407.16981v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16977v1","updated":"2024-07-24T03:45:35Z","published":"2024-07-24T03:45:35Z","title":"Selective Vision-Language Subspace Projection for Few-shot CLIP","summary":" Vision-language models such as CLIP are capable of mapping the different\nmodality data into a unified feature space, enabling zero/few-shot inference by\nmeasuring the similarity of given images and texts. However, most existing\nmethods overlook modality gaps in CLIP's encoded features, which is shown as\nthe text and image features lie far apart from each other, resulting in limited\nclassification performance. To tackle this issue, we introduce a method called\nSelective Vision-Language Subspace Projection (SSP), which incorporates local\nimage features and utilizes them as a bridge to enhance the alignment between\nimage-text pairs. Specifically, our SSP framework comprises two parallel\nmodules: a vision projector and a language projector. Both projectors utilize\nlocal image features to span the respective subspaces for image and texts,\nthereby projecting the image and text features into their respective subspaces\nto achieve alignment. Moreover, our approach entails only training-free matrix\ncalculations and can be seamlessly integrated into advanced CLIP-based few-shot\nlearning frameworks. Extensive experiments on 11 datasets have demonstrated\nSSP's superior text-image alignment capabilities, outperforming the\nstate-of-the-art alignment methods. The code is available at\nhttps://github.com/zhuhsingyuu/SSP\n","authors":["Xingyu Zhu","Beier Zhu","Yi Tan","Shuo Wang","Yanbin Hao","Hanwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.16977v1.pdf","comment":"Accepted to ACM MultiMedia 2024"},{"id":"http://arxiv.org/abs/2407.16962v1","updated":"2024-07-24T03:01:55Z","published":"2024-07-24T03:01:55Z","title":"Toward an Integrated Decision Making Framework for Optimized Stroke\n Diagnosis with DSA and Treatment under Uncertainty","summary":" This study addresses the challenge of stroke diagnosis and treatment under\nuncertainty, a critical issue given the rapid progression and severe\nconsequences of stroke conditions such as aneurysms, arteriovenous\nmalformations (AVM), and occlusions. Current diagnostic methods, including\nDigital Subtraction Angiography (DSA), face limitations due to high costs and\nits invasive nature. To overcome these challenges, we propose a novel approach\nusing a Partially Observable Markov Decision Process (POMDP) framework. Our\nmodel integrates advanced diagnostic tools and treatment approaches with a\ndecision-making algorithm that accounts for the inherent uncertainties in\nstroke diagnosis. Our approach combines noisy observations from CT scans,\nSiriraj scores, and DSA reports to inform the subsequent treatment options. We\nutilize the online solver DESPOT, which employs tree-search methods and\nparticle filters, to simulate potential future scenarios and guide our\nstrategies. The results indicate that our POMDP framework balances diagnostic\nand treatment objectives, striking a tradeoff between the need for precise\nstroke identification via invasive procedures like DSA and the constraints of\nlimited healthcare resources that necessitate more cost-effective strategies,\nsuch as in-hospital or at-home observation, by relying only relying on\nsimulation rollouts and not imposing any prior knowledge. Our study offers a\nsignificant contribution by presenting a systematic framework that optimally\nintegrates diagnostic and treatment processes for stroke and accounting for\nvarious uncertainties, thereby improving care and outcomes in stroke\nmanagement.\n","authors":["Nur Ahmad Khatim","Ahmad Azmul Asmar Irfan","Amaliya Mata'ul Hayah","Mansur M. Arief"],"pdf_url":"https://arxiv.org/pdf/2407.16962v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16961v1","updated":"2024-07-24T03:00:53Z","published":"2024-07-24T03:00:53Z","title":"Pose Estimation from Camera Images for Underwater Inspection","summary":" High-precision localization is pivotal in underwater reinspection missions.\nTraditional localization methods like inertial navigation systems, Doppler\nvelocity loggers, and acoustic positioning face significant challenges and are\nnot cost-effective for some applications. Visual localization is a\ncost-effective alternative in such cases, leveraging the cameras already\nequipped on inspection vehicles to estimate poses from images of the\nsurrounding scene. Amongst these, machine learning-based pose estimation from\nimages shows promise in underwater environments, performing efficient\nrelocalization using models trained based on previously mapped scenes. We\nexplore the efficacy of learning-based pose estimators in both clear and turbid\nwater inspection missions, assessing the impact of image formats, model\narchitectures and training data diversity. We innovate by employing novel view\nsynthesis models to generate augmented training data, significantly enhancing\npose estimation in unexplored regions. Moreover, we enhance localization\naccuracy by integrating pose estimator outputs with sensor data via an extended\nKalman filter, demonstrating improved trajectory smoothness and accuracy.\n","authors":["Luyuan Peng","Hari Vishnu","Mandar Chitre","Yuen Min Too","Bharath Kalyan","Rajat Mishra","Soo Pieng Tan"],"pdf_url":"https://arxiv.org/pdf/2407.16961v1.pdf","comment":"Submitted to IEEE Journal of Oceanic Engineering"},{"id":"http://arxiv.org/abs/2406.11271v2","updated":"2024-07-24T02:59:40Z","published":"2024-06-17T07:21:36Z","title":"MINT-1T: Scaling Open-Source Multimodal Data by 10x: A Multimodal\n Dataset with One Trillion Tokens","summary":" Multimodal interleaved datasets featuring free-form interleaved sequences of\nimages and text are crucial for training frontier large multimodal models\n(LMMs). Despite the rapid progression of open-source LMMs, there remains a\npronounced scarcity of large-scale, diverse open-source multimodal interleaved\ndatasets. In response, we introduce MINT-1T, the most extensive and diverse\nopen-source Multimodal INTerleaved dataset to date. MINT-1T comprises one\ntrillion text tokens and 3.4 billion images, a 10x scale-up from existing\nopen-source datasets. Additionally, we include previously untapped sources such\nas PDFs and ArXiv papers. As scaling multimodal interleaved datasets requires\nsubstantial engineering effort, sharing the data curation process and releasing\nthe dataset greatly benefits the community. Our experiments show that LMMs\ntrained on MINT-1T rival the performance of models trained on the previous\nleading dataset, OBELICS. Our data and code will be released at\nhttps://github.com/mlfoundations/MINT-1T.\n","authors":["Anas Awadalla","Le Xue","Oscar Lo","Manli Shu","Hannah Lee","Etash Kumar Guha","Matt Jordan","Sheng Shen","Mohamed Awadalla","Silvio Savarese","Caiming Xiong","Ran Xu","Yejin Choi","Ludwig Schmidt"],"pdf_url":"https://arxiv.org/pdf/2406.11271v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.07720v2","updated":"2024-07-24T02:55:17Z","published":"2024-07-10T14:53:37Z","title":"SvANet: A Scale-variant Attention-based Network for Small Medical Object\n Segmentation","summary":" Early detection and accurate diagnosis can predict the risk of malignant\ndisease transformation, thereby increasing the probability of effective\ntreatment. A mild syndrome with small infected regions is an ominous warning\nand is foremost in the early diagnosis of diseases. Deep learning algorithms,\nsuch as convolutional neural networks (CNNs), have been used to segment natural\nor medical objects, showing promising results. However, analyzing medical\nobjects of small areas in images remains a challenge due to information losses\nand compression defects caused by convolution and pooling operations in CNNs.\nThese losses and defects become increasingly significant as the network\ndeepens, particularly for small medical objects. To address these challenges,\nwe propose a novel scale-variant attention-based network (SvANet) for accurate\nsmall-scale object segmentation in medical images. The SvANet consists of Monte\nCarlo attention, scale-variant attention, and vision transformer, which\nincorporates cross-scale features and alleviates compression artifacts for\nenhancing the discrimination of small medical objects. Quantitative\nexperimental results demonstrate the superior performance of SvANet, achieving\n96.12%, 96.11%, 89.79%, 84.15%, 80.25%, 73.05%, and 72.58% in mean Dice\ncoefficient for segmenting kidney tumors, skin lesions, hepatic tumors, polyps,\nsurgical excision cells, retinal vasculatures, and sperms, which occupy less\nthan 1% of the image areas in KiTS23, ISIC 2018, ATLAS, PolypGen, TissueNet,\nFIVES, and SpermHealth datasets, respectively.\n","authors":["Wei Dai"],"pdf_url":"https://arxiv.org/pdf/2407.07720v2.pdf","comment":"14 pages, 9 figures, under review"},{"id":"http://arxiv.org/abs/2402.16479v2","updated":"2024-07-24T02:53:00Z","published":"2024-02-26T10:54:26Z","title":"Edge Detectors Can Make Deep Convolutional Neural Networks More Robust","summary":" Deep convolutional neural networks (DCNN for short) are vulnerable to\nexamples with small perturbations. Improving DCNN's robustness is of great\nsignificance to the safety-critical applications, such as autonomous driving\nand industry automation. Inspired by the principal way that human eyes\nrecognize objects, i.e., largely relying on the shape features, this paper\nfirst employs the edge detectors as layer kernels and designs a binary edge\nfeature branch (BEFB for short) to learn the binary edge features, which can be\neasily integrated into any popular backbone. The four edge detectors can learn\nthe horizontal, vertical, positive diagonal, and negative diagonal edge\nfeatures, respectively, and the branch is stacked by multiple Sobel layers\n(using edge detectors as kernels) and one threshold layer. The binary edge\nfeatures learned by the branch, concatenated with the texture features learned\nby the backbone, are fed into the fully connected layers for classification. We\nintegrate the proposed branch into VGG16 and ResNet34, respectively, and\nconduct experiments on multiple datasets. Experimental results demonstrate the\nBEFB is lightweight and has no side effects on training. And the accuracy of\nthe BEFB integrated models is better than the original ones on all datasets\nwhen facing FGSM, PGD, and C\\&W attacks. Besides, BEFB integrated models\nequipped with the robustness enhancing techniques can achieve better\nclassification accuracy compared to the original models. The work in this paper\nfor the first time shows it is feasible to enhance the robustness of DCNNs\nthrough combining both shape-like features and texture features.\n","authors":["Jin Ding","Jie-Chao Zhao","Yong-Zhi Sun","Ping Tan","Jia-Wei Wang","Ji-En Ma","You-Tong Fang"],"pdf_url":"https://arxiv.org/pdf/2402.16479v2.pdf","comment":"26 pages, 18 figures, 7 tables"},{"id":"http://arxiv.org/abs/2407.16957v1","updated":"2024-07-24T02:48:30Z","published":"2024-07-24T02:48:30Z","title":"Raindrop Clarity: A Dual-Focused Dataset for Day and Night Raindrop\n Removal","summary":" Existing raindrop removal datasets have two shortcomings. First, they consist\nof images captured by cameras with a focus on the background, leading to the\npresence of blurry raindrops. To our knowledge, none of these datasets include\nimages where the focus is specifically on raindrops, which results in a blurry\nbackground. Second, these datasets predominantly consist of daytime images,\nthereby lacking nighttime raindrop scenarios. Consequently, algorithms trained\non these datasets may struggle to perform effectively in raindrop-focused or\nnighttime scenarios. The absence of datasets specifically designed for\nraindrop-focused and nighttime raindrops constrains research in this area. In\nthis paper, we introduce a large-scale, real-world raindrop removal dataset\ncalled Raindrop Clarity. Raindrop Clarity comprises 15,186 high-quality\npairs/triplets (raindrops, blur, and background) of images with raindrops and\nthe corresponding clear background images. There are 5,442 daytime raindrop\nimages and 9,744 nighttime raindrop images. Specifically, the 5,442 daytime\nimages include 3,606 raindrop- and 1,836 background-focused images. While the\n9,744 nighttime images contain 4,838 raindrop- and 4,906 background-focused\nimages. Our dataset will enable the community to explore background-focused and\nraindrop-focused images, including challenges unique to daytime and nighttime\nconditions. Our data and code are available at:\n\\url{https://github.com/jinyeying/RaindropClarity}\n","authors":["Yeying Jin","Xin Li","Jiadong Wang","Yan Zhang","Malu Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.16957v1.pdf","comment":"Accepted to ECCV2024, dataset and benchmark at:\n \\url{https://github.com/jinyeying/RaindropClarity}"},{"id":"http://arxiv.org/abs/2407.11219v2","updated":"2024-07-24T02:45:37Z","published":"2024-07-15T20:07:45Z","title":"TLRN: Temporal Latent Residual Networks For Large Deformation Image\n Registration","summary":" This paper presents a novel approach, termed {\\em Temporal Latent Residual\nNetwork (TLRN)}, to predict a sequence of deformation fields in time-series\nimage registration. The challenge of registering time-series images often lies\nin the occurrence of large motions, especially when images differ significantly\nfrom a reference (e.g., the start of a cardiac cycle compared to the peak\nstretching phase). To achieve accurate and robust registration results, we\nleverage the nature of motion continuity and exploit the temporal smoothness in\nconsecutive image frames. Our proposed TLRN highlights a temporal residual\nnetwork with residual blocks carefully designed in latent deformation spaces,\nwhich are parameterized by time-sequential initial velocity fields. We treat a\nsequence of residual blocks over time as a dynamic training system, where each\nblock is designed to learn the residual function between desired deformation\nfeatures and current input accumulated from previous time frames. We validate\nthe effectivenss of TLRN on both synthetic data and real-world cine cardiac\nmagnetic resonance (CMR) image videos. Our experimental results shows that TLRN\nis able to achieve substantially improved registration accuracy compared to the\nstate-of-the-art. Our code is publicly available at\nhttps://github.com/nellie689/TLRN.\n","authors":["Nian Wu","Jiarui Xing","Miaomiao Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.11219v2.pdf","comment":"10 pages. Accepted by MICCAI 2024"},{"id":"http://arxiv.org/abs/2407.16955v1","updated":"2024-07-24T02:44:41Z","published":"2024-07-24T02:44:41Z","title":"DVPE: Divided View Position Embedding for Multi-View 3D Object Detection","summary":" Sparse query-based paradigms have achieved significant success in multi-view\n3D detection for autonomous vehicles. Current research faces challenges in\nbalancing between enlarging receptive fields and reducing interference when\naggregating multi-view features. Moreover, different poses of cameras present\nchallenges in training global attention models. To address these problems, this\npaper proposes a divided view method, in which features are modeled globally\nvia the visibility crossattention mechanism, but interact only with partial\nfeatures in a divided local virtual space. This effectively reduces\ninterference from other irrelevant features and alleviates the training\ndifficulties of the transformer by decoupling the position embedding from\ncamera poses. Additionally, 2D historical RoI features are incorporated into\nthe object-centric temporal modeling to utilize highlevel visual semantic\ninformation. The model is trained using a one-to-many assignment strategy to\nfacilitate stability. Our framework, named DVPE, achieves state-of-the-art\nperformance (57.2% mAP and 64.5% NDS) on the nuScenes test set. Codes will be\navailable at https://github.com/dop0/DVPE.\n","authors":["Jiasen Wang","Zhenglin Li","Ke Sun","Xianyuan Liu","Yang Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.16955v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16953v1","updated":"2024-07-24T02:41:19Z","published":"2024-07-24T02:41:19Z","title":"Open Challenges on Fairness of Artificial Intelligence in Medical\n Imaging Applications","summary":" Recently, the research community of computerized medical imaging has started\nto discuss and address potential fairness issues that may emerge when\ndeveloping and deploying AI systems for medical image analysis. This chapter\ncovers some of the pressing challenges encountered when doing research in this\narea, and it is intended to raise questions and provide food for thought for\nthose aiming to enter this research field. The chapter first discusses various\nsources of bias, including data collection, model training, and clinical\ndeployment, and their impact on the fairness of machine learning algorithms in\nmedical image computing. We then turn to discussing open challenges that we\nbelieve require attention from researchers and practitioners, as well as\npotential pitfalls of naive application of common methods in the field. We\ncover a variety of topics including the impact of biased metrics when auditing\nfor fairness, the leveling down effect, task difficulty variations among\nsubgroups, discovering biases in unseen populations, and explaining biases\nbeyond standard demographic attributes.\n","authors":["Enzo Ferrante","Rodrigo Echeveste"],"pdf_url":"https://arxiv.org/pdf/2407.16953v1.pdf","comment":"Published as part of the book \"Trustworthy AI in Medical Imaging\"\n (Elsevier, 2024) available at\n https://shop.elsevier.com/books/trustworthy-ai-in-medical-imaging/lorenzi/978-0-443-23761-4"},{"id":"http://arxiv.org/abs/2405.02363v2","updated":"2024-07-24T02:36:07Z","published":"2024-05-03T05:09:54Z","title":"LLM as Dataset Analyst: Subpopulation Structure Discovery with Large\n Language Model","summary":" The distribution of subpopulations is an important property hidden within a\ndataset. Uncovering and analyzing the subpopulation distribution within\ndatasets provides a comprehensive understanding of the datasets, standing as a\npowerful tool beneficial to various downstream tasks, including Dataset\nSubpopulation Organization, Subpopulation Shift, and Slice Discovery. Despite\nits importance, there has been no work that systematically explores the\nsubpopulation distribution of datasets to our knowledge. To address the\nlimitation and solve all the mentioned tasks in a unified way, we introduce a\nnovel concept of subpopulation structures to represent, analyze, and utilize\nsubpopulation distributions within datasets. To characterize the structures in\nan interpretable manner, we propose the Subpopulation Structure Discovery with\nLarge Language Models (SSD-LLM) framework, which employs world knowledge and\ninstruction-following capabilities of Large Language Models (LLMs) to\nlinguistically analyze informative image captions and summarize the structures.\nFurthermore, we propose complete workflows to address downstream tasks, named\nTask-specific Tuning, showcasing the application of the discovered structure to\na spectrum of subpopulation-related tasks, including dataset subpopulation\norganization, subpopulation shift, and slice discovery. Furthermore, we propose\ncomplete workflows to address downstream tasks, named Task-specific Tuning,\nshowcasing the application of the discovered structure to a spectrum of\nsubpopulation-related tasks, including dataset subpopulation organization,\nsubpopulation shift, and slice discovery.\n","authors":["Yulin Luo","Ruichuan An","Bocheng Zou","Yiming Tang","Jiaming Liu","Shanghang Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.02363v2.pdf","comment":"ECCV24 Camera Ready"},{"id":"http://arxiv.org/abs/2407.16945v1","updated":"2024-07-24T02:24:21Z","published":"2024-07-24T02:24:21Z","title":"Affective Behaviour Analysis via Progressive Learning","summary":" Affective Behavior Analysis aims to develop emotionally intelligent\ntechnology that can recognize and respond to human emotions. To advance this,\nthe 7th Affective Behavior Analysis in-the-wild (ABAW) competition establishes\ntwo tracks: i.e., the Multi-task Learning (MTL) Challenge and the Compound\nExpression (CE) challenge based on Aff-Wild2 and C-EXPR-DB datasets. In this\npaper, we present our methods and experimental results for the two competition\ntracks. Specifically, it can be summarized in the following four aspects: 1) To\nattain high-quality facial features, we train a Masked-Auto Encoder in a\nself-supervised manner. 2) We devise a temporal convergence module to capture\nthe temporal information between video frames and explore the impact of window\nsize and sequence length on each sub-task. 3) To facilitate the joint\noptimization of various sub-tasks, we explore the impact of sub-task joint\ntraining and feature fusion from individual tasks on each task performance\nimprovement. 4) We utilize curriculum learning to transition the model from\nrecognizing single expressions to recognizing compound expressions, thereby\nimproving the accuracy of compound expression recognition. Extensive\nexperiments demonstrate the superiority of our designs.\n","authors":["Chen Liu","Wei Zhang","Feng Qiu","Lincheng Li","Xin Yu"],"pdf_url":"https://arxiv.org/pdf/2407.16945v1.pdf","comment":"Techical Report for 7th ABAW Competition"},{"id":"http://arxiv.org/abs/2407.16943v1","updated":"2024-07-24T02:23:02Z","published":"2024-07-24T02:23:02Z","title":"McGAN: Generating Manufacturable Designs by Embedding Manufacturing\n Rules into Conditional Generative Adversarial Network","summary":" Generative design (GD) methods aim to automatically generate a wide variety\nof designs that satisfy functional or aesthetic design requirements. However,\nresearch to date generally lacks considerations of manufacturability of the\ngenerated designs. To this end, we propose a novel GD approach by using deep\nneural networks to encode design for manufacturing (DFM) rules, thereby\nmodifying part designs to make them manufacturable by a given manufacturing\nprocess. Specifically, a three-step approach is proposed: first, an instance\nsegmentation method, Mask R-CNN, is used to decompose a part design into\nsubregions. Second, a conditional generative adversarial neural network (cGAN),\nPix2Pix, transforms unmanufacturable decomposed subregions into manufacturable\nsubregions. The transformed subregions of designs are subsequently reintegrated\ninto a unified manufacturable design. These three steps, Mask-RCNN, Pix2Pix,\nand reintegration, form the basis of the proposed Manufacturable conditional\nGAN (McGAN) framework. Experimental results show that McGAN can transform\nexisting unmanufacturable designs to generate their corresponding\nmanufacturable counterparts automatically that realize the specified\nmanufacturing rules in an efficient and robust manner. The effectiveness of\nMcGAN is demonstrated through two-dimensional design case studies of an\ninjection molding process.\n","authors":["Zhichao Wang","Xiaoliang Yan","Shreyes Melkote","David Rosen"],"pdf_url":"https://arxiv.org/pdf/2407.16943v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17086v2","updated":"2024-07-24T01:41:01Z","published":"2023-11-28T02:31:52Z","title":"PEA-Diffusion: Parameter-Efficient Adapter with Knowledge Distillation\n in non-English Text-to-Image Generation","summary":" Text-to-image diffusion models are well-known for their ability to generate\nrealistic images based on textual prompts. However, the existing works have\npredominantly focused on English, lacking support for non-English text-to-image\nmodels. The most commonly used translation methods cannot solve the generation\nproblem related to language culture, while training from scratch on a specific\nlanguage dataset is prohibitively expensive. In this paper, we are inspired to\npropose a simple plug-and-play language transfer method based on knowledge\ndistillation. All we need to do is train a lightweight MLP-like\nparameter-efficient adapter (PEA) with only 6M parameters under teacher\nknowledge distillation along with a small parallel data corpus. We are\nsurprised to find that freezing the parameters of UNet can still achieve\nremarkable performance on the language-specific prompt evaluation set,\ndemonstrating that PEA can stimulate the potential generation ability of the\noriginal UNet. Additionally, it closely approaches the performance of the\nEnglish text-to-image model on a general prompt evaluation set. Furthermore,\nour adapter can be used as a plugin to achieve significant results in\ndownstream tasks in cross-lingual text-to-image generation. Code will be\navailable at: https://github.com/OPPO-Mente-Lab/PEA-Diffusion\n","authors":["Jian Ma","Chen Chen","Qingsong Xie","Haonan Lu"],"pdf_url":"https://arxiv.org/pdf/2311.17086v2.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2407.16921v1","updated":"2024-07-24T01:11:28Z","published":"2024-07-24T01:11:28Z","title":"SAR to Optical Image Translation with Color Supervised Diffusion Model","summary":" Synthetic Aperture Radar (SAR) offers all-weather, high-resolution imaging\ncapabilities, but its complex imaging mechanism often poses challenges for\ninterpretation. In response to these limitations, this paper introduces an\ninnovative generative model designed to transform SAR images into more\nintelligible optical images, thereby enhancing the interpretability of SAR\nimages. Specifically, our model backbone is based on the recent diffusion\nmodels, which have powerful generative capabilities. We employ SAR images as\nconditional guides in the sampling process and integrate color supervision to\ncounteract color shift issues effectively. We conducted experiments on the\nSEN12 dataset and employed quantitative evaluations using peak signal-to-noise\nratio, structural similarity, and fr\\'echet inception distance. The results\ndemonstrate that our model not only surpasses previous methods in quantitative\nassessments but also significantly enhances the visual quality of the generated\nimages.\n","authors":["Xinyu Bai","Feng Xu"],"pdf_url":"https://arxiv.org/pdf/2407.16921v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16552v2","updated":"2024-07-24T01:09:36Z","published":"2024-07-23T15:05:55Z","title":"MicroEmo: Time-Sensitive Multimodal Emotion Recognition with\n Micro-Expression Dynamics in Video Dialogues","summary":" Multimodal Large Language Models (MLLMs) have demonstrated remarkable\nmultimodal emotion recognition capabilities, integrating multimodal cues from\nvisual, acoustic, and linguistic contexts in the video to recognize human\nemotional states. However, existing methods ignore capturing local facial\nfeatures of temporal dynamics of micro-expressions and do not leverage the\ncontextual dependencies of the utterance-aware temporal segments in the video,\nthereby limiting their expected effectiveness to a certain extent. In this\nwork, we propose MicroEmo, a time-sensitive MLLM aimed at directing attention\nto the local facial micro-expression dynamics and the contextual dependencies\nof utterance-aware video clips. Our model incorporates two key architectural\ncontributions: (1) a global-local attention visual encoder that integrates\nglobal frame-level timestamp-bound image features with local facial features of\ntemporal dynamics of micro-expressions; (2) an utterance-aware video Q-Former\nthat captures multi-scale and contextual dependencies by generating visual\ntoken sequences for each utterance segment and for the entire video then\ncombining them. Preliminary qualitative experiments demonstrate that in a new\nExplainable Multimodal Emotion Recognition (EMER) task that exploits\nmulti-modal and multi-faceted clues to predict emotions in an open-vocabulary\n(OV) manner, MicroEmo demonstrates its effectiveness compared with the latest\nmethods.\n","authors":["Liyun Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.16552v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02508v4","updated":"2024-07-24T00:53:26Z","published":"2024-05-03T22:42:00Z","title":"Rasterized Edge Gradients: Handling Discontinuities Differentiably","summary":" Computing the gradients of a rendering process is paramount for diverse\napplications in computer vision and graphics. However, accurate computation of\nthese gradients is challenging due to discontinuities and rendering\napproximations, particularly for surface-based representations and\nrasterization-based rendering. We present a novel method for computing\ngradients at visibility discontinuities for rasterization-based differentiable\nrenderers. Our method elegantly simplifies the traditionally complex problem\nthrough a carefully designed approximation strategy, allowing for a\nstraightforward, effective, and performant solution. We introduce a novel\nconcept of micro-edges, which allows us to treat the rasterized images as\noutcomes of a differentiable, continuous process aligned with the inherently\nnon-differentiable, discrete-pixel rasterization. This technique eliminates the\nnecessity for rendering approximations or other modifications to the forward\npass, preserving the integrity of the rendered image, which makes it applicable\nto rasterized masks, depth, and normals images where filtering is prohibitive.\nUtilizing micro-edges simplifies gradient interpretation at discontinuities and\nenables handling of geometry intersections, offering an advantage over the\nprior art. We showcase our method in dynamic human head scene reconstruction,\ndemonstrating effective handling of camera images and segmentation masks.\n","authors":["Stanislav Pidhorskyi","Tomas Simon","Gabriel Schwartz","He Wen","Yaser Sheikh","Jason Saragih"],"pdf_url":"https://arxiv.org/pdf/2405.02508v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16369v2","updated":"2024-07-24T00:49:00Z","published":"2024-07-23T10:34:02Z","title":"FCNR: Fast Compressive Neural Representation of Visualization Images","summary":" We present FCNR, a fast compressive neural representation for tens of\nthousands of visualization images under varying viewpoints and timesteps. The\nexisting NeRVI solution, albeit enjoying a high compression ratio, incurs slow\nspeeds in encoding and decoding. Built on the recent advances in stereo image\ncompression, FCNR assimilates stereo context modules and joint context transfer\nmodules to compress image pairs. Our solution significantly improves encoding\nand decoding speed while maintaining high reconstruction quality and satisfying\ncompression ratio. To demonstrate its effectiveness, we compare FCNR with\nstate-of-the-art neural compression methods, including E-NeRV, HNeRV, NeRVI,\nand ECSIC. The source code can be found at\nhttps://github.com/YunfeiLu0112/FCNR.\n","authors":["Yunfei Lu","Pengfei Gu","Chaoli Wang"],"pdf_url":"https://arxiv.org/pdf/2407.16369v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17673v1","updated":"2024-07-24T23:39:10Z","published":"2024-07-24T23:39:10Z","title":"CRASAR-U-DROIDs: A Large Scale Benchmark Dataset for Building Alignment\n and Damage Assessment in Georectified sUAS Imagery","summary":" This document presents the Center for Robot Assisted Search And Rescue -\nUncrewed Aerial Systems - Disaster Response Overhead Inspection Dataset\n(CRASAR-U-DROIDs) for building damage assessment and spatial alignment\ncollected from small uncrewed aerial systems (sUAS) geospatial imagery. This\ndataset is motivated by the increasing use of sUAS in disaster response and the\nlack of previous work in utilizing high-resolution geospatial sUAS imagery for\nmachine learning and computer vision models, the lack of alignment with\noperational use cases, and with hopes of enabling further investigations\nbetween sUAS and satellite imagery. The CRASAR-U-DRIODs dataset consists of\nfifty-two (52) orthomosaics from ten (10) federally declared disasters\n(Hurricane Ian, Hurricane Ida, Hurricane Harvey, Hurricane Idalia, Hurricane\nLaura, Hurricane Michael, Musset Bayou Fire, Mayfield Tornado, Kilauea\nEruption, and Champlain Towers Collapse) spanning 67.98 square kilometers\n(26.245 square miles), containing 21,716 building polygons and damage labels,\nand 7,880 adjustment annotations. The imagery was tiled and presented in\nconjunction with overlaid building polygons to a pool of 130 annotators who\nprovided human judgments of damage according to the Joint Damage Scale. These\nannotations were then reviewed via a two-stage review process in which building\npolygon damage labels were first reviewed individually and then again by\ncommittee. Additionally, the building polygons have been aligned spatially to\nprecisely overlap with the imagery to enable more performant machine learning\nmodels to be trained. It appears that CRASAR-U-DRIODs is the largest labeled\ndataset of sUAS orthomosaic imagery.\n","authors":["Thomas Manzini","Priyankari Perali","Raisa Karnik","Robin Murphy"],"pdf_url":"https://arxiv.org/pdf/2407.17673v1.pdf","comment":"16 Pages, 7 Figures, 6 Tables"},{"id":"http://arxiv.org/abs/2407.17671v1","updated":"2024-07-24T23:23:38Z","published":"2024-07-24T23:23:38Z","title":"Unsqueeze [CLS] Bottleneck to Learn Rich Representations","summary":" Distillation-based self-supervised learning typically leads to more\ncompressed representations due to its radical clustering process and the\nimplementation of a sharper target distribution. To overcome this limitation\nand preserve more information from input, we introduce UDI, conceptualized as\nUnsqueezed Distillation-based self-supervised learning (SSL). UDI enriches the\nlearned representation by encouraging multimodal prediction distilled from a\nconsolidated profile of local predictions that are derived via stratified\nsampling. Our evaluations show that UDI not only promotes semantically\nmeaningful representations at instance level, delivering superior or\ncompetitive results to state-of-the-art SSL methods in image classification,\nbut also effectively preserves the nuisance of input, which yields significant\nimprovement in dense prediction tasks, including object detection and\nsegmentation. Additionally, UDI performs competitively in low-shot image\nclassification, improving the scalability of joint-embedding pipelines. Various\nvisualizations and ablation studies are presented to further elucidate the\nmechanisms behind UDI. Our source code is available at\nhttps://github.com/ISL-CV/udi.\n","authors":["Qing Su","Shihao Ji"],"pdf_url":"https://arxiv.org/pdf/2407.17671v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03411v2","updated":"2024-07-24T23:03:01Z","published":"2024-06-05T16:09:01Z","title":"Interactive Text-to-Image Retrieval with Large Language Models: A\n Plug-and-Play Approach","summary":" In this paper, we primarily address the issue of dialogue-form context query\nwithin the interactive text-to-image retrieval task. Our methodology, PlugIR,\nactively utilizes the general instruction-following capability of LLMs in two\nways. First, by reformulating the dialogue-form context, we eliminate the\nnecessity of fine-tuning a retrieval model on existing visual dialogue data,\nthereby enabling the use of any arbitrary black-box model. Second, we construct\nthe LLM questioner to generate non-redundant questions about the attributes of\nthe target image, based on the information of retrieval candidate images in the\ncurrent context. This approach mitigates the issues of noisiness and redundancy\nin the generated questions. Beyond our methodology, we propose a novel\nevaluation metric, Best log Rank Integral (BRI), for a comprehensive assessment\nof the interactive retrieval system. PlugIR demonstrates superior performance\ncompared to both zero-shot and fine-tuned baselines in various benchmarks.\nAdditionally, the two methodologies comprising PlugIR can be flexibly applied\ntogether or separately in various situations. Our codes are available at\nhttps://github.com/Saehyung-Lee/PlugIR.\n","authors":["Saehyung Lee","Sangwon Yu","Junsung Park","Jihun Yi","Sungroh Yoon"],"pdf_url":"https://arxiv.org/pdf/2406.03411v2.pdf","comment":"ACL 2024 Oral"},{"id":"http://arxiv.org/abs/2312.13317v2","updated":"2024-07-24T22:46:33Z","published":"2023-12-20T11:02:59Z","title":"Deep Hybrid Camera Deblurring for Smartphone Cameras","summary":" Mobile cameras, despite their significant advancements, still have difficulty\nin low-light imaging due to compact sensors and lenses, leading to longer\nexposures and motion blur. Traditional blind deconvolution methods and\nlearning-based deblurring methods can be potential solutions to remove blur.\nHowever, achieving practical performance still remains a challenge. To address\nthis, we propose a learning-based deblurring framework for smartphones,\nutilizing wide and ultra-wide cameras as a hybrid camera system. We\nsimultaneously capture a long-exposure wide image and short-exposure burst\nultra-wide images, and utilize the burst images to deblur the wide image. To\nfully exploit burst ultra-wide images, we present HCDeblur, a practical\ndeblurring framework that includes novel deblurring networks, HC-DNet and\nHC-FNet. HC-DNet utilizes motion information extracted from burst images to\ndeblur a wide image, and HC-FNet leverages burst images as reference images to\nfurther enhance a deblurred output. For training and evaluating the proposed\nmethod, we introduce the HCBlur dataset, which consists of synthetic and\nreal-world datasets. Our experiments demonstrate that HCDeblur achieves\nstate-of-the-art deblurring quality. Code and datasets are available at\nhttps://cg.postech.ac.kr/research/HCDeblur.\n","authors":["Jaesung Rim","Junyong Lee","Heemin Yang","Sunghyun Cho"],"pdf_url":"https://arxiv.org/pdf/2312.13317v2.pdf","comment":"SIGGRAPH 2024, Project page:\n http://cg.postech.ac.kr/research/HCDeblur"},{"id":"http://arxiv.org/abs/2407.17664v1","updated":"2024-07-24T22:21:35Z","published":"2024-07-24T22:21:35Z","title":"SDLNet: Statistical Deep Learning Network for Co-Occurring Object\n Detection and Identification","summary":" With the growing advances in deep learning based technologies the detection\nand identification of co-occurring objects is a challenging task which has many\napplications in areas such as, security and surveillance. In this paper, we\npropose a novel framework called SDLNet- Statistical analysis with Deep\nLearning Network that identifies co-occurring objects in conjunction with base\nobjects in multilabel object categories. The pipeline of proposed work is\nimplemented in two stages: in the first stage of SDLNet we deal with multilabel\ndetectors for discovering labels, and in the second stage we perform\nco-occurrence matrix analysis. In co-occurrence matrix analysis, we learn\nco-occurrence statistics by setting base classes and frequently occurring\nclasses, following this we build association rules and generate frequent\npatterns. The crucial part of SDLNet is recognizing base classes and making\nconsideration for co-occurring classes. Finally, the generated co-occurrence\nmatrix based on frequent patterns will show base classes and their\ncorresponding co-occurring classes. SDLNet is evaluated on two publicly\navailable datasets: Pascal VOC and MS-COCO. The experimental results on these\nbenchmark datasets are reported in Sec 4.\n","authors":["Binay Kumar Singh","Niels Da Vitoria Lobo"],"pdf_url":"https://arxiv.org/pdf/2407.17664v1.pdf","comment":"8 pages, 3 figures, ICMLT-2024. arXiv admin note: text overlap with\n arXiv:2403.17223"},{"id":"http://arxiv.org/abs/2402.14566v2","updated":"2024-07-24T21:52:29Z","published":"2024-02-22T14:04:41Z","title":"Self-supervised Visualisation of Medical Image Datasets","summary":" Self-supervised learning methods based on data augmentations, such as SimCLR,\nBYOL, or DINO, allow obtaining semantically meaningful representations of image\ndatasets and are widely used prior to supervised fine-tuning. A recent\nself-supervised learning method, $t$-SimCNE, uses contrastive learning to\ndirectly train a 2D representation suitable for visualisation. When applied to\nnatural image datasets, $t$-SimCNE yields 2D visualisations with semantically\nmeaningful clusters. In this work, we used $t$-SimCNE to visualise medical\nimage datasets, including examples from dermatology, histology, and blood\nmicroscopy. We found that increasing the set of data augmentations to include\narbitrary rotations improved the results in terms of class separability,\ncompared to data augmentations used for natural images. Our 2D representations\nshow medically relevant structures and can be used to aid data exploration and\nannotation, improving on common approaches for data visualisation.\n","authors":["Ifeoma Veronica Nwabufo","Jan Niklas Böhm","Philipp Berens","Dmitry Kobak"],"pdf_url":"https://arxiv.org/pdf/2402.14566v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.07673v3","updated":"2024-07-24T20:39:47Z","published":"2024-07-10T14:00:19Z","title":"Towards Adaptive Pseudo-label Learning for Semi-Supervised Temporal\n Action Localization","summary":" Alleviating noisy pseudo labels remains a key challenge in Semi-Supervised\nTemporal Action Localization (SS-TAL). Existing methods often filter pseudo\nlabels based on strict conditions, but they typically assess classification and\nlocalization quality separately, leading to suboptimal pseudo-label ranking and\nselection. In particular, there might be inaccurate pseudo labels within\nselected positives, alongside reliable counterparts erroneously assigned to\nnegatives. To tackle these problems, we propose a novel Adaptive Pseudo-label\nLearning (APL) framework to facilitate better pseudo-label selection.\nSpecifically, to improve the ranking quality, Adaptive Label Quality Assessment\n(ALQA) is proposed to jointly learn classification confidence and localization\nreliability, followed by dynamically selecting pseudo labels based on the joint\nscore. Additionally, we propose an Instance-level Consistency Discriminator\n(ICD) for eliminating ambiguous positives and mining potential positives\nsimultaneously based on inter-instance intrinsic consistency, thereby leading\nto a more precise selection. We further introduce a general unsupervised\nAction-aware Contrastive Pre-training (ACP) to enhance the discrimination both\nwithin actions and between actions and backgrounds, which benefits SS-TAL.\nExtensive experiments on THUMOS14 and ActivityNet v1.3 demonstrate that our\nmethod achieves state-of-the-art performance under various semi-supervised\nsettings.\n","authors":["Feixiang Zhou","Bryan Williams","Hossein Rahmani"],"pdf_url":"https://arxiv.org/pdf/2407.07673v3.pdf","comment":"Accepted to ECCV 2024"},{"id":"http://arxiv.org/abs/2407.17630v1","updated":"2024-07-24T20:39:17Z","published":"2024-07-24T20:39:17Z","title":"Revising the Problem of Partial Labels from the Perspective of CNNs'\n Robustness","summary":" Convolutional neural networks (CNNs) have gained increasing popularity and\nversatility in recent decades, finding applications in diverse domains. These\nremarkable achievements are greatly attributed to the support of extensive\ndatasets with precise labels. However, annotating image datasets is intricate\nand complex, particularly in the case of multi-label datasets. Hence, the\nconcept of partial-label setting has been proposed to reduce annotation costs,\nand numerous corresponding solutions have been introduced. The evaluation\nmethods for these existing solutions have been primarily based on accuracy.\nThat is, their performance is assessed by their predictive accuracy on the test\nset. However, we insist that such an evaluation is insufficient and one-sided.\nOn one hand, since the quality of the test set has not been evaluated, the\nassessment results are unreliable. On the other hand, the partial-label problem\nmay also be raised by undergoing adversarial attacks. Therefore, incorporating\nrobustness into the evaluation system is crucial. For this purpose, we first\npropose two attack models to generate multiple partial-label datasets with\nvarying degrees of label missing rates. Subsequently, we introduce a\nlightweight partial-label solution using pseudo-labeling techniques and a\ndesigned loss function. Then, we employ D-Score to analyze both the proposed\nand existing methods to determine whether they can enhance robustness while\nimproving accuracy. Extensive experimental results demonstrate that while\ncertain methods may improve accuracy, the enhancement in robustness is not\nsignificant, and in some cases, it even diminishes.\n","authors":["Xin Zhang","Yuqi Song","Wyatt McCurdy","Xiaofeng Wang","Fei Zuo"],"pdf_url":"https://arxiv.org/pdf/2407.17630v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17628v1","updated":"2024-07-24T20:35:20Z","published":"2024-07-24T20:35:20Z","title":"PEEKABOO: Hiding parts of an image for unsupervised object localization","summary":" Localizing objects in an unsupervised manner poses significant challenges due\nto the absence of key visual information such as the appearance, type and\nnumber of objects, as well as the lack of labeled object classes typically\navailable in supervised settings. While recent approaches to unsupervised\nobject localization have demonstrated significant progress by leveraging\nself-supervised visual representations, they often require computationally\nintensive training processes, resulting in high resource demands in terms of\ncomputation, learnable parameters, and data. They also lack explicit modeling\nof visual context, potentially limiting their accuracy in object localization.\nTo tackle these challenges, we propose a single-stage learning framework,\ndubbed PEEKABOO, for unsupervised object localization by learning context-based\nrepresentations at both the pixel- and shape-level of the localized objects\nthrough image masking. The key idea is to selectively hide parts of an image\nand leverage the remaining image information to infer the location of objects\nwithout explicit supervision. The experimental results, both quantitative and\nqualitative, across various benchmark datasets, demonstrate the simplicity,\neffectiveness and competitive performance of our approach compared to\nstate-of-the-art methods in both single object discovery and unsupervised\nsalient object detection tasks. Code and pre-trained models are available at:\nhttps://github.com/hasibzunair/peekaboo\n","authors":["Hasib Zunair","A. Ben Hamza"],"pdf_url":"https://arxiv.org/pdf/2407.17628v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17620v1","updated":"2024-07-24T20:17:05Z","published":"2024-07-24T20:17:05Z","title":"CoMoTo: Unpaired Cross-Modal Lesion Distillation Improves Breast Lesion\n Detection in Tomosynthesis","summary":" Digital Breast Tomosynthesis (DBT) is an advanced breast imaging modality\nthat offers superior lesion detection accuracy compared to conventional\nmammography, albeit at the trade-off of longer reading time. Accelerating\nlesion detection from DBT using deep learning is hindered by limited data\navailability and huge annotation costs. A possible solution to this issue could\nbe to leverage the information provided by a more widely available modality,\nsuch as mammography, to enhance DBT lesion detection. In this paper, we present\na novel framework, CoMoTo, for improving lesion detection in DBT. Our framework\nleverages unpaired mammography data to enhance the training of a DBT model,\nimproving practicality by eliminating the need for mammography during\ninference. Specifically, we propose two novel components, Lesion-specific\nKnowledge Distillation (LsKD) and Intra-modal Point Alignment (ImPA). LsKD\nselectively distills lesion features from a mammography teacher model to a DBT\nstudent model, disregarding background features. ImPA further enriches LsKD by\nensuring the alignment of lesion features within the teacher before distilling\nknowledge to the student. Our comprehensive evaluation shows that CoMoTo is\nsuperior to traditional pretraining and image-level KD, improving performance\nby 7% Mean Sensitivity under low-data setting. Our code is available at\nhttps://github.com/Muhammad-Al-Barbary/CoMoTo .\n","authors":["Muhammad Alberb","Marawan Elbatel","Aya Elgebaly","Ricardo Montoya-del-Angel","Xiaomeng Li","Robert Martí"],"pdf_url":"https://arxiv.org/pdf/2407.17620v1.pdf","comment":"ADSMI @ MICCAI 2024"},{"id":"http://arxiv.org/abs/2311.13976v2","updated":"2024-07-24T20:01:08Z","published":"2023-11-23T12:42:52Z","title":"Low Latency Instance Segmentation by Continuous Clustering for LiDAR\n Sensors","summary":" Low-latency instance segmentation of LiDAR point clouds is crucial in\nreal-world applications because it serves as an initial and frequently-used\nbuilding block in a robot's perception pipeline, where every task adds further\ndelay. Particularly in dynamic environments, this total delay can result in\nsignificant positional offsets of dynamic objects, as seen in highway\nscenarios. To address this issue, we employ a new technique, which we call\ncontinuous clustering. Unlike most existing clustering approaches, which use a\nfull revolution of the LiDAR sensor, we process the data stream in a continuous\nand seamless fashion. Our approach does not rely on the concept of complete or\npartial sensor rotations with multiple discrete range images; instead, it views\nthe range image as a single and infinitely horizontally growing entity. Each\nnew column of this continuous range image is processed as soon it is available.\nObstacle points are clustered to existing instances in real-time and it is\nchecked at a high-frequency which instances are completed in order to publish\nthem without waiting for the completion of the revolution or some other\nintegration period. In the case of rotating sensors, no problematic\ndiscontinuities between the points of the end and the start of a scan are\nobserved. In this work we describe the two-layered data structure and the\ncorresponding algorithm for continuous clustering. It is able to achieve an\naverage latency of just 5 ms with respect to the latest timestamp of all points\nin the cluster. We are publishing the source code at\nhttps://github.com/UniBwTAS/continuous_clustering.\n","authors":["Andreas Reich","Mirko Maehlisch"],"pdf_url":"https://arxiv.org/pdf/2311.13976v2.pdf","comment":"Accompanying Video: https://www.youtube.com/watch?v=ex4qcR2bkWs"},{"id":"http://arxiv.org/abs/2407.17596v1","updated":"2024-07-24T19:02:01Z","published":"2024-07-24T19:02:01Z","title":"Quality Assured: Rethinking Annotation Strategies in Imaging AI","summary":" This paper does not describe a novel method. Instead, it studies an essential\nfoundation for reliable benchmarking and ultimately real-world application of\nAI-based image analysis: generating high-quality reference annotations.\nPrevious research has focused on crowdsourcing as a means of outsourcing\nannotations. However, little attention has so far been given to annotation\ncompanies, specifically regarding their internal quality assurance (QA)\nprocesses. Therefore, our aim is to evaluate the influence of QA employed by\nannotation companies on annotation quality and devise methodologies for\nmaximizing data annotation efficacy. Based on a total of 57,648 instance\nsegmented images obtained from a total of 924 annotators and 34 QA workers from\nfour annotation companies and Amazon Mechanical Turk (MTurk), we derived the\nfollowing insights: (1) Annotation companies perform better both in terms of\nquantity and quality compared to the widely used platform MTurk. (2) Annotation\ncompanies' internal QA only provides marginal improvements, if any. However,\nimproving labeling instructions instead of investing in QA can substantially\nboost annotation performance. (3) The benefit of internal QA depends on\nspecific image characteristics. Our work could enable researchers to derive\nsubstantially more value from a fixed annotation budget and change the way\nannotation companies conduct internal QA.\n","authors":["Tim Rädsch","Annika Reinke","Vivienn Weru","Minu D. Tizabi","Nicholas Heller","Fabian Isensee","Annette Kopp-Schneider","Lena Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2407.17596v1.pdf","comment":"Accepted at ECCV 2024, preprint, Computer Vision, Data Annotation"},{"id":"http://arxiv.org/abs/2406.03556v2","updated":"2024-07-24T18:50:51Z","published":"2024-06-05T18:10:49Z","title":"Npix2Cpix: A GAN-based Image-to-Image Translation Network with\n Retrieval-Classification Integration for Watermark Retrieval from Historical\n Document Images","summary":" The identification and restoration of ancient watermarks have long been a\nmajor topic in codicology and history. Classifying historical documents based\non watermarks is challenging due to their diversity, noisy samples, multiple\nrepresentation modes, and minor distinctions between classes and intra-class\nvariations. This paper proposes a modified U-net-based conditional generative\nadversarial network (GAN) named Npix2Cpix to translate noisy raw historical\nwatermarked images into clean, handwriting-free watermarked images by\nperforming image translation from degraded (noisy) pixels to clean pixels.\nUsing image-to-image translation and adversarial learning, the network creates\nclutter-free images for watermark restoration and categorization. The generator\nand discriminator of the proposed GAN are trained using two separate loss\nfunctions, each based on the distance between images, to learn the mapping from\nthe input noisy image to the output clean image. After using the proposed GAN\nto pre-process noisy watermarked images, Siamese-based one-shot learning is\nemployed for watermark classification. Experimental results on a large-scale\nhistorical watermark dataset demonstrate that cleaning the noisy watermarked\nimages can help to achieve high one-shot classification accuracy. The\nqualitative and quantitative evaluation of the retrieved watermarked image\nhighlights the effectiveness of the proposed approach.\n","authors":["Utsab Saha","Sawradip Saha","Shaikh Anowarul Fattah","Mohammad Saquib"],"pdf_url":"https://arxiv.org/pdf/2406.03556v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.06863v3","updated":"2024-07-24T18:09:48Z","published":"2024-07-09T13:50:43Z","title":"Beyond Aesthetics: Cultural Competence in Text-to-Image Models","summary":" Text-to-Image (T2I) models are being increasingly adopted in diverse global\ncommunities where they create visual representations of their unique cultures.\nCurrent T2I benchmarks primarily focus on faithfulness, aesthetics, and realism\nof generated images, overlooking the critical dimension of cultural competence.\nIn this work, we introduce a framework to evaluate cultural competence of T2I\nmodels along two crucial dimensions: cultural awareness and cultural diversity,\nand present a scalable approach using a combination of structured knowledge\nbases and large language models to build a large dataset of cultural artifacts\nto enable this evaluation. In particular, we apply this approach to build CUBE\n(CUltural BEnchmark for Text-to-Image models), a first-of-its-kind benchmark to\nevaluate cultural competence of T2I models. CUBE covers cultural artifacts\nassociated with 8 countries across different geo-cultural regions and along 3\nconcepts: cuisine, landmarks, and art. CUBE consists of 1) CUBE-1K, a set of\nhigh-quality prompts that enable the evaluation of cultural awareness, and 2)\nCUBE-CSpace, a larger dataset of cultural artifacts that serves as grounding to\nevaluate cultural diversity. We also introduce cultural diversity as a novel\nT2I evaluation component, leveraging quality-weighted Vendi score. Our\nevaluations reveal significant gaps in the cultural awareness of existing\nmodels across countries and provide valuable insights into the cultural\ndiversity of T2I outputs for under-specified prompts. Our methodology is\nextendable to other cultural regions and concepts, and can facilitate the\ndevelopment of T2I models that better cater to the global population.\n","authors":["Nithish Kannen","Arif Ahmad","Marco Andreetto","Vinodkumar Prabhakaran","Utsav Prabhu","Adji Bousso Dieng","Pushpak Bhattacharyya","Shachi Dave"],"pdf_url":"https://arxiv.org/pdf/2407.06863v3.pdf","comment":"30 pages, 10 figures, preprint"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2407.17451v1","updated":"2024-07-24T17:31:48Z","published":"2024-07-24T17:31:48Z","title":"BlueTempNet: A Temporal Multi-network Dataset of Social Interactions in\n Bluesky Social","summary":" Decentralized social media platforms like Bluesky Social (Bluesky) have made\nit possible to publicly disclose some user behaviors with millisecond-level\nprecision. Embracing Bluesky's principles of open-source and open-data, we\npresent the first collection of the temporal dynamics of user-driven social\ninteractions. BlueTempNet integrates multiple types of networks into a single\nmulti-network, including user-to-user interactions (following and blocking\nusers) and user-to-community interactions (creating and joining communities).\nCommunities are user-formed groups in custom Feeds, where users subscribe to\nposts aligned with their interests. Following Bluesky's public data policy, we\ncollect existing Bluesky Feeds, including the users who liked and generated\nthese Feeds, and provide tools to gather users' social interactions within a\ndate range. This data-collection strategy captures past user behaviors and\nsupports the future data collection of user behavior.\n","authors":["Ujun Jeong","Bohan Jiang","Zhen Tan","H. Russell Bernard","Huan Liu"],"pdf_url":"https://arxiv.org/pdf/2407.17451v1.pdf","comment":"to appear in IEEE Data Description"},{"id":"http://arxiv.org/abs/2406.08461v2","updated":"2024-07-24T16:40:41Z","published":"2024-06-12T17:51:47Z","title":"Bridging the Gap: Unravelling Local Government Data Sharing Barriers in\n Estonia and Beyond","summary":" Open Government Data (OGD) plays a crucial role in transforming smart cities\ninto sustainable and intelligent entities by providing data for analytics,\nreal-time monitoring, and informed decision-making. This data is increasingly\nused in urban digital twins, enhancing city management through stakeholder\ncollaboration. However, local administrative data remains underutilized even in\ndigitally advanced countries like Estonia. This study explores barriers\npreventing Estonian municipalities from sharing OGD, using a qualitative\napproach through interviews with Estonian municipalities and drawing on the\nOGD-adapted Innovation Resistance Theory model (IRT). Interviews with local\ngovernment officials highlight ongoing is-sues in data provision and quality.\nBy addressing overlooked weaknesses in the Estonian open data ecosystem and\nproviding actionable recommendations, this research contributes to a more\nresilient and sustainable open data ecosystem. Additionally, by validating the\nOGD-adapted Innovation Resistance Theory model and proposing a revised version\ntailored for local government contexts, the study advances theoretical\nframeworks for understanding data sharing resistance. Ultimately, this study\nserves as a call to action for policymakers and practitioners to prioritize\nlocal OGD initiatives, ensuring the full utilization of OGD in smart city\ndevelopment.\n","authors":["Katrin Rajamäe Soosaar","Anastasija Nikiforova"],"pdf_url":"https://arxiv.org/pdf/2406.08461v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.12517v5","updated":"2024-07-24T15:10:41Z","published":"2023-05-21T17:14:31Z","title":"Description-Based Text Similarity","summary":" Identifying texts with a given semantics is central for many information\nseeking scenarios. Similarity search over vector embeddings appear to be\ncentral to this ability, yet the similarity reflected in current text\nembeddings is corpus-driven, and is inconsistent and sub-optimal for many use\ncases. What, then, is a good notion of similarity for effective retrieval of\ntext?\n We identify the need to search for texts based on abstract descriptions of\ntheir content, and the corresponding notion of \\emph{description based\nsimilarity}. We demonstrate the inadequacy of current text embeddings and\npropose an alternative model that significantly improves when used in standard\nnearest neighbor search. The model is trained using positive and negative pairs\nsourced through prompting a LLM, demonstrating how data from LLMs can be used\nfor creating new capabilities not immediately possible using the original\nmodel.\n","authors":["Shauli Ravfogel","Valentina Pyatkin","Amir DN Cohen","Avshalom Manevich","Yoav Goldberg"],"pdf_url":"https://arxiv.org/pdf/2305.12517v5.pdf","comment":"Accepted in COLM 2024"},{"id":"http://arxiv.org/abs/2407.17284v1","updated":"2024-07-24T13:50:21Z","published":"2024-07-24T13:50:21Z","title":"A Novel Two-Step Fine-Tuning Pipeline for Cold-Start Active Learning in\n Text Classification Tasks","summary":" This is the first work to investigate the effectiveness of BERT-based\ncontextual embeddings in active learning (AL) tasks on cold-start scenarios,\nwhere traditional fine-tuning is infeasible due to the absence of labeled data.\nOur primary contribution is the proposal of a more robust fine-tuning pipeline\n- DoTCAL - that diminishes the reliance on labeled data in AL using two steps:\n(1) fully leveraging unlabeled data through domain adaptation of the embeddings\nvia masked language modeling and (2) further adjusting model weights using\nlabeled data selected by AL. Our evaluation contrasts BERT-based embeddings\nwith other prevalent text representation paradigms, including Bag of Words\n(BoW), Latent Semantic Indexing (LSI), and FastText, at two critical stages of\nthe AL process: instance selection and classification. Experiments conducted on\neight ATC benchmarks with varying AL budgets (number of labeled instances) and\nnumber of instances (about 5,000 to 300,000) demonstrate DoTCAL's superior\neffectiveness, achieving up to a 33% improvement in Macro-F1 while reducing\nlabeling efforts by half compared to the traditional one-step method. We also\nfound that in several tasks, BoW and LSI (due to information aggregation)\nproduce results superior (up to 59% ) to BERT, especially in low-budget\nscenarios and hard-to-classify tasks, which is quite surprising.\n","authors":["Fabiano Belém","Washington Cunha","Celso França","Claudio Andrade","Leonardo Rocha","Marcos André Gonçalves"],"pdf_url":"https://arxiv.org/pdf/2407.17284v1.pdf","comment":"11 pages, 4 figures, 2 Tables, and 1 algorithm"},{"id":"http://arxiv.org/abs/2407.07503v3","updated":"2024-07-24T13:07:26Z","published":"2024-07-10T09:41:36Z","title":"Inter and Intra Prior Learning-based Hyperspectral Image Reconstruction\n Using Snapshot SWIR Metasurface","summary":" Shortwave-infrared(SWIR) spectral information, ranging from 1 {\\mu}m to\n2.5{\\mu}m, overcomes the limitations of traditional color cameras in acquiring\nscene information. However, conventional SWIR hyperspectral imaging systems\nface challenges due to their bulky setups and low acquisition speeds. This work\nintroduces a snapshot SWIR hyperspectral imaging system based on a metasurface\nfilter and a corresponding filter selection method to achieve the lowest\ncorrelation coefficient among these filters. This system offers the advantages\nof compact size and snapshot imaging. We propose a novel inter and intra prior\nlearning unfolding framework to achieve high-quality SWIR hyperspectral image\nreconstruction, which bridges the gap between prior learning and cross-stage\ninformation interaction. Additionally, We design an adaptive feature transfer\nmechanism to adaptively transfer the contextual correlation of multi-scale\nencoder features to prevent detailed information loss in the decoder.\nExperiment results demonstrate that our method can reconstruct hyperspectral\nimages with high speed and superior performance over existing methods.\n","authors":["Linqiang Li","Jinglei Hao","Yongqiang Zhao","Pan Liu","Haofang Yan","Ziqin Zhang","Seong G. Kong"],"pdf_url":"https://arxiv.org/pdf/2407.07503v3.pdf","comment":"12 pages,9 figures"},{"id":"http://arxiv.org/abs/2407.17234v1","updated":"2024-07-24T12:42:41Z","published":"2024-07-24T12:42:41Z","title":"Intent-Guided Heterogeneous Graph Contrastive Learning for\n Recommendation","summary":" Contrastive Learning (CL)-based recommender systems have gained prominence in\nthe context of Heterogeneous Graph (HG) due to their capacity to enhance the\nconsistency of representations across different views. Nonetheless, existing\nframeworks often neglect the fact that user-item interactions within HG are\ngoverned by diverse latent intents (for instance, preferences towards specific\nbrands or the demographic characteristics of item audiences), which are pivotal\nin capturing fine-grained relations. The exploration of these underlying\nintents, particularly through the lens of meta-paths in HGs, presents us with\ntwo principal challenges: i) How to integrate CL mechanisms with latent\nintents; ii) How to mitigate the noise associated with these complicated\nintents.To address these challenges, we propose an innovative framework termed\nIntent-Guided Heterogeneous Graph Contrastive Learning (IHGCL), which designed\nto enhance CL-based recommendation by capturing the intents contained within\nmeta-paths. Specifically, the IHGCL framework includes: i) it employs a\nmeta-path-based dual contrastive learning approach to effectively integrate\nintents into the recommendation, constructing meta-path contrast and view\ncontrast; ii) it uses an bottlenecked autoencoder that combines mask\npropagation with the information bottleneck principle to significantly reduce\nnoise perturbations introduced by meta-paths. Empirical evaluations conducted\nacross six distinct datasets demonstrate the superior performance of our IHGCL\nframework relative to conventional baseline methods. Our model implementation\nis available at https://github.com/wangyu0627/IHGCL.\n","authors":["Lei Sang","Yu Wang","Yi Zhang","Yiwen Zhang","Xindong Wu"],"pdf_url":"https://arxiv.org/pdf/2407.17234v1.pdf","comment":"14pages, 11figures"},{"id":"http://arxiv.org/abs/2407.11245v2","updated":"2024-07-24T11:54:26Z","published":"2024-07-15T21:14:13Z","title":"Pacer and Runner: Cooperative Learning Framework between Single- and\n Cross-Domain Sequential Recommendation","summary":" Cross-Domain Sequential Recommendation (CDSR) improves recommendation\nperformance by utilizing information from multiple domains, which contrasts\nwith Single-Domain Sequential Recommendation (SDSR) that relies on a historical\ninteraction within a specific domain. However, CDSR may underperform compared\nto the SDSR approach in certain domains due to negative transfer, which occurs\nwhen there is a lack of relation between domains or different levels of data\nsparsity. To address the issue of negative transfer, our proposed CDSR model\nestimates the degree of negative transfer of each domain and adaptively assigns\nit as a weight factor to the prediction loss, to control gradient flows through\ndomains with significant negative transfer. To this end, our model compares the\nperformance of a model trained on multiple domains (CDSR) with a model trained\nsolely on the specific domain (SDSR) to evaluate the negative transfer of each\ndomain using our asymmetric cooperative network. In addition, to facilitate the\ntransfer of valuable cues between the SDSR and CDSR tasks, we developed an\nauxiliary loss that maximizes the mutual information between the representation\npairs from both tasks on a per-domain basis. This cooperative learning between\nSDSR and CDSR tasks is similar to the collaborative dynamics between pacers and\nrunners in a marathon. Our model outperformed numerous previous works in\nextensive experiments on two real-world industrial datasets across ten service\ndomains. We also have deployed our model in the recommendation system of our\npersonal assistant app service, resulting in 21.4% increase in click-through\nrate compared to existing models, which is valuable to real-world business.\n","authors":["Chung Park","Taesan Kim","Hyungjun Yoon","Junui Hong","Yelim Yu","Mincheol Cho","Minsung Choi","Jaegul Choo"],"pdf_url":"https://arxiv.org/pdf/2407.11245v2.pdf","comment":"Accepted at SIGIR'24 (Best Paper Honorable Mention)"},{"id":"http://arxiv.org/abs/2407.17115v1","updated":"2024-07-24T09:24:49Z","published":"2024-07-24T09:24:49Z","title":"Reinforced Prompt Personalization for Recommendation with Large Language\n Models","summary":" Designing effective prompts can empower LLMs to understand user preferences\nand provide recommendations by leveraging LLMs' intent comprehension and\nknowledge utilization capabilities. However, existing research predominantly\nconcentrates on task-wise prompting, developing fixed prompt templates composed\nof four patterns (i.e., role-playing, history records, reasoning guidance, and\noutput format) and applying them to all users for a given task. Although\nconvenient, task-wise prompting overlooks individual user differences, leading\nto potential mismatches in capturing user preferences. To address it, we\nintroduce the concept of instance-wise prompting to personalize discrete\nprompts for individual users and propose Reinforced Prompt Personalization\n(RPP) to optimize the four patterns in prompts using multi-agent reinforcement\nlearning (MARL). To boost efficiency, RPP formulates prompt personalization as\nselecting optimal sentences holistically across the four patterns, rather than\noptimizing word-by-word. To ensure the quality of prompts, RPP meticulously\ncrafts diverse expressions for each of the four patterns, considering multiple\nanalytical perspectives for specific recommendation tasks. In addition to RPP,\nour proposal of RPP+ aims to enhance the scalability of action space by\ndynamically refining actions with LLMs throughout the iterative process. We\nevaluate the effectiveness of RPP/RPP+ in ranking tasks over various datasets.\nExperimental results demonstrate the superiority of RPP/RPP+ over traditional\nrecommender models, few-shot methods, and other prompt-based methods,\nunderscoring the significance of instance-wise prompting for LLMs in\nrecommendation tasks and validating the effectiveness of RPP/RPP+. Our code is\navailable at https://github.com/maowenyu-11/RPP.\n","authors":["Wenyu Mao","Jiancan Wu","Weijian Chen","Chongming Gao","Xiang Wang","Xiangnan He"],"pdf_url":"https://arxiv.org/pdf/2407.17115v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03365v2","updated":"2024-07-24T08:48:38Z","published":"2024-01-31T11:03:58Z","title":"Heterophily-Aware Fair Recommendation using Graph Convolutional Networks","summary":" In recent years, graph neural networks (GNNs) have become a popular tool to\nimprove the accuracy and performance of recommender systems. Modern recommender\nsystems are not only designed to serve end users, but also to benefit other\nparticipants, such as items and items providers. These participants may have\ndifferent or conflicting goals and interests, which raise the need for fairness\nand popularity bias considerations. GNN-based recommendation methods also face\nthe challenges of unfairness and popularity bias and their normalization and\naggregation processes suffer from these challenges. In this paper, we propose a\nfair GNN-based recommender system, called HetroFair, to improve items' side\nfairness. HetroFair uses two separate components to generate fairness-aware\nembeddings: i) fairnessaware attention which incorporates dot product in the\nnormalization process of GNNs, to decrease the effect of nodes' degrees, and\nii) heterophily feature weighting to assign distinct weights to different\nfeatures during the aggregation process. In order to evaluate the effectiveness\nof HetroFair, we conduct extensive experiments over six real-world datasets.\nOur experimental results reveal that HetroFair not only alleviates the\nunfairness and popularity bias on items' side, but also achieves superior\naccuracy on users' side. Our implementation is publicly available at\nhttps://github.com/NematGH/HetroFair.\n","authors":["Nemat Gholinejad","Mostafa Haghir Chehreghani"],"pdf_url":"https://arxiv.org/pdf/2402.03365v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16984v1","updated":"2024-07-24T04:01:09Z","published":"2024-07-24T04:01:09Z","title":"scGHSOM: Hierarchical clustering and visualization of single-cell and\n CRISPR data using growing hierarchical SOM","summary":" High-dimensional single-cell data poses significant challenges in identifying\nunderlying biological patterns due to the complexity and heterogeneity of\ncellular states. We propose a comprehensive gene-cell dependency visualization\nvia unsupervised clustering, Growing Hierarchical Self-Organizing Map (GHSOM),\nspecifically designed for analyzing high-dimensional single-cell data like\nsingle-cell sequencing and CRISPR screens. GHSOM is applied to cluster samples\nin a hierarchical structure such that the self-growth structure of clusters\nsatisfies the required variations between and within. We propose a novel\nSignificant Attributes Identification Algorithm to identify features that\ndistinguish clusters. This algorithm pinpoints attributes with minimal\nvariation within a cluster but substantial variation between clusters. These\nkey attributes can then be used for targeted data retrieval and downstream\nanalysis. Furthermore, we present two innovative visualization tools: Cluster\nFeature Map and Cluster Distribution Map. The Cluster Feature Map highlights\nthe distribution of specific features across the hierarchical structure of\nGHSOM clusters. This allows for rapid visual assessment of cluster uniqueness\nbased on chosen features. The Cluster Distribution Map depicts leaf clusters as\ncircles on the GHSOM grid, with circle size reflecting cluster data size and\ncolor customizable to visualize features like cell type or other attributes. We\napply our analysis to three single-cell datasets and one CRISPR dataset\n(cell-gene database) and evaluate clustering methods with internal and external\nCH and ARI scores. GHSOM performs well, being the best performer in internal\nevaluation (CH=4.2). In external evaluation, GHSOM has the third-best\nperformance of all methods.\n","authors":["Shang-Jung Wen","Jia-Ming Chang","Fang Yu"],"pdf_url":"https://arxiv.org/pdf/2407.16984v1.pdf","comment":"Abstract presentation at BIOKDD@ACM KDD 2024"},{"id":"http://arxiv.org/abs/2407.08108v2","updated":"2024-07-24T03:37:17Z","published":"2024-07-11T00:54:56Z","title":"CADC: Encoding User-Item Interactions for Compressing Recommendation\n Model Training Data","summary":" Deep learning recommendation models (DLRMs) are at the heart of the current\ne-commerce industry. However, the amount of training data used to train these\nlarge models is growing exponentially, leading to substantial training hurdles.\nThe training dataset contains two primary types of information: content-based\ninformation (features of users and items) and collaborative information\n(interactions between users and items). One approach to reduce the training\ndataset is to remove user-item interactions. But that significantly diminishes\ncollaborative information, which is crucial for maintaining accuracy due to its\ninclusion of interaction histories. This loss profoundly impacts DLRM\nperformance.\n This paper makes an important observation that if one can capture the\nuser-item interaction history to enrich the user and item embeddings, then the\ninteraction history can be compressed without losing model accuracy. Thus, this\nwork, Collaborative Aware Data Compression (CADC), takes a two-step approach to\ntraining dataset compression. In the first step, we use matrix factorization of\nthe user-item interaction matrix to create a novel embedding representation for\nboth the users and items. Once the user and item embeddings are enriched by the\ninteraction history information the approach then applies uniform random\nsampling of the training dataset to drastically reduce the training dataset\nsize while minimizing model accuracy drop. The source code of CADC is available\nat\n\\href{https://anonymous.4open.science/r/DSS-RM-8C1D/README.md}{https://anonymous.4open.science/r/DSS-RM-8C1D/README.md}.\n","authors":["Hossein Entezari Zarch","Abdulla Alshabanah","Chaoyi Jiang","Murali Annavaram"],"pdf_url":"https://arxiv.org/pdf/2407.08108v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17688v2","updated":"2024-07-24T01:35:07Z","published":"2024-03-26T13:31:33Z","title":"Large Language Models Enhanced Collaborative Filtering","summary":" Recent advancements in Large Language Models (LLMs) have attracted\nconsiderable interest among researchers to leverage these models to enhance\nRecommender Systems (RSs). Existing work predominantly utilizes LLMs to\ngenerate knowledge-rich texts or utilizes LLM-derived embeddings as features to\nimprove RSs. Although the extensive world knowledge embedded in LLMs generally\nbenefits RSs, the application can only take limited number of users and items\nas inputs, without adequately exploiting collaborative filtering information.\nConsidering its crucial role in RSs, one key challenge in enhancing RSs with\nLLMs lies in providing better collaborative filtering information through LLMs.\nIn this paper, drawing inspiration from the in-context learning and chain of\nthought reasoning in LLMs, we propose the Large Language Models enhanced\nCollaborative Filtering (LLM-CF) framework, which distils the world knowledge\nand reasoning capabilities of LLMs into collaborative filtering. We also\nexplored a concise and efficient instruction-tuning method, which improves the\nrecommendation capabilities of LLMs while preserving their general\nfunctionalities (e.g., not decreasing on the LLM benchmark). Comprehensive\nexperiments on three real-world datasets demonstrate that LLM-CF significantly\nenhances several backbone recommendation models and consistently outperforms\ncompetitive baselines, showcasing its effectiveness in distilling the world\nknowledge and reasoning capabilities of LLM into collaborative filtering.\n","authors":["Zhongxiang Sun","Zihua Si","Xiaoxue Zang","Kai Zheng","Yang Song","Xiao Zhang","Jun Xu"],"pdf_url":"https://arxiv.org/pdf/2403.17688v2.pdf","comment":"Accepted by CIKM 2024"},{"id":"http://arxiv.org/abs/2407.09017v3","updated":"2024-07-24T01:15:20Z","published":"2024-07-12T06:10:01Z","title":"AI-Driven Guided Response for Security Operation Centers with Microsoft\n Copilot for Security","summary":" Security operation centers contend with a constant stream of security\nincidents, ranging from straightforward to highly complex. To address this, we\ndeveloped Copilot Guided Response (CGR), an industry-scale ML architecture that\nguides security analysts across three key tasks -- (1) investigation, providing\nessential historical context by identifying similar incidents; (2) triaging to\nascertain the nature of the incident -- whether it is a true positive, false\npositive, or benign positive; and (3) remediation, recommending tailored\ncontainment actions. CGR is integrated into the Microsoft Defender XDR product\nand deployed worldwide, generating millions of recommendations across thousands\nof customers. Our extensive evaluation, incorporating internal evaluation,\ncollaboration with security experts, and customer feedback, demonstrates that\nCGR delivers high-quality recommendations across all three tasks. We provide a\ncomprehensive overview of the CGR architecture, setting a precedent as the\nfirst cybersecurity company to openly discuss these capabilities in such depth.\nAdditionally, we GUIDE, the largest public collection of real-world security\nincidents, spanning 13M evidences across 1M annotated incidents. By enabling\nresearchers and practitioners to conduct research on real-world data, GUIDE\nadvances the state of cybersecurity and supports the development of\nnext-generation machine learning systems.\n","authors":["Scott Freitas","Jovan Kalajdjieski","Amir Gharib","Robert McCann"],"pdf_url":"https://arxiv.org/pdf/2407.09017v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.12784v2","updated":"2024-07-24T23:00:50Z","published":"2024-02-20T07:49:30Z","title":"Understanding and Mitigating the Threat of Vec2Text to Dense Retrieval\n Systems","summary":" The emergence of Vec2Text -- a method for text embedding inversion -- has\nraised serious privacy concerns for dense retrieval systems which use text\nembeddings, such as those offered by OpenAI and Cohere. This threat comes from\nthe ability for a malicious attacker with access to embeddings to reconstruct\nthe original text. In this paper, we investigate various factors related to\nembedding models that may impact text recoverability via Vec2Text. We explore\nfactors such as distance metrics, pooling functions, bottleneck pre-training,\ntraining with noise addition, embedding quantization, and embedding dimensions,\nwhich were not considered in the original Vec2Text paper. Through a\ncomprehensive analysis of these factors, our objective is to gain a deeper\nunderstanding of the key elements that affect the trade-offs between the text\nrecoverability and retrieval effectiveness of dense retrieval systems, offering\ninsights for practitioners designing privacy-aware dense retrieval systems. We\nalso propose a simple embedding transformation fix that guarantees equal\nranking effectiveness while mitigating the recoverability risk. Overall, this\nstudy reveals that Vec2Text could pose a threat to current dense retrieval\nsystems, but there are some effective methods to patch such systems.\n","authors":["Shengyao Zhuang","Bevan Koopman","Xiaoran Chu","Guido Zuccon"],"pdf_url":"https://arxiv.org/pdf/2402.12784v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17631v1","updated":"2024-07-24T20:44:36Z","published":"2024-07-24T20:44:36Z","title":"BLAZE: Cross-Language and Cross-Project Bug Localization via Dynamic\n Chunking and Hard Example Learning","summary":" Software bugs require developers to exert significant effort to identify and\nresolve them, often consuming about one-third of their time. Bug localization,\nthe process of pinpointing the exact source code files that need modification,\nis crucial in reducing this effort. Existing bug localization tools, typically\nreliant on deep learning techniques, face limitations in cross-project\napplicability and effectiveness in multi-language environments. Recent\nadvancements with Large Language Models (LLMs) offer detailed representations\nfor bug localization. However, they encounter challenges with limited context\nwindows and mapping accuracy. To address these issues, we propose BLAZE, an\napproach that employs dynamic chunking and hard example learning. First, BLAZE\ndynamically segments source code to minimize continuity loss. Then, BLAZE\nfine-tunes a GPT-based model using challenging bug cases, in order to enhance\ncross-project and cross-language bug localization. To support the capability of\nBLAZE, we create the BEETLEBOX dataset, which comprises 26,321 bugs from 29\nlarge and thriving open-source projects across five different programming\nlanguages (Java, C++, Python, Go, and JavaScript). Our evaluations of BLAZE on\nthree benchmark datasets BEETLEBOX, SWE-Bench, and Ye et al. demonstrate\nsubstantial improvements compared to six state-of-the-art baselines.\nSpecifically, BLAZE achieves up to an increase of 120% in Top 1 accuracy, 144%\nin Mean Average Precision (MAP), and 100% in Mean Reciprocal Rank (MRR). An\nextensive ablation study confirms the contributions of our pipeline components\nto the overall performance enhancement.\n","authors":["Partha Chakraborty","Mahmoud Alfadel","Meiyappan Nagappan"],"pdf_url":"https://arxiv.org/pdf/2407.17631v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21056v1","updated":"2024-07-24T13:26:02Z","published":"2024-07-24T13:26:02Z","title":"What Matters in Explanations: Towards Explainable Fake Review Detection\n Focusing on Transformers","summary":" Customers' reviews and feedback play crucial role on electronic\ncommerce~(E-commerce) platforms like Amazon, Zalando, and eBay in influencing\nother customers' purchasing decisions. However, there is a prevailing concern\nthat sellers often post fake or spam reviews to deceive potential customers and\nmanipulate their opinions about a product. Over the past decade, there has been\nconsiderable interest in using machine learning (ML) and deep learning (DL)\nmodels to identify such fraudulent reviews. Unfortunately, the decisions made\nby complex ML and DL models - which often function as \\emph{black-boxes} - can\nbe surprising and difficult for general users to comprehend. In this paper, we\npropose an explainable framework for detecting fake reviews with high precision\nin identifying fraudulent content with explanations and investigate what\ninformation matters most for explaining particular decisions by conducting\nempirical user evaluation. Initially, we develop fake review detection models\nusing DL and transformer models including XLNet and DistilBERT. We then\nintroduce layer-wise relevance propagation (LRP) technique for generating\nexplanations that can map the contributions of words toward the predicted\nclass. The experimental results on two benchmark fake review detection datasets\ndemonstrate that our predictive models achieve state-of-the-art performance and\noutperform several existing methods. Furthermore, the empirical user evaluation\nof the generated explanations concludes which important information needs to be\nconsidered in generating explanations in the context of fake review\nidentification.\n","authors":["Md Shajalal","Md Atabuzzaman","Alexander Boden","Gunnar Stevens","Delong Du"],"pdf_url":"https://arxiv.org/pdf/2407.21056v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2407.17467v1","updated":"2024-07-24T17:59:02Z","published":"2024-07-24T17:59:02Z","title":"CMR Scaling Law: Predicting Critical Mixture Ratios for Continual\n Pre-training of Language Models","summary":" Large Language Models (LLMs) excel in diverse tasks but often underperform in\nspecialized fields due to limited domain-specific or proprietary corpus.\nContinual pre-training (CPT) enhances LLM capabilities by imbuing new\ndomain-specific or proprietary knowledge while replaying general corpus to\nprevent catastrophic forgetting. The data mixture ratio of general corpus and\ndomain-specific corpus, however, has been chosen heuristically, leading to\nsub-optimal training efficiency in practice. In this context, we attempt to\nre-visit the scaling behavior of LLMs under the hood of CPT, and discover a\npower-law relationship between loss, mixture ratio, and training tokens scale.\nWe formalize the trade-off between general and domain-specific capabilities,\nleading to a well-defined Critical Mixture Ratio (CMR) of general and domain\ndata. By striking the balance, CMR maintains the model's general ability and\nachieves the desired domain transfer, ensuring the highest utilization of\navailable resources. Therefore, if we value the balance between efficiency and\neffectiveness, CMR can be consider as the optimal mixture ratio.Through\nextensive experiments, we ascertain the predictability of CMR, and propose CMR\nscaling law and have substantiated its generalization. These findings offer\npractical guidelines for optimizing LLM training in specialized domains,\nensuring both general and domain-specific performance while efficiently\nmanaging training resources.\n","authors":["Jiawei Gu","Zacc Yang","Chuanghao Ding","Rui Zhao","Fei Tan"],"pdf_url":"https://arxiv.org/pdf/2407.17467v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17466v1","updated":"2024-07-24T17:58:49Z","published":"2024-07-24T17:58:49Z","title":"Traversing Pareto Optimal Policies: Provably Efficient Multi-Objective\n Reinforcement Learning","summary":" This paper investigates multi-objective reinforcement learning (MORL), which\nfocuses on learning Pareto optimal policies in the presence of multiple reward\nfunctions. Despite MORL's significant empirical success, there is still a lack\nof satisfactory understanding of various MORL optimization targets and\nefficient learning algorithms. Our work offers a systematic analysis of several\noptimization targets to assess their abilities to find all Pareto optimal\npolicies and controllability over learned policies by the preferences for\ndifferent objectives. We then identify Tchebycheff scalarization as a favorable\nscalarization method for MORL. Considering the non-smoothness of Tchebycheff\nscalarization, we reformulate its minimization problem into a new min-max-max\noptimization problem. Then, for the stochastic policy class, we propose\nefficient algorithms using this reformulation to learn Pareto optimal policies.\nWe first propose an online UCB-based algorithm to achieve an $\\varepsilon$\nlearning error with an $\\tilde{\\mathcal{O}}(\\varepsilon^{-2})$ sample\ncomplexity for a single given preference. To further reduce the cost of\nenvironment exploration under different preferences, we propose a\npreference-free framework that first explores the environment without\npre-defined preferences and then generates solutions for any number of\npreferences. We prove that it only requires an\n$\\tilde{\\mathcal{O}}(\\varepsilon^{-2})$ exploration complexity in the\nexploration phase and demands no additional exploration afterward. Lastly, we\nanalyze the smooth Tchebycheff scalarization, an extension of Tchebycheff\nscalarization, which is proved to be more advantageous in distinguishing the\nPareto optimal policies from other weakly Pareto optimal policies based on\nentry values of preference vectors. Furthermore, we extend our algorithms and\ntheoretical analysis to accommodate this optimization target.\n","authors":["Shuang Qiu","Dake Zhang","Rui Yang","Boxiang Lyu","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.17466v1.pdf","comment":"Initially submitted in May 2024"},{"id":"http://arxiv.org/abs/2407.17465v1","updated":"2024-07-24T17:58:42Z","published":"2024-07-24T17:58:42Z","title":"u-$μ$P: The Unit-Scaled Maximal Update Parametrization","summary":" The Maximal Update Parametrization ($\\mu$P) aims to make the optimal\nhyperparameters (HPs) of a model independent of its size, allowing them to be\nswept using a cheap proxy model rather than the full-size target model. We\npresent a new scheme, u-$\\mu$P, which improves upon $\\mu$P by combining it with\nUnit Scaling, a method for designing models that makes them easy to train in\nlow-precision. The two techniques have a natural affinity: $\\mu$P ensures that\nthe scale of activations is independent of model size, and Unit Scaling ensures\nthat activations, weights and gradients begin training with a scale of one.\nThis synthesis opens the door to a simpler scheme, whose default values are\nnear-optimal. This in turn facilitates a more efficient sweeping strategy, with\nu-$\\mu$P models reaching a lower loss than comparable $\\mu$P models and working\nout-of-the-box in FP8.\n","authors":["Charlie Blake","Constantin Eichenberg","Josef Dean","Lukas Balles","Luke Y. Prince","Björn Deiseroth","Andres Felipe Cruz-Salinas","Carlo Luschi","Samuel Weinbach","Douglas Orr"],"pdf_url":"https://arxiv.org/pdf/2407.17465v1.pdf","comment":"48 pages"},{"id":"http://arxiv.org/abs/2407.17460v1","updated":"2024-07-24T17:57:21Z","published":"2024-07-24T17:57:21Z","title":"SoNIC: Safe Social Navigation with Adaptive Conformal Inference and\n Constrained Reinforcement Learning","summary":" Reinforcement Learning (RL) has enabled social robots to generate\ntrajectories without human-designed rules or interventions, which makes it more\neffective than hard-coded systems for generalizing to complex real-world\nscenarios. However, social navigation is a safety-critical task that requires\nrobots to avoid collisions with pedestrians while previous RL-based solutions\nfall short in safety performance in complex environments. To enhance the safety\nof RL policies, to the best of our knowledge, we propose the first algorithm,\nSoNIC, that integrates adaptive conformal inference (ACI) with constrained\nreinforcement learning (CRL) to learn safe policies for social navigation. More\nspecifically, our method augments RL observations with ACI-generated\nnonconformity scores and provides explicit guidance for agents to leverage the\nuncertainty metrics to avoid safety-critical areas by incorporating safety\nconstraints with spatial relaxation. Our method outperforms state-of-the-art\nbaselines in terms of both safety and adherence to social norms by a large\nmargin and demonstrates much stronger robustness to out-of-distribution\nscenarios. Our code and video demos are available on our project website:\nhttps://sonic-social-nav.github.io/.\n","authors":["Jianpeng Yao","Xiaopan Zhang","Yu Xia","Zejin Wang","Amit K. Roy-Chowdhury","Jiachen Li"],"pdf_url":"https://arxiv.org/pdf/2407.17460v1.pdf","comment":"Project website: https://sonic-social-nav.github.io/"},{"id":"http://arxiv.org/abs/2403.14236v3","updated":"2024-07-24T17:56:32Z","published":"2024-03-21T08:54:24Z","title":"A Unified Framework for Model Editing","summary":" ROME and MEMIT are largely believed to be two different model editing\nalgorithms, with the major difference between them being the ability to perform\nbatched edits. In this paper, we unify these two algorithms under a single\nconceptual umbrella, optimizing for the same goal, which we call the\npreservation-memorization objective. ROME uses an equality constraint to\noptimize this objective to perform one edit at a time, whereas MEMIT employs a\nmore flexible least-square constraint that allows for batched edits. We\ngeneralize ROME and enable batched editing with equality constraint in the form\nof EMMET - an Equality-constrained Mass Model Editing algorithm for\nTransformers, a new batched memory-editing algorithm. EMMET can perform\nbatched-edits up to a batch-size of 10,000, with very similar performance to\nMEMIT across multiple dimensions. With the introduction of EMMET, we truly\nunify ROME and MEMIT and show that both algorithms are equivalent in terms of\ntheir optimization objective, their abilities (singular and batched editing),\ntheir model editing performance and their limitations.\n","authors":["Akshat Gupta","Dev Sajnani","Gopala Anumanchipalli"],"pdf_url":"https://arxiv.org/pdf/2403.14236v3.pdf","comment":"Under review. To appear as poster at KnowledgeableLM Workshop\n co-located with ACL 2024"},{"id":"http://arxiv.org/abs/2407.17459v1","updated":"2024-07-24T17:54:07Z","published":"2024-07-24T17:54:07Z","title":"Hidden or Inferred: Fair Learning-To-Rank with Unknown Demographics","summary":" As learning-to-rank models are increasingly deployed for decision-making in\nareas with profound life implications, the FairML community has been developing\nfair learning-to-rank (LTR) models. These models rely on the availability of\nsensitive demographic features such as race or sex. However, in practice,\nregulatory obstacles and privacy concerns protect this data from collection and\nuse. As a result, practitioners may either need to promote fairness despite the\nabsence of these features or turn to demographic inference tools to attempt to\ninfer them. Given that these tools are fallible, this paper aims to further\nunderstand how errors in demographic inference impact the fairness performance\nof popular fair LTR strategies. In which cases would it be better to keep such\ndemographic attributes hidden from models versus infer them? We examine a\nspectrum of fair LTR strategies ranging from fair LTR with and without\ndemographic features hidden versus inferred to fairness-unaware LTR followed by\nfair re-ranking. We conduct a controlled empirical investigation modeling\ndifferent levels of inference errors by systematically perturbing the inferred\nsensitive attribute. We also perform three case studies with real-world\ndatasets and popular open-source inference methods. Our findings reveal that as\ninference noise grows, LTR-based methods that incorporate fairness\nconsiderations into the learning process may increase bias. In contrast, fair\nre-ranking strategies are more robust to inference errors. All source code,\ndata, and experimental artifacts of our experimental study are available here:\nhttps://github.com/sewen007/hoiltr.git\n","authors":["Oluseun Olulana","Kathleen Cachel","Fabricio Murai","Elke Rundensteiner"],"pdf_url":"https://arxiv.org/pdf/2407.17459v1.pdf","comment":"This paper has been accepted by AAAI/AIES to the AIES 2024 conference"},{"id":"http://arxiv.org/abs/2407.17458v1","updated":"2024-07-24T17:50:54Z","published":"2024-07-24T17:50:54Z","title":"EuroCropsML: A Time Series Benchmark Dataset For Few-Shot Crop Type\n Classification","summary":" We introduce EuroCropsML, an analysis-ready remote sensing machine learning\ndataset for time series crop type classification of agricultural parcels in\nEurope. It is the first dataset designed to benchmark transnational few-shot\ncrop type classification algorithms that supports advancements in algorithmic\ndevelopment and research comparability. It comprises 706 683 multi-class\nlabeled data points across 176 classes, featuring annual time series of\nper-parcel median pixel values from Sentinel-2 L1C data for 2021, along with\ncrop type labels and spatial coordinates. Based on the open-source EuroCrops\ncollection, EuroCropsML is publicly available on Zenodo.\n","authors":["Joana Reuss","Jan Macdonald","Simon Becker","Lorenz Richter","Marco Körner"],"pdf_url":"https://arxiv.org/pdf/2407.17458v1.pdf","comment":"5 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.17449v1","updated":"2024-07-24T17:30:21Z","published":"2024-07-24T17:30:21Z","title":"Looking at Model Debiasing through the Lens of Anomaly Detection","summary":" It is widely recognized that deep neural networks are sensitive to bias in\nthe data. This means that during training these models are likely to learn\nspurious correlations between data and labels, resulting in limited\ngeneralization abilities and low performance. In this context, model debiasing\napproaches can be devised aiming at reducing the model's dependency on such\nunwanted correlations, either leveraging the knowledge of bias information or\nnot. In this work, we focus on the latter and more realistic scenario, showing\nthe importance of accurately predicting the bias-conflicting and bias-aligned\nsamples to obtain compelling performance in bias mitigation. On this ground, we\npropose to conceive the problem of model bias from an out-of-distribution\nperspective, introducing a new bias identification method based on anomaly\ndetection. We claim that when data is mostly biased, bias-conflicting samples\ncan be regarded as outliers with respect to the bias-aligned distribution in\nthe feature space of a biased model, thus allowing for precisely detecting them\nwith an anomaly detection method. Coupling the proposed bias identification\napproach with bias-conflicting data upsampling and augmentation in a two-step\nstrategy, we reach state-of-the-art performance on synthetic and real benchmark\ndatasets. Ultimately, our proposed approach shows that the data bias issue does\nnot necessarily require complex debiasing methods, given that an accurate bias\nidentification procedure is defined.\n","authors":["Vito Paolo Pastore","Massimiliano Ciranni","Davide Marinelli","Francesca Odone","Vittorio Murino"],"pdf_url":"https://arxiv.org/pdf/2407.17449v1.pdf","comment":"15 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.14436v2","updated":"2024-07-24T17:26:07Z","published":"2024-04-19T20:03:30Z","title":"Investigating Resource-efficient Neutron/Gamma Classification ML Models\n Targeting eFPGAs","summary":" There has been considerable interest and resulting progress in implementing\nmachine learning (ML) models in hardware over the last several years from the\nparticle and nuclear physics communities. A big driver has been the release of\nthe Python package, hls4ml, which has enabled porting models specified and\ntrained using Python ML libraries to register transfer level (RTL) code. So\nfar, the primary end targets have been commercial FPGAs or synthesized custom\nblocks on ASICs. However, recent developments in open-source embedded FPGA\n(eFPGA) frameworks now provide an alternate, more flexible pathway for\nimplementing ML models in hardware. These customized eFPGA fabrics can be\nintegrated as part of an overall chip design. In general, the decision between\na fully custom, eFPGA, or commercial FPGA ML implementation will depend on the\ndetails of the end-use application. In this work, we explored the parameter\nspace for eFPGA implementations of fully-connected neural network (fcNN) and\nboosted decision tree (BDT) models using the task of neutron/gamma\nclassification with a specific focus on resource efficiency. We used data\ncollected using an AmBe sealed source incident on Stilbene, which was optically\ncoupled to an OnSemi J-series SiPM to generate training and test data for this\nstudy. We investigated relevant input features and the effects of\nbit-resolution and sampling rate as well as trade-offs in hyperparameters for\nboth ML architectures while tracking total resource usage. The performance\nmetric used to track model performance was the calculated neutron efficiency at\na gamma leakage of 10$^{-3}$. The results of the study will be used to aid the\nspecification of an eFPGA fabric, which will be integrated as part of a test\nchip.\n","authors":["Jyothisraj Johnson","Billy Boxer","Tarun Prakash","Carl Grace","Peter Sorensen","Mani Tripathi"],"pdf_url":"https://arxiv.org/pdf/2404.14436v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17446v1","updated":"2024-07-24T17:23:14Z","published":"2024-07-24T17:23:14Z","title":"Fractional signature: a generalisation of the signature inspired by\n fractional calculus","summary":" In this paper, we propose a novel generalisation of the signature of a path,\nmotivated by fractional calculus, which is able to describe the solutions of\nlinear Caputo controlled FDEs. We also propose another generalisation of the\nsignature, inspired by the previous one, but more convenient to use in machine\nlearning. Finally, we test this last signature in a toy application to the\nproblem of handwritten digit recognition, where significant improvements in\naccuracy rates are observed compared to those of the original signature.\n","authors":["José Manuel Corcuera","Rubén Jiménez"],"pdf_url":"https://arxiv.org/pdf/2407.17446v1.pdf","comment":"9 pages, 1 figure"},{"id":"http://arxiv.org/abs/2407.17438v1","updated":"2024-07-24T17:15:58Z","published":"2024-07-24T17:15:58Z","title":"HumanVid: Demystifying Training Data for Camera-controllable Human Image\n Animation","summary":" Human image animation involves generating videos from a character photo,\nallowing user control and unlocking potential for video and movie production.\nWhile recent approaches yield impressive results using high-quality training\ndata, the inaccessibility of these datasets hampers fair and transparent\nbenchmarking. Moreover, these approaches prioritize 2D human motion and\noverlook the significance of camera motions in videos, leading to limited\ncontrol and unstable video generation.To demystify the training data, we\npresent HumanVid, the first large-scale high-quality dataset tailored for human\nimage animation, which combines crafted real-world and synthetic data. For the\nreal-world data, we compile a vast collection of copyright-free real-world\nvideos from the internet. Through a carefully designed rule-based filtering\nstrategy, we ensure the inclusion of high-quality videos, resulting in a\ncollection of 20K human-centric videos in 1080P resolution. Human and camera\nmotion annotation is accomplished using a 2D pose estimator and a SLAM-based\nmethod. For the synthetic data, we gather 2,300 copyright-free 3D avatar assets\nto augment existing available 3D assets. Notably, we introduce a rule-based\ncamera trajectory generation method, enabling the synthetic pipeline to\nincorporate diverse and precise camera motion annotation, which can rarely be\nfound in real-world data. To verify the effectiveness of HumanVid, we establish\na baseline model named CamAnimate, short for Camera-controllable Human\nAnimation, that considers both human and camera motions as conditions. Through\nextensive experimentation, we demonstrate that such simple baseline training on\nour HumanVid achieves state-of-the-art performance in controlling both human\npose and camera motions, setting a new benchmark. Code and data will be\npublicly available at \\url{https://github.com/zhenzhiwang/HumanVid/}.\n","authors":["Zhenzhi Wang","Yixuan Li","Yanhong Zeng","Youqing Fang","Yuwei Guo","Wenran Liu","Jing Tan","Kai Chen","Tianfan Xue","Bo Dai","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2407.17438v1.pdf","comment":"camera controllable human image animation, a dataset and a baseline"},{"id":"http://arxiv.org/abs/2402.06912v2","updated":"2024-07-24T17:15:44Z","published":"2024-02-10T09:15:21Z","title":"Solving Deep Reinforcement Learning Tasks with Evolution Strategies and\n Linear Policy Networks","summary":" Although deep reinforcement learning methods can learn effective policies for\nchallenging problems such as Atari games and robotics tasks, algorithms are\ncomplex, and training times are often long. This study investigates how\nEvolution Strategies perform compared to gradient-based deep reinforcement\nlearning methods. We use Evolution Strategies to optimize the weights of a\nneural network via neuroevolution, performing direct policy search. We\nbenchmark both deep policy networks and networks consisting of a single linear\nlayer from observations to actions for three gradient-based methods, such as\nProximal Policy Optimization. These methods are evaluated against three\nclassical Evolution Strategies and Augmented Random Search, which all use\nlinear policy networks. Our results reveal that Evolution Strategies can find\neffective linear policies for many reinforcement learning benchmark tasks,\nunlike deep reinforcement learning methods that can only find successful\npolicies using much larger networks, suggesting that current benchmarks are\neasier to solve than previously assumed. Interestingly, Evolution Strategies\nalso achieve results comparable to gradient-based deep reinforcement learning\nalgorithms for higher-complexity tasks. Furthermore, we find that by directly\naccessing the memory state of the game, Evolution Strategies can find\nsuccessful policies in Atari that outperform the policies found by Deep\nQ-Learning. Evolution Strategies also outperform Augmented Random Search in\nmost benchmarks, demonstrating superior sample efficiency and robustness in\ntraining linear policy networks.\n","authors":["Annie Wong","Jacob de Nobel","Thomas Bäck","Aske Plaat","Anna V. Kononova"],"pdf_url":"https://arxiv.org/pdf/2402.06912v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.01267v2","updated":"2024-07-24T17:13:55Z","published":"2024-03-02T17:10:44Z","title":"Dissecting Language Models: Machine Unlearning via Selective Pruning","summary":" Understanding and shaping the behaviour of Large Language Models (LLMs) is\nincreasingly important as applications become more powerful and more frequently\nadopted. This paper introduces a machine unlearning method specifically\ndesigned for LLMs. We introduce a selective pruning method for LLMs that\nremoves neurons based on their relative importance on a targeted capability\ncompared to overall network performance. This approach is a compute- and\ndata-efficient method for identifying and removing neurons that enable specific\nbehaviours. Our findings reveal that both feed-forward and attention neurons in\nLLMs are specialized; that is, for specific tasks, certain neurons are more\ncrucial than others. Code from all experiments is available at\nhttps://github.com/nickypro/selective-pruning\n","authors":["Nicholas Pochinkov","Nandi Schoots"],"pdf_url":"https://arxiv.org/pdf/2403.01267v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17437v1","updated":"2024-07-24T17:13:31Z","published":"2024-07-24T17:13:31Z","title":"Nerva: a Truly Sparse Implementation of Neural Networks","summary":" We introduce Nerva, a fast neural network library under development in C++.\nIt supports sparsity by using the sparse matrix operations of Intel's Math\nKernel Library (MKL), which eliminates the need for binary masks. We show that\nNerva significantly decreases training time and memory usage while reaching\nequivalent accuracy to PyTorch. We run static sparse experiments with an MLP on\nCIFAR-10. On high sparsity levels like $99\\%$, the runtime is reduced by a\nfactor of $4\\times$ compared to a PyTorch model using masks. Similar to other\npopular frameworks such as PyTorch and Keras, Nerva offers a Python interface\nfor users to work with.\n","authors":["Wieger Wesselink","Bram Grooten","Qiao Xiao","Cassio de Campos","Mykola Pechenizkiy"],"pdf_url":"https://arxiv.org/pdf/2407.17437v1.pdf","comment":"The Nerva library is available at https://github.com/wiegerw/nerva"},{"id":"http://arxiv.org/abs/2407.13018v2","updated":"2024-07-24T17:04:35Z","published":"2024-07-17T21:14:05Z","title":"Proof-of-Collaborative-Learning: A Multi-winner Federated Learning\n Consensus Algorithm","summary":" Regardless of their variations, blockchains require a consensus mechanism to\nvalidate transactions, supervise added blocks, maintain network security,\nsynchronize the network state, and distribute incentives. Proof-of-Work (PoW),\none of the most influential implementations of consensus mechanisms, consumes\nan extraordinary amount of energy for a task that lacks direct productive\noutput. In this paper, we propose Proof-of-Collaborative-Learning (PoCL), a\nmulti-winner federated learning validated consensus mechanism that redirects\nthe computation power of blockchains to train federated learning models. In\naddition, we present a novel evaluation mechanism to ensure the efficiency of\nthe locally trained models of miners. We evaluated the security of our\nevaluation mechanism by introducing and conducting probable attacks. Moreover,\nwe present a novel reward distribution mechanism to incentivize winning miners\nfairly, and demonstrate that our reward system is fair both within and across\nall rounds.\n","authors":["Amirreza Sokhankhosh","Sara Rouhani"],"pdf_url":"https://arxiv.org/pdf/2407.13018v2.pdf","comment":"8 pages. Accepted at the 7th IEEE International Conference on\n Blockchain (Blockchain 2024)"},{"id":"http://arxiv.org/abs/2403.14606v2","updated":"2024-07-24T16:56:17Z","published":"2024-03-21T17:55:16Z","title":"The Elements of Differentiable Programming","summary":" Artificial intelligence has recently experienced remarkable advances, fueled\nby large models, vast datasets, accelerated hardware, and, last but not least,\nthe transformative power of differentiable programming. This new programming\nparadigm enables end-to-end differentiation of complex computer programs\n(including those with control flows and data structures), making gradient-based\noptimization of program parameters possible. As an emerging paradigm,\ndifferentiable programming builds upon several areas of computer science and\napplied mathematics, including automatic differentiation, graphical models,\noptimization and statistics. This book presents a comprehensive review of the\nfundamental concepts useful for differentiable programming. We adopt two main\nperspectives, that of optimization and that of probability, with clear\nanalogies between the two. Differentiable programming is not merely the\ndifferentiation of programs, but also the thoughtful design of programs\nintended for differentiation. By making programs differentiable, we inherently\nintroduce probability distributions over their execution, providing a means to\nquantify the uncertainty associated with program outputs.\n","authors":["Mathieu Blondel","Vincent Roulet"],"pdf_url":"https://arxiv.org/pdf/2403.14606v2.pdf","comment":"Draft version 2"},{"id":"http://arxiv.org/abs/2402.14925v2","updated":"2024-07-24T16:54:33Z","published":"2024-02-22T19:15:50Z","title":"Efficient Unbiased Sparsification","summary":" An unbiased $m$-sparsification of a vector $p\\in \\mathbb{R}^n$ is a random\nvector $Q\\in \\mathbb{R}^n$ with mean $p$ that has at most $m0}$ the Brownian kernel,\nand the distribution of the projections $w$ is learnt. This can also be viewed\nas an infinite-width one-hidden layer neural network, optimising the first\nlayer's weights through gradient descent and explicitly adjusting the\nnon-linearity and weights of the second layer. We introduce an efficient\ncomputation method for the estimator, called Brownian Kernel Neural Network\n(BKerNN), using particles to approximate the expectation. The optimisation is\nprincipled due to the positive homogeneity of the Brownian kernel. Using\nRademacher complexity, we show that BKerNN's expected risk converges to the\nminimal risk with explicit high-probability rates of $O( \\min((d/n)^{1/2},\nn^{-1/6}))$ (up to logarithmic factors). Numerical experiments confirm our\noptimisation intuitions, and BKerNN outperforms kernel ridge regression, and\nfavourably compares to a one-hidden layer neural network with ReLU activations\nin various settings and real data sets.\n","authors":["Bertille Follain","Francis Bach"],"pdf_url":"https://arxiv.org/pdf/2407.17280v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14324v2","updated":"2024-07-24T13:34:14Z","published":"2023-11-24T07:53:48Z","title":"Large Language Models as Topological Structure Enhancers for\n Text-Attributed Graphs","summary":" The latest advancements in large language models (LLMs) have revolutionized\nthe field of natural language processing (NLP). Inspired by the success of LLMs\nin NLP tasks, some recent work has begun investigating the potential of\napplying LLMs in graph learning tasks. However, most of the existing work\nfocuses on utilizing LLMs as powerful node feature augmenters, leaving\nemploying LLMs to enhance graph topological structures an understudied problem.\nIn this work, we explore how to leverage the information retrieval and text\ngeneration capabilities of LLMs to refine/enhance the topological structure of\ntext-attributed graphs (TAGs) under the node classification setting. First, we\npropose using LLMs to help remove unreliable edges and add reliable ones in the\nTAG. Specifically, we first let the LLM output the semantic similarity between\nnode attributes through delicate prompt designs, and then perform edge deletion\nand edge addition based on the similarity. Second, we propose using\npseudo-labels generated by the LLM to improve graph topology, that is, we\nintroduce the pseudo-label propagation as a regularization to guide the graph\nneural network (GNN) in learning proper edge weights. Finally, we incorporate\nthe two aforementioned LLM-based methods for graph topological refinement into\nthe process of GNN training, and perform extensive experiments on four\nreal-world datasets. The experimental results demonstrate the effectiveness of\nLLM-based graph topology refinement (achieving a 0.15%--2.47% performance gain\non public benchmarks).\n","authors":["Shengyin Sun","Yuxiang Ren","Chen Ma","Xuecang Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.14324v2.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2209.15609v3","updated":"2024-07-24T13:31:07Z","published":"2022-09-30T17:34:48Z","title":"$Φ$-DVAE: Physics-Informed Dynamical Variational Autoencoders for\n Unstructured Data Assimilation","summary":" Incorporating unstructured data into physical models is a challenging problem\nthat is emerging in data assimilation. Traditional approaches focus on\nwell-defined observation operators whose functional forms are typically assumed\nto be known. This prevents these methods from achieving a consistent model-data\nsynthesis in configurations where the mapping from data-space to model-space is\nunknown. To address these shortcomings, in this paper we develop a\nphysics-informed dynamical variational autoencoder ($\\Phi$-DVAE) to embed\ndiverse data streams into time-evolving physical systems described by\ndifferential equations. Our approach combines a standard, possibly nonlinear,\nfilter for the latent state-space model and a VAE, to assimilate the\nunstructured data into the latent dynamical system. Unstructured data, in our\nexample systems, comes in the form of video data and velocity field\nmeasurements, however the methodology is suitably generic to allow for\narbitrary unknown observation operators. A variational Bayesian framework is\nused for the joint estimation of the encoding, latent states, and unknown\nsystem parameters. To demonstrate the method, we provide case studies with the\nLorenz-63 ordinary differential equation, and the advection and Korteweg-de\nVries partial differential equations. Our results, with synthetic data, show\nthat $\\Phi$-DVAE provides a data efficient dynamics encoding methodology which\nis competitive with standard approaches. Unknown parameters are recovered with\nuncertainty quantification, and unseen data are accurately predicted.\n","authors":["Alex Glyn-Davies","Connor Duffin","Ö. Deniz Akyildiz","Mark Girolami"],"pdf_url":"https://arxiv.org/pdf/2209.15609v3.pdf","comment":"29 pages, 9 figures, updated version"},{"id":"http://arxiv.org/abs/2301.07609v5","updated":"2024-07-24T13:23:17Z","published":"2023-01-18T15:40:19Z","title":"Physics-informed Information Field Theory for Modeling Physical Systems\n with Uncertainty Quantification","summary":" Data-driven approaches coupled with physical knowledge are powerful\ntechniques to model systems. The goal of such models is to efficiently solve\nfor the underlying field by combining measurements with known physical laws. As\nmany systems contain unknown elements, such as missing parameters, noisy data,\nor incomplete physical laws, this is widely approached as an uncertainty\nquantification problem. The common techniques to handle all the variables\ntypically depend on the numerical scheme used to approximate the posterior, and\nit is desirable to have a method which is independent of any such\ndiscretization. Information field theory (IFT) provides the tools necessary to\nperform statistics over fields that are not necessarily Gaussian. We extend IFT\nto physics-informed IFT (PIFT) by encoding the functional priors with\ninformation about the physical laws which describe the field. The posteriors\nderived from this PIFT remain independent of any numerical scheme and can\ncapture multiple modes, allowing for the solution of problems which are\nill-posed. We demonstrate our approach through an analytical example involving\nthe Klein-Gordon equation. We then develop a variant of stochastic gradient\nLangevin dynamics to draw samples from the joint posterior over the field and\nmodel parameters. We apply our method to numerical examples with various\ndegrees of model-form error and to inverse problems involving nonlinear\ndifferential equations. As an addendum, the method is equipped with a metric\nwhich allows the posterior to automatically quantify model-form uncertainty.\nBecause of this, our numerical experiments show that the method remains robust\nto even an incorrect representation of the physics given sufficient data. We\nnumerically demonstrate that the method correctly identifies when the physics\ncannot be trusted, in which case it automatically treats learning the field as\na regression problem.\n","authors":["Alex Alberts","Ilias Bilionis"],"pdf_url":"https://arxiv.org/pdf/2301.07609v5.pdf","comment":"32 pages, 8 figures. Published in Journal of Computational Physics"},{"id":"http://arxiv.org/abs/2306.00833v2","updated":"2024-07-24T13:13:20Z","published":"2023-06-01T15:55:46Z","title":"When Does Bottom-up Beat Top-down in Hierarchical Community Detection?","summary":" Hierarchical clustering of networks consists in finding a tree of\ncommunities, such that lower levels of the hierarchy reveal finer-grained\ncommunity structures. There are two main classes of algorithms tackling this\nproblem. Divisive ($\\textit{top-down}$) algorithms recursively partition the\nnodes into two communities, until a stopping rule indicates that no further\nsplit is needed. In contrast, agglomerative ($\\textit{bottom-up}$) algorithms\nfirst identify the smallest community structure and then repeatedly merge the\ncommunities using a $\\textit{linkage}$ method. In this article, we establish\ntheoretical guarantees for the recovery of the hierarchical tree and community\nstructure of a Hierarchical Stochastic Block Model by a bottom-up algorithm. We\nalso establish that this bottom-up algorithm attains the information-theoretic\nthreshold for exact recovery at intermediate levels of the hierarchy. Notably,\nthese recovery conditions are less restrictive compared to those existing for\ntop-down algorithms. This shows that bottom-up algorithms extend the feasible\nregion for achieving exact recovery at intermediate levels. Numerical\nexperiments on both synthetic and real data sets confirm the superiority of\nbottom-up algorithms over top-down algorithms. We also observe that top-down\nalgorithms can produce dendrograms with inversions. These findings contribute\nto a better understanding of hierarchical clustering techniques and their\napplications in network analysis.\n","authors":["Maximilien Dreveton","Daichi Kuroda","Matthias Grossglauser","Patrick Thiran"],"pdf_url":"https://arxiv.org/pdf/2306.00833v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17246v1","updated":"2024-07-24T13:05:17Z","published":"2024-07-24T13:05:17Z","title":"Channel-Aware Low-Rank Adaptation in Time Series Forecasting","summary":" The balance between model capacity and generalization has been a key focus of\nrecent discussions in long-term time series forecasting. Two representative\nchannel strategies are closely associated with model expressivity and\nrobustness, including channel independence (CI) and channel dependence (CD).\nThe former adopts individual channel treatment and has been shown to be more\nrobust to distribution shifts, but lacks sufficient capacity to model\nmeaningful channel interactions. The latter is more expressive for representing\ncomplex cross-channel dependencies, but is prone to overfitting. To balance the\ntwo strategies, we present a channel-aware low-rank adaptation method to\ncondition CD models on identity-aware individual components. As a plug-in\nsolution, it is adaptable for a wide range of backbone architectures. Extensive\nexperiments show that it can consistently and significantly improve the\nperformance of both CI and CD models with demonstrated efficiency and\nflexibility. The code is available at https://github.com/tongnie/C-LoRA.\n","authors":["Tong Nie","Yuewen Mei","Guoyang Qin","Jian Sun","Wei Ma"],"pdf_url":"https://arxiv.org/pdf/2407.17246v1.pdf","comment":"Accepted by CIKM 2024, short research paper track"},{"id":"http://arxiv.org/abs/2401.17505v4","updated":"2024-07-24T12:57:56Z","published":"2024-01-30T23:46:35Z","title":"Arrows of Time for Large Language Models","summary":" We study the probabilistic modeling performed by Autoregressive Large\nLanguage Models (LLMs) through the angle of time directionality, addressing a\nquestion first raised in (Shannon, 1951). For large enough models, we\nempirically find a time asymmetry in their ability to learn natural language: a\ndifference in the average log-perplexity when trying to predict the next token\nversus when trying to predict the previous one. This difference is at the same\ntime subtle and very consistent across various modalities (language, model\nsize, training time, ...). Theoretically, this is surprising: from an\ninformation-theoretic point of view, there should be no such difference. We\nprovide a theoretical framework to explain how such an asymmetry can appear\nfrom sparsity and computational complexity considerations, and outline a number\nof perspectives opened by our results.\n","authors":["Vassilis Papadopoulos","Jérémie Wenger","Clément Hongler"],"pdf_url":"https://arxiv.org/pdf/2401.17505v4.pdf","comment":"Corrected typos in Table 2. Added links. 12 figures, 20 pages"},{"id":"http://arxiv.org/abs/2405.15771v2","updated":"2024-07-24T12:56:41Z","published":"2024-03-13T17:47:39Z","title":"Adaptive Splitting of Reusable Temporal Monitors for Rare Traffic\n Violations","summary":" Autonomous Vehicles (AVs) are often tested in simulation to estimate the\nprobability they will violate safety specifications. Two common issues arise\nwhen using existing techniques to produce this estimation: If violations occur\nrarely, simple Monte-Carlo sampling techniques can fail to produce efficient\nestimates; if simulation horizons are too long, importance sampling techniques\n(which learn proposal distributions from past simulations) can fail to\nconverge. This paper addresses both issues by interleaving rare-event sampling\ntechniques with online specification monitoring algorithms. We use adaptive\nmulti-level splitting to decompose simulations into partial trajectories, then\ncalculate the distance of those partial trajectories to failure by leveraging\nrobustness metrics from Signal Temporal Logic (STL). By caching those partial\nrobustness metric values, we can efficiently re-use computations across\nmultiple sampling stages. Our experiments on an interstate lane-change scenario\nshow our method is viable for testing simulated AV-pipelines, efficiently\nestimating failure probabilities for STL specifications based on real traffic\nrules. We produce better estimates than Monte-Carlo and importance sampling in\nfewer simulations.\n","authors":["Craig Innes","Subramanian Ramamoorthy"],"pdf_url":"https://arxiv.org/pdf/2405.15771v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17238v1","updated":"2024-07-24T12:53:26Z","published":"2024-07-24T12:53:26Z","title":"Pretrained Visual Representations in Reinforcement Learning","summary":" Visual reinforcement learning (RL) has made significant progress in recent\nyears, but the choice of visual feature extractor remains a crucial design\ndecision. This paper compares the performance of RL algorithms that train a\nconvolutional neural network (CNN) from scratch with those that utilize\npre-trained visual representations (PVRs). We evaluate the Dormant Ratio\nMinimization (DRM) algorithm, a state-of-the-art visual RL method, against\nthree PVRs: ResNet18, DINOv2, and Visual Cortex (VC). We use the Metaworld\nPush-v2 and Drawer-Open-v2 tasks for our comparison. Our results show that the\nchoice of training from scratch compared to using PVRs for maximising\nperformance is task-dependent, but PVRs offer advantages in terms of reduced\nreplay buffer size and faster training times. We also identify a strong\ncorrelation between the dormant ratio and model performance, highlighting the\nimportance of exploration in visual RL. Our study provides insights into the\ntrade-offs between training from scratch and using PVRs, informing the design\nof future visual RL algorithms.\n","authors":["Emlyn Williams","Athanasios Polydoros"],"pdf_url":"https://arxiv.org/pdf/2407.17238v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17236v1","updated":"2024-07-24T12:45:02Z","published":"2024-07-24T12:45:02Z","title":"Statistical Batch-Based Bearing Fault Detection","summary":" In the domain of rotating machinery, bearings are vulnerable to different\nmechanical faults, including ball, inner, and outer race faults. Various\ntechniques can be used in condition-based monitoring, from classical signal\nanalysis to deep learning methods. Based on the complex working conditions of\nrotary machines, multivariate statistical process control charts such as\nHotelling's $T^2$ and Squared Prediction Error are useful for providing early\nwarnings. However, these methods are rarely applied to condition monitoring of\nrotating machinery due to the univariate nature of the datasets. In the present\npaper, we propose a multivariate statistical process control-based fault\ndetection method that utilizes multivariate data composed of Fourier transform\nfeatures extracted for fixed-time batches. Our approach makes use of the\nmultidimensional nature of Fourier transform characteristics, which record more\ndetailed information about the machine's status, in an effort to enhance early\ndefect detection and diagnosis. Experiments with varying vibration measurement\nlocations (Fan End, Drive End), fault types (ball, inner, and outer race\nfaults), and motor loads (0-3 horsepower) are used to validate the suggested\napproach. The outcomes illustrate our method's effectiveness in fault detection\nand point to possible broader uses in industrial maintenance.\n","authors":["Victoria Jorrya","Zina-Sabrina Duma","Tuomas Sihvonen","Satu-Pia Reinikainen","Lassi Roininen"],"pdf_url":"https://arxiv.org/pdf/2407.17236v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02263v3","updated":"2024-07-24T12:36:41Z","published":"2024-07-02T13:40:29Z","title":"FreeCG: Free the Design Space of Clebsch-Gordan Transform for Machine\n Learning Force Fields","summary":" The Clebsch-Gordan Transform (CG transform) effectively encodes many-body\ninteractions. Many studies have proven its accuracy in depicting atomic\nenvironments, although this comes with high computational needs. The\ncomputational burden of this challenge is hard to reduce due to the need for\npermutation equivariance, which limits the design space of the CG transform\nlayer. We show that, implementing the CG transform layer on\npermutation-invariant inputs allows complete freedom in the design of this\nlayer without affecting symmetry. Developing further on this premise, our idea\nis to create a CG transform layer that operates on permutation-invariant\nabstract edges generated from real edge information. We bring in group CG\ntransform with sparse path, abstract edges shuffling, and attention enhancer to\nform a powerful and efficient CG transform layer. Our method, known as FreeCG,\nachieves State-of-The-Art (SoTA) results in force prediction for MD17, rMD17,\nMD22, and property prediction in QM9 datasets with notable enhancement. The\nextensibility to other models is also examined. Molecular dynamics simulations\nare carried out on MD17 and other periodic systems, including water and LiPS,\nshowcasing the capacity for real-world applications of FreeCG. It introduces a\nnovel paradigm for carrying out efficient and expressive CG transform in future\ngeometric neural network designs.\n","authors":["Shihao Shao","Haoran Geng","Zun Wang","Qinghua Cui"],"pdf_url":"https://arxiv.org/pdf/2407.02263v3.pdf","comment":"29 pages, 8 tables, 10 figures"},{"id":"http://arxiv.org/abs/2407.17228v1","updated":"2024-07-24T12:32:08Z","published":"2024-07-24T12:32:08Z","title":"A Hybrid Federated Kernel Regularized Least Squares Algorithm","summary":" Federated learning is becoming an increasingly viable and accepted strategy\nfor building machine learning models in critical privacy-preserving scenarios\nsuch as clinical settings. Often, the data involved is not limited to clinical\ndata but also includes additional omics features (e.g. proteomics).\nConsequently, data is distributed not only across hospitals but also across\nomics centers, which are labs capable of generating such additional features\nfrom biosamples. This scenario leads to a hybrid setting where data is\nscattered both in terms of samples and features. In this hybrid setting, we\npresent an efficient reformulation of the Kernel Regularized Least Squares\nalgorithm, introduce two variants and validate them using well-established\ndatasets. Lastly, we discuss security measures to defend against possible\nattacks.\n","authors":["Celeste Damiani","Yulia Rodina","Sergio Decherchi"],"pdf_url":"https://arxiv.org/pdf/2407.17228v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17226v1","updated":"2024-07-24T12:26:21Z","published":"2024-07-24T12:26:21Z","title":"Sublinear Regret for An Actor-Critic Algorithm in Continuous-Time\n Linear-Quadratic Reinforcement Learning","summary":" We study reinforcement learning (RL) for a class of continuous-time\nlinear-quadratic (LQ) control problems for diffusions where volatility of the\nstate processes depends on both state and control variables. We apply a\nmodel-free approach that relies neither on knowledge of model parameters nor on\ntheir estimations, and devise an actor-critic algorithm to learn the optimal\npolicy parameter directly. Our main contributions include the introduction of a\nnovel exploration schedule and a regret analysis of the proposed algorithm. We\nprovide the convergence rate of the policy parameter to the optimal one, and\nprove that the algorithm achieves a regret bound of $O(N^{\\frac{3}{4}})$ up to\na logarithmic factor. We conduct a simulation study to validate the theoretical\nresults and demonstrate the effectiveness and reliability of the proposed\nalgorithm. We also perform numerical comparisons between our method and those\nof the recent model-based stochastic LQ RL studies adapted to the state- and\ncontrol-dependent volatility setting, demonstrating a better performance of the\nformer in terms of regret bounds.\n","authors":["Yilie Huang","Yanwei Jia","Xun Yu Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.17226v1.pdf","comment":"42 pages, 4 figures"},{"id":"http://arxiv.org/abs/2310.08582v2","updated":"2024-07-24T12:25:17Z","published":"2023-10-12T17:59:50Z","title":"Tree-Planner: Efficient Close-loop Task Planning with Large Language\n Models","summary":" This paper studies close-loop task planning, which refers to the process of\ngenerating a sequence of skills (a plan) to accomplish a specific goal while\nadapting the plan based on real-time observations. Recently, prompting Large\nLanguage Models (LLMs) to generate actions iteratively has become a prevalent\nparadigm due to its superior performance and user-friendliness. However, this\nparadigm is plagued by two inefficiencies: high token consumption and redundant\nerror correction, both of which hinder its scalability for large-scale testing\nand applications. To address these issues, we propose Tree-Planner, which\nreframes task planning with LLMs into three distinct phases: plan sampling,\naction tree construction, and grounded deciding. Tree-Planner starts by using\nan LLM to sample a set of potential plans before execution, followed by the\naggregation of them to form an action tree. Finally, the LLM performs a\ntop-down decision-making process on the tree, taking into account real-time\nenvironmental information. Experiments show that Tree-Planner achieves\nstate-of-the-art performance while maintaining high efficiency. By decomposing\nLLM queries into a single plan-sampling call and multiple grounded-deciding\ncalls, a considerable part of the prompt are less likely to be repeatedly\nconsumed. As a result, token consumption is reduced by 92.2% compared to the\npreviously best-performing model. Additionally, by enabling backtracking on the\naction tree as needed, the correction process becomes more flexible, leading to\na 40.5% decrease in error corrections.\n","authors":["Mengkang Hu","Yao Mu","Xinmiao Yu","Mingyu Ding","Shiguang Wu","Wenqi Shao","Qiguang Chen","Bin Wang","Yu Qiao","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2310.08582v2.pdf","comment":"Published in ICLR 2024"},{"id":"http://arxiv.org/abs/2402.02333v2","updated":"2024-07-24T12:23:41Z","published":"2024-02-04T04:00:33Z","title":"Copyright Protection in Generative AI: A Technical Perspective","summary":" Generative AI has witnessed rapid advancement in recent years, expanding\ntheir capabilities to create synthesized content such as text, images, audio,\nand code. The high fidelity and authenticity of contents generated by these\nDeep Generative Models (DGMs) have sparked significant copyright concerns.\nThere have been various legal debates on how to effectively safeguard\ncopyrights in DGMs. This work delves into this issue by providing a\ncomprehensive overview of copyright protection from a technical perspective. We\nexamine from two distinct viewpoints: the copyrights pertaining to the source\ndata held by the data owners and those of the generative models maintained by\nthe model builders. For data copyright, we delve into methods data owners can\nprotect their content and DGMs can be utilized without infringing upon these\nrights. For model copyright, our discussion extends to strategies for\npreventing model theft and identifying outputs generated by specific models.\nFinally, we highlight the limitations of existing techniques and identify areas\nthat remain unexplored. Furthermore, we discuss prospective directions for the\nfuture of copyright protection, underscoring its importance for the sustainable\nand ethical development of Generative AI.\n","authors":["Jie Ren","Han Xu","Pengfei He","Yingqian Cui","Shenglai Zeng","Jiankun Zhang","Hongzhi Wen","Jiayuan Ding","Pei Huang","Lingjuan Lyu","Hui Liu","Yi Chang","Jiliang Tang"],"pdf_url":"https://arxiv.org/pdf/2402.02333v2.pdf","comment":"26 pages"},{"id":"http://arxiv.org/abs/2407.17216v1","updated":"2024-07-24T12:15:59Z","published":"2024-07-24T12:15:59Z","title":"An Adaptive Second-order Method for a Class of Nonconvex Nonsmooth\n Composite Optimization","summary":" This paper explores a specific type of nonconvex sparsity-promoting\nregularization problems, namely those involving $\\ell_p$-norm regularization,\nin conjunction with a twice continuously differentiable loss function. We\npropose a novel second-order algorithm designed to effectively address this\nclass of challenging nonconvex and nonsmooth problems, showcasing several\ninnovative features: (i) The use of an alternating strategy to solve a\nreweighted $\\ell_1$ regularized subproblem and the subspace approximate Newton\nstep. (ii) The reweighted $\\ell_1$ regularized subproblem relies on a convex\napproximation to the nonconvex regularization term, enabling a closed-form\nsolution characterized by the soft-thresholding operator. This feature allows\nour method to be applied to various nonconvex regularization problems. (iii)\nOur algorithm ensures that the iterates maintain their sign values and that\nnonzero components are kept away from 0 for a sufficient number of iterations,\neventually transitioning to a perturbed Newton method. (iv) We provide\ntheoretical guarantees of global convergence, local superlinear convergence in\nthe presence of the Kurdyka-\\L ojasiewicz (KL) property, and local quadratic\nconvergence when employing the exact Newton step in our algorithm. We also\nshowcase the effectiveness of our approach through experiments on a diverse set\nof model prediction problems.\n","authors":["Hao Wang","Xiangyu Yang","Yichen Zhu"],"pdf_url":"https://arxiv.org/pdf/2407.17216v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17214v1","updated":"2024-07-24T12:14:19Z","published":"2024-07-24T12:14:19Z","title":"Application of Machine Learning and Convex Limiting to Subgrid Flux\n Modeling in the Shallow-Water Equations","summary":" We propose a combination of machine learning and flux limiting for\nproperty-preserving subgrid scale modeling in the context of flux-limited\nfinite volume methods for the one-dimensional shallow-water equations. The\nnumerical fluxes of a conservative target scheme are fitted to the coarse-mesh\naverages of a monotone fine-grid discretization using a neural network to\nparametrize the subgrid scale components. To ensure positivity preservation and\nthe validity of local maximum principles, we use a flux limiter that constrains\nthe intermediate states of an equivalent fluctuation form to stay in a convex\nadmissible set. The results of our numerical studies confirm that the proposed\ncombination of machine learning with monolithic convex limiting produces\nmeaningful closures even in scenarios for which the network was not trained.\n","authors":["Ilya Timofeyev","Alexey Schwarzmann","Dmitri Kuzmin"],"pdf_url":"https://arxiv.org/pdf/2407.17214v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17213v1","updated":"2024-07-24T12:11:09Z","published":"2024-07-24T12:11:09Z","title":"Spectrum-Informed Multistage Neural Networks: Multiscale Function\n Approximators of Machine Precision","summary":" Deep learning frameworks have become powerful tools for approaching\nscientific problems such as turbulent flow, which has wide-ranging\napplications. In practice, however, existing scientific machine learning\napproaches have difficulty fitting complex, multi-scale dynamical systems to\nvery high precision, as required in scientific contexts. We propose using the\nnovel multistage neural network approach with a spectrum-informed\ninitialization to learn the residue from the previous stage, utilizing the\nspectral biases associated with neural networks to capture high frequency\nfeatures in the residue, and successfully tackle the spectral bias of neural\nnetworks. This approach allows the neural network to fit target functions to\ndouble floating-point machine precision $O(10^{-16})$.\n","authors":["Jakin Ng","Yongji Wang","Ching-Yao Lai"],"pdf_url":"https://arxiv.org/pdf/2407.17213v1.pdf","comment":"8 pages, 3 figures, ICML 2024 workshop (AI for Science: Scaling in AI\n for Scientific Discovery)"},{"id":"http://arxiv.org/abs/2407.17209v1","updated":"2024-07-24T12:09:07Z","published":"2024-07-24T12:09:07Z","title":"Nonverbal Immediacy Analysis in Education: A Multimodal Computational\n Model","summary":" This paper introduces a novel computational approach for analyzing nonverbal\nsocial behavior in educational settings. Integrating multimodal behavioral\ncues, including facial expressions, gesture intensity, and spatial dynamics,\nthe model assesses the nonverbal immediacy (NVI) of teachers from RGB classroom\nvideos. A dataset of 400 30-second video segments from German classrooms was\nconstructed for model training and validation. The gesture intensity regressor\nachieved a correlation of 0.84, the perceived distance regressor 0.55, and the\nNVI model 0.44 with median human ratings. The model demonstrates the potential\nto provide a valuable support in nonverbal behavior assessment, approximating\nthe accuracy of individual human raters. Validated against both questionnaire\ndata and trained observer ratings, our models show moderate to strong\ncorrelations with relevant educational outcomes, indicating their efficacy in\nreflecting effective teaching behaviors. This research advances the objective\nassessment of nonverbal communication behaviors, opening new pathways for\neducational research.\n","authors":["Uroš Petković","Jonas Frenkel","Olaf Hellwich","Rebecca Lazarides"],"pdf_url":"https://arxiv.org/pdf/2407.17209v1.pdf","comment":"12 pages, 3 figures. Camera-ready version for the SAB 2024: 17th\n International Conference on the Simulation of Adaptive Behavior"},{"id":"http://arxiv.org/abs/2407.17206v1","updated":"2024-07-24T12:06:09Z","published":"2024-07-24T12:06:09Z","title":"Take a Step and Reconsider: Sequence Decoding for Self-Improved Neural\n Combinatorial Optimization","summary":" The constructive approach within Neural Combinatorial Optimization (NCO)\ntreats a combinatorial optimization problem as a finite Markov decision\nprocess, where solutions are built incrementally through a sequence of\ndecisions guided by a neural policy network. To train the policy, recent\nresearch is shifting toward a 'self-improved' learning methodology that\naddresses the limitations of reinforcement learning and supervised approaches.\nHere, the policy is iteratively trained in a supervised manner, with solutions\nderived from the current policy serving as pseudo-labels. The way these\nsolutions are obtained from the policy determines the quality of the\npseudo-labels. In this paper, we present a simple and problem-independent\nsequence decoding method for self-improved learning based on sampling sequences\nwithout replacement. We incrementally follow the best solution found and repeat\nthe sampling process from intermediate partial solutions. By modifying the\npolicy to ignore previously sampled sequences, we force it to consider only\nunseen alternatives, thereby increasing solution diversity. Experimental\nresults for the Traveling Salesman and Capacitated Vehicle Routing Problem\ndemonstrate its strong performance. Furthermore, our method outperforms\nprevious NCO approaches on the Job Shop Scheduling Problem.\n","authors":["Jonathan Pirnay","Dominik G. Grimm"],"pdf_url":"https://arxiv.org/pdf/2407.17206v1.pdf","comment":"Accepted at ECAI-2024"},{"id":"http://arxiv.org/abs/2407.17200v1","updated":"2024-07-24T12:00:30Z","published":"2024-07-24T12:00:30Z","title":"Generalization Bounds of Surrogate Policies for Combinatorial\n Optimization Problems","summary":" A recent stream of structured learning approaches has improved the practical\nstate of the art for a range of combinatorial optimization problems with\ncomplex objectives encountered in operations research. Such approaches train\npolicies that chain a statistical model with a surrogate combinatorial\noptimization oracle to map any instance of the problem to a feasible solution.\nThe key idea is to exploit the statistical distribution over instances instead\nof dealing with instances separately. However learning such policies by risk\nminimization is challenging because the empirical risk is piecewise constant in\nthe parameters, and few theoretical guarantees have been provided so far. In\nthis article, we investigate methods that smooth the risk by perturbing the\npolicy, which eases optimization and improves generalization. Our main\ncontribution is a generalization bound that controls the perturbation bias, the\nstatistical learning error, and the optimization error. Our analysis relies on\nthe introduction of a uniform weak property, which captures and quantifies the\ninterplay of the statistical model and the surrogate combinatorial optimization\noracle. This property holds under mild assumptions on the statistical model,\nthe surrogate optimization, and the instance data distribution. We illustrate\nthe result on a range of applications such as stochastic vehicle scheduling. In\nparticular, such policies are relevant for contextual stochastic optimization\nand our results cover this case.\n","authors":["Pierre-Cyril Aubin-Frankowski","Yohann De Castro","Axel Parmentier","Alessandro Rudi"],"pdf_url":"https://arxiv.org/pdf/2407.17200v1.pdf","comment":"10 pages main document, 3 pages supplement"},{"id":"http://arxiv.org/abs/2407.17195v1","updated":"2024-07-24T11:55:18Z","published":"2024-07-24T11:55:18Z","title":"Surrogate-guided optimization in quantum networks","summary":" We propose an optimization algorithm to improve the design and performance of\nquantum communication networks. When physical architectures become too complex\nfor analytical methods, numerical simulation becomes essential to study quantum\nnetwork behavior. Although highly informative, these simulations involve\ncomplex numerical functions without known analytical forms, making traditional\noptimization techniques that assume continuity, differentiability, or convexity\ninapplicable. Additionally, quantum network simulations are computationally\ndemanding, rendering global approaches like Simulated Annealing or genetic\nalgorithms,\n which require extensive function evaluations, impractical. We introduce a\nmore efficient optimization workflow using machine learning models, which serve\nas surrogates for a given objective function. We demonstrate the effectiveness\nof our approach by applying it to three well-known optimization problems in\nquantum networking: quantum memory allocation for multiple network nodes,\ntuning an experimental parameter in all physical links of a quantum\nentanglement switch, and finding efficient protocol settings within a large\nasymmetric quantum network. The solutions found by our algorithm consistently\noutperform those obtained with our baseline approaches -- Simulated Annealing\nand Bayesian optimization -- in the allotted time limit by up to 18\\% and 20\\%,\nrespectively. Our framework thus allows for more comprehensive quantum network\nstudies, integrating surrogate-assisted optimization with existing quantum\nnetwork simulators.\n","authors":["Luise Prielinger","Álvaro G. Iñesta","Gayane Vardoyan"],"pdf_url":"https://arxiv.org/pdf/2407.17195v1.pdf","comment":"20 pages (including supplementary notes), 12 figures"},{"id":"http://arxiv.org/abs/2406.02765v3","updated":"2024-07-24T11:35:26Z","published":"2024-06-04T20:33:29Z","title":"Discovering Dynamic Symbolic Policies with Genetic Programming","summary":" Artificial intelligence techniques are increasingly being applied to solve\ncontrol problems, but often rely on black-box methods without transparent\noutput generation. To improve the interpretability and transparency in control\nsystems, models can be defined as white-box symbolic policies described by\nmathematical expressions. While current approaches to learn symbolic policies\nfocus on static policies that directly map observations to control signals,\nthese may fail in partially observable and volatile environments. We instead\nconsider dynamic symbolic policies with memory, optimised with genetic\nprogramming. The resulting policies are robust, and consist of easy to\ninterpret coupled differential equations. Our results show that dynamic\nsymbolic policies compare with black-box policies on a variety of control\ntasks. Furthermore, the benefit of the memory in dynamic policies is\ndemonstrated on experiments where static policies fall short. Overall, we\npresent a method for evolving high-performing symbolic policies that offer\ninterpretability and transparency, which lacks in black-box models.\n","authors":["Sigur de Vries","Sander Keemink","Marcel van Gerven"],"pdf_url":"https://arxiv.org/pdf/2406.02765v3.pdf","comment":"19 pages including references and appendix, 5 figures, 1 algorithm, 5\n tables"},{"id":"http://arxiv.org/abs/2407.17182v1","updated":"2024-07-24T11:34:24Z","published":"2024-07-24T11:34:24Z","title":"Solving the Electrical Impedance Tomography Problem with a DeepONet Type\n Neural Network: Theory and Application","summary":" In this work, we consider the non-invasive medical imaging modality of\nElectrical Impedance Tomography, where the problem is to recover the\nconductivity in a medium from a set of data that arises out of a\ncurrent-to-voltage map (Neumann-to-Dirichlet operator) defined on the boundary\nof the medium. We formulate this inverse problem as an operator-learning\nproblem where the goal is to learn the implicitly defined operator-to-function\nmap between the space of Neumann-to-Dirichlet operators to the space of\nadmissible conductivities. Subsequently, we use an operator-learning\narchitecture, popularly called DeepONets, to learn this operator-to-function\nmap. Thus far, most of the operator learning architectures have been\nimplemented to learn operators between function spaces. In this work, we\ngeneralize the earlier works and use a DeepONet to actually {learn an\noperator-to-function} map. We provide a Universal Approximation Theorem type\nresult which guarantees that this implicitly defined operator-to-function map\nbetween the space of Neumann-to-Dirichlet operator to the space of conductivity\nfunction can be approximated to an arbitrary degree using such a DeepONet.\nFurthermore, we provide a computational implementation of our proposed approach\nand compare it against a standard baseline. We show that the proposed approach\nachieves good reconstructions and outperforms the baseline method in our\nexperiments.\n","authors":["Anuj Abhishek","Thilo Strauss"],"pdf_url":"https://arxiv.org/pdf/2407.17182v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17174v1","updated":"2024-07-24T11:24:25Z","published":"2024-07-24T11:24:25Z","title":"NarrationDep: Narratives on Social Media For Automatic Depression\n Detection","summary":" Social media posts provide valuable insight into the narrative of users and\ntheir intentions, including providing an opportunity to automatically model\nwhether a social media user is depressed or not. The challenge lies in\nfaithfully modelling user narratives from their online social media posts,\nwhich could potentially be useful in several different applications. We have\ndeveloped a novel and effective model called \\texttt{NarrationDep}, which\nfocuses on detecting narratives associated with depression. By analyzing a\nuser's tweets, \\texttt{NarrationDep} accurately identifies crucial narratives.\n\\texttt{NarrationDep} is a deep learning framework that jointly models\nindividual user tweet representations and clusters of users' tweets. As a\nresult, \\texttt{NarrationDep} is characterized by a novel two-layer deep\nlearning model: the first layer models using social media text posts, and the\nsecond layer learns semantic representations of tweets associated with a\ncluster. To faithfully model these cluster representations, the second layer\nincorporates a novel component that hierarchically learns from users' posts.\nThe results demonstrate that our framework outperforms other comparative models\nincluding recently developed models on a variety of datasets.\n","authors":["Hamad Zogan","Imran Razzak","Shoaib Jameel","Guandong Xu"],"pdf_url":"https://arxiv.org/pdf/2407.17174v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.15260v2","updated":"2024-07-24T11:21:47Z","published":"2022-03-29T06:15:54Z","title":"Efficient Convex Optimization Requires Superlinear Memory","summary":" We show that any memory-constrained, first-order algorithm which minimizes\n$d$-dimensional, $1$-Lipschitz convex functions over the unit ball to\n$1/\\mathrm{poly}(d)$ accuracy using at most $d^{1.25 - \\delta}$ bits of memory\nmust make at least $\\tilde{\\Omega}(d^{1 + (4/3)\\delta})$ first-order queries\n(for any constant $\\delta \\in [0, 1/4]$). Consequently, the performance of such\nmemory-constrained algorithms are a polynomial factor worse than the optimal\n$\\tilde{O}(d)$ query bound for this problem obtained by cutting plane methods\nthat use $\\tilde{O}(d^2)$ memory. This resolves a COLT 2019 open problem of\nWoodworth and Srebro.\n","authors":["Annie Marsden","Vatsal Sharan","Aaron Sidford","Gregory Valiant"],"pdf_url":"https://arxiv.org/pdf/2203.15260v2.pdf","comment":"33 pages, 1 figure"},{"id":"http://arxiv.org/abs/2407.16417v2","updated":"2024-07-24T11:19:22Z","published":"2024-07-23T12:00:44Z","title":"On the Utility of Speech and Audio Foundation Models for Marmoset Call\n Analysis","summary":" Marmoset monkeys encode vital information in their calls and serve as a\nsurrogate model for neuro-biologists to understand the evolutionary origins of\nhuman vocal communication. Traditionally analyzed with signal processing-based\nfeatures, recent approaches have utilized self-supervised models pre-trained on\nhuman speech for feature extraction, capitalizing on their ability to learn a\nsignal's intrinsic structure independently of its acoustic domain. However, the\nutility of such foundation models remains unclear for marmoset call analysis in\nterms of multi-class classification, bandwidth, and pre-training domain. This\nstudy assesses feature representations derived from speech and general audio\ndomains, across pre-training bandwidths of 4, 8, and 16 kHz for marmoset\ncall-type and caller classification tasks. Results show that models with higher\nbandwidth improve performance, and pre-training on speech or general audio\nyields comparable results, improving over a spectral baseline.\n","authors":["Eklavya Sarkar","Mathew Magimai. -Doss"],"pdf_url":"https://arxiv.org/pdf/2407.16417v2.pdf","comment":"Accepted at Interspeech 2024 satellite event (VIHAR 2024)"},{"id":"http://arxiv.org/abs/2407.17164v1","updated":"2024-07-24T11:12:01Z","published":"2024-07-24T11:12:01Z","title":"Robust Deep Hawkes Process under Label Noise of Both Event and\n Occurrence","summary":" Integrating deep neural networks with the Hawkes process has significantly\nimproved predictive capabilities in finance, health informatics, and\ninformation technology. Nevertheless, these models often face challenges in\nreal-world settings, particularly due to substantial label noise. This issue is\nof significant concern in the medical field, where label noise can arise from\ndelayed updates in electronic medical records or misdiagnoses, leading to\nincreased prediction risks. Our research indicates that deep Hawkes process\nmodels exhibit reduced robustness when dealing with label noise, particularly\nwhen it affects both event types and timing. To address these challenges, we\nfirst investigate the influence of label noise in approximated intensity\nfunctions and present a novel framework, the Robust Deep Hawkes Process (RDHP),\nto overcome the impact of label noise on the intensity function of Hawkes\nmodels, considering both the events and their occurrences. We tested RDHP using\nmultiple open-source benchmarks with synthetic noise and conducted a case study\non obstructive sleep apnea-hypopnea syndrome (OSAHS) in a real-world setting\nwith inherent label noise. The results demonstrate that RDHP can effectively\nperform classification and regression tasks, even in the presence of noise\nrelated to events and their timing. To the best of our knowledge, this is the\nfirst study to successfully address both event and time label noise in deep\nHawkes process models, offering a promising solution for medical applications,\nspecifically in diagnosing OSAHS.\n","authors":["Xiaoyu Tan","Bin Li","Xihe Qiu","Jingjing Huang","Yinghui Xu","Wei Chu"],"pdf_url":"https://arxiv.org/pdf/2407.17164v1.pdf","comment":"ECAI2024"},{"id":"http://arxiv.org/abs/2407.17165v1","updated":"2024-07-24T11:12:01Z","published":"2024-07-24T11:12:01Z","title":"Explainable Artificial Intelligence Techniques for Irregular Temporal\n Classification of Multidrug Resistance Acquisition in Intensive Care Unit\n Patients","summary":" Antimicrobial Resistance represents a significant challenge in the Intensive\nCare Unit (ICU), where patients are at heightened risk of Multidrug-Resistant\n(MDR) infections-pathogens resistant to multiple antimicrobial agents. This\nstudy introduces a novel methodology that integrates Gated Recurrent Units\n(GRUs) with advanced intrinsic and post-hoc interpretability techniques for\ndetecting the onset of MDR in patients across time. Within interpretability\nmethods, we propose Explainable Artificial Intelligence (XAI) approaches to\nhandle irregular Multivariate Time Series (MTS), introducing Irregular Time\nShapley Additive Explanations (IT-SHAP), a modification of Shapley Additive\nExplanations designed for irregular MTS with Recurrent Neural Networks focused\non temporal outputs. Our methodology aims to identify specific risk factors\nassociated with MDR in ICU patients. GRU with Hadamard's attention demonstrated\nhigh initial specificity and increasing sensitivity over time, correlating with\nincreased nosocomial infection risks during prolonged ICU stays. XAI analysis,\nenhanced by Hadamard attention and IT-SHAP, identified critical factors such as\nprevious non-resistant cultures, specific antibiotic usage patterns, and\nhospital environment dynamics. These insights suggest that early detection of\nat-risk patients can inform interventions such as preventive isolation and\ncustomized treatments, significantly improving clinical outcomes. The proposed\nGRU model for temporal classification achieved an average Receiver Operating\nCharacteristic Area Under the Curve of 78.27 +- 1.26 over time, indicating\nstrong predictive performance. In summary, this study highlights the clinical\nutility of our methodology, which combines predictive accuracy with\ninterpretability, thereby facilitating more effective healthcare interventions\nby professionals.\n","authors":["Óscar Escudero-Arnanz","Cristina Soguero-Ruiz","Joaquín Álvarez-Rodríguez","Antonio G. Marques"],"pdf_url":"https://arxiv.org/pdf/2407.17165v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17163v1","updated":"2024-07-24T11:07:20Z","published":"2024-07-24T11:07:20Z","title":"dlordinal: a Python package for deep ordinal classification","summary":" dlordinal is a new Python library that unifies many recent deep ordinal\nclassification methodologies available in the literature. Developed using\nPyTorch as underlying framework, it implements the top performing\nstate-of-the-art deep learning techniques for ordinal classification problems.\nOrdinal approaches are designed to leverage the ordering information present in\nthe target variable. Specifically, it includes loss functions, various output\nlayers, dropout techniques, soft labelling methodologies, and other\nclassification strategies, all of which are appropriately designed to\nincorporate the ordinal information. Furthermore, as the performance metrics to\nassess novel proposals in ordinal classification depend on the distance between\ntarget and predicted classes in the ordinal scale, suitable ordinal evaluation\nmetrics are also included. dlordinal is distributed under the BSD-3-Clause\nlicense and is available at https://github.com/ayrna/dlordinal.\n","authors":["Francisco Bérchez-Moreno","Víctor M. Vargas","Rafael Ayllón-Gavilán","David Guijo-Rubio","César Hervás-Martínez","Juan C. Fernández","Pedro A. Gutiérrez"],"pdf_url":"https://arxiv.org/pdf/2407.17163v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17161v1","updated":"2024-07-24T11:05:05Z","published":"2024-07-24T11:05:05Z","title":"Quantum Supervised Learning","summary":" Recent advancements in quantum computing have positioned it as a prospective\nsolution for tackling intricate computational challenges, with supervised\nlearning emerging as a promising domain for its application. Despite this\npotential, the field of quantum machine learning is still in its early stages,\nand there persists a level of skepticism regarding a possible near-term quantum\nadvantage. This paper aims to provide a classical perspective on current\nquantum algorithms for supervised learning, effectively bridging traditional\nmachine learning principles with advancements in quantum machine learning.\nSpecifically, this study charts a research trajectory that diverges from the\npredominant focus of quantum machine learning literature, originating from the\nprerequisites of classical methodologies and elucidating the potential impact\nof quantum approaches. Through this exploration, our objective is to deepen the\nunderstanding of the convergence between classical and quantum methods, thereby\nlaying the groundwork for future advancements in both domains and fostering the\ninvolvement of classical practitioners in the field of quantum machine\nlearning.\n","authors":["Antonio Macaluso"],"pdf_url":"https://arxiv.org/pdf/2407.17161v1.pdf","comment":"16 pages, 3 figures, 1 table"},{"id":"http://arxiv.org/abs/2407.16680v2","updated":"2024-07-24T10:58:48Z","published":"2024-07-23T17:45:16Z","title":"A Simulation Benchmark for Autonomous Racing with Large-Scale Human Data","summary":" Despite the availability of international prize-money competitions, scaled\nvehicles, and simulation environments, research on autonomous racing and the\ncontrol of sports cars operating close to the limit of handling has been\nlimited by the high costs of vehicle acquisition and management, as well as the\nlimited physics accuracy of open-source simulators. In this paper, we propose a\nracing simulation platform based on the simulator Assetto Corsa to test,\nvalidate, and benchmark autonomous driving algorithms, including reinforcement\nlearning (RL) and classical Model Predictive Control (MPC), in realistic and\nchallenging scenarios. Our contributions include the development of this\nsimulation platform, several state-of-the-art algorithms tailored to the racing\nenvironment, and a comprehensive dataset collected from human drivers.\nAdditionally, we evaluate algorithms in the offline RL setting. All the\nnecessary code (including environment and benchmarks), working examples,\ndatasets, and videos are publicly released and can be found at:\nhttps://assetto-corsa-gym.github.io\n","authors":["Adrian Remonda","Nicklas Hansen","Ayoub Raji","Nicola Musiu","Marko Bertogna","Eduardo Veas","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2407.16680v2.pdf","comment":"Project page and code can be found at:\n \\url{https://assetto-corsa-gym.github.io/}"},{"id":"http://arxiv.org/abs/2407.17156v1","updated":"2024-07-24T10:54:23Z","published":"2024-07-24T10:54:23Z","title":"Path Following and Stabilisation of a Bicycle Model using a\n Reinforcement Learning Approach","summary":" Over the years, complex control approaches have been developed to control the\nmotion of a bicycle. Reinforcement Learning (RL), a branch of machine learning,\npromises easy deployment of so-called agents. Deployed agents are increasingly\nconsidered as an alternative to controllers for mechanical systems. The present\nwork introduces an RL approach to do path following with a virtual bicycle\nmodel while simultaneously stabilising it laterally. The bicycle, modelled as\nthe Whipple benchmark model and using multibody system dynamics, has no\nstabilisation aids. The agent succeeds in both path following and stabilisation\nof the bicycle model exclusively by outputting steering angles, which are\nconverted into steering torques via a PD controller. Curriculum learning is\napplied as a state-of-the-art training strategy. Different settings for the\nimplemented RL framework are investigated and compared to each other. The\nperformance of the deployed agents is evaluated using different types of paths\nand measurements. The ability of the deployed agents to do path following and\nstabilisation of the bicycle model travelling between 2m/s and 7m/s along\ncomplex paths including full circles, slalom manoeuvres, and lane changes is\ndemonstrated. Explanatory methods for machine learning are used to analyse the\nfunctionality of a deployed agent and link the introduced RL approach with\nresearch in the field of bicycle dynamics.\n","authors":["Sebastian Weyrer","Peter Manzl","A. L. Schwab","Johannes Gerstmayr"],"pdf_url":"https://arxiv.org/pdf/2407.17156v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.09615v5","updated":"2024-07-24T10:49:23Z","published":"2022-05-19T15:13:00Z","title":"EXACT: How to Train Your Accuracy","summary":" Classification tasks are usually evaluated in terms of accuracy. However,\naccuracy is discontinuous and cannot be directly optimized using gradient\nascent. Popular methods minimize cross-entropy, hinge loss, or other surrogate\nlosses, which can lead to suboptimal results. In this paper, we propose a new\noptimization framework by introducing stochasticity to a model's output and\noptimizing expected accuracy, i.e. accuracy of the stochastic model. Extensive\nexperiments on linear models and deep image classification show that the\nproposed optimization method is a powerful alternative to widely used\nclassification losses.\n","authors":["Ivan Karpukhin","Stanislav Dereka","Sergey Kolesnikov"],"pdf_url":"https://arxiv.org/pdf/2205.09615v5.pdf","comment":"Pattern Recognition Letters (2024)"},{"id":"http://arxiv.org/abs/2309.08546v3","updated":"2024-07-24T10:16:59Z","published":"2023-09-15T17:10:51Z","title":"Towards Robust Continual Learning with Bayesian Adaptive Moment\n Regularization","summary":" The pursuit of long-term autonomy mandates that machine learning models must\ncontinuously adapt to their changing environments and learn to solve new tasks.\nContinual learning seeks to overcome the challenge of catastrophic forgetting,\nwhere learning to solve new tasks causes a model to forget previously learnt\ninformation. Prior-based continual learning methods are appealing as they are\ncomputationally efficient and do not require auxiliary models or data storage.\nHowever, prior-based approaches typically fail on important benchmarks and are\nthus limited in their potential applications compared to their memory-based\ncounterparts. We introduce Bayesian adaptive moment regularization (BAdam), a\nnovel prior-based method that better constrains parameter growth, reducing\ncatastrophic forgetting. Our method boasts a range of desirable properties such\nas being lightweight and task label-free, converging quickly, and offering\ncalibrated uncertainty that is important for safe real-world deployment.\nResults show that BAdam achieves state-of-the-art performance for prior-based\nmethods on challenging single-headed class-incremental experiments such as\nSplit MNIST and Split FashionMNIST, and does so without relying on task labels\nor discrete task boundaries.\n","authors":["Jack Foster","Alexandra Brintrup"],"pdf_url":"https://arxiv.org/pdf/2309.08546v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2008.07361v2","updated":"2024-07-24T09:56:06Z","published":"2020-08-14T11:00:13Z","title":"Logistic regression models for patient-level prediction based on massive\n observational data: Do we need all data?","summary":" Objective: Provide guidance on sample size considerations for developing\npredictive models by empirically establishing the adequate sample size, which\nbalances the competing objectives of improving model performance and reducing\nmodel complexity as well as computational requirements.\n Materials and Methods: We empirically assess the effect of sample size on\nprediction performance and model complexity by generating learning curves for\n81 prediction problems (23 outcomes predicted in a depression cohort, 58\noutcomes predicted in a hypertension cohort) in three large observational\nhealth databases, requiring training of 17,248 prediction models. The adequate\nsample size was defined as the sample size for which the performance of a model\nequalled the maximum model performance minus a small threshold value.\n Results: The adequate sample size achieves a median reduction of the number\nof observations of 9.5%, 37.3%, 58.5%, and 78.5% for the thresholds of 0.001,\n0.005, 0.01, and 0.02, respectively. The median reduction of the number of\npredictors in the models was 8.6%, 32.2%, 48.2%, and 68.3% for the thresholds\nof 0.001, 0.005, 0.01, and 0.02, respectively.\n Discussion: Based on our results a conservative, yet significant, reduction\nin sample size and model complexity can be estimated for future prediction\nwork. Though, if a researcher is willing to generate a learning curve a much\nlarger reduction of the model complexity may be possible as suggested by a\nlarge outcome-dependent variability.\n Conclusion: Our results suggest that in most cases only a fraction of the\navailable data was sufficient to produce a model close to the performance of\none developed on the full data set, but with a substantially reduced model\ncomplexity.\n","authors":["Luis H. John","Jan A. Kors","Jenna M. Reps","Patrick B. Ryan","Peter R. Rijnbeek"],"pdf_url":"https://arxiv.org/pdf/2008.07361v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17125v1","updated":"2024-07-24T09:48:48Z","published":"2024-07-24T09:48:48Z","title":"Behavioral Testing: Can Large Language Models Implicitly Resolve\n Ambiguous Entities?","summary":" One of the major aspects contributing to the striking performance of large\nlanguage models (LLMs) is the vast amount of factual knowledge accumulated\nduring pre-training. Yet, many LLMs suffer from self-inconsistency, which\nraises doubts about their trustworthiness and reliability. In this paper, we\nfocus on entity type ambiguity and analyze current state-of-the-art LLMs for\ntheir proficiency and consistency in applying their factual knowledge when\nprompted for entities under ambiguity. To do so, we propose an evaluation\nprotocol that disentangles knowing from applying knowledge, and test\nstate-of-the-art LLMs on 49 entities. Our experiments reveal that LLMs perform\npoorly with ambiguous prompts, achieving only 80% accuracy. Our results further\ndemonstrate systematic discrepancies in LLM behavior and their failure to\nconsistently apply information, indicating that the models can exhibit\nknowledge without being able to utilize it, significant biases for preferred\nreadings, as well as self inconsistencies. Our study highlights the importance\nof handling entity ambiguity in future for more trustworthy LLMs\n","authors":["Anastasiia Sedova","Robert Litschko","Diego Frassinelli","Benjamin Roth","Barbara Plank"],"pdf_url":"https://arxiv.org/pdf/2407.17125v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.06580v2","updated":"2024-07-24T09:38:49Z","published":"2024-02-09T17:55:01Z","title":"SAE: Single Architecture Ensemble Neural Networks","summary":" Ensembles of separate neural networks (NNs) have shown superior accuracy and\nconfidence calibration over single NN across tasks. To improve the hardware\nefficiency of ensembles of separate NNs, recent methods create ensembles within\na single network via adding early exits or considering multi input multi output\napproaches. However, it is unclear which of these methods is the most effective\nfor a given task, needing a manual and separate search through each method. Our\nnovel Single Architecture Ensemble (SAE) framework enables an automatic and\njoint search through the early exit and multi input multi output configurations\nand their previously unobserved in-between combinations. SAE consists of two\nparts: a scalable search space that generalises the previous methods and their\nin-between configurations, and an optimisation objective that allows learning\nthe optimal configuration for a given task. Our image classification and\nregression experiments show that with SAE we can automatically find diverse\nconfigurations that fit the task, achieving competitive accuracy or confidence\ncalibration to baselines while reducing the compute operations or parameter\ncount by up to $1.5{\\sim}3.7\\times$.\n","authors":["Martin Ferianc","Hongxiang Fan","Miguel Rodrigues"],"pdf_url":"https://arxiv.org/pdf/2402.06580v2.pdf","comment":"Accepted at BMVC'24"},{"id":"http://arxiv.org/abs/2407.17120v1","updated":"2024-07-24T09:30:04Z","published":"2024-07-24T09:30:04Z","title":"Parameter-Efficient Fine-Tuning for Continual Learning: A Neural Tangent\n Kernel Perspective","summary":" Parameter-efficient fine-tuning for continual learning (PEFT-CL) has shown\npromise in adapting pre-trained models to sequential tasks while mitigating\ncatastrophic forgetting problem. However, understanding the mechanisms that\ndictate continual performance in this paradigm remains elusive. To tackle this\ncomplexity, we undertake a rigorous analysis of PEFT-CL dynamics to derive\nrelevant metrics for continual scenarios using Neural Tangent Kernel (NTK)\ntheory. With the aid of NTK as a mathematical analysis tool, we recast the\nchallenge of test-time forgetting into the quantifiable generalization gaps\nduring training, identifying three key factors that influence these gaps and\nthe performance of PEFT-CL: training sample size, task-level feature\northogonality, and regularization. To address these challenges, we introduce\nNTK-CL, a novel framework that eliminates task-specific parameter storage while\nadaptively generating task-relevant features. Aligning with theoretical\nguidance, NTK-CL triples the feature representation of each sample,\ntheoretically and empirically reducing the magnitude of both task-interplay and\ntask-specific generalization gaps. Grounded in NTK analysis, our approach\nimposes an adaptive exponential moving average mechanism and constraints on\ntask-level feature orthogonality, maintaining intra-task NTK forms while\nattenuating inter-task NTK forms. Ultimately, by fine-tuning optimizable\nparameters with appropriate regularization, NTK-CL achieves state-of-the-art\nperformance on established PEFT-CL benchmarks. This work provides a theoretical\nfoundation for understanding and improving PEFT-CL models, offering insights\ninto the interplay between feature representation, task orthogonality, and\ngeneralization, contributing to the development of more efficient continual\nlearning systems.\n","authors":["Jingren Liu","Zhong Ji","YunLong Yu","Jiale Cao","Yanwei Pang","Jungong Han","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2407.17120v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17147v3","updated":"2024-07-24T09:28:11Z","published":"2024-04-26T04:34:45Z","title":"On the Federated Learning Framework for Cooperative Perception","summary":" Cooperative perception is essential to enhance the efficiency and safety of\nfuture transportation systems, requiring extensive data sharing among vehicles\non the road, which raises significant privacy concerns. Federated learning\noffers a promising solution by enabling data privacy-preserving collaborative\nenhancements in perception, decision-making, and planning among connected and\nautonomous vehicles (CAVs). However, federated learning is impeded by\nsignificant challenges arising from data heterogeneity across diverse clients,\npotentially diminishing model accuracy and prolonging convergence periods. This\nstudy introduces a specialized federated learning framework for CP, termed the\nfederated dynamic weighted aggregation (FedDWA) algorithm, facilitated by\ndynamic adjusting loss (DALoss) function. This framework employs dynamic client\nweighting to direct model convergence and integrates a novel loss function that\nutilizes Kullback-Leibler divergence (KLD) to counteract the detrimental\neffects of non-independently and identically distributed (Non-IID) and\nunbalanced data. Utilizing the BEV transformer as the primary model, our\nrigorous testing on the OpenV2V dataset, augmented with FedBEVT data,\ndemonstrates significant improvements in the average intersection over union\n(IoU). These results highlight the substantial potential of our federated\nlearning framework to address data heterogeneity challenges in CP, thereby\nenhancing the accuracy of environmental perception models and facilitating more\nrobust and efficient collaborative learning solutions in the transportation\nsector.\n","authors":["Zhenrong Zhang","Jianan Liu","Xi Zhou","Tao Huang","Qing-Long Han","Jingxin Liu","Hongbin Liu"],"pdf_url":"https://arxiv.org/pdf/2404.17147v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17117v1","updated":"2024-07-24T09:25:54Z","published":"2024-07-24T09:25:54Z","title":"EverAdapt: Continuous Adaptation for Dynamic Machine Fault Diagnosis\n Environments","summary":" Unsupervised Domain Adaptation (UDA) has emerged as a key solution in\ndata-driven fault diagnosis, addressing domain shift where models underperform\nin changing environments. However, under the realm of continually changing\nenvironments, UDA tends to underperform on previously seen domains when\nadapting to new ones - a problem known as catastrophic forgetting. To address\nthis limitation, we introduce the EverAdapt framework, specifically designed\nfor continuous model adaptation in dynamic environments. Central to EverAdapt\nis a novel Continual Batch Normalization (CBN), which leverages source domain\nstatistics as a reference point to standardize feature representations across\ndomains. EverAdapt not only retains statistical information from previous\ndomains but also adapts effectively to new scenarios. Complementing CBN, we\ndesign a class-conditional domain alignment module for effective integration of\ntarget domains, and a Sample-efficient Replay strategy to reinforce memory\nretention. Experiments on real-world datasets demonstrate EverAdapt superiority\nin maintaining robust fault diagnosis in dynamic environments. Our code is\navailable: https://github.com/mohamedr002/EverAdapt\n","authors":[" Edward","Mohamed Ragab","Yuecong Xu","Min Wu","Yuecong Xu","Zhenghua Chen","Abdulla Alseiari","Xiaoli Li"],"pdf_url":"https://arxiv.org/pdf/2407.17117v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17112v1","updated":"2024-07-24T09:23:22Z","published":"2024-07-24T09:23:22Z","title":"Neural Dueling Bandits","summary":" Contextual dueling bandit is used to model the bandit problems, where a\nlearner's goal is to find the best arm for a given context using observed noisy\npreference feedback over the selected arms for the past contexts. However,\nexisting algorithms assume the reward function is linear, which can be complex\nand non-linear in many real-life applications like online recommendations or\nranking web search results. To overcome this challenge, we use a neural network\nto estimate the reward function using preference feedback for the previously\nselected arms. We propose upper confidence bound- and Thompson sampling-based\nalgorithms with sub-linear regret guarantees that efficiently select arms in\neach round. We then extend our theoretical results to contextual bandit\nproblems with binary feedback, which is in itself a non-trivial contribution.\nExperimental results on the problem instances derived from synthetic datasets\ncorroborate our theoretical results.\n","authors":["Arun Verma","Zhongxiang Dai","Xiaoqiang Lin","Patrick Jaillet","Bryan Kian Hsiang Low"],"pdf_url":"https://arxiv.org/pdf/2407.17112v1.pdf","comment":"Accepted at ICML 2024 Workshop on Foundations of Reinforcement\n Learning and Control"},{"id":"http://arxiv.org/abs/2201.08712v3","updated":"2024-07-24T09:17:56Z","published":"2022-01-21T14:16:56Z","title":"Improved Random Features for Dot Product Kernels","summary":" Dot product kernels, such as polynomial and exponential (softmax) kernels,\nare among the most widely used kernels in machine learning, as they enable\nmodeling the interactions between input features, which is crucial in\napplications like computer vision, natural language processing, and recommender\nsystems. We make several novel contributions for improving the efficiency of\nrandom feature approximations for dot product kernels, to make these kernels\nmore useful in large scale learning. First, we present a generalization of\nexisting random feature approximations for polynomial kernels, such as\nRademacher and Gaussian sketches and TensorSRHT, using complex-valued random\nfeatures. We show empirically that the use of complex features can\nsignificantly reduce the variances of these approximations. Second, we provide\na theoretical analysis for understanding the factors affecting the efficiency\nof various random feature approximations, by deriving closed-form expressions\nfor their variances. These variance formulas elucidate conditions under which\ncertain approximations (e.g., TensorSRHT) achieve lower variances than others\n(e.g., Rademacher sketches), and conditions under which the use of complex\nfeatures leads to lower variances than real features. Third, by using these\nvariance formulas, which can be evaluated in practice, we develop a data-driven\noptimization approach to improve random feature approximations for general dot\nproduct kernels, which is also applicable to the Gaussian kernel. We describe\nthe improvements brought by these contributions with extensive experiments on a\nvariety of tasks and datasets.\n","authors":["Jonas Wacker","Motonobu Kanagawa","Maurizio Filippone"],"pdf_url":"https://arxiv.org/pdf/2201.08712v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17097v1","updated":"2024-07-24T08:49:18Z","published":"2024-07-24T08:49:18Z","title":"Towards Robust Knowledge Tracing Models via k-Sparse Attention","summary":" Knowledge tracing (KT) is the problem of predicting students' future\nperformance based on their historical interaction sequences. With the advanced\ncapability of capturing contextual long-term dependency, attention mechanism\nbecomes one of the essential components in many deep learning based KT (DLKT)\nmodels. In spite of the impressive performance achieved by these attentional\nDLKT models, many of them are often vulnerable to run the risk of overfitting,\nespecially on small-scale educational datasets. Therefore, in this paper, we\npropose \\textsc{sparseKT}, a simple yet effective framework to improve the\nrobustness and generalization of the attention based DLKT approaches.\nSpecifically, we incorporate a k-selection module to only pick items with the\nhighest attention scores. We propose two sparsification heuristics : (1)\nsoft-thresholding sparse attention and (2) top-$K$ sparse attention. We show\nthat our \\textsc{sparseKT} is able to help attentional KT models get rid of\nirrelevant student interactions and have comparable predictive performance when\ncompared to 11 state-of-the-art KT models on three publicly available\nreal-world educational datasets. To encourage reproducible research, we make\nour data and code publicly available at\n\\url{https://github.com/pykt-team/pykt-toolkit}\\footnote{We merged our model to\nthe \\textsc{pyKT} benchmark at \\url{https://pykt.org/}.}.\n","authors":["Shuyan Huang","Zitao Liu","Xiangyu Zhao","Weiqi Luo","Jian Weng"],"pdf_url":"https://arxiv.org/pdf/2407.17097v1.pdf","comment":"Accepted at SIGIR'2023 (revised version with additional results)"},{"id":"http://arxiv.org/abs/2402.03365v2","updated":"2024-07-24T08:48:38Z","published":"2024-01-31T11:03:58Z","title":"Heterophily-Aware Fair Recommendation using Graph Convolutional Networks","summary":" In recent years, graph neural networks (GNNs) have become a popular tool to\nimprove the accuracy and performance of recommender systems. Modern recommender\nsystems are not only designed to serve end users, but also to benefit other\nparticipants, such as items and items providers. These participants may have\ndifferent or conflicting goals and interests, which raise the need for fairness\nand popularity bias considerations. GNN-based recommendation methods also face\nthe challenges of unfairness and popularity bias and their normalization and\naggregation processes suffer from these challenges. In this paper, we propose a\nfair GNN-based recommender system, called HetroFair, to improve items' side\nfairness. HetroFair uses two separate components to generate fairness-aware\nembeddings: i) fairnessaware attention which incorporates dot product in the\nnormalization process of GNNs, to decrease the effect of nodes' degrees, and\nii) heterophily feature weighting to assign distinct weights to different\nfeatures during the aggregation process. In order to evaluate the effectiveness\nof HetroFair, we conduct extensive experiments over six real-world datasets.\nOur experimental results reveal that HetroFair not only alleviates the\nunfairness and popularity bias on items' side, but also achieves superior\naccuracy on users' side. Our implementation is publicly available at\nhttps://github.com/NematGH/HetroFair.\n","authors":["Nemat Gholinejad","Mostafa Haghir Chehreghani"],"pdf_url":"https://arxiv.org/pdf/2402.03365v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17087v1","updated":"2024-07-24T08:34:08Z","published":"2024-07-24T08:34:08Z","title":"Assessing Non-Nested Configurations of Multifidelity Machine Learning\n for Quantum-Chemical Properties","summary":" Multifidelity machine learning (MFML) for quantum chemical (QC) properties\nhas seen strong development in the recent years. The method has been shown to\nreduce the cost of generating training data for high-accuracy low-cost ML\nmodels. In such a set-up, the ML models are trained on molecular geometries and\nsome property of interest computed at various computational chemistry\naccuracies, or fidelities. These are then combined in training the MFML models.\nIn some multifidelity models, the training data is required to be nested, that\nis the same molecular geometries are included to calculate the property across\nall the fidelities. In these multifidelity models, the requirement of a nested\nconfiguration restricts the kind of sampling that can be performed while\nselection training samples at different fidelities.\n This work assesses the use of non-nested training data for two of these\nmultifidelity methods, namely MFML and optimized MFML (o-MFML). The assessment\nis carried out for the prediction of ground state energies and first vertical\nexcitation energies of a diverse collection of molecules of the CheMFi dataset.\nResults indicate that the MFML method still requires a nested structure of\ntraining data across the fidelities. However, the o-MFML method shows promising\nresults for non-nested multifidelity training data with model errors comparable\nto the nested configurations.\n","authors":["Vivin Vinod","Peter Zaspel"],"pdf_url":"https://arxiv.org/pdf/2407.17087v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03227v2","updated":"2024-07-24T08:31:21Z","published":"2023-09-04T02:30:19Z","title":"Learning a Patent-Informed Biomedical Knowledge Graph Reveals\n Technological Potential of Drug Repositioning Candidates","summary":" Drug repositioning-a promising strategy for discovering new therapeutic uses\nfor existing drugs-has been increasingly explored in the computational science\nliterature using biomedical databases. However, the technological potential of\ndrug repositioning candidates has often been overlooked. This study presents a\nnovel protocol to comprehensively analyse various sources such as\npharmaceutical patents and biomedical databases, and identify drug\nrepositioning candidates with both technological potential and scientific\nevidence. To this end, first, we constructed a scientific biomedical knowledge\ngraph (s-BKG) comprising relationships between drugs, diseases, and genes\nderived from biomedical databases. Our protocol involves identifying drugs that\nexhibit limited association with the target disease but are closely located in\nthe s-BKG, as potential drug candidates. We constructed a patent-informed\nbiomedical knowledge graph (p-BKG) by adding pharmaceutical patent information.\nFinally, we developed a graph embedding protocol to ascertain the structure of\nthe p-BKG, thereby calculating the relevance scores of those candidates with\ntarget disease-related patents to evaluate their technological potential. Our\ncase study on Alzheimer's disease demonstrates its efficacy and feasibility,\nwhile the quantitative outcomes and systematic methods are expected to bridge\nthe gap between computational discoveries and successful market applications in\ndrug repositioning research.\n","authors":["Yongseung Jegal","Jaewoong Choi","Jiho Lee","Ki-Su Park","Seyoung Lee","Janghyeok Yoon"],"pdf_url":"https://arxiv.org/pdf/2309.03227v2.pdf","comment":"We are sorry to withdraw this paper. We found some critical errors in\n the introduction and results sections. Specifically, we found that the first\n author have wrongly inserted citations on background works and he made\n mistakes in the graph embedding methods and relevant results are wrongly\n calculated. In this regard, we tried to revise this paper and withdraw the\n current version. Thank you"},{"id":"http://arxiv.org/abs/2407.17085v1","updated":"2024-07-24T08:22:49Z","published":"2024-07-24T08:22:49Z","title":"OVR: A Dataset for Open Vocabulary Temporal Repetition Counting in\n Videos","summary":" We introduce a dataset of annotations of temporal repetitions in videos. The\ndataset, OVR (pronounced as over), contains annotations for over 72K videos,\nwith each annotation specifying the number of repetitions, the start and end\ntime of the repetitions, and also a free-form description of what is repeating.\nThe annotations are provided for videos sourced from Kinetics and Ego4D, and\nconsequently cover both Exo and Ego viewing conditions, with a huge variety of\nactions and activities. Moreover, OVR is almost an order of magnitude larger\nthan previous datasets for video repetition. We also propose a baseline\ntransformer-based counting model, OVRCounter, that can localise and count\nrepetitions in videos that are up to 320 frames long. The model is trained and\nevaluated on the OVR dataset, and its performance assessed with and without\nusing text to specify the target class to count. The performance is also\ncompared to a prior repetition counting model. The dataset is available for\ndownload at: https://sites.google.com/view/openvocabreps/\n","authors":["Debidatta Dwibedi","Yusuf Aytar","Jonathan Tompson","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2407.17085v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.10834v2","updated":"2024-07-24T08:14:50Z","published":"2024-07-15T15:45:07Z","title":"MetaLLM: A High-performant and Cost-efficient Dynamic Framework for\n Wrapping LLMs","summary":" The rapid progress in machine learning (ML) has brought forth many large\nlanguage models (LLMs) that excel in various tasks and areas. These LLMs come\nwith different abilities and costs in terms of computation or pricing. Since\nthe demand for each query can vary, e.g., because of the queried domain or its\ncomplexity, defaulting to one LLM in an application is not usually the best\nchoice, whether it is the biggest, priciest, or even the one with the best\naverage test performance. Consequently, picking the right LLM that is both\naccurate and cost-effective for an application remains a challenge. In this\npaper, we introduce MetaLLM, a framework that dynamically and intelligently\nroutes each query to the optimal LLM (among several available LLMs) for\nclassification tasks, achieving significantly improved accuracy and\ncost-effectiveness. By framing the selection problem as a multi-armed bandit,\nMetaLLM balances prediction accuracy and cost efficiency under uncertainty. Our\nexperiments, conducted on popular LLM platforms such as OpenAI's GPT models,\nAmazon's Titan, Anthropic's Claude, and Meta's LLaMa, showcase MetaLLM's\nefficacy in real-world scenarios, laying the groundwork for future extensions\nbeyond classification tasks.\n","authors":["Quang H. Nguyen","Duy C. Hoang","Juliette Decugis","Saurav Manchanda","Nitesh V. Chawla","Khoa D. Doan"],"pdf_url":"https://arxiv.org/pdf/2407.10834v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08330v2","updated":"2024-07-24T08:13:26Z","published":"2024-01-16T12:49:10Z","title":"Boosting Gradient Ascent for Continuous DR-submodular Maximization","summary":" Projected Gradient Ascent (PGA) is the most commonly used optimization scheme\nin machine learning and operations research areas. Nevertheless, numerous\nstudies and examples have shown that the PGA methods may fail to achieve the\ntight approximation ratio for continuous DR-submodular maximization problems.\nTo address this challenge, we present a boosting technique in this paper, which\ncan efficiently improve the approximation guarantee of the standard PGA to\n\\emph{optimal} with only small modifications on the objective function. The\nfundamental idea of our boosting technique is to exploit non-oblivious search\nto derive a novel auxiliary function $F$, whose stationary points are excellent\napproximations to the global maximum of the original DR-submodular objective\n$f$. Specifically, when $f$ is monotone and $\\gamma$-weakly DR-submodular, we\npropose an auxiliary function $F$ whose stationary points can provide a better\n$(1-e^{-\\gamma})$-approximation than the\n$(\\gamma^2/(1+\\gamma^2))$-approximation guaranteed by the stationary points of\n$f$ itself. Similarly, for the non-monotone case, we devise another auxiliary\nfunction $F$ whose stationary points can achieve an optimal\n$\\frac{1-\\min_{\\boldsymbol{x}\\in\\mathcal{C}}\\|\\boldsymbol{x}\\|_{\\infty}}{4}$-approximation\nguarantee where $\\mathcal{C}$ is a convex constraint set. In contrast, the\nstationary points of the original non-monotone DR-submodular function can be\narbitrarily bad~\\citep{chen2023continuous}. Furthermore, we demonstrate the\nscalability of our boosting technique on four problems. In all of these four\nproblems, our resulting variants of boosting PGA algorithm beat the previous\nstandard PGA in several aspects such as approximation ratio and efficiency.\nFinally, we corroborate our theoretical findings with numerical experiments,\nwhich demonstrate the effectiveness of our boosting PGA methods.\n","authors":["Qixin Zhang","Zongqi Wan","Zengde Deng","Zaiyi Chen","Xiaoming Sun","Jialin Zhang","Yu Yang"],"pdf_url":"https://arxiv.org/pdf/2401.08330v2.pdf","comment":"74 pages, 6 figures and 9 tables. An extended version of Stochastic\n Continuous Submodular Maximization: Boosting via Non-oblivious Function (ICML\n 2022)"},{"id":"http://arxiv.org/abs/2401.06821v4","updated":"2024-07-24T08:12:11Z","published":"2024-01-11T21:04:28Z","title":"Surrogate Neural Networks Local Stability for Aircraft Predictive\n Maintenance","summary":" Surrogate Neural Networks are nowadays routinely used in industry as\nsubstitutes for computationally demanding engineering simulations (e.g., in\nstructural analysis). They allow to generate faster predictions and thus\nanalyses in industrial applications e.g., during a product design, testing or\nmonitoring phases. Due to their performance and time-efficiency, these\nsurrogate models are now being developed for use in safety-critical\napplications. Neural network verification and in particular the assessment of\ntheir robustness (e.g., to perturbations) is the next critical step to allow\ntheir inclusion in real-life applications and certification. We assess the\napplicability and scalability of empirical and formal methods in the context of\naircraft predictive maintenance for surrogate neural networks designed to\npredict the stress sustained by an aircraft part from external loads. The case\nstudy covers a high-dimensional input and output space and the verification\nprocess thus accommodates multi-objective constraints. We explore the\ncomplementarity of verification methods in assessing the local stability\nproperty of such surrogate models to input noise. We showcase the effectiveness\nof sequentially combining methods in one verification 'pipeline' and\ndemonstrate the subsequent gain in runtime required to assess the targeted\nproperty.\n","authors":["Mélanie Ducoffe","Guillaume Povéda","Audrey Galametz","Ryma Boumazouza","Marion-Cécile Martin","Julien Baris","Derk Daverschot","Eugene O'Higgins"],"pdf_url":"https://arxiv.org/pdf/2401.06821v4.pdf","comment":"Peer-reviewed and accepted at the 29th International Conference on\n Formal Methods for Industrial Critical Systems (FMICS 2024) - 15 pages"},{"id":"http://arxiv.org/abs/2407.17073v1","updated":"2024-07-24T08:02:41Z","published":"2024-07-24T08:02:41Z","title":"Contrastive Learning Is Not Optimal for Quasiperiodic Time Series","summary":" Despite recent advancements in Self-Supervised Learning (SSL) for time series\nanalysis, a noticeable gap persists between the anticipated achievements and\nactual performance. While these methods have demonstrated formidable\ngeneralization capabilities with minimal labels in various domains, their\neffectiveness in distinguishing between different classes based on a limited\nnumber of annotated records is notably lacking. Our hypothesis attributes this\nbottleneck to the prevalent use of Contrastive Learning, a shared training\nobjective in previous state-of-the-art (SOTA) methods. By mandating\ndistinctiveness between representations for negative pairs drawn from separate\nrecords, this approach compels the model to encode unique record-based patterns\nbut simultaneously neglects changes occurring across the entire record. To\novercome this challenge, we introduce Distilled Embedding for Almost-Periodic\nTime Series (DEAPS) in this paper, offering a non-contrastive method tailored\nfor quasiperiodic time series, such as electrocardiogram (ECG) data. By\navoiding the use of negative pairs, we not only mitigate the model's blindness\nto temporal changes but also enable the integration of a \"Gradual Loss (Lgra)\"\nfunction. This function guides the model to effectively capture dynamic\npatterns evolving throughout the record. The outcomes are promising, as DEAPS\ndemonstrates a notable improvement of +10% over existing SOTA methods when just\na few annotated records are presented to fit a Machine Learning (ML) model\nbased on the learned representation.\n","authors":["Adrian Atienza","Jakob Bardram","Sadasivan Puthusserypady"],"pdf_url":"https://arxiv.org/pdf/2407.17073v1.pdf","comment":"Accepted to IJCAI 2024"},{"id":"http://arxiv.org/abs/2407.17072v1","updated":"2024-07-24T07:59:18Z","published":"2024-07-24T07:59:18Z","title":"An Efficient Procedure for Computing Bayesian Network Structure Learning","summary":" We propose a globally optimal Bayesian network structure discovery algorithm\nbased on a progressively leveled scoring approach. Bayesian network structure\ndiscovery is a fundamental yet NP-hard problem in the field of probabilistic\ngraphical models, and as the number of variables increases, memory usage grows\nexponentially. The simple and effective method proposed by Silander and\nMyllym\\\"aki has been widely applied in this field, as it incrementally\ncalculates local scores to achieve global optimality. However, existing methods\nthat utilize disk storage, while capable of handling networks with a larger\nnumber of variables, introduce issues such as latency, fragmentation, and\nadditional overhead associated with disk I/O operations. To avoid these\nproblems, we explore how to further enhance computational efficiency and reduce\npeak memory usage using only memory. We introduce an efficient hierarchical\ncomputation method that requires only a single traversal of all local\nstructures, retaining only the data and information necessary for the current\ncomputation, thereby improving efficiency and significantly reducing memory\nrequirements. Experimental results indicate that our method, when using only\nmemory, not only reduces peak memory usage but also improves computational\nefficiency compared to existing methods, demonstrating good scalability for\nhandling larger networks and exhibiting stable experimental results.\nUltimately, we successfully achieved the processing of a Bayesian network with\n28 variables using only memory.\n","authors":["Hongming Huang","Joe Suzuki"],"pdf_url":"https://arxiv.org/pdf/2407.17072v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17070v1","updated":"2024-07-24T07:55:49Z","published":"2024-07-24T07:55:49Z","title":"Curriculum Negative Mining For Temporal Networks","summary":" Temporal networks are effective in capturing the evolving interactions of\nnetworks over time, such as social networks and e-commerce networks. In recent\nyears, researchers have primarily concentrated on developing specific model\narchitectures for Temporal Graph Neural Networks (TGNNs) in order to improve\nthe representation quality of temporal nodes and edges. However, limited\nattention has been given to the quality of negative samples during the training\nof TGNNs. When compared with static networks, temporal networks present two\nspecific challenges for negative sampling: positive sparsity and positive\nshift. Positive sparsity refers to the presence of a single positive sample\namidst numerous negative samples at each timestamp, while positive shift\nrelates to the variations in positive samples across different timestamps. To\nrobustly address these challenges in training TGNNs, we introduce Curriculum\nNegative Mining (CurNM), a model-aware curriculum learning framework that\nadaptively adjusts the difficulty of negative samples. Within this framework,\nwe first establish a dynamically updated negative pool that balances random,\nhistorical, and hard negatives to address the challenges posed by positive\nsparsity. Secondly, we implement a temporal-aware negative selection module\nthat focuses on learning from the disentangled factors of recently active\nedges, thus accurately capturing shifting preferences. Extensive experiments on\n12 datasets and 3 TGNNs demonstrate that our method outperforms baseline\nmethods by a significant margin. Additionally, thorough ablation studies and\nparameter sensitivity experiments verify the usefulness and robustness of our\napproach. Our code is available at https://github.com/zziyue83/CurNM.\n","authors":["Ziyue Chen","Tongya Zheng","Mingli Song"],"pdf_url":"https://arxiv.org/pdf/2407.17070v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15665v2","updated":"2024-07-24T07:51:20Z","published":"2024-07-22T14:28:46Z","title":"A spatiotemporal deep learning framework for prediction of crack\n dynamics in heterogeneous solids: efficient mapping of concrete\n microstructures to its fracture properties","summary":" A spatiotemporal deep learning framework is proposed that is capable of 2D\nfull-field prediction of fracture in concrete mesostructures. This framework\nnot only predicts fractures but also captures the entire history of the\nfracture process, from the crack initiation in the interfacial transition zone\nto the subsequent propagation of the cracks in the mortar matrix. In addition,\na convolutional neural network is developed which can predict the averaged\nstress-strain curve of the mesostructures. The UNet modeling framework, which\ncomprises an encoder-decoder section with skip connections, is used as the deep\nlearning surrogate model. Training and test data are generated from\nhigh-fidelity fracture simulations of randomly generated concrete\nmesostructures. These mesostructures include geometric variabilities such as\ndifferent aggregate particle geometrical features, spatial distribution, and\nthe total volume fraction of aggregates. The fracture simulations are carried\nout in Abaqus, utilizing the cohesive phase-field fracture modeling technique\nas the fracture modeling approach. In this work, to reduce the number of\ntraining datasets, the spatial distribution of three sets of material\nproperties for three-phase concrete mesostructures, along with the spatial\nphase-field damage index, are fed to the UNet to predict the corresponding\nstress and spatial damage index at the subsequent step. It is shown that after\nthe training process using this methodology, the UNet model is capable of\naccurately predicting damage on the unseen test dataset by using 470 datasets.\nMoreover, another novel aspect of this work is the conversion of irregular\nfinite element data into regular grids using a developed pipeline. This\napproach allows for the implementation of less complex UNet architecture and\nfacilitates the integration of phase-field fracture equations into surrogate\nmodels for future developments.\n","authors":["Rasoul Najafi Koopas","Shahed Rezaei","Natalie Rauter","Richard Ostwald","Rolf Lammering"],"pdf_url":"https://arxiv.org/pdf/2407.15665v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.08261v3","updated":"2024-07-24T07:26:59Z","published":"2023-02-16T12:38:01Z","title":"Knowledge-augmented Graph Machine Learning for Drug Discovery: A Survey","summary":" The integration of Artificial Intelligence (AI) into the field of drug\ndiscovery has been a growing area of interdisciplinary scientific research.\nHowever, conventional AI models are heavily limited in handling complex\nbiomedical structures (such as 2D or 3D protein and molecule structures) and\nproviding interpretations for outputs, which hinders their practical\napplication. As of late, Graph Machine Learning (GML) has gained considerable\nattention for its exceptional ability to model graph-structured biomedical data\nand investigate their properties and functional relationships. Despite\nextensive efforts, GML methods still suffer from several deficiencies, such as\nthe limited ability to handle supervision sparsity and provide interpretability\nin learning and inference processes, and their ineffectiveness in utilising\nrelevant domain knowledge. In response, recent studies have proposed\nintegrating external biomedical knowledge into the GML pipeline to realise more\nprecise and interpretable drug discovery with limited training instances.\nHowever, a systematic definition for this burgeoning research direction is yet\nto be established. This survey presents a comprehensive overview of\nlong-standing drug discovery principles, provides the foundational concepts and\ncutting-edge techniques for graph-structured data and knowledge databases, and\nformally summarises Knowledge-augmented Graph Machine Learning (KaGML) for drug\ndiscovery. we propose a thorough review of related KaGML works, collected\nfollowing a carefully designed search methodology, and organise them into four\ncategories following a novel-defined taxonomy. To facilitate research in this\npromptly emerging field, we also share collected practical resources that are\nvaluable for intelligent drug discovery and provide an in-depth discussion of\nthe potential avenues for future advancements.\n","authors":["Zhiqiang Zhong","Anastasia Barkova","Davide Mottin"],"pdf_url":"https://arxiv.org/pdf/2302.08261v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01804v2","updated":"2024-07-24T07:19:00Z","published":"2024-07-01T21:06:34Z","title":"DCoM: Active Learning for All Learners","summary":" Deep Active Learning (AL) techniques can be effective in reducing annotation\ncosts for training deep models. However, their effectiveness in low- and\nhigh-budget scenarios seems to require different strategies, and achieving\noptimal results across varying budget scenarios remains a challenge. In this\nstudy, we introduce Dynamic Coverage & Margin mix (DCoM), a novel active\nlearning approach designed to bridge this gap. Unlike existing strategies, DCoM\ndynamically adjusts its strategy, considering the competence of the current\nmodel. Through theoretical analysis and empirical evaluations on diverse\ndatasets, including challenging computer vision tasks, we demonstrate DCoM's\nability to overcome the cold start problem and consistently improve results\nacross different budgetary constraints. Thus DCoM achieves state-of-the-art\nperformance in both low- and high-budget regimes.\n","authors":["Inbal Mishal","Daphna Weinshall"],"pdf_url":"https://arxiv.org/pdf/2407.01804v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20091v3","updated":"2024-07-24T07:06:43Z","published":"2024-05-30T14:27:40Z","title":"VAAD: Visual Attention Analysis Dashboard applied to e-Learning","summary":" In this paper, we present an approach in the Multimodal Learning Analytics\nfield. Within this approach, we have developed a tool to visualize and analyze\neye movement data collected during learning sessions in online courses. The\ntool is named VAAD, an acronym for Visual Attention Analysis Dashboard. These\neye movement data have been gathered using an eye-tracker and subsequently\nprocessed and visualized for interpretation. The purpose of the tool is to\nconduct a descriptive analysis of the data by facilitating its visualization,\nenabling the identification of differences and learning patterns among various\nlearner populations. Additionally, it integrates a predictive module capable of\nanticipating learner activities during a learning session. Consequently, VAAD\nholds the potential to offer valuable insights into online learning behaviors\nfrom both descriptive and predictive perspectives.\n","authors":["Miriam Navarro","Álvaro Becerra","Roberto Daza","Ruth Cobos","Aythami Morales","Julian Fierrez"],"pdf_url":"https://arxiv.org/pdf/2405.20091v3.pdf","comment":"Accepted in CEDI 2024 (VII Congreso Espa\\~nol de Inform\\'atica), A\n Coru\\~na, Spain"},{"id":"http://arxiv.org/abs/2407.17040v1","updated":"2024-07-24T07:02:16Z","published":"2024-07-24T07:02:16Z","title":"Time Series Missing Imputation with Multivariate Radial Basis Function\n Neural Network","summary":" Researchers have been persistently working to address the issue of missing\nvalues in time series data. Numerous models have been proposed, striving to\nestimate the distribution of the data. The Radial Basis Functions Neural\nNetwork (RBFNN) has recently exhibited exceptional performance in estimating\ndata distribution. In this paper, we propose a time series imputation model\nbased on RBFNN. Our imputation model learns local information from timestamps\nto create a continuous function. Additionally, we incorporate time gaps to\nfacilitate learning information considering the missing terms of missing\nvalues. We name this model the Missing Imputation Multivariate RBFNN\n(MIM-RBFNN). However, MIM-RBFNN relies on a local information-based learning\napproach, which presents difficulties in utilizing temporal information.\nTherefore, we propose an extension called the Missing Value Imputation\nRecurrent Neural Network with Continuous Function (MIRNN-CF) using the\ncontinuous function generated by MIM-RBFNN. We evaluate the performance using\ntwo real-world datasets with non-random missing and random missing patterns,\nand conduct an ablation study comparing MIM-RBFNN and MIRNN-CF.\n","authors":["Chanyoung Jung","Yun Jang"],"pdf_url":"https://arxiv.org/pdf/2407.17040v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17033v1","updated":"2024-07-24T06:39:58Z","published":"2024-07-24T06:39:58Z","title":"Sparse Inducing Points in Deep Gaussian Processes: Enhancing Modeling\n with Denoising Diffusion Variational Inference","summary":" Deep Gaussian processes (DGPs) provide a robust paradigm for Bayesian deep\nlearning. In DGPs, a set of sparse integration locations called inducing points\nare selected to approximate the posterior distribution of the model. This is\ndone to reduce computational complexity and improve model efficiency. However,\ninferring the posterior distribution of inducing points is not straightforward.\nTraditional variational inference approaches to posterior approximation often\nlead to significant bias. To address this issue, we propose an alternative\nmethod called Denoising Diffusion Variational Inference (DDVI) that uses a\ndenoising diffusion stochastic differential equation (SDE) to generate\nposterior samples of inducing variables. We rely on score matching methods for\ndenoising diffusion model to approximate score functions with a neural network.\nFurthermore, by combining classical mathematical theory of SDEs with the\nminimization of KL divergence between the approximate and true processes, we\npropose a novel explicit variational lower bound for the marginal likelihood\nfunction of DGP. Through experiments on various datasets and comparisons with\nbaseline methods, we empirically demonstrate the effectiveness of DDVI for\nposterior inference of inducing points for DGP models.\n","authors":["Jian Xu","Delu Zeng","John Paisley"],"pdf_url":"https://arxiv.org/pdf/2407.17033v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17032v1","updated":"2024-07-24T06:35:05Z","published":"2024-07-24T06:35:05Z","title":"Gymnasium: A Standard Interface for Reinforcement Learning Environments","summary":" Gymnasium is an open-source library providing an API for reinforcement\nlearning environments. Its main contribution is a central abstraction for wide\ninteroperability between benchmark environments and training algorithms.\nGymnasium comes with various built-in environments and utilities to simplify\nresearchers' work along with being supported by most training libraries. This\npaper outlines the main design decisions for Gymnasium, its key features, and\nthe differences to alternative APIs.\n","authors":["Mark Towers","Ariel Kwiatkowski","Jordan Terry","John U. Balis","Gianluca De Cola","Tristan Deleu","Manuel Goulão","Andreas Kallinteris","Markus Krimmel","Arjun KG","Rodrigo Perez-Vicente","Andrea Pierré","Sander Schulhoff","Jun Jet Tai","Hannah Tan","Omar G. Younis"],"pdf_url":"https://arxiv.org/pdf/2407.17032v1.pdf","comment":"6 pages, 1 figure, preprint"},{"id":"http://arxiv.org/abs/2010.01874v3","updated":"2024-07-24T06:25:27Z","published":"2020-10-05T09:22:31Z","title":"Diversity-Preserving K-Armed Bandits, Revisited","summary":" We consider the bandit-based framework for diversity-preserving\nrecommendations introduced by Celis et al. (2019), who approached it in the\ncase of a polytope mainly by a reduction to the setting of linear bandits. We\ndesign a UCB algorithm using the specific structure of the setting and show\nthat it enjoys a bounded distribution-dependent regret in the natural cases\nwhen the optimal mixed actions put some probability mass on all actions (i.e.,\nwhen diversity is desirable). The regret lower bounds provided show that\notherwise, at least when the model is mean-unbounded, a $\\ln T$ regret is\nsuffered. We also discuss an example beyond the special case of polytopes.\n","authors":["Hédi Hadiji","Sébastien Gerchinovitz","Jean-Michel Loubes","Gilles Stoltz"],"pdf_url":"https://arxiv.org/pdf/2010.01874v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17029v1","updated":"2024-07-24T06:16:37Z","published":"2024-07-24T06:16:37Z","title":"Accurate and Efficient Fine-Tuning of Quantized Large Language Models\n Through Optimal Balance","summary":" Large Language Models (LLMs) have demonstrated impressive performance across\nvarious domains. However, the enormous number of model parameters makes\nfine-tuning challenging, significantly limiting their application and\ndeployment. Existing solutions combine parameter quantization with Low-Rank\nAdaptation (LoRA), greatly reducing memory usage but resulting in noticeable\nperformance degradation. In this paper, we identify an imbalance in fine-tuning\nquantized pre-trained models: overly complex adapter inputs and outputs versus\nlow effective trainability of the adaptation. We propose Quantized LLMs with\nBalanced-rank Adaptation (Q-BaRA), which simplifies the adapter inputs and\noutputs while increasing the adapter's rank to achieve a more suitable balance\nfor fine-tuning quantized LLMs. Additionally, for scenarios where fine-tuned\nLLMs need to be deployed as low-precision inference models, we introduce\nQuantization-Aware Fine-tuning with Higher Rank Adaptation (QA-HiRA), which\nsimplifies the adapter inputs and outputs to align with the pre-trained model's\nblock-wise quantization while employing a single matrix to achieve a higher\nrank. Both Q-BaRA and QA-HiRA are easily implemented and offer the following\noptimizations: (i) Q-BaRA consistently achieves the highest accuracy compared\nto baselines and other variants, requiring the same number of trainable\nparameters and computational effort; (ii) QA-HiRA naturally merges adapter\nparameters into the block-wise quantized model after fine-tuning, achieving the\nhighest accuracy compared to other methods. We apply our Q-BaRA and QA-HiRA to\nthe LLaMA and LLaMA2 model families and validate their effectiveness across\ndifferent fine-tuning datasets and downstream scenarios.\n Code will be made available at\n\\href{https://github.com/xiaocaigou/qbaraqahira}{https://github.com/xiaocaigou/qbaraqahira}\n","authors":["Ao Shen","Qiang Wang","Zhiquan Lai","Xionglve Li","Dongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2407.17029v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09084v4","updated":"2024-07-24T06:07:28Z","published":"2023-08-17T16:23:52Z","title":"MovePose: A High-performance Human Pose Estimation Algorithm on Mobile\n and Edge Devices","summary":" We present MovePose, an optimized lightweight convolutional neural network\ndesigned specifically for real-time body pose estimation on CPU-based mobile\ndevices. The current solutions do not provide satisfactory accuracy and speed\nfor human posture estimation, and MovePose addresses this gap. It aims to\nmaintain real-time performance while improving the accuracy of human posture\nestimation for mobile devices. Our MovePose algorithm has attained an Mean\nAverage Precision (mAP) score of 68.0 on the COCO \\cite{cocodata} validation\ndataset. The MovePose algorithm displayed efficiency with a performance of 69+\nframes per second (fps) when run on an Intel i9-10920x CPU. Additionally, it\nshowcased an increased performance of 452+ fps on an NVIDIA RTX3090 GPU. On an\nAndroid phone equipped with a Snapdragon 8 + 4G processor, the fps reached\nabove 11. To enhance accuracy, we incorporated three techniques: deconvolution,\nlarge kernel convolution, and coordinate classification methods. Compared to\nbasic upsampling, deconvolution is trainable, improves model capacity, and\nenhances the receptive field. Large kernel convolution strengthens these\nproperties at a decreased computational cost. In summary, MovePose provides\nhigh accuracy and real-time performance, marking it a potential tool for a\nvariety of applications, including those focused on mobile-side human posture\nestimation. The code and models for this algorithm will be made publicly\naccessible.\n","authors":["Dongyang Yu","Haoyue Zhang","Ruisheng Zhao","Guoqi Chen","Wangpeng An","Yanhong Yang"],"pdf_url":"https://arxiv.org/pdf/2308.09084v4.pdf","comment":"This paper has been accepted by ICANN 2024 and is an oral\n presentation"},{"id":"http://arxiv.org/abs/2406.11233v2","updated":"2024-07-24T05:22:48Z","published":"2024-06-17T06:00:24Z","title":"Probing the Decision Boundaries of In-context Learning in Large Language\n Models","summary":" In-context learning is a key paradigm in large language models (LLMs) that\nenables them to generalize to new tasks and domains by simply prompting these\nmodels with a few exemplars without explicit parameter updates. Many attempts\nhave been made to understand in-context learning in LLMs as a function of model\nscale, pretraining data, and other factors. In this work, we propose a new\nmechanism to probe and understand in-context learning from the lens of decision\nboundaries for in-context binary classification. Decision boundaries are\nstraightforward to visualize and provide important information about the\nqualitative behavior of the inductive biases of standard classifiers. To our\nsurprise, we find that the decision boundaries learned by current LLMs in\nsimple binary classification tasks are often irregular and non-smooth,\nregardless of linear separability in the underlying task. This paper\ninvestigates the factors influencing these decision boundaries and explores\nmethods to enhance their generalizability. We assess various approaches,\nincluding training-free and fine-tuning methods for LLMs, the impact of model\narchitecture, and the effectiveness of active prompting techniques for\nsmoothing decision boundaries in a data-efficient manner. Our findings provide\na deeper understanding of in-context learning dynamics and offer practical\nimprovements for enhancing robustness and generalizability of in-context\nlearning.\n","authors":["Siyan Zhao","Tung Nguyen","Aditya Grover"],"pdf_url":"https://arxiv.org/pdf/2406.11233v2.pdf","comment":"18 pages, code at https://github.com/siyan-zhao/ICL_decision_boundary"},{"id":"http://arxiv.org/abs/2407.15899v2","updated":"2024-07-24T05:05:53Z","published":"2024-07-22T10:20:34Z","title":"Spatial-Temporal Cross-View Contrastive Pre-training for Check-in\n Sequence Representation Learning","summary":" The rapid growth of location-based services (LBS) has yielded massive amounts\nof data on human mobility. Effectively extracting meaningful representations\nfor user-generated check-in sequences is pivotal for facilitating various\ndownstream services. However, the user-generated check-in data are\nsimultaneously influenced by the surrounding objective circumstances and the\nuser's subjective intention. Specifically, the temporal uncertainty and spatial\ndiversity exhibited in check-in data make it difficult to capture the\nmacroscopic spatial-temporal patterns of users and to understand the semantics\nof user mobility activities. Furthermore, the distinct characteristics of the\ntemporal and spatial information in check-in sequences call for an effective\nfusion method to incorporate these two types of information. In this paper, we\npropose a novel Spatial-Temporal Cross-view Contrastive Representation (STCCR)\nframework for check-in sequence representation learning. Specifically, STCCR\naddresses the above challenges by employing self-supervision from \"spatial\ntopic\" and \"temporal intention\" views, facilitating effective fusion of spatial\nand temporal information at the semantic level. Besides, STCCR leverages\ncontrastive clustering to uncover users' shared spatial topics from diverse\nmobility activities, while employing angular momentum contrast to mitigate the\nimpact of temporal uncertainty and noise. We extensively evaluate STCCR on\nthree real-world datasets and demonstrate its superior performance across three\ndownstream tasks.\n","authors":["Letian Gong","Huaiyu Wan","Shengnan Guo","Xiucheng Li","Yan Lin","Erwen Zheng","Tianyi Wang","Zeyu Zhou","Youfang Lin"],"pdf_url":"https://arxiv.org/pdf/2407.15899v2.pdf","comment":"This paper has been accepted as a regular paper at IEEE TKDE"},{"id":"http://arxiv.org/abs/2405.07987v4","updated":"2024-07-24T05:01:21Z","published":"2024-05-13T17:58:30Z","title":"The Platonic Representation Hypothesis","summary":" We argue that representations in AI models, particularly deep networks, are\nconverging. First, we survey many examples of convergence in the literature:\nover time and across multiple domains, the ways by which different neural\nnetworks represent data are becoming more aligned. Next, we demonstrate\nconvergence across data modalities: as vision models and language models get\nlarger, they measure distance between datapoints in a more and more alike way.\nWe hypothesize that this convergence is driving toward a shared statistical\nmodel of reality, akin to Plato's concept of an ideal reality. We term such a\nrepresentation the platonic representation and discuss several possible\nselective pressures toward it. Finally, we discuss the implications of these\ntrends, their limitations, and counterexamples to our analysis.\n","authors":["Minyoung Huh","Brian Cheung","Tongzhou Wang","Phillip Isola"],"pdf_url":"https://arxiv.org/pdf/2405.07987v4.pdf","comment":"Equal contributions. Project: https://phillipi.github.io/prh/ Code:\n https://github.com/minyoungg/platonic-rep"},{"id":"http://arxiv.org/abs/2407.16999v1","updated":"2024-07-24T04:47:36Z","published":"2024-07-24T04:47:36Z","title":"SepsisLab: Early Sepsis Prediction with Uncertainty Quantification and\n Active Sensing","summary":" Sepsis is the leading cause of in-hospital mortality in the USA. Early sepsis\nonset prediction and diagnosis could significantly improve the survival of\nsepsis patients. Existing predictive models are usually trained on high-quality\ndata with few missing information, while missing values widely exist in\nreal-world clinical scenarios (especially in the first hours of admissions to\nthe hospital), which causes a significant decrease in accuracy and an increase\nin uncertainty for the predictive models. The common method to handle missing\nvalues is imputation, which replaces the unavailable variables with estimates\nfrom the observed data. The uncertainty of imputation results can be propagated\nto the sepsis prediction outputs, which have not been studied in existing works\non either sepsis prediction or uncertainty quantification. In this study, we\nfirst define such propagated uncertainty as the variance of prediction output\nand then introduce uncertainty propagation methods to quantify the propagated\nuncertainty. Moreover, for the potential high-risk patients with low confidence\ndue to limited observations, we propose a robust active sensing algorithm to\nincrease confidence by actively recommending clinicians to observe the most\ninformative variables. We validate the proposed models in both publicly\navailable data (i.e., MIMIC-III and AmsterdamUMCdb) and proprietary data in The\nOhio State University Wexner Medical Center (OSUWMC). The experimental results\nshow that the propagated uncertainty is dominant at the beginning of admissions\nto hospitals and the proposed algorithm outperforms state-of-the-art active\nsensing methods. Finally, we implement a SepsisLab system for early sepsis\nprediction and active sensing based on our pre-trained models. Clinicians and\npotential sepsis patients can benefit from the system in early prediction and\ndiagnosis of sepsis.\n","authors":["Changchang Yin","Pin-Yu Chen","Bingsheng Yao","Dakuo Wang","Jeffrey Caterino","Ping Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.16999v1.pdf","comment":"To be published in KDD 2024"},{"id":"http://arxiv.org/abs/2407.14504v2","updated":"2024-07-24T04:33:55Z","published":"2024-07-19T17:58:00Z","title":"Nonlinear Schrödinger Network","summary":" Deep neural networks (DNNs) have achieved exceptional performance across\nvarious fields by learning complex nonlinear mappings from large-scale\ndatasets. However, they encounter challenges such as high computational costs\nand limited interpretability. To address these issues, hybrid approaches that\nintegrate physics with AI are gaining interest. This paper introduces a novel\nphysics-based AI model called the \"Nonlinear Schr\\\"odinger Network\", which\ntreats the Nonlinear Schr\\\"odinger Equation (NLSE) as a general-purpose\ntrainable model for learning complex patterns including nonlinear mappings and\nmemory effects from data. Existing physics-informed machine learning methods\nuse neural networks to approximate the solutions of partial differential\nequations (PDEs). In contrast, our approach directly treats the PDE as a\ntrainable model to obtain general nonlinear mappings that would otherwise\nrequire neural networks. As a type of physics-AI symbiosis, it offers a more\ninterpretable and parameter-efficient alternative to traditional black-box\nneural networks, achieving comparable or better accuracy in some time series\nclassification tasks while significantly reducing the number of required\nparameters. Notably, the trained Nonlinear Schr\\\"odinger Network is\ninterpretable, with all parameters having physical meanings as properties of a\nvirtual physical system that transforms the data to a more separable space.\nThis interpretability allows for insight into the underlying dynamics of the\ndata transformation process. Applications to time series forecasting have also\nbeen explored. While our current implementation utilizes the NLSE, the proposed\nmethod of using physics equations as trainable models to learn nonlinear\nmappings from data is not limited to the NLSE and may be extended to other\nmaster equations of physics.\n","authors":["Yiming Zhou","Callen MacPhee","Tingyi Zhou","Bahram Jalali"],"pdf_url":"https://arxiv.org/pdf/2407.14504v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16994v1","updated":"2024-07-24T04:27:55Z","published":"2024-07-24T04:27:55Z","title":"A Voter-Based Stochastic Rejection-Method Framework for Asymptotically\n Safe Language Model Outputs","summary":" This paper proposes a new method for preventing unsafe or otherwise low\nquality large language model (LLM) outputs, by leveraging the stochasticity of\nLLMs. We propose a system whereby LLM checkers vote on the acceptability of a\ngenerated output, regenerating it if a threshold of disapproval is reached,\nuntil sufficient checkers approve. We further propose estimators for cost and\nfailure rate, and based on those estimators and experimental data tailored to\nthe application, we propose an algorithm that achieves a desired failure rate\nat the least possible cost. We demonstrate that, under these models, failure\nrate decreases exponentially as a function of cost when voter count and\nthreshold are chosen according to the algorithm, and that the models reasonably\nestimate the actual performance of such a system in action, even with limited\ndata.\n","authors":["Jake R. Watts","Joel Sokol"],"pdf_url":"https://arxiv.org/pdf/2407.16994v1.pdf","comment":"7 pages, 2 figures"},{"id":"http://arxiv.org/abs/2303.04778v2","updated":"2024-07-24T04:10:16Z","published":"2023-03-08T18:20:56Z","title":"Fourier-MIONet: Fourier-enhanced multiple-input neural operators for\n multiphase modeling of geological carbon sequestration","summary":" Geologic carbon sequestration (GCS) is a safety-critical technology that aims\nto reduce the amount of carbon dioxide in the atmosphere, which also places\nhigh demands on reliability. Multiphase flow in porous media is essential to\nunderstand CO$_2$ migration and pressure fields in the subsurface associated\nwith GCS. However, numerical simulation for such problems in 4D is\ncomputationally challenging and expensive, due to the multiphysics and\nmultiscale nature of the highly nonlinear governing partial differential\nequations (PDEs). It prevents us from considering multiple subsurface scenarios\nand conducting real-time optimization. Here, we develop a Fourier-enhanced\nmultiple-input neural operator (Fourier-MIONet) to learn the solution operator\nof the problem of multiphase flow in porous media. Fourier-MIONet utilizes the\nrecently developed framework of the multiple-input deep neural operators\n(MIONet) and incorporates the Fourier neural operator (FNO) in the network\narchitecture. Once Fourier-MIONet is trained, it can predict the evolution of\nsaturation and pressure of the multiphase flow under various reservoir\nconditions, such as permeability and porosity heterogeneity, anisotropy,\ninjection configurations, and multiphase flow properties. Compared to the\nenhanced FNO (U-FNO), the proposed Fourier-MIONet has 90% fewer unknown\nparameters, and it can be trained in significantly less time (about 3.5 times\nfaster) with much lower CPU memory ($<$ 15%) and GPU memory ($<$ 35%)\nrequirements, to achieve similar prediction accuracy. In addition to the lower\ncomputational cost, Fourier-MIONet can be trained with only 6 snapshots of time\nto predict the PDE solutions for 30 years. The excellent generalizability of\nFourier-MIONet is enabled by its adherence to the physical principle that the\nsolution to a PDE is continuous over time.\n","authors":["Zhongyi Jiang","Min Zhu","Lu Lu"],"pdf_url":"https://arxiv.org/pdf/2303.04778v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16985v1","updated":"2024-07-24T04:04:56Z","published":"2024-07-24T04:04:56Z","title":"Sparse Tensor PCA via Tensor Decomposition for Unsupervised Feature\n Selection","summary":" Recently, introducing Tensor Decomposition (TD) methods into unsupervised\nfeature selection (UFS) has been a rising research point. A tensor structure is\nbeneficial for mining the relations between different modes and helps relieve\nthe computation burden. However, while existing methods exploit TD to minimize\nthe reconstruction error of a data tensor, they don't fully utilize the\ninterpretable and discriminative information in the factor matrices. Moreover,\nmost methods require domain knowledge to perform feature selection. To solve\nthe above problems, we develop two Sparse Tensor Principal Component Analysis\n(STPCA) models that utilize the projection directions in the factor matrices to\nperform UFS. The first model extends Tucker Decomposition to a multiview sparse\nregression form and is transformed into several alternatively solved convex\nsubproblems. The second model formulates a sparse version of the family of\nTensor Singular Value Decomposition (T-SVDs) and is transformed into individual\nconvex subproblems. For both models, we prove the optimal solution of each\nsubproblem falls onto the Hermitian Positive Semidefinite Cone (HPSD).\nAccordingly, we design two fast algorithms based on HPSD projection and prove\ntheir convergence. According to the experimental results on two original\nsynthetic datasets (Orbit and Array Signal) and five real-world datasets, the\ntwo proposed methods are suitable for handling different data tensor scenarios\nand outperform the state-of-the-art UFS methods.\n","authors":["Junjing Zheng","Xinyu Zhang","Weidong Jiang"],"pdf_url":"https://arxiv.org/pdf/2407.16985v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16984v1","updated":"2024-07-24T04:01:09Z","published":"2024-07-24T04:01:09Z","title":"scGHSOM: Hierarchical clustering and visualization of single-cell and\n CRISPR data using growing hierarchical SOM","summary":" High-dimensional single-cell data poses significant challenges in identifying\nunderlying biological patterns due to the complexity and heterogeneity of\ncellular states. We propose a comprehensive gene-cell dependency visualization\nvia unsupervised clustering, Growing Hierarchical Self-Organizing Map (GHSOM),\nspecifically designed for analyzing high-dimensional single-cell data like\nsingle-cell sequencing and CRISPR screens. GHSOM is applied to cluster samples\nin a hierarchical structure such that the self-growth structure of clusters\nsatisfies the required variations between and within. We propose a novel\nSignificant Attributes Identification Algorithm to identify features that\ndistinguish clusters. This algorithm pinpoints attributes with minimal\nvariation within a cluster but substantial variation between clusters. These\nkey attributes can then be used for targeted data retrieval and downstream\nanalysis. Furthermore, we present two innovative visualization tools: Cluster\nFeature Map and Cluster Distribution Map. The Cluster Feature Map highlights\nthe distribution of specific features across the hierarchical structure of\nGHSOM clusters. This allows for rapid visual assessment of cluster uniqueness\nbased on chosen features. The Cluster Distribution Map depicts leaf clusters as\ncircles on the GHSOM grid, with circle size reflecting cluster data size and\ncolor customizable to visualize features like cell type or other attributes. We\napply our analysis to three single-cell datasets and one CRISPR dataset\n(cell-gene database) and evaluate clustering methods with internal and external\nCH and ARI scores. GHSOM performs well, being the best performer in internal\nevaluation (CH=4.2). In external evaluation, GHSOM has the third-best\nperformance of all methods.\n","authors":["Shang-Jung Wen","Jia-Ming Chang","Fang Yu"],"pdf_url":"https://arxiv.org/pdf/2407.16984v1.pdf","comment":"Abstract presentation at BIOKDD@ACM KDD 2024"},{"id":"http://arxiv.org/abs/2407.16975v1","updated":"2024-07-24T03:43:55Z","published":"2024-07-24T03:43:55Z","title":"On the Parameter Identifiability of Partially Observed Linear Causal\n Models","summary":" Linear causal models are important tools for modeling causal dependencies and\nyet in practice, only a subset of the variables can be observed. In this paper,\nwe examine the parameter identifiability of these models by investigating\nwhether the edge coefficients can be recovered given the causal structure and\npartially observed data. Our setting is more general than that of prior\nresearch - we allow all variables, including both observed and latent ones, to\nbe flexibly related, and we consider the coefficients of all edges, whereas\nmost existing works focus only on the edges between observed variables.\nTheoretically, we identify three types of indeterminacy for the parameters in\npartially observed linear causal models. We then provide graphical conditions\nthat are sufficient for all parameters to be identifiable and show that some of\nthem are provably necessary. Methodologically, we propose a novel\nlikelihood-based parameter estimation method that addresses the variance\nindeterminacy of latent variables in a specific way and can asymptotically\nrecover the underlying parameters up to trivial indeterminacy. Empirical\nstudies on both synthetic and real-world datasets validate our identifiability\ntheory and the effectiveness of the proposed method in the finite-sample\nregime.\n","authors":["Xinshuai Dong","Ignavier Ng","Biwei Huang","Yuewen Sun","Songyao Jin","Roberto Legaspi","Peter Spirtes","Kun Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.16975v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.06433v4","updated":"2024-07-24T03:43:32Z","published":"2023-02-13T15:12:15Z","title":"Label-efficient Time Series Representation Learning: A Review","summary":" Label-efficient time series representation learning, which aims to learn\neffective representations with limited labeled data, is crucial for deploying\ndeep learning models in real-world applications. To address the scarcity of\nlabeled time series data, various strategies, e.g., transfer learning,\nself-supervised learning, and semi-supervised learning, have been developed. In\nthis survey, we introduce a novel taxonomy for the first time, categorizing\nexisting approaches as in-domain or cross-domain, based on their reliance on\nexternal data sources or not. Furthermore, we present a review of the recent\nadvances in each strategy, conclude the limitations of current methodologies,\nand suggest future research directions that promise further improvements in the\nfield.\n","authors":["Emadeldeen Eldele","Mohamed Ragab","Zhenghua Chen","Min Wu","Chee-Keong Kwoh","Xiaoli Li"],"pdf_url":"https://arxiv.org/pdf/2302.06433v4.pdf","comment":"Accepted in the IEEE Transactions on Artificial Intelligence (TAI)\n https://ieeexplore.ieee.org/document/10601520"},{"id":"http://arxiv.org/abs/2407.08108v2","updated":"2024-07-24T03:37:17Z","published":"2024-07-11T00:54:56Z","title":"CADC: Encoding User-Item Interactions for Compressing Recommendation\n Model Training Data","summary":" Deep learning recommendation models (DLRMs) are at the heart of the current\ne-commerce industry. However, the amount of training data used to train these\nlarge models is growing exponentially, leading to substantial training hurdles.\nThe training dataset contains two primary types of information: content-based\ninformation (features of users and items) and collaborative information\n(interactions between users and items). One approach to reduce the training\ndataset is to remove user-item interactions. But that significantly diminishes\ncollaborative information, which is crucial for maintaining accuracy due to its\ninclusion of interaction histories. This loss profoundly impacts DLRM\nperformance.\n This paper makes an important observation that if one can capture the\nuser-item interaction history to enrich the user and item embeddings, then the\ninteraction history can be compressed without losing model accuracy. Thus, this\nwork, Collaborative Aware Data Compression (CADC), takes a two-step approach to\ntraining dataset compression. In the first step, we use matrix factorization of\nthe user-item interaction matrix to create a novel embedding representation for\nboth the users and items. Once the user and item embeddings are enriched by the\ninteraction history information the approach then applies uniform random\nsampling of the training dataset to drastically reduce the training dataset\nsize while minimizing model accuracy drop. The source code of CADC is available\nat\n\\href{https://anonymous.4open.science/r/DSS-RM-8C1D/README.md}{https://anonymous.4open.science/r/DSS-RM-8C1D/README.md}.\n","authors":["Hossein Entezari Zarch","Abdulla Alshabanah","Chaoyi Jiang","Murali Annavaram"],"pdf_url":"https://arxiv.org/pdf/2407.08108v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16452v2","updated":"2024-07-24T03:32:09Z","published":"2023-09-28T13:59:50Z","title":"On the Trade-offs between Adversarial Robustness and Actionable\n Explanations","summary":" As machine learning models are increasingly being employed in various\nhigh-stakes settings, it becomes important to ensure that predictions of these\nmodels are not only adversarially robust, but also readily explainable to\nrelevant stakeholders. However, it is unclear if these two notions can be\nsimultaneously achieved or if there exist trade-offs between them. In this\nwork, we make one of the first attempts at studying the impact of adversarially\nrobust models on actionable explanations which provide end users with a means\nfor recourse. We theoretically and empirically analyze the cost (ease of\nimplementation) and validity (probability of obtaining a positive model\nprediction) of recourses output by state-of-the-art algorithms when the\nunderlying models are adversarially robust vs. non-robust. More specifically,\nwe derive theoretical bounds on the differences between the cost and the\nvalidity of the recourses generated by state-of-the-art algorithms for\nadversarially robust vs. non-robust linear and non-linear models. Our empirical\nresults with multiple real-world datasets validate our theoretical results and\nshow the impact of varying degrees of model robustness on the cost and validity\nof the resulting recourses. Our analyses demonstrate that adversarially robust\nmodels significantly increase the cost and reduce the validity of the resulting\nrecourses, thus shedding light on the inherent trade-offs between adversarial\nrobustness and actionable explanations.\n","authors":["Satyapriya Krishna","Chirag Agarwal","Himabindu Lakkaraju"],"pdf_url":"https://arxiv.org/pdf/2309.16452v2.pdf","comment":"Accepted in the 7th AAAI Conference on AI, Ethics, and Society, 2024"},{"id":"http://arxiv.org/abs/2407.16970v1","updated":"2024-07-24T03:32:05Z","published":"2024-07-24T03:32:05Z","title":"Towards Aligning Language Models with Textual Feedback","summary":" We present ALT (ALignment with Textual feedback), an approach that aligns\nlanguage models with user preferences expressed in text. We argue that text\noffers greater expressiveness, enabling users to provide richer feedback than\nsimple comparative preferences and this richer feedback can lead to more\nefficient and effective alignment. ALT aligns the model by conditioning its\ngeneration on the textual feedback. Our method relies solely on language\nmodeling techniques and requires minimal hyper-parameter tuning, though it\nstill presents the main benefits of RL-based alignment algorithms and can\neffectively learn from textual feedback. We explore the efficacy and efficiency\nof textual feedback across different tasks such as toxicity reduction,\nsummarization, and dialog response generation. We find that ALT outperforms PPO\nfor the task of toxicity reduction while being able to match its performance on\nsummarization with only 20% of the samples. We also explore how ALT can be used\nwith feedback provided by an existing LLM where we explore an LLM providing\nconstrained and unconstrained textual feedback. We also outline future\ndirections to align models with natural language feedback.\n","authors":["Saüc Abadal Lloret","Shehzaad Dhuliawala","Keerthiram Murugesan","Mrinmaya Sachan"],"pdf_url":"https://arxiv.org/pdf/2407.16970v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16968v1","updated":"2024-07-24T03:26:26Z","published":"2024-07-24T03:26:26Z","title":"Stochastic Variance-Reduced Iterative Hard Thresholding in Graph\n Sparsity Optimization","summary":" Stochastic optimization algorithms are widely used for large-scale data\nanalysis due to their low per-iteration costs, but they often suffer from slow\nasymptotic convergence caused by inherent variance. Variance-reduced techniques\nhave been therefore used to address this issue in structured sparse models\nutilizing sparsity-inducing norms or $\\ell_0$-norms. However, these techniques\nare not directly applicable to complex (non-convex) graph sparsity models,\nwhich are essential in applications like disease outbreak monitoring and social\nnetwork analysis. In this paper, we introduce two stochastic variance-reduced\ngradient-based methods to solve graph sparsity optimization: GraphSVRG-IHT and\nGraphSCSG-IHT. We provide a general framework for theoretical analysis,\ndemonstrating that our methods enjoy a linear convergence speed. Extensive\nexperiments validate\n","authors":["Derek Fox","Samuel Hernandez","Qianqian Tong"],"pdf_url":"https://arxiv.org/pdf/2407.16968v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16964v1","updated":"2024-07-24T03:02:57Z","published":"2024-07-24T03:02:57Z","title":"When AI Defeats Password Deception! A Deep Learning Framework to\n Distinguish Passwords and Honeywords","summary":" \"Honeywords\" have emerged as a promising defense mechanism for detecting data\nbreaches and foiling offline dictionary attacks (ODA) by deceiving attackers\nwith false passwords. In this paper, we propose PassFilter, a novel deep\nlearning (DL) based attack framework, fundamental in its ability to identify\npasswords from a set of sweetwords associated with a user account, effectively\nchallenging a variety of honeywords generation techniques (HGTs). The DL model\nin PassFilter is trained with a set of previously collected or adversarially\ngenerated passwords and honeywords, and carefully orchestrated to predict\nwhether a sweetword is the password or a honeyword. Our model can compromise\nthe security of state-of-the-art, heuristics-based, and representation\nlearning-based HGTs proposed by Dionysiou et al. Specifically, our analysis\nwith nine publicly available password datasets shows that PassFilter\nsignificantly outperforms the baseline random guessing success rate of 5%,\nachieving 6.10% to 52.78% on the 1st guessing attempt, considering 20\nsweetwords per account. This success rate rapidly increases with additional\nlogin attempts before account lock-outs, often allowed on many real-world\nonline services to maintain reasonable usability. For example, it ranges from\n41.78% to 96.80% for five attempts, and from 72.87% to 99.00% for ten attempts,\ncompared to 25% and 50% random guessing, respectively. We also examined\nPassFilter against general-purpose language models used for honeyword\ngeneration, like those proposed by Yu et al. These honeywords also proved\nvulnerable to our attack, with success rates of 14.19% for 1st guessing\nattempt, increasing to 30.23%, 41.70%, and 63.10% after 3rd, 5th, and 10th\nguessing attempts, respectively. Our findings demonstrate the effectiveness of\nDL model deployed in PassFilter in breaching state-of-the-art HGTs and\ncompromising password security based on ODA.\n","authors":["Jimmy Dani","Brandon McCulloh","Nitesh Saxena"],"pdf_url":"https://arxiv.org/pdf/2407.16964v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11271v2","updated":"2024-07-24T02:59:40Z","published":"2024-06-17T07:21:36Z","title":"MINT-1T: Scaling Open-Source Multimodal Data by 10x: A Multimodal\n Dataset with One Trillion Tokens","summary":" Multimodal interleaved datasets featuring free-form interleaved sequences of\nimages and text are crucial for training frontier large multimodal models\n(LMMs). Despite the rapid progression of open-source LMMs, there remains a\npronounced scarcity of large-scale, diverse open-source multimodal interleaved\ndatasets. In response, we introduce MINT-1T, the most extensive and diverse\nopen-source Multimodal INTerleaved dataset to date. MINT-1T comprises one\ntrillion text tokens and 3.4 billion images, a 10x scale-up from existing\nopen-source datasets. Additionally, we include previously untapped sources such\nas PDFs and ArXiv papers. As scaling multimodal interleaved datasets requires\nsubstantial engineering effort, sharing the data curation process and releasing\nthe dataset greatly benefits the community. Our experiments show that LMMs\ntrained on MINT-1T rival the performance of models trained on the previous\nleading dataset, OBELICS. Our data and code will be released at\nhttps://github.com/mlfoundations/MINT-1T.\n","authors":["Anas Awadalla","Le Xue","Oscar Lo","Manli Shu","Hannah Lee","Etash Kumar Guha","Matt Jordan","Sheng Shen","Mohamed Awadalla","Silvio Savarese","Caiming Xiong","Ran Xu","Yejin Choi","Ludwig Schmidt"],"pdf_url":"https://arxiv.org/pdf/2406.11271v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16959v1","updated":"2024-07-24T02:56:22Z","published":"2024-07-24T02:56:22Z","title":"Dynamic Graph Transformer with Correlated Spatial-Temporal Positional\n Encoding","summary":" Learning effective representations for Continuous-Time Dynamic Graphs (CTDGs)\nhas garnered significant research interest, largely due to its powerful\ncapabilities in modeling complex interactions between nodes. A fundamental and\ncrucial requirement for representation learning in CTDGs is the appropriate\nestimation and preservation of proximity. However, due to the sparse and\nevolving characteristics of CTDGs, the spatial-temporal properties inherent in\nhigh-order proximity remain largely unexplored. Despite its importance, this\nproperty presents significant challenges due to the computationally intensive\nnature of personalized interaction intensity estimation and the dynamic\nattributes of CTDGs. To this end, we propose a novel Correlated\nSpatial-Temporal Positional encoding that incorporates a parameter-free\npersonalized interaction intensity estimation under the weak assumption of the\nPoisson Point Process. Building on this, we introduce the Dynamic Graph\nTransformer with \\Correlated Spatial-Temporal Positional Encoding (CorDGT),\nwhich efficiently retains the evolving spatial-temporal high-order proximity\nfor effective node representation learning in CTDGs. Extensive experiments on\nseven small and two large-scale datasets demonstrate the superior performance\nand scalability of the proposed CorDGT.\n","authors":["Zhe Wang","Sheng Zhou","Jiawei Chen","Zhen Zhang","Binbin Hu","Yan Feng","Chun Chen","Can Wang"],"pdf_url":"https://arxiv.org/pdf/2407.16959v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16958v1","updated":"2024-07-24T02:52:02Z","published":"2024-07-24T02:52:02Z","title":"Cheems: Wonderful Matrices More Efficient and More Effective\n Architecture","summary":" Recent studies have shown that, relative position encoding performs well in\nselective state space model scanning algorithms, and the architecture that\nbalances SSM and Attention enhances the efficiency and effectiveness of the\nalgorithm, while the sparse activation of the mixture of experts reduces the\ntraining cost. I studied the effectiveness of using different position\nencodings in structured state space dual algorithms, and the more effective\nSSD-Attn internal and external function mixing method, and designed a more\nefficient cross domain mixture of experts. I found that the same matrix is very\nwonderful in different algorithms, which allows us to establish a new hybrid\nsparse architecture: Cheems. Compared with other hybrid architectures, it is\nmore efficient and more effective in language modeling tasks.\n","authors":["Jingze Shi","Lu He","Yuhan Wang","Tianyu He","Bingheng Wu","Mingkun Hou"],"pdf_url":"https://arxiv.org/pdf/2407.16958v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16951v1","updated":"2024-07-24T02:37:42Z","published":"2024-07-24T02:37:42Z","title":"Towards Transfer Unlearning: Empirical Evidence of Cross-Domain Bias\n Mitigation","summary":" Large language models (LLMs) often inherit biases from vast amounts of\ntraining corpora. Traditional debiasing methods, while effective to some\nextent, do not completely eliminate memorized biases and toxicity in LLMs. In\nthis paper, we study an unlearning-based approach to debiasing in LLMs by\nperforming gradient ascent on hate speech against minority groups, i.e.,\nminimizing the likelihood of biased or toxic content. Specifically, we propose\na mask language modeling unlearning technique, which unlearns the harmful part\nof the text. This method enables LLMs to selectively forget and disassociate\nfrom biased and harmful content. Experimental results demonstrate the\neffectiveness of our approach in diminishing bias while maintaining the\nlanguage modeling abilities. Surprisingly, the results also unveil an\nunexpected potential for cross-domain transfer unlearning: debiasing in one\nbias form (e.g. gender) may contribute to mitigating others (e.g. race and\nreligion).\n","authors":["Huimin Lu","Masaru Isonuma","Junichiro Mori","Ichiro Sakata"],"pdf_url":"https://arxiv.org/pdf/2407.16951v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16944v1","updated":"2024-07-24T02:23:18Z","published":"2024-07-24T02:23:18Z","title":"An Adaptive Gradient Regularization Method","summary":" Optimizer plays an important role in neural network training with high\nefficiency and performance. Weight update based on its gradient is the central\npart of the optimizer. It has been shown that normalization and standardization\noperation on weight and gradient can accelerate the training process and\nimprove performance such as Weight Standardization (WS), weight normalization\n(WN) and gradient normalization (GN); there is also gradient centralization\n(GC). In this work, we introduce a new optimization technique based on the\ngradient magnitude in a gradient vector named adaptive gradient regularization\n(AGR), which normalizes the gradient vector in all dimensions as a coefficient\nvector and subtracts the product of the gradient and its coefficient vector by\nthe vanilla gradient. It can be viewed as an adaptive gradient clipping method.\nWe show that the AGR can improve the loss function Lipschitzness with a more\nstable training process and better generalization performance. AGR is very\nsimple to be embedded into vanilla optimizers such as Adan and AdamW with only\nthree lines of code. Our experiments are conducted in image generation, image\nclassification and language representation, which shows that our AGR improves\nthe training result.\n","authors":["Huixiu Jiang","Yu Bao","Rutong Si"],"pdf_url":"https://arxiv.org/pdf/2407.16944v1.pdf","comment":"11 pages, 11 figures"},{"id":"http://arxiv.org/abs/2407.16940v1","updated":"2024-07-24T02:20:29Z","published":"2024-07-24T02:20:29Z","title":"GV-Rep: A Large-Scale Dataset for Genetic Variant Representation\n Learning","summary":" Genetic variants (GVs) are defined as differences in the DNA sequences among\nindividuals and play a crucial role in diagnosing and treating genetic\ndiseases. The rapid decrease in next generation sequencing cost has led to an\nexponential increase in patient-level GV data. This growth poses a challenge\nfor clinicians who must efficiently prioritize patient-specific GVs and\nintegrate them with existing genomic databases to inform patient management. To\naddressing the interpretation of GVs, genomic foundation models (GFMs) have\nemerged. However, these models lack standardized performance assessments,\nleading to considerable variability in model evaluations. This poses the\nquestion: How effectively do deep learning methods classify unknown GVs and\nalign them with clinically-verified GVs? We argue that representation learning,\nwhich transforms raw data into meaningful feature spaces, is an effective\napproach for addressing both indexing and classification challenges. We\nintroduce a large-scale Genetic Variant dataset, named GV-Rep, featuring\nvariable-length contexts and detailed annotations, designed for deep learning\nmodels to learn GV representations across various traits, diseases, tissue\ntypes, and experimental contexts. Our contributions are three-fold: (i)\nConstruction of a comprehensive dataset with 7 million records, each labeled\nwith characteristics of the corresponding variants, alongside additional data\nfrom 17,548 gene knockout tests across 1,107 cell types, 1,808 variant\ncombinations, and 156 unique clinically verified GVs from real-world patients.\n(ii) Analysis of the structure and properties of the dataset. (iii)\nExperimentation of the dataset with pre-trained GFMs. The results show a\nsignificant gap between GFMs current capabilities and accurate GV\nrepresentation. We hope this dataset will help advance genomic deep learning to\nbridge this gap.\n","authors":["Zehui Li","Vallijah Subasri","Guy-Bart Stan","Yiren Zhao","Bo Wang"],"pdf_url":"https://arxiv.org/pdf/2407.16940v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2407.16938v1","updated":"2024-07-24T02:16:52Z","published":"2024-07-24T02:16:52Z","title":"Synthetic Trajectory Generation Through Convolutional Neural Networks","summary":" Location trajectories provide valuable insights for applications from urban\nplanning to pandemic control. However, mobility data can also reveal sensitive\ninformation about individuals, such as political opinions, religious beliefs,\nor sexual orientations. Existing privacy-preserving approaches for publishing\nthis data face a significant utility-privacy trade-off. Releasing synthetic\ntrajectory data generated through deep learning offers a promising solution.\nDue to the trajectories' sequential nature, most existing models are based on\nrecurrent neural networks (RNNs). However, research in generative adversarial\nnetworks (GANs) largely employs convolutional neural networks (CNNs) for image\ngeneration. This discrepancy raises the question of whether advances in\ncomputer vision can be applied to trajectory generation. In this work, we\nintroduce a Reversible Trajectory-to-CNN Transformation (RTCT) that adapts\ntrajectories into a format suitable for CNN-based models. We integrated this\ntransformation with the well-known DCGAN in a proof-of-concept (PoC) and\nevaluated its performance against an RNN-based trajectory GAN using four\nmetrics across two datasets. The PoC was superior in capturing spatial\ndistributions compared to the RNN model but had difficulty replicating\nsequential and temporal properties. Although the PoC's utility is not\nsufficient for practical applications, the results demonstrate the\ntransformation's potential to facilitate the use of CNNs for trajectory\ngeneration, opening up avenues for future research. To support continued\nresearch, all source code has been made available under an open-source license.\n","authors":["Jesse Merhi","Erik Buchholz","Salil S. Kanhere"],"pdf_url":"https://arxiv.org/pdf/2407.16938v1.pdf","comment":"To appear in the proceedings of the 21st Annual International\n Conference on Privacy, Security & Trust (PST 2024)"},{"id":"http://arxiv.org/abs/2407.16936v1","updated":"2024-07-24T02:15:48Z","published":"2024-07-24T02:15:48Z","title":"Provable Benefit of Annealed Langevin Monte Carlo for Non-log-concave\n Sampling","summary":" We address the outstanding problem of sampling from an unnormalized density\nthat may be non-log-concave and multimodal. To enhance the performance of\nsimple Markov chain Monte Carlo (MCMC) methods, techniques of annealing type\nhave been widely used. However, quantitative theoretical guarantees of these\ntechniques are under-explored. This study takes a first step toward providing a\nnon-asymptotic analysis of annealed MCMC. Specifically, we establish, for the\nfirst time, an oracle complexity of $\\widetilde{O}\\left(\\frac{d\\beta^2{\\cal\nA}^2}{\\varepsilon^6}\\right)$ for simple annealed Langevin Monte Carlo algorithm\nto achieve $\\varepsilon^2$ accuracy in Kullback-Leibler divergence to the\ntarget distribution $\\pi\\propto{\\rm e}^{-V}$ on $\\mathbb{R}^d$ with\n$\\beta$-smooth potential $V$. Here, ${\\cal A}$ represents the action of a curve\nof probability measures interpolating the target distribution $\\pi$ and a\nreadily sampleable distribution.\n","authors":["Wei Guo","Molei Tao","Yongxin Chen"],"pdf_url":"https://arxiv.org/pdf/2407.16936v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10081v3","updated":"2024-07-24T02:08:25Z","published":"2023-06-16T07:07:58Z","title":"Optimizer's Information Criterion: Dissecting and Correcting Bias in\n Data-Driven Optimization","summary":" In data-driven optimization, the sample performance of the obtained decision\ntypically incurs an optimistic bias against the true performance, a phenomenon\ncommonly known as the Optimizer's Curse and intimately related to overfitting\nin machine learning. Common techniques to correct this bias, such as\ncross-validation, require repeatedly solving additional optimization problems\nand are therefore computationally expensive. We develop a general bias\ncorrection approach, building on what we call Optimizer's Information Criterion\n(OIC), that directly approximates the first-order bias and does not require\nsolving any additional optimization problems. Our OIC generalizes the\ncelebrated Akaike Information Criterion to evaluate the objective performance\nin data-driven optimization, which crucially involves not only model fitting\nbut also its interplay with the downstream optimization. As such it can be used\nfor decision selection instead of only model selection. We apply our approach\nto a range of data-driven optimization formulations comprising empirical and\nparametric models, their regularized counterparts, and furthermore contextual\noptimization. Finally, we provide numerical validation on the superior\nperformance of our approach under synthetic and real-world datasets.\n","authors":["Garud Iyengar","Henry Lam","Tianyu Wang"],"pdf_url":"https://arxiv.org/pdf/2306.10081v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03718v2","updated":"2024-07-24T02:03:47Z","published":"2024-07-04T08:08:12Z","title":"Multi-Convformer: Extending Conformer with Multiple Convolution Kernels","summary":" Convolutions have become essential in state-of-the-art end-to-end Automatic\nSpeech Recognition~(ASR) systems due to their efficient modelling of local\ncontext. Notably, its use in Conformers has led to superior performance\ncompared to vanilla Transformer-based ASR systems. While components other than\nthe convolution module in the Conformer have been reexamined, altering the\nconvolution module itself has been far less explored. Towards this, we\nintroduce Multi-Convformer that uses multiple convolution kernels within the\nconvolution module of the Conformer in conjunction with gating. This helps in\nimproved modeling of local dependencies at varying granularities. Our model\nrivals existing Conformer variants such as CgMLP and E-Branchformer in\nperformance, while being more parameter efficient. We empirically compare our\napproach with Conformer and its variants across four different datasets and\nthree different modelling paradigms and show up to 8% relative word error\nrate~(WER) improvements.\n","authors":["Darshan Prabhu","Yifan Peng","Preethi Jyothi","Shinji Watanabe"],"pdf_url":"https://arxiv.org/pdf/2407.03718v2.pdf","comment":"Accepted to INTERSPEECH 2024"},{"id":"http://arxiv.org/abs/2407.16935v1","updated":"2024-07-24T02:03:28Z","published":"2024-07-24T02:03:28Z","title":"Federated Automatic Latent Variable Selection in Multi-output Gaussian\n Processes","summary":" This paper explores a federated learning approach that automatically selects\nthe number of latent processes in multi-output Gaussian processes (MGPs). The\nMGP has seen great success as a transfer learning tool when data is generated\nfrom multiple sources/units/entities. A common approach in MGPs to transfer\nknowledge across units involves gathering all data from each unit to a central\nserver and extracting common independent latent processes to express each unit\nas a linear combination of the shared latent patterns. However, this approach\nposes key challenges in (i) determining the adequate number of latent processes\nand (ii) relying on centralized learning which leads to potential privacy risks\nand significant computational burdens on the central server. To address these\nissues, we propose a hierarchical model that places spike-and-slab priors on\nthe coefficients of each latent process. These priors help automatically select\nonly needed latent processes by shrinking the coefficients of unnecessary ones\nto zero. To estimate the model while avoiding the drawbacks of centralized\nlearning, we propose a variational inference-based approach, that formulates\nmodel inference as an optimization problem compatible with federated settings.\nWe then design a federated learning algorithm that allows units to jointly\nselect and infer the common latent processes without sharing their data. We\nalso discuss an efficient learning approach for a new unit within our proposed\nfederated framework. Simulation and case studies on Li-ion battery degradation\nand air temperature data demonstrate the advantageous features of our proposed\napproach.\n","authors":["Jingyi Gao","Seokhyun Chung"],"pdf_url":"https://arxiv.org/pdf/2407.16935v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16933v1","updated":"2024-07-24T01:54:30Z","published":"2024-07-24T01:54:30Z","title":"Deep Koopman-based Control of Quality Variation in Multistage\n Manufacturing Systems","summary":" This paper presents a modeling-control synthesis to address the quality\ncontrol challenges in multistage manufacturing systems (MMSs). A new\nfeedforward control scheme is developed to minimize the quality variations\ncaused by process disturbances in MMSs. Notably, the control framework\nleverages a stochastic deep Koopman (SDK) model to capture the quality\npropagation mechanism in the MMSs, highlighted by its ability to transform the\nnonlinear propagation dynamics into a linear one. Two roll-to-roll case studies\nare presented to validate the proposed method and demonstrate its\neffectiveness. The overall method is suitable for nonlinear MMSs and does not\nrequire extensive expert knowledge.\n","authors":["Zhiyi Chen","Harshal Maske","Devesh Upadhyay","Huanyi Shui","Xun Huan","Jun Ni"],"pdf_url":"https://arxiv.org/pdf/2407.16933v1.pdf","comment":"The paper was in the proceeding of 2024 American Control Conference.\n This submitted version addresses a minor correction to one equation (Eq. 14),\n while the results and conclusions remain the same"},{"id":"http://arxiv.org/abs/2407.04418v2","updated":"2024-07-24T01:32:05Z","published":"2024-07-05T11:09:05Z","title":"Enabling On-Device LLMs Personalization with Smartphone Sensing","summary":" This demo presents a novel end-to-end framework that combines on-device large\nlanguage models (LLMs) with smartphone sensing technologies to achieve\ncontext-aware and personalized services. The framework addresses critical\nlimitations of current personalization solutions via cloud LLMs, such as\nprivacy concerns, latency and cost, and limited personal information. To\nachieve this, we innovatively proposed deploying LLMs on smartphones with\nmultimodal sensor data through context-aware sensing and customized prompt\nengineering, ensuring privacy and enhancing personalization performance. A case\nstudy involving a university student demonstrated the capability of the\nframework to provide tailored recommendations. In addition, we show that the\nframework achieves the best trade-off in privacy, performance, latency, cost,\nbattery and energy consumption between on-device and cloud LLMs. To the best of\nour knowledge, this is the first framework to provide on-device LLMs\npersonalization with smartphone sensing. Future work will incorporate more\ndiverse sensor data and involve extensive user studies to enhance\npersonalization. Our proposed framework has the potential to substantially\nimprove user experiences across domains including healthcare, productivity, and\nentertainment.\n","authors":["Shiquan Zhang","Ying Ma","Le Fang","Hong Jia","Simon D'Alfonso","Vassilis Kostakos"],"pdf_url":"https://arxiv.org/pdf/2407.04418v2.pdf","comment":"5 pages, 3 figures, conference demo paper"},{"id":"http://arxiv.org/abs/2407.16927v1","updated":"2024-07-24T01:28:04Z","published":"2024-07-24T01:28:04Z","title":"DeepCell: A Ubiquitous Accurate Provider-side Cellular-based\n Localization","summary":" Although outdoor localization is already available to the general public and\nbusinesses through the wide spread use of the GPS, it is not supported by\nlow-end phones, requires a direct line of sight to satellites and can drain\nphone battery quickly. The current fingerprinting solutions can provide\nhigh-accuracy localization but are based on the client side. This limits their\nubiquitous deployment and accuracy. In this paper, we introduce DeepCell: a\nprovider-side fingerprinting localization system that can provide high accuracy\nlocalization for any cell phone. To build its fingerprint, DeepCell leverages\nthe unlabeled cellular measurements recorded by the cellular provider while\nopportunistically synchronizing with selected client devices to get location\nlabels. The fingerprint is then used to train a deep neural network model that\nis harnessed for localization. To achieve this goal, DeepCell need to address a\nnumber of challenges including using unlabeled data from the provider side,\nhandling noise and sparsity, scaling the data to large areas, and finally\nproviding enough data that is required for training deep models without\noverhead. Evaluation of DeepCell in a typical realistic environment shows that\nit can achieve a consistent median accuracy of 29m. This accuracy outperforms\nthe state-of-the-art client-based cellular-based systems by more than 75.4%. In\naddition, the same accuracy is extended to low-end phones.\n","authors":["Ahmed Shokry","Moustafa Youssef"],"pdf_url":"https://arxiv.org/pdf/2407.16927v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2106.13632"},{"id":"http://arxiv.org/abs/2407.16923v1","updated":"2024-07-24T01:15:53Z","published":"2024-07-24T01:15:53Z","title":"Handling Device Heterogeneity for Deep Learning-based Localization","summary":" Deep learning-based fingerprinting is one of the current promising\ntechnologies for outdoor localization in cellular networks. However, deploying\nsuch localization systems for heterogeneous phones affects their accuracy as\nthe cellular received signal strength (RSS) readings vary for different types\nof phones. In this paper, we introduce a number of techniques for addressing\nthe phones heterogeneity problem in the deep-learning based localization\nsystems. The basic idea is either to approximate a function that maps the\ncellular RSS measurements between different devices or to transfer the\nknowledge across them.\n Evaluation of the proposed techniques using different Android phones on four\nindependent testbeds shows that our techniques can improve the localization\naccuracy by more than 220% for the four testbeds as compared to the\nstate-of-the-art systems. This highlights the promise of the proposed device\nheterogeneity handling techniques for enabling a wide deployment of deep\nlearning-based localization systems over different devices.\n","authors":["Ahmed Shokry","Moustafa Youssef"],"pdf_url":"https://arxiv.org/pdf/2407.16923v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14710v2","updated":"2024-07-24T01:15:40Z","published":"2024-07-20T00:11:59Z","title":"Universally Harmonizing Differential Privacy Mechanisms for Federated\n Learning: Boosting Accuracy and Convergence","summary":" Differentially private federated learning (DP-FL) is a promising technique\nfor collaborative model training while ensuring provable privacy for clients.\nHowever, optimizing the tradeoff between privacy and accuracy remains a\ncritical challenge. To our best knowledge, we propose the first DP-FL framework\n(namely UDP-FL), which universally harmonizes any randomization mechanism\n(e.g., an optimal one) with the Gaussian Moments Accountant (viz. DP-SGD) to\nsignificantly boost accuracy and convergence. Specifically, UDP-FL demonstrates\nenhanced model performance by mitigating the reliance on Gaussian noise. The\nkey mediator variable in this transformation is the R\\'enyi Differential\nPrivacy notion, which is carefully used to harmonize privacy budgets. We also\npropose an innovative method to theoretically analyze the convergence for DP-FL\n(including our UDP-FL ) based on mode connectivity analysis. Moreover, we\nevaluate our UDP-FL through extensive experiments benchmarked against\nstate-of-the-art (SOTA) methods, demonstrating superior performance on both\nprivacy guarantees and model performance. Notably, UDP-FL exhibits substantial\nresilience against different inference attacks, indicating a significant\nadvance in safeguarding sensitive data in federated learning environments.\n","authors":["Shuya Feng","Meisam Mohammady","Hanbin Hong","Shenao Yan","Ashish Kundu","Binghui Wang","Yuan Hong"],"pdf_url":"https://arxiv.org/pdf/2407.14710v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.09017v3","updated":"2024-07-24T01:15:20Z","published":"2024-07-12T06:10:01Z","title":"AI-Driven Guided Response for Security Operation Centers with Microsoft\n Copilot for Security","summary":" Security operation centers contend with a constant stream of security\nincidents, ranging from straightforward to highly complex. To address this, we\ndeveloped Copilot Guided Response (CGR), an industry-scale ML architecture that\nguides security analysts across three key tasks -- (1) investigation, providing\nessential historical context by identifying similar incidents; (2) triaging to\nascertain the nature of the incident -- whether it is a true positive, false\npositive, or benign positive; and (3) remediation, recommending tailored\ncontainment actions. CGR is integrated into the Microsoft Defender XDR product\nand deployed worldwide, generating millions of recommendations across thousands\nof customers. Our extensive evaluation, incorporating internal evaluation,\ncollaboration with security experts, and customer feedback, demonstrates that\nCGR delivers high-quality recommendations across all three tasks. We provide a\ncomprehensive overview of the CGR architecture, setting a precedent as the\nfirst cybersecurity company to openly discuss these capabilities in such depth.\nAdditionally, we GUIDE, the largest public collection of real-world security\nincidents, spanning 13M evidences across 1M annotated incidents. By enabling\nresearchers and practitioners to conduct research on real-world data, GUIDE\nadvances the state of cybersecurity and supports the development of\nnext-generation machine learning systems.\n","authors":["Scott Freitas","Jovan Kalajdjieski","Amir Gharib","Robert McCann"],"pdf_url":"https://arxiv.org/pdf/2407.09017v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.11933v2","updated":"2024-07-24T01:13:35Z","published":"2024-02-19T08:19:26Z","title":"SLADE: Detecting Dynamic Anomalies in Edge Streams without Labels via\n Self-Supervised Learning","summary":" To detect anomalies in real-world graphs, such as social, email, and\nfinancial networks, various approaches have been developed. While they\ntypically assume static input graphs, most real-world graphs grow over time,\nnaturally represented as edge streams. In this context, we aim to achieve three\ngoals: (a) instantly detecting anomalies as they occur, (b) adapting to\ndynamically changing states, and (c) handling the scarcity of dynamic anomaly\nlabels. In this paper, we propose SLADE (Self-supervised Learning for Anomaly\nDetection in Edge Streams) for rapid detection of dynamic anomalies in edge\nstreams, without relying on labels. SLADE detects the shifts of nodes into\nabnormal states by observing deviations in their interaction patterns over\ntime. To this end, it trains a deep neural network to perform two\nself-supervised tasks: (a) minimizing drift in node representations and (b)\ngenerating long-term interaction patterns from short-term ones. Failure in\nthese tasks for a node signals its deviation from the norm. Notably, the neural\nnetwork and tasks are carefully designed so that all required operations can be\nperformed in constant time (w.r.t. the graph size) in response to each new edge\nin the input stream. In dynamic anomaly detection across four real-world\ndatasets, SLADE outperforms nine competing methods, even those leveraging label\nsupervision.\n","authors":["Jongha Lee","Sunwoo Kim","Kijung Shin"],"pdf_url":"https://arxiv.org/pdf/2402.11933v2.pdf","comment":"15 pages, 6 figures, To Appear in KDD 2024"},{"id":"http://arxiv.org/abs/2404.01039v2","updated":"2024-07-24T01:10:49Z","published":"2024-04-01T10:50:34Z","title":"A Survey on Hypergraph Neural Networks: An In-Depth and Step-By-Step\n Guide","summary":" Higher-order interactions (HOIs) are ubiquitous in real-world complex systems\nand applications. Investigation of deep learning for HOIs, thus, has become a\nvaluable agenda for the data mining and machine learning communities. As\nnetworks of HOIs are expressed mathematically as hypergraphs, hypergraph neural\nnetworks (HNNs) have emerged as a powerful tool for representation learning on\nhypergraphs. Given the emerging trend, we present the first survey dedicated to\nHNNs, with an in-depth and step-by-step guide. Broadly, the present survey\noverviews HNN architectures, training strategies, and applications. First, we\nbreak existing HNNs down into four design components: (i) input features, (ii)\ninput structures, (iii) message-passing schemes, and (iv) training strategies.\nSecond, we examine how HNNs address and learn HOIs with each of their\ncomponents. Third, we overview the recent applications of HNNs in\nrecommendation, bioinformatics and medical science, time series analysis, and\ncomputer vision. Lastly, we conclude with a discussion on limitations and\nfuture directions.\n","authors":["Sunwoo Kim","Soo Yong Lee","Yue Gao","Alessia Antelmi","Mirko Polato","Kijung Shin"],"pdf_url":"https://arxiv.org/pdf/2404.01039v2.pdf","comment":"To appear in KDD 2024 (survey paper)"},{"id":"http://arxiv.org/abs/2112.08222v5","updated":"2024-07-24T01:06:36Z","published":"2021-12-15T15:57:33Z","title":"Guaranteed Trajectory Tracking under Learned Dynamics with Contraction\n Metrics and Disturbance Estimation","summary":" This paper presents an approach to trajectory-centric learning control based\non contraction metrics and disturbance estimation for nonlinear systems subject\nto matched uncertainties. The approach uses deep neural networks to learn\nuncertain dynamics while still providing guarantees of transient tracking\nperformance throughout the learning phase. Within the proposed approach, a\ndisturbance estimation law is adopted to estimate the pointwise value of the\nuncertainty, with pre-computable estimation error bounds (EEBs). The learned\ndynamics, the estimated disturbances, and the EEBs are then incorporated in a\nrobust Riemann energy condition to compute the control law that guarantees\nexponential convergence of actual trajectories to desired ones throughout the\nlearning phase, even when the learned model is poor. On the other hand, with\nimproved accuracy, the learned model can help improve the robustness of the\ntracking controller, e.g., against input delays, and can be incorporated to\nplan better trajectories with improved performance, e.g., lower energy\nconsumption and shorter travel time.The proposed framework is validated on a\nplanar quadrotor example.\n","authors":["Pan Zhao","Ziyao Guo","Yikun Cheng","Aditya Gahlawat","Hyungsoo Kang","Naira Hovakimyan"],"pdf_url":"https://arxiv.org/pdf/2112.08222v5.pdf","comment":"18 pages, 8 figures"},{"id":"http://arxiv.org/abs/2407.16917v1","updated":"2024-07-24T00:44:52Z","published":"2024-07-24T00:44:52Z","title":"TelescopeML -- I. An End-to-End Python Package for Interpreting\n Telescope Datasets through Training Machine Learning Models, Generating\n Statistical Reports, and Visualizing Results","summary":" We are on the verge of a revolutionary era in space exploration, thanks to\nadvancements in telescopes such as the James Webb Space Telescope\n(\\textit{JWST}). High-resolution, high signal-to-noise spectra from exoplanet\nand brown dwarf atmospheres have been collected over the past few decades,\nrequiring the development of accurate and reliable pipelines and tools for\ntheir analysis. Accurately and swiftly determining the spectroscopic parameters\nfrom the observational spectra of these objects is crucial for understanding\ntheir atmospheric composition and guiding future follow-up observations.\n\\texttt{TelescopeML} is a Python package developed to perform three main tasks:\n1. Process the synthetic astronomical datasets for training a CNN model and\nprepare the observational dataset for later use for prediction; 2. Train a CNN\nmodel by implementing the optimal hyperparameters; and 3. Deploy the trained\nCNN models on the actual observational data to derive the output spectroscopic\nparameters.\n","authors":[" Ehsan"," Gharib-Nezhad","Natasha E. Batalha","Hamed Valizadegan","Miguel J. S. Martinho","Mahdi Habibi","Gopal Nookula"],"pdf_url":"https://arxiv.org/pdf/2407.16917v1.pdf","comment":"Please find the accepted paper with complete reference list at\n https://joss.theoj.org/papers/10.21105/joss.06346"},{"id":"http://arxiv.org/abs/2403.01046v4","updated":"2024-07-24T00:32:35Z","published":"2024-03-02T00:33:45Z","title":"A Library of Mirrors: Deep Neural Nets in Low Dimensions are Convex\n Lasso Models with Reflection Features","summary":" We prove that training neural networks on 1-D data is equivalent to solving\nconvex Lasso problems with discrete, explicitly defined dictionary matrices. We\nconsider neural networks with piecewise linear activations and depths ranging\nfrom 2 to an arbitrary but finite number of layers. We first show that\ntwo-layer networks with piecewise linear activations are equivalent to Lasso\nmodels using a discrete dictionary of ramp functions, with breakpoints\ncorresponding to the training data points. In certain general architectures\nwith absolute value or ReLU activations, a third layer surprisingly creates\nfeatures that reflect the training data about themselves. Additional layers\nprogressively generate reflections of these reflections. The Lasso\nrepresentation provides valuable insights into the analysis of globally optimal\nnetworks, elucidating their solution landscapes and enabling closed-form\nsolutions in certain special cases. Numerical results show that reflections\nalso occur when optimizing standard deep networks using standard non-convex\noptimizers. Additionally, we demonstrate our theory with autoregressive time\nseries models.\n","authors":["Emi Zeger","Yifei Wang","Aaron Mishkin","Tolga Ergen","Emmanuel Candès","Mert Pilanci"],"pdf_url":"https://arxiv.org/pdf/2403.01046v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16912v1","updated":"2024-07-24T00:13:00Z","published":"2024-07-24T00:13:00Z","title":"Cross-Domain Policy Transfer by Representation Alignment via\n Multi-Domain Behavioral Cloning","summary":" Transferring learned skills across diverse situations remains a fundamental\nchallenge for autonomous agents, particularly when agents are not allowed to\ninteract with an exact target setup. While prior approaches have predominantly\nfocused on learning domain translation, they often struggle with handling\nsignificant domain gaps or out-of-distribution tasks. In this paper, we present\na simple approach for cross-domain policy transfer that learns a shared latent\nrepresentation across domains and a common abstract policy on top of it. Our\napproach leverages multi-domain behavioral cloning on unaligned trajectories of\nproxy tasks and employs maximum mean discrepancy (MMD) as a regularization term\nto encourage cross-domain alignment. The MMD regularization better preserves\nstructures of latent state distributions than commonly used\ndomain-discriminative distribution matching, leading to higher transfer\nperformance. Moreover, our approach involves training only one multi-domain\npolicy, which makes extension easier than existing methods. Empirical\nevaluations demonstrate the efficacy of our method across various domain\nshifts, especially in scenarios where exact domain translation is challenging,\nsuch as cross-morphology or cross-viewpoint settings. Our ablation studies\nfurther reveal that multi-domain behavioral cloning implicitly contributes to\nrepresentation alignment alongside domain-adversarial regularization.\n","authors":["Hayato Watahiki","Ryo Iwase","Ryosuke Unno","Yoshimasa Tsuruoka"],"pdf_url":"https://arxiv.org/pdf/2407.16912v1.pdf","comment":"CoLLAs 2024 (Oral). Code:\n https://github.com/hwatahiki/portable-latent-policy"}],"Multimedia":[{"id":"http://arxiv.org/abs/2407.17274v1","updated":"2024-07-24T13:39:51Z","published":"2024-07-24T13:39:51Z","title":"Revolutionizing Text-to-Image Retrieval as Autoregressive Token-to-Voken\n Generation","summary":" Text-to-image retrieval is a fundamental task in multimedia processing,\naiming to retrieve semantically relevant cross-modal content. Traditional\nstudies have typically approached this task as a discriminative problem,\nmatching the text and image via the cross-attention mechanism (one-tower\nframework) or in a common embedding space (two-tower framework). Recently,\ngenerative cross-modal retrieval has emerged as a new research line, which\nassigns images with unique string identifiers and generates the target\nidentifier as the retrieval target. Despite its great potential, existing\ngenerative approaches are limited due to the following issues: insufficient\nvisual information in identifiers, misalignment with high-level semantics, and\nlearning gap towards the retrieval target. To address the above issues, we\npropose an autoregressive voken generation method, named AVG. AVG tokenizes\nimages into vokens, i.e., visual tokens, and innovatively formulates the\ntext-to-image retrieval task as a token-to-voken generation problem. AVG\ndiscretizes an image into a sequence of vokens as the identifier of the image,\nwhile maintaining the alignment with both the visual information and high-level\nsemantics of the image. Additionally, to bridge the learning gap between\ngenerative training and the retrieval target, we incorporate discriminative\ntraining to modify the learning direction during token-to-voken training.\nExtensive experiments demonstrate that AVG achieves superior results in both\neffectiveness and efficiency.\n","authors":["Yongqi Li","Hongru Cai","Wenjie Wang","Leigang Qu","Yinwei Wei","Wenjie Li","Liqiang Nie","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2407.17274v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2407.17205v1","updated":"2024-07-24T12:04:25Z","published":"2024-07-24T12:04:25Z","title":"The Sketchfab 3D Creative Commons Collection (S3D3C)","summary":" The technology to capture, create, and use three-dimensional (3D) models has\nbecome increasingly accessible in recent years.\n With increasing numbers of use cases for 3D models and collections of rapidly\nincreasing size, better methods to analyze the content of 3D models are\nrequired.\n While previously proposed 3D model collections for research purposes exist,\nthese often contain only untextured geometry and are typically designed for a\nspecific application, which limits their use in quantitative evaluations of\nmodern 3D model analysis methods.\n In this paper, we introduce the Sketchfab 3D Creative Commons Collection\n(S3D3C), a new 3D model research collection consisting of 40,802 creative\ncommons licensed models downloaded from the 3D model platform Sketchfab.\n By including popular freely available models with a wide variety of technical\nproperties, such as textures, materials, and animations, we enable its use in\nthe evaluation of state-of-the-art geometry-based and view-based 3D model\nanalysis and retrieval techniques.\n","authors":["Florian Spiess","Raphael Waltenspül","Heiko Schuldt"],"pdf_url":"https://arxiv.org/pdf/2407.17205v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01976v2","updated":"2024-07-24T11:45:48Z","published":"2024-07-02T06:29:05Z","title":"A Bounding Box is Worth One Token: Interleaving Layout and Text in a\n Large Language Model for Document Understanding","summary":" Recently, many studies have demonstrated that exclusively incorporating\nOCR-derived text and spatial layouts with large language models (LLMs) can be\nhighly effective for document understanding tasks. However, existing methods\nthat integrate spatial layouts with text have limitations, such as producing\noverly long text sequences or failing to fully leverage the autoregressive\ntraits of LLMs. In this work, we introduce Interleaving Layout and Text in a\nLarge Language Model (LayTextLLM)} for document understanding. In particular,\nLayTextLLM projects each bounding box to a single embedding and interleaves it\nwith text, efficiently avoiding long sequence issues while leveraging\nautoregressive traits of LLMs. LayTextLLM not only streamlines the interaction\nof layout and textual data but also shows enhanced performance in Key\nInformation Extraction (KIE) and Visual Question Answering (VQA). Comprehensive\nbenchmark evaluations reveal significant improvements, with a 27.2% increase on\nKIE tasks and 12.0% on VQA tasks compared to previous state-of-the-art document\nunderstanding MLLMs, as well as a 15.1% improvement over other SOTA OCR-based\nLLMs on KIE tasks.\n","authors":["Jinghui Lu","Haiyang Yu","Yanjie Wang","Yongjie Ye","Jingqun Tang","Ziwei Yang","Binghong Wu","Qi Liu","Hao Feng","Han Wang","Hao Liu","Can Huang"],"pdf_url":"https://arxiv.org/pdf/2407.01976v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17028v1","updated":"2024-07-24T06:15:28Z","published":"2024-07-24T06:15:28Z","title":"Enhancing Environmental Monitoring through Multispectral Imaging: The\n WasteMS Dataset for Semantic Segmentation of Lakeside Waste","summary":" Environmental monitoring of lakeside green areas is crucial for environmental\nprotection. Compared to manual inspections, computer vision technologies offer\na more efficient solution when deployed on-site. Multispectral imaging provides\ndiverse information about objects under different spectrums, aiding in the\ndifferentiation between waste and lakeside lawn environments. This study\nintroduces WasteMS, the first multispectral dataset established for the\nsemantic segmentation of lakeside waste. WasteMS includes a diverse range of\nwaste types in lawn environments, captured under various lighting conditions.\nWe implemented a rigorous annotation process to label waste in images.\nRepresentative semantic segmentation frameworks were used to evaluate\nsegmentation accuracy using WasteMS. Challenges encountered when using WasteMS\nfor segmenting waste on lakeside lawns were discussed. The WasteMS dataset is\navailable at https://github.com/zhuqinfeng1999/WasteMS.\n","authors":["Qinfeng Zhu","Ningxin Weng","Lei Fan","Yuanzhi Cai"],"pdf_url":"https://arxiv.org/pdf/2407.17028v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16248v2","updated":"2024-07-24T05:56:55Z","published":"2024-07-23T07:36:54Z","title":"Spatiotemporal Graph Guided Multi-modal Network for Livestreaming\n Product Retrieval","summary":" With the rapid expansion of e-commerce, more consumers have become accustomed\nto making purchases via livestreaming. Accurately identifying the products\nbeing sold by salespeople, i.e., livestreaming product retrieval (LPR), poses a\nfundamental and daunting challenge. The LPR task encompasses three primary\ndilemmas in real-world scenarios: 1) the recognition of intended products from\ndistractor products present in the background; 2) the video-image heterogeneity\nthat the appearance of products showcased in live streams often deviates\nsubstantially from standardized product images in stores; 3) there are numerous\nconfusing products with subtle visual nuances in the shop. To tackle these\nchallenges, we propose the Spatiotemporal Graphing Multi-modal Network (SGMN).\nFirst, we employ a text-guided attention mechanism that leverages the spoken\ncontent of salespeople to guide the model to focus toward intended products,\nemphasizing their salience over cluttered background products. Second, a\nlong-range spatiotemporal graph network is further designed to achieve both\ninstance-level interaction and frame-level matching, solving the misalignment\ncaused by video-image heterogeneity. Third, we propose a multi-modal hard\nexample mining, assisting the model in distinguishing highly similar products\nwith fine-grained features across the video-image-text domain. Through\nextensive quantitative and qualitative experiments, we demonstrate the superior\nperformance of our proposed SGMN model, surpassing the state-of-the-art methods\nby a substantial margin. The code is available at\nhttps://github.com/Huxiaowan/SGMN.\n","authors":["Xiaowan Hu","Yiyi Chen","Yan Li","Minquan Wang","Haoqian Wang","Quan Chen","Han Li","Peng Jiang"],"pdf_url":"https://arxiv.org/pdf/2407.16248v2.pdf","comment":"9 pages, 12 figures"},{"id":"http://arxiv.org/abs/2407.16977v1","updated":"2024-07-24T03:45:35Z","published":"2024-07-24T03:45:35Z","title":"Selective Vision-Language Subspace Projection for Few-shot CLIP","summary":" Vision-language models such as CLIP are capable of mapping the different\nmodality data into a unified feature space, enabling zero/few-shot inference by\nmeasuring the similarity of given images and texts. However, most existing\nmethods overlook modality gaps in CLIP's encoded features, which is shown as\nthe text and image features lie far apart from each other, resulting in limited\nclassification performance. To tackle this issue, we introduce a method called\nSelective Vision-Language Subspace Projection (SSP), which incorporates local\nimage features and utilizes them as a bridge to enhance the alignment between\nimage-text pairs. Specifically, our SSP framework comprises two parallel\nmodules: a vision projector and a language projector. Both projectors utilize\nlocal image features to span the respective subspaces for image and texts,\nthereby projecting the image and text features into their respective subspaces\nto achieve alignment. Moreover, our approach entails only training-free matrix\ncalculations and can be seamlessly integrated into advanced CLIP-based few-shot\nlearning frameworks. Extensive experiments on 11 datasets have demonstrated\nSSP's superior text-image alignment capabilities, outperforming the\nstate-of-the-art alignment methods. The code is available at\nhttps://github.com/zhuhsingyuu/SSP\n","authors":["Xingyu Zhu","Beier Zhu","Yi Tan","Shuo Wang","Yanbin Hao","Hanwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.16977v1.pdf","comment":"Accepted to ACM MultiMedia 2024"},{"id":"http://arxiv.org/abs/2407.16552v2","updated":"2024-07-24T01:09:36Z","published":"2024-07-23T15:05:55Z","title":"MicroEmo: Time-Sensitive Multimodal Emotion Recognition with\n Micro-Expression Dynamics in Video Dialogues","summary":" Multimodal Large Language Models (MLLMs) have demonstrated remarkable\nmultimodal emotion recognition capabilities, integrating multimodal cues from\nvisual, acoustic, and linguistic contexts in the video to recognize human\nemotional states. However, existing methods ignore capturing local facial\nfeatures of temporal dynamics of micro-expressions and do not leverage the\ncontextual dependencies of the utterance-aware temporal segments in the video,\nthereby limiting their expected effectiveness to a certain extent. In this\nwork, we propose MicroEmo, a time-sensitive MLLM aimed at directing attention\nto the local facial micro-expression dynamics and the contextual dependencies\nof utterance-aware video clips. Our model incorporates two key architectural\ncontributions: (1) a global-local attention visual encoder that integrates\nglobal frame-level timestamp-bound image features with local facial features of\ntemporal dynamics of micro-expressions; (2) an utterance-aware video Q-Former\nthat captures multi-scale and contextual dependencies by generating visual\ntoken sequences for each utterance segment and for the entire video then\ncombining them. Preliminary qualitative experiments demonstrate that in a new\nExplainable Multimodal Emotion Recognition (EMER) task that exploits\nmulti-modal and multi-faceted clues to predict emotions in an open-vocabulary\n(OV) manner, MicroEmo demonstrates its effectiveness compared with the latest\nmethods.\n","authors":["Liyun Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.16552v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17536v1","updated":"2024-07-24T07:32:26Z","published":"2024-07-24T07:32:26Z","title":"Improved symbolic drum style classification with grammar-based\n hierarchical representations","summary":" Deep learning models have become a critical tool for analysis and\nclassification of musical data. These models operate either on the audio\nsignal, e.g. waveform or spectrogram, or on a symbolic representation, such as\nMIDI. In the latter, musical information is often reduced to basic features,\ni.e. durations, pitches and velocities. Most existing works then rely on\ngeneric tokenization strategies from classical natural language processing, or\nmatrix representations, e.g. piano roll. In this work, we evaluate how enriched\nrepresentations of symbolic data can impact deep models, i.e. Transformers and\nRNN, for music style classification. In particular, we examine representations\nthat explicitly incorporate musical information implicitly present in MIDI-like\nencodings, such as rhythmic organization, and show that they outperform generic\ntokenization strategies. We introduce a new tree-based representation of MIDI\ndata built upon a context-free musical grammar. We show that this grammar\nrepresentation accurately encodes high-level rhythmic information and\noutperforms existing encodings on the GrooveMIDI Dataset for drumming style\nclassification, while being more compact and parameter-efficient.\n","authors":["Léo Géré","Philippe Rigaux","Nicolas Audebert"],"pdf_url":"https://arxiv.org/pdf/2407.17536v1.pdf","comment":"International Society for Music Information Retrieval Conference\n 2024, Nov 2024, San Francisco, United States"}]},"2024-07-25T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2407.18248v1","updated":"2024-07-25T17:59:16Z","published":"2024-07-25T17:59:16Z","title":"Self-Training with Direct Preference Optimization Improves\n Chain-of-Thought Reasoning","summary":" Effective training of language models (LMs) for mathematical reasoning tasks\ndemands high-quality supervised fine-tuning data. Besides obtaining annotations\nfrom human experts, a common alternative is sampling from larger and more\npowerful LMs. However, this knowledge distillation approach can be costly and\nunstable, particularly when relying on closed-source, proprietary LMs like\nGPT-4, whose behaviors are often unpredictable. In this work, we demonstrate\nthat the reasoning abilities of small-scale LMs can be enhanced through\nself-training, a process where models learn from their own outputs. We also\nshow that the conventional self-training can be further augmented by a\npreference learning algorithm called Direct Preference Optimization (DPO). By\nintegrating DPO into self-training, we leverage preference data to guide LMs\ntowards more accurate and diverse chain-of-thought reasoning. We evaluate our\nmethod across various mathematical reasoning tasks using different base models.\nOur experiments show that this approach not only improves LMs' reasoning\nperformance but also offers a more cost-effective and scalable solution\ncompared to relying on large proprietary LMs.\n","authors":["Tianduo Wang","Shichen Li","Wei Lu"],"pdf_url":"https://arxiv.org/pdf/2407.18248v1.pdf","comment":"ACL 2024. Code and data are available at\n https://github.com/TianduoWang/DPO-ST"},{"id":"http://arxiv.org/abs/2407.18242v1","updated":"2024-07-25T17:57:12Z","published":"2024-07-25T17:57:12Z","title":"LoRA-Pro: Are Low-Rank Adapters Properly Optimized?","summary":" Low-Rank Adaptation, also known as LoRA, has emerged as a prominent method\nfor parameter-efficient fine-tuning foundation models by re-parameterizing the\noriginal matrix into the product of two low-rank matrices. Despite its\nefficiency, LoRA often yields inferior performance compared to full\nfine-tuning. In this paper, we propose LoRA-Pro to bridge this performance gap.\nFirstly, we delve into the optimization processes in LoRA and full fine-tuning.\nWe reveal that while LoRA employs low-rank approximation, it neglects to\napproximate the optimization process of full fine-tuning. To address this, we\nintroduce a novel concept called the \"equivalent gradient.\" This virtual\ngradient makes the optimization process on the re-parameterized matrix\nequivalent to LoRA, which can be used to quantify the differences between LoRA\nand full fine-tuning. The equivalent gradient is derived from the gradients of\nmatrices $A$ and $B$. To narrow the performance gap, our approach minimizes the\ndifferences between the equivalent gradient and the gradient obtained from full\nfine-tuning during the optimization process. By solving this objective, we\nderive optimal closed-form solutions for updating matrices $A$ and $B$. Our\nmethod constrains the optimization process, shrinking the performance gap\nbetween LoRA and full fine-tuning. Extensive experiments on natural language\nprocessing tasks validate the effectiveness of our method.\n","authors":["Zhengbo Wang","Jian Liang"],"pdf_url":"https://arxiv.org/pdf/2407.18242v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10444v2","updated":"2024-07-25T17:51:50Z","published":"2024-03-15T16:28:22Z","title":"Block Verification Accelerates Speculative Decoding","summary":" Speculative decoding is an effective method for lossless acceleration of\nlarge language models during inference. It uses a fast model to draft a block\nof tokens which are then verified in parallel by the target model, and provides\na guarantee that the output is distributed identically to a sample from the\ntarget model. In prior works, draft verification is performed independently\ntoken-by-token. Surprisingly, we show that this approach is not optimal. We\npropose Block Verification, a simple draft verification algorithm that verifies\nthe entire block jointly and provides additional wall-clock speedup. We prove\nthat the proposed mechanism is optimal in the expected number of tokens\nproduced each iteration and specifically is never worse than the standard\ntoken-level verification. Empirically, block verification provides modest but\nconsistent wall-clock speedups over the standard token verification algorithm\nof 5%-8% in a range of tasks and datasets. Given that block verification does\nnot increase code complexity, maintains the strong lossless guarantee of the\nstandard speculative decoding verification algorithm, cannot deteriorate\nperformance, and, in fact, consistently improves it, it can be used as a good\ndefault in speculative decoding implementations.\n","authors":["Ziteng Sun","Uri Mendlovic","Yaniv Leviathan","Asaf Aharoni","Ahmad Beirami","Jae Hun Ro","Ananda Theertha Suresh"],"pdf_url":"https://arxiv.org/pdf/2403.10444v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18219v1","updated":"2024-07-25T17:35:59Z","published":"2024-07-25T17:35:59Z","title":"Recursive Introspection: Teaching Language Model Agents How to\n Self-Improve","summary":" A central piece in enabling intelligent agentic behavior in foundation models\nis to make them capable of introspecting upon their behavior, reasoning, and\ncorrecting their mistakes as more computation or interaction is available. Even\nthe strongest proprietary large language models (LLMs) do not quite exhibit the\nability of continually improving their responses sequentially, even in\nscenarios where they are explicitly told that they are making a mistake. In\nthis paper, we develop RISE: Recursive IntroSpEction, an approach for\nfine-tuning LLMs to introduce this capability, despite prior work hypothesizing\nthat this capability may not be possible to attain. Our approach prescribes an\niterative fine-tuning procedure, which attempts to teach the model how to alter\nits response after having executed previously unsuccessful attempts to solve a\nhard test-time problem, with optionally additional environment feedback. RISE\nposes fine-tuning for a single-turn prompt as solving a multi-turn Markov\ndecision process (MDP), where the initial state is the prompt. Inspired by\nprinciples in online imitation learning and reinforcement learning, we propose\nstrategies for multi-turn data collection and training so as to imbue an LLM\nwith the capability to recursively detect and correct its previous mistakes in\nsubsequent iterations. Our experiments show that RISE enables Llama2, Llama3,\nand Mistral models to improve themselves with more turns on math reasoning\ntasks, outperforming several single-turn strategies given an equal amount of\ninference-time computation. We also find that RISE scales well, often attaining\nlarger benefits with more capable models. Our analysis shows that RISE makes\nmeaningful improvements to responses to arrive at the correct solution for\nchallenging prompts, without disrupting one-turn abilities as a result of\nexpressing more complex distributions.\n","authors":["Yuxiao Qu","Tianjun Zhang","Naman Garg","Aviral Kumar"],"pdf_url":"https://arxiv.org/pdf/2407.18219v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18213v1","updated":"2024-07-25T17:26:41Z","published":"2024-07-25T17:26:41Z","title":"Exploring Scaling Trends in LLM Robustness","summary":" Language model capabilities predictably improve from scaling a model's size\nand training data. Motivated by this, increasingly large language models have\nbeen trained, yielding an array of impressive capabilities. Yet these models\nare vulnerable to adversarial prompts, such as \"jailbreaks\" that hijack models\nto perform undesired behaviors, posing a significant risk of misuse. Prior work\nindicates that computer vision models become more robust with model and data\nscaling, raising the question: does language model robustness also improve with\nscale? We study this question empirically, finding that larger models respond\nsubstantially better to adversarial training, but there is little to no benefit\nfrom model scale in the absence of explicit defenses.\n","authors":["Nikolhaus Howe","Michał Zajac","Ian McKenzie","Oskar Hollinsworth","Tom Tseng","Pierre-Luc Bacon","Adam Gleave"],"pdf_url":"https://arxiv.org/pdf/2407.18213v1.pdf","comment":"31 pages"},{"id":"http://arxiv.org/abs/2406.05981v3","updated":"2024-07-25T17:20:48Z","published":"2024-06-10T02:47:55Z","title":"ShiftAddLLM: Accelerating Pretrained LLMs via Post-Training\n Multiplication-Less Reparameterization","summary":" Large language models (LLMs) have shown impressive performance on language\ntasks but face challenges when deployed on resource-constrained devices due to\ntheir extensive parameters and reliance on dense multiplications, resulting in\nhigh memory demands and latency bottlenecks. Shift-and-add reparameterization\noffers a promising solution by replacing costly multiplications with\nhardware-friendly primitives in both the attention and multi-layer perceptron\n(MLP) layers of an LLM. However, current reparameterization techniques require\ntraining from scratch or full parameter fine-tuning to restore accuracy, which\nis resource-intensive for LLMs. To address this, we propose accelerating\npretrained LLMs through post-training shift-and-add reparameterization,\ncreating efficient multiplication-free models, dubbed ShiftAddLLM.\nSpecifically, we quantize each weight matrix into binary matrices paired with\ngroup-wise scaling factors. The associated multiplications are reparameterized\ninto (1) shifts between activations and scaling factors and (2) queries and\nadds according to the binary matrices. To reduce accuracy loss, we present a\nmulti-objective optimization method to minimize both weight and output\nactivation reparameterization errors. Additionally, based on varying\nsensitivity across layers to reparameterization, we develop an automated bit\nallocation strategy to further reduce memory usage and latency. Experiments on\nfive LLM families and eight tasks consistently validate the effectiveness of\nShiftAddLLM, achieving average perplexity improvements of 5.6 and 22.7 points\nat comparable or lower latency compared to the most competitive quantized LLMs\nat 3 and 2 bits, respectively, and more than 80% memory and energy reductions\nover the original LLMs. Codes and models are available at\nhttps://github.com/GATECH-EIC/ShiftAddLLM.\n","authors":["Haoran You","Yipin Guo","Yichao Fu","Wei Zhou","Huihong Shi","Xiaofan Zhang","Souvik Kundu","Amir Yazdanbakhsh","Yingyan Celine Lin"],"pdf_url":"https://arxiv.org/pdf/2406.05981v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.07368v2","updated":"2024-07-25T17:18:01Z","published":"2024-06-11T15:34:43Z","title":"When Linear Attention Meets Autoregressive Decoding: Towards More\n Effective and Efficient Linearized Large Language Models","summary":" Autoregressive Large Language Models (LLMs) have achieved impressive\nperformance in language tasks but face two significant bottlenecks: (1)\nquadratic complexity in the attention module as the number of tokens increases,\nand (2) limited efficiency due to the sequential processing nature of\nautoregressive LLMs during generation. While linear attention and speculative\ndecoding offer potential solutions, their applicability and synergistic\npotential for enhancing autoregressive LLMs remain uncertain. We conduct the\nfirst comprehensive study on the efficacy of existing linear attention methods\nfor autoregressive LLMs, integrating them with speculative decoding. We\nintroduce an augmentation technique for linear attention that ensures\ncompatibility with speculative decoding, enabling more efficient training and\nserving of LLMs. Extensive experiments and ablation studies involving seven\nexisting linear attention models and five encoder/decoder-based LLMs\nconsistently validate the effectiveness of our augmented linearized LLMs.\nNotably, our approach achieves up to a 6.67 reduction in perplexity on the\nLLaMA model and up to a 2$\\times$ speedup during generation compared to prior\nlinear attention methods. Codes and models are available at\nhttps://github.com/GATECH-EIC/Linearized-LLM.\n","authors":["Haoran You","Yichao Fu","Zheng Wang","Amir Yazdanbakhsh","Yingyan Celine Lin"],"pdf_url":"https://arxiv.org/pdf/2406.07368v2.pdf","comment":"Accepted by ICML 2024; 17 pages; 10 figures; 16 tables"},{"id":"http://arxiv.org/abs/2403.14236v4","updated":"2024-07-25T16:52:15Z","published":"2024-03-21T08:54:24Z","title":"A Unified Framework for Model Editing","summary":" ROME and MEMIT are largely believed to be two different model editing\nalgorithms, with the major difference between them being the ability to perform\nbatched edits. In this paper, we unify these two algorithms under a single\nconceptual umbrella, optimizing for the same goal, which we call the\npreservation-memorization objective. ROME uses an equality constraint to\noptimize this objective to perform one edit at a time, whereas MEMIT employs a\nmore flexible least-square constraint that allows for batched edits. We\ngeneralize ROME and enable batched editing with equality constraint in the form\nof EMMET - an Equality-constrained Mass Model Editing algorithm for\nTransformers, a new batched memory-editing algorithm. EMMET can perform\nbatched-edits up to a batch-size of 10,000, with very similar performance to\nMEMIT across multiple dimensions. With the introduction of EMMET, we truly\nunify ROME and MEMIT and show that both algorithms are equivalent in terms of\ntheir optimization objective, their abilities (singular and batched editing),\ntheir model editing performance and their limitations.\n","authors":["Akshat Gupta","Dev Sajnani","Gopala Anumanchipalli"],"pdf_url":"https://arxiv.org/pdf/2403.14236v4.pdf","comment":"Under review. To appear as poster at KnowledgeableLM Workshop\n co-located with ACL 2024"},{"id":"http://arxiv.org/abs/2407.12835v2","updated":"2024-07-25T16:50:58Z","published":"2024-07-03T18:42:55Z","title":"Regurgitative Training: The Value of Real Data in Training Large\n Language Models","summary":" What happens if we train a new Large Language Model (LLM) using data that are\nat least partially generated by other LLMs? The explosive success of LLMs means\nthat a substantial amount of content online will be generated by LLMs rather\nthan humans, which will inevitably enter the training datasets of\nnext-generation LLMs. We evaluate the implications of such \"regurgitative\ntraining\" on LLM performance. Through fine-tuning GPT-3.5 with data generated\neither by itself or by other LLMs in a machine translation task, we find strong\nevidence that regurgitative training clearly handicaps the performance of LLMs.\nThe same performance loss of regurgitative training is observed on transformer\nmodels that we train from scratch. We find suggestive evidence that the\nperformance disadvantage of regurgitative training can be attributed to at\nleast two mechanisms: (1) higher error rates and (2) lower lexical diversity in\nLLM-generated data as compared to real data. Based on these mechanisms, we\npropose and evaluate three different strategies to mitigate the performance\nloss of regurgitative training. First, we devise data-driven metrics to gauge\nthe quality of each LLM-generated data instance, and then carry out an ordered\ntraining process where high-quality data are added before low-quality ones.\nSecond, we combine data generated by multiple different LLMs (as an attempt to\nincrease lexical diversity). Third, we train an AI detection classifier to\ndifferentiate between LLM- and human-generated data, and include LLM-generated\ndata in the order of resemblance to human-generated data. All three strategies\ncan improve the performance of regurgitative training to some extent but are\nnot always able to fully close the gap from training with real data. Our\nresults highlight the value of real, human-generated data in training LLMs,\nwhich cannot be easily substituted by synthetic, LLM-generated data.\n","authors":["Jinghui Zhang","Dandan Qiao","Mochen Yang","Qiang Wei"],"pdf_url":"https://arxiv.org/pdf/2407.12835v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16470v2","updated":"2024-07-25T16:31:39Z","published":"2024-07-23T13:40:54Z","title":"Machine Translation Hallucination Detection for Low and High Resource\n Languages using Large Language Models","summary":" Recent advancements in massively multilingual machine translation systems\nhave significantly enhanced translation accuracy; however, even the best\nperforming systems still generate hallucinations, severely impacting user\ntrust. Detecting hallucinations in Machine Translation (MT) remains a critical\nchallenge, particularly since existing methods excel with High-Resource\nLanguages (HRLs) but exhibit substantial limitations when applied to\nLow-Resource Languages (LRLs). This paper evaluates hallucination detection\napproaches using Large Language Models (LLMs) and semantic similarity within\nmassively multilingual embeddings. Our study spans 16 language directions,\ncovering HRLs, LRLs, with diverse scripts. We find that the choice of model is\nessential for performance. On average, for HRLs, Llama3-70B outperforms the\nprevious state of the art by as much as 0.16 MCC (Matthews Correlation\nCoefficient). However, for LRLs we observe that Claude Sonnet outperforms other\nLLMs on average by 0.03 MCC. The key takeaway from our study is that LLMs can\nachieve performance comparable or even better than previously proposed models,\ndespite not being explicitly trained for any machine translation task. However,\ntheir advantage is less significant for LRLs.\n","authors":["Kenza Benkirane","Laura Gongas","Shahar Pelles","Naomi Fuchs","Joshua Darmon","Pontus Stenetorp","David Ifeoluwa Adelani","Eduardo Sánchez"],"pdf_url":"https://arxiv.org/pdf/2407.16470v2.pdf","comment":"Authors Kenza Benkirane and Laura Gongas contributed equally to this\n work"},{"id":"http://arxiv.org/abs/2404.19708v2","updated":"2024-07-25T16:16:46Z","published":"2024-04-30T17:00:32Z","title":"Harmonic LLMs are Trustworthy","summary":" We introduce an intuitive method to test the robustness (stability and\nexplainability) of any black-box LLM in real-time via its local deviation from\nharmoniticity, denoted as $\\gamma$. To the best of our knowledge this is the\nfirst completely model-agnostic and unsupervised method of measuring the\nrobustness of any given response from an LLM, based upon the model itself\nconforming to a purely mathematical standard. To show general application and\nimmediacy of results, we measure $\\gamma$ in 10 popular LLMs (ChatGPT,\nClaude-2.1, Claude3.0, GPT-4, GPT-4o, Smaug-72B, Mixtral-8x7B, Llama2-7B,\nMistral-7B and MPT-7B) across thousands of queries in three objective domains:\nWebQA, ProgrammingQA, and TruthfulQA. Across all models and domains tested,\nhuman annotation confirms that $\\gamma \\to 0$ indicates trustworthiness, and\nconversely searching higher values of $\\gamma$ easily exposes examples of\nhallucination, a fact that enables efficient adversarial prompt generation\nthrough stochastic gradient ascent in $\\gamma$. The low-$\\gamma$ leaders among\nthe models in the respective domains are GPT-4o, GPT-4, and Smaug-72B,\nproviding evidence that mid-size open-source models can win out against large\ncommercial models.\n","authors":["Nicholas S. Kersting","Mohammad Rahman","Suchismitha Vedala","Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2404.19708v2.pdf","comment":"15 pages, 2 figures, 16 tables; added Claude-3.0, GPT-4o, Mistral-7B,\n Mixtral-8x7B, and more annotation for other models"},{"id":"http://arxiv.org/abs/2407.18147v1","updated":"2024-07-25T15:58:19Z","published":"2024-07-25T15:58:19Z","title":"The FIGNEWS Shared Task on News Media Narratives","summary":" We present an overview of the FIGNEWS shared task, organized as part of the\nArabicNLP 2024 conference co-located with ACL 2024. The shared task addresses\nbias and propaganda annotation in multilingual news posts. We focus on the\nearly days of the Israel War on Gaza as a case study. The task aims to foster\ncollaboration in developing annotation guidelines for subjective tasks by\ncreating frameworks for analyzing diverse narratives highlighting potential\nbias and propaganda. In a spirit of fostering and encouraging diversity, we\naddress the problem from a multilingual perspective, namely within five\nlanguages: English, French, Arabic, Hebrew, and Hindi. A total of 17 teams\nparticipated in two annotation subtasks: bias (16 teams) and propaganda (6\nteams). The teams competed in four evaluation tracks: guidelines development,\nannotation quality, annotation quantity, and consistency. Collectively, the\nteams produced 129,800 data points. Key findings and implications for the field\nare discussed.\n","authors":["Wajdi Zaghouani","Mustafa Jarrar","Nizar Habash","Houda Bouamor","Imed Zitouni","Mona Diab","Samhaa R. El-Beltagy","Muhammed AbuOdeh"],"pdf_url":"https://arxiv.org/pdf/2407.18147v1.pdf","comment":"18 pages, 10 tables, 1 figure, accepted to ArabicNLP 2024 co-located\n with ACL 2024"},{"id":"http://arxiv.org/abs/2407.18129v1","updated":"2024-07-25T15:36:48Z","published":"2024-07-25T15:36:48Z","title":"Dallah: A Dialect-Aware Multimodal Large Language Model for Arabic","summary":" Recent advancements have significantly enhanced the capabilities of\nMultimodal Large Language Models (MLLMs) in generating and understanding\nimage-to-text content. Despite these successes, progress is predominantly\nlimited to English due to the scarcity of high quality multimodal resources in\nother languages. This limitation impedes the development of competitive models\nin languages such as Arabic. To alleviate this situation, we introduce an\nefficient Arabic multimodal assistant, dubbed Dallah, that utilizes an advanced\nlanguage model based on LLaMA-2 to facilitate multimodal interactions. Dallah\ndemonstrates state-of-the-art performance in Arabic MLLMs. Through fine-tuning\nsix Arabic dialects, Dallah showcases its capability to handle complex\ndialectal interactions incorporating both textual and visual elements. The\nmodel excels in two benchmark tests: one evaluating its performance on Modern\nStandard Arabic (MSA) and another specifically designed to assess dialectal\nresponses. Beyond its robust performance in multimodal interaction tasks,\nDallah has the potential to pave the way for further development of\ndialect-aware Arabic MLLMs.\n","authors":["Fakhraddin Alwajih","Gagan Bhatia","Muhammad Abdul-Mageed"],"pdf_url":"https://arxiv.org/pdf/2407.18129v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18119v1","updated":"2024-07-25T15:27:08Z","published":"2024-07-25T15:27:08Z","title":"Tracking linguistic information in transformer-based sentence embeddings\n through targeted sparsification","summary":" Analyses of transformer-based models have shown that they encode a variety of\nlinguistic information from their textual input. While these analyses have shed\na light on the relation between linguistic information on one side, and\ninternal architecture and parameters on the other, a question remains\nunanswered: how is this linguistic information reflected in sentence\nembeddings? Using datasets consisting of sentences with known structure, we\ntest to what degree information about chunks (in particular noun, verb or\nprepositional phrases), such as grammatical number, or semantic role, can be\nlocalized in sentence embeddings. Our results show that such information is not\ndistributed over the entire sentence embedding, but rather it is encoded in\nspecific regions. Understanding how the information from an input text is\ncompressed into sentence embeddings helps understand current transformer models\nand help build future explainable neural models.\n","authors":["Vivi Nastase","Paola Merlo"],"pdf_url":"https://arxiv.org/pdf/2407.18119v1.pdf","comment":"12 pages, 9 figures, 1 table, published in RepL4NLP 2024"},{"id":"http://arxiv.org/abs/2407.18078v1","updated":"2024-07-25T14:36:18Z","published":"2024-07-25T14:36:18Z","title":"PEFT-U: Parameter-Efficient Fine-Tuning for User Personalization","summary":" The recent emergence of Large Language Models (LLMs) has heralded a new era\nof human-AI interaction. These sophisticated models, exemplified by Chat-GPT\nand its successors, have exhibited remarkable capabilities in language\nunderstanding. However, as these LLMs have undergone exponential growth, a\ncrucial dimension that remains understudied is the personalization of these\nmodels. Large foundation models such as GPT-3 etc. focus on creating a\nuniversal model that serves a broad range of tasks and users. This approach\nemphasizes the model's generalization capabilities, treating users as a\ncollective rather than as distinct individuals. While practical for many common\napplications, this one-size-fits-all approach often fails to address the rich\ntapestry of human diversity and individual needs. To explore this issue we\nintroduce the PEFT-U Benchmark: a new dataset for building and evaluating NLP\nmodels for user personalization. \\datasetname{} consists of a series of\nuser-centered tasks containing diverse and individualized expressions where the\npreferences of users can potentially differ for the same input. Using PEFT-U,\nwe explore the challenge of efficiently personalizing LLMs to accommodate\nuser-specific preferences in the context of diverse user-centered tasks.\n","authors":["Christopher Clarke","Yuzhao Heng","Lingjia Tang","Jason Mars"],"pdf_url":"https://arxiv.org/pdf/2407.18078v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18061v1","updated":"2024-07-25T14:16:08Z","published":"2024-07-25T14:16:08Z","title":"Difficulty Estimation and Simplification of French Text Using LLMs","summary":" We leverage generative large language models for language learning\napplications, focusing on estimating the difficulty of foreign language texts\nand simplifying them to lower difficulty levels. We frame both tasks as\nprediction problems and develop a difficulty classification model using labeled\nexamples, transfer learning, and large language models, demonstrating superior\naccuracy compared to previous approaches. For simplification, we evaluate the\ntrade-off between simplification quality and meaning preservation, comparing\nzero-shot and fine-tuned performances of large language models. We show that\nmeaningful text simplifications can be obtained with limited fine-tuning. Our\nexperiments are conducted on French texts, but our methods are\nlanguage-agnostic and directly applicable to other foreign languages.\n","authors":["Henri Jamet","Yash Raj Shrestha","Michalis Vlachos"],"pdf_url":"https://arxiv.org/pdf/2407.18061v1.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2407.18058v1","updated":"2024-07-25T14:15:05Z","published":"2024-07-25T14:15:05Z","title":"I can listen but cannot read: An evaluation of two-tower multimodal\n systems for instrument recognition","summary":" Music two-tower multimodal systems integrate audio and text modalities into a\njoint audio-text space, enabling direct comparison between songs and their\ncorresponding labels. These systems enable new approaches for classification\nand retrieval, leveraging both modalities. Despite the promising results they\nhave shown for zero-shot classification and retrieval tasks, closer inspection\nof the embeddings is needed. This paper evaluates the inherent zero-shot\nproperties of joint audio-text spaces for the case-study of instrument\nrecognition. We present an evaluation and analysis of two-tower systems for\nzero-shot instrument recognition and a detailed analysis of the properties of\nthe pre-joint and joint embeddings spaces. Our findings suggest that audio\nencoders alone demonstrate good quality, while challenges remain within the\ntext encoder or joint space projection. Specifically, two-tower systems exhibit\nsensitivity towards specific words, favoring generic prompts over musically\ninformed ones. Despite the large size of textual encoders, they do not yet\nleverage additional textual context or infer instruments accurately from their\ndescriptions. Lastly, a novel approach for quantifying the semantic\nmeaningfulness of the textual space leveraging an instrument ontology is\nproposed. This method reveals deficiencies in the systems' understanding of\ninstruments and provides evidence of the need for fine-tuning text encoders on\nmusical data.\n","authors":["Yannis Vasilakis","Rachel Bittner","Johan Pauwels"],"pdf_url":"https://arxiv.org/pdf/2407.18058v1.pdf","comment":"Accepted to ISMIR 2024"},{"id":"http://arxiv.org/abs/2407.18035v1","updated":"2024-07-25T13:29:37Z","published":"2024-07-25T13:29:37Z","title":"RestoreAgent: Autonomous Image Restoration Agent via Multimodal Large\n Language Models","summary":" Natural images captured by mobile devices often suffer from multiple types of\ndegradation, such as noise, blur, and low light. Traditional image restoration\nmethods require manual selection of specific tasks, algorithms, and execution\nsequences, which is time-consuming and may yield suboptimal results. All-in-one\nmodels, though capable of handling multiple tasks, typically support only a\nlimited range and often produce overly smooth, low-fidelity outcomes due to\ntheir broad data distribution fitting. To address these challenges, we first\ndefine a new pipeline for restoring images with multiple degradations, and then\nintroduce RestoreAgent, an intelligent image restoration system leveraging\nmultimodal large language models. RestoreAgent autonomously assesses the type\nand extent of degradation in input images and performs restoration through (1)\ndetermining the appropriate restoration tasks, (2) optimizing the task\nsequence, (3) selecting the most suitable models, and (4) executing the\nrestoration. Experimental results demonstrate the superior performance of\nRestoreAgent in handling complex degradation, surpassing human experts.\nFurthermore, the system modular design facilitates the fast integration of new\ntasks and models, enhancing its flexibility and scalability for various\napplications.\n","authors":["Haoyu Chen","Wenbo Li","Jinjin Gu","Jingjing Ren","Sixiang Chen","Tian Ye","Renjing Pei","Kaiwen Zhou","Fenglong Song","Lei Zhu"],"pdf_url":"https://arxiv.org/pdf/2407.18035v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.06543v2","updated":"2024-07-25T13:27:08Z","published":"2022-12-13T12:56:55Z","title":"Improving Stance Detection by Leveraging Measurement Knowledge from\n Social Sciences: A Case Study of Dutch Political Tweets and Traditional\n Gender Role Division","summary":" Stance detection (SD) concerns automatically determining the viewpoint (i.e.,\nin favour of, against, or neutral) of a text's author towards a target. SD has\nbeen applied to many research topics, among which the detection of stances\nbehind political tweets is an important one. In this paper, we apply SD to a\ndataset of tweets from official party accounts in the Netherlands between 2017\nand 2021, with a focus on stances towards traditional gender role division, a\ndividing issue between (some) Dutch political parties. To implement and improve\nSD of traditional gender role division, we propose to leverage an established\nsurvey instrument from social sciences, which has been validated for the\npurpose of measuring attitudes towards traditional gender role division. Based\non our experiments, we show that using such a validated survey instrument helps\nto improve SD performance.\n","authors":["Qixiang Fang","Anastasia Giachanou","Ayoub Bagheri"],"pdf_url":"https://arxiv.org/pdf/2212.06543v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01799v2","updated":"2024-07-25T13:12:47Z","published":"2024-04-02T09:58:57Z","title":"PATCH! Psychometrics-AssisTed benCHmarking of Large Language Models: A\n Case Study of Proficiency in 8th Grade Mathematics","summary":" Many existing benchmarks of large (multimodal) language models (LLMs) focus\non measuring LLMs' academic proficiency, often with also an interest in\ncomparing model performance with human test takers. While these benchmarks have\nproven key to the development of LLMs, they suffer from several limitations,\nincluding questionable measurement quality (e.g., Do they measure what they are\nsupposed to in a reliable way?), lack of quality assessment on the item level\n(e.g., Are some items more important or difficult than others?) and unclear\nhuman population reference (e.g., To whom can the model be compared?). In\nresponse to these challenges, we propose leveraging knowledge from\npsychometrics - a field dedicated to the measurement of latent variables like\nacademic proficiency - into LLM benchmarking. We make three primary\ncontributions. First, we introduce PATCH: a novel framework for\n{P}sychometrics-{A}ssis{T}ed ben{CH}marking of LLMs. PATCH addresses the\naforementioned limitations, presenting a new direction for LLM benchmark\nresearch. Second, we implement PATCH by measuring GPT-4 and Gemini-Pro-Vision's\nproficiency in 8th grade mathematics against 56 human populations. We show that\nadopting a psychometrics-based approach yields evaluation outcomes that diverge\nfrom those based on existing benchmarking practices. Third, we release 4\nhigh-quality datasets to support measuring and comparing LLM proficiency in\ngrade school mathematics and science against human populations.\n","authors":["Qixiang Fang","Daniel L. Oberski","Dong Nguyen"],"pdf_url":"https://arxiv.org/pdf/2404.01799v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.19146v2","updated":"2024-07-25T13:09:18Z","published":"2024-06-27T13:02:43Z","title":"Resolving Discrepancies in Compute-Optimal Scaling of Language Models","summary":" Kaplan et al. and Hoffmann et al. developed influential scaling laws for the\noptimal model size as a function of the compute budget, but these laws yield\nsubstantially different predictions. We explain the discrepancy by reproducing\nthe Kaplan scaling law on two datasets (OpenWebText2 and RefinedWeb) and\nidentifying three factors causing the difference: last layer computational\ncost, warmup duration, and scale-dependent optimizer tuning. With these factors\ncorrected, we obtain excellent agreement with the Hoffmann et al. (i.e.,\n\"Chinchilla\") scaling law. Counter to a hypothesis of Hoffmann et al., we find\nthat careful learning rate decay is not essential for the validity of their\nscaling law. As a secondary result, we derive scaling laws for the optimal\nlearning rate and batch size, finding that tuning the AdamW $\\beta_2$ parameter\nis essential at lower batch sizes.\n","authors":["Tomer Porian","Mitchell Wortsman","Jenia Jitsev","Ludwig Schmidt","Yair Carmon"],"pdf_url":"https://arxiv.org/pdf/2406.19146v2.pdf","comment":"Fixing bug in small models with tuned LR"},{"id":"http://arxiv.org/abs/2407.18008v1","updated":"2024-07-25T13:04:25Z","published":"2024-07-25T13:04:25Z","title":"GermanPartiesQA: Benchmarking Commercial Large Language Models for\n Political Bias and Sycophancy","summary":" LLMs are changing the way humans create and interact with content,\npotentially affecting citizens' political opinions and voting decisions. As\nLLMs increasingly shape our digital information ecosystems, auditing to\nevaluate biases, sycophancy, or steerability has emerged as an active field of\nresearch. In this paper, we evaluate and compare the alignment of six LLMs by\nOpenAI, Anthropic, and Cohere with German party positions and evaluate\nsycophancy based on a prompt experiment. We contribute to evaluating political\nbias and sycophancy in multi-party systems across major commercial LLMs. First,\nwe develop the benchmark dataset GermanPartiesQA based on the Voting Advice\nApplication Wahl-o-Mat covering 10 state and 1 national elections between 2021\nand 2023. In our study, we find a left-green tendency across all examined LLMs.\nWe then conduct our prompt experiment for which we use the benchmark and\nsociodemographic data of leading German parliamentarians to evaluate changes in\nLLMs responses. To differentiate between sycophancy and steerabilty, we use 'I\nam [politician X], ...' and 'You are [politician X], ...' prompts. Against our\nexpectations, we do not observe notable differences between prompting 'I am'\nand 'You are'. While our findings underscore that LLM responses can be\nideologically steered with political personas, they suggest that observed\nchanges in LLM outputs could be better described as personalization to the\ngiven context rather than sycophancy.\n","authors":["Jan Batzner","Volker Stocker","Stefan Schmid","Gjergji Kasneci"],"pdf_url":"https://arxiv.org/pdf/2407.18008v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2407.18003v1","updated":"2024-07-25T12:56:22Z","published":"2024-07-25T12:56:22Z","title":"Keep the Cost Down: A Review on Methods to Optimize LLM' s KV-Cache\n Consumption","summary":" Large Language Models (LLMs), epitomized by ChatGPT' s release in late 2022,\nhave revolutionized various industries with their advanced language\ncomprehension. However, their efficiency is challenged by the Transformer\narchitecture' s struggle with handling long texts. KV-Cache has emerged as a\npivotal solution to this issue, converting the time complexity of token\ngeneration from quadratic to linear, albeit with increased GPU memory overhead\nproportional to conversation length. With the development of the LLM community\nand academia, various KV-Cache compression methods have been proposed. In this\nreview, we dissect the various properties of KV-Cache and elaborate on various\nmethods currently used to optimize the KV-Cache space usage of LLMs. These\nmethods span the pre-training phase, deployment phase, and inference phase, and\nwe summarize the commonalities and differences among these methods.\nAdditionally, we list some metrics for evaluating the long-text capabilities of\nlarge language models, from both efficiency and capability perspectives. Our\nreview thus sheds light on the evolving landscape of LLM optimization, offering\ninsights into future advancements in this dynamic field.\n","authors":["Shi Luohe","Zhang Hongyi","Yao Yao","Li Zuchao","Zhao Hai"],"pdf_url":"https://arxiv.org/pdf/2407.18003v1.pdf","comment":"to be published in CoLM 2024"},{"id":"http://arxiv.org/abs/2407.17997v1","updated":"2024-07-25T12:44:45Z","published":"2024-07-25T12:44:45Z","title":"On the Effect of Purely Synthetic Training Data for Different Automatic\n Speech Recognition Architectures","summary":" In this work we evaluate the utility of synthetic data for training automatic\nspeech recognition (ASR). We use the ASR training data to train a\ntext-to-speech (TTS) system similar to FastSpeech-2. With this TTS we reproduce\nthe original training data, training ASR systems solely on synthetic data. For\nASR, we use three different architectures, attention-based encoder-decoder,\nhybrid deep neural network hidden Markov model and a Gaussian mixture hidden\nMarkov model, showing the different sensitivity of the models to synthetic data\ngeneration. In order to extend previous work, we present a number of ablation\nstudies on the effectiveness of synthetic vs. real training data for ASR. In\nparticular we focus on how the gap between training on synthetic and real data\nchanges by varying the speaker embedding or by scaling the model size. For the\nlatter we show that the TTS models generalize well, even when training scores\nindicate overfitting.\n","authors":["Nick Rossenbach","Benedikt Hilmes","Ralf Schlüter"],"pdf_url":"https://arxiv.org/pdf/2407.17997v1.pdf","comment":"Accepted at the SynData4GenAI 2024 workshop"},{"id":"http://arxiv.org/abs/2407.17974v1","updated":"2024-07-25T12:09:41Z","published":"2024-07-25T12:09:41Z","title":"What does Kiki look like? Cross-modal associations between speech sounds\n and visual shapes in vision-and-language models","summary":" Humans have clear cross-modal preferences when matching certain novel words\nto visual shapes. Evidence suggests that these preferences play a prominent\nrole in our linguistic processing, language learning, and the origins of\nsignal-meaning mappings. With the rise of multimodal models in AI, such as\nvision- and-language (VLM) models, it becomes increasingly important to uncover\nthe kinds of visio-linguistic associations these models encode and whether they\nalign with human representations. Informed by experiments with humans, we probe\nand compare four VLMs for a well-known human cross-modal preference, the\nbouba-kiki effect. We do not find conclusive evidence for this effect but\nsuggest that results may depend on features of the models, such as architecture\ndesign, model size, and training details. Our findings inform discussions on\nthe origins of the bouba-kiki effect in human cognition and future developments\nof VLMs that align well with human cross-modal associations.\n","authors":["Tessa Verhoef","Kiana Shahrasbi","Tom Kouwenhoven"],"pdf_url":"https://arxiv.org/pdf/2407.17974v1.pdf","comment":"Appeared at the 13th edition of the Workshop on Cognitive Modeling\n and Computational Linguistics (CMCL 2024)"},{"id":"http://arxiv.org/abs/2404.00725v2","updated":"2024-07-25T11:37:54Z","published":"2024-03-31T15:55:49Z","title":"The Larger the Better? Improved LLM Code-Generation via Budget\n Reallocation","summary":" It is a common belief that large language models (LLMs) are better than\nsmaller-sized ones. However, larger models also require significantly more time\nand compute during inference. This begs the question: what happens when both\nmodels operate under the same budget? (e.g., compute, run-time). To address\nthis question, we analyze code generation LLMs of various sizes and make\ncomparisons such as running a 70B model once vs. generating five outputs from a\n13B model. We consider a standard unit-test setup, which can be used to select\nthe correct output from the smaller model. Our findings reveal that the\nrepeated use of smaller models can yield consistent improvements, with gains of\nup to 15% across five tasks. On the other hand, in scenarios where unit-tests\nare unavailable, a ranking-based selection of candidates from the smaller model\nfalls short of the performance of a single output from larger ones. Our results\nhighlight the potential of using smaller models instead of larger ones, and the\nimportance of studying approaches for ranking LLM outputs.\n","authors":["Michael Hassid","Tal Remez","Jonas Gehring","Roy Schwartz","Yossi Adi"],"pdf_url":"https://arxiv.org/pdf/2404.00725v2.pdf","comment":"COLM 2024"},{"id":"http://arxiv.org/abs/2407.17960v1","updated":"2024-07-25T11:29:27Z","published":"2024-07-25T11:29:27Z","title":"The Curious Case of Representational Alignment: Unravelling\n Visio-Linguistic Tasks in Emergent Communication","summary":" Natural language has the universal properties of being compositional and\ngrounded in reality. The emergence of linguistic properties is often\ninvestigated through simulations of emergent communication in referential\ngames. However, these experiments have yielded mixed results compared to\nsimilar experiments addressing linguistic properties of human language. Here we\naddress representational alignment as a potential contributing factor to these\nresults. Specifically, we assess the representational alignment between agent\nimage representations and between agent representations and input images. Doing\nso, we confirm that the emergent language does not appear to encode human-like\nconceptual visual features, since agent image representations drift away from\ninputs whilst inter-agent alignment increases. We moreover identify a strong\nrelationship between inter-agent alignment and topographic similarity, a common\nmetric for compositionality, and address its consequences. To address these\nissues, we introduce an alignment penalty that prevents representational drift\nbut interestingly does not improve performance on a compositional\ndiscrimination task. Together, our findings emphasise the key role\nrepresentational alignment plays in simulations of language emergence.\n","authors":["Tom Kouwenhoven","Max Peeperkorn","Bram van Dijk","Tessa Verhoef"],"pdf_url":"https://arxiv.org/pdf/2407.17960v1.pdf","comment":"Appeared at the 13th edition of the Workshop on Cognitive Modeling\n and Computational Linguistics (CMCL 2024)"},{"id":"http://arxiv.org/abs/2407.17940v1","updated":"2024-07-25T10:58:42Z","published":"2024-07-25T10:58:42Z","title":"Positive Text Reframing under Multi-strategy Optimization","summary":" Differing from sentiment transfer, positive reframing seeks to substitute\nnegative perspectives with positive expressions while preserving the original\nmeaning. With the emergence of pre-trained language models (PLMs), it is\npossible to achieve acceptable results by fine-tuning PLMs. Nevertheless,\ngenerating fluent, diverse and task-constrained reframing text remains a\nsignificant challenge. To tackle this issue, a \\textbf{m}ulti-\\textbf{s}trategy\n\\textbf{o}ptimization \\textbf{f}ramework (MSOF) is proposed in this paper.\nStarting from the objective of positive reframing, we first design positive\nsentiment reward and content preservation reward to encourage the model to\ntransform the negative expressions of the original text while ensuring the\nintegrity and consistency of the semantics. Then, different decoding\noptimization approaches are introduced to improve the quality of text\ngeneration. Finally, based on the modeling formula of positive reframing, we\npropose a multi-dimensional re-ranking method that further selects candidate\nsentences from three dimensions: strategy consistency, text similarity and\nfluency. Extensive experiments on two Seq2Seq PLMs, BART and T5, demonstrate\nour framework achieves significant improvements on unconstrained and controlled\npositive reframing tasks.\n","authors":["Shutong Jia","Biwei Cao","Qingqing Gao","Jiuxin Cao","Bo Liu"],"pdf_url":"https://arxiv.org/pdf/2407.17940v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17914v1","updated":"2024-07-25T10:08:37Z","published":"2024-07-25T10:08:37Z","title":"Modelling Multimodal Integration in Human Concept Processing with\n Vision-and-Language Models","summary":" Representations from deep neural networks (DNNs) have proven remarkably\npredictive of neural activity involved in both visual and linguistic\nprocessing. Despite these successes, most studies to date concern unimodal\nDNNs, encoding either visual or textual input but not both. Yet, there is\ngrowing evidence that human meaning representations integrate linguistic and\nsensory-motor information. Here we investigate whether the integration of\nmultimodal information operated by current vision-and-language DNN models\n(VLMs) leads to representations that are more aligned with human brain activity\nthan those obtained by language-only and vision-only DNNs. We focus on fMRI\nresponses recorded while participants read concept words in the context of\neither a full sentence or an accompanying picture. Our results reveal that VLM\nrepresentations correlate more strongly than language- and vision-only DNNs\nwith activations in brain areas functionally related to language processing. A\ncomparison between different types of visuo-linguistic architectures shows that\nrecent generative VLMs tend to be less brain-aligned than previous\narchitectures with lower performance on downstream applications. Moreover,\nthrough an additional analysis comparing brain vs. behavioural alignment across\nmultiple VLMs, we show that -- with one remarkable exception -- representations\nthat strongly align with behavioural judgments do not correlate highly with\nbrain responses. This indicates that brain similarity does not go hand in hand\nwith behavioural similarity, and vice versa.\n","authors":["Anna Bavaresco","Marianne de Heer Kloots","Sandro Pezzelle","Raquel Fernández"],"pdf_url":"https://arxiv.org/pdf/2407.17914v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17900v1","updated":"2024-07-25T09:42:24Z","published":"2024-07-25T09:42:24Z","title":"The Power of Combining Data and Knowledge: GPT-4o is an Effective\n Interpreter of Machine Learning Models in Predicting Lymph Node Metastasis of\n Lung Cancer","summary":" Lymph node metastasis (LNM) is a crucial factor in determining the initial\ntreatment for patients with lung cancer, yet accurate preoperative diagnosis of\nLNM remains challenging. Recently, large language models (LLMs) have garnered\nsignificant attention due to their remarkable text generation capabilities.\nLeveraging the extensive medical knowledge learned from vast corpora, LLMs can\nestimate probabilities for clinical problems, though their performance has\nhistorically been inferior to data-driven machine learning models. In this\npaper, we propose a novel ensemble method that combines the medical knowledge\nacquired by LLMs with the latent patterns identified by machine learning models\nto enhance LNM prediction performance. Initially, we developed machine learning\nmodels using patient data. We then designed a prompt template to integrate the\npatient data with the predicted probability from the machine learning model.\nSubsequently, we instructed GPT-4o, the most advanced LLM developed by OpenAI,\nto estimate the likelihood of LNM based on patient data and then adjust the\nestimate using the machine learning output. Finally, we collected three outputs\nfrom the GPT-4o using the same prompt and ensembled these results as the final\nprediction. Using the proposed method, our models achieved an AUC value of\n0.765 and an AP value of 0.415 for LNM prediction, significantly improving\npredictive performance compared to baseline machine learning models. The\nexperimental results indicate that GPT-4o can effectively leverage its medical\nknowledge and the probabilities predicted by machine learning models to achieve\nmore accurate LNM predictions. These findings demonstrate that LLMs can perform\nwell in clinical risk prediction tasks, offering a new paradigm for integrating\nmedical knowledge and patient data in clinical predictions.\n","authors":["Danqing Hu","Bing Liu","Xiaofeng Zhu","Nan Wu"],"pdf_url":"https://arxiv.org/pdf/2407.17900v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14888v2","updated":"2024-07-25T09:19:06Z","published":"2024-03-21T23:48:21Z","title":"AutoRE: Document-Level Relation Extraction with Large Language Models","summary":" Large Language Models (LLMs) have demonstrated exceptional abilities in\ncomprehending and generating text, motivating numerous researchers to utilize\nthem for Information Extraction (IE) purposes, including Relation Extraction\n(RE). Nonetheless, most existing methods are predominantly designed for\nSentence-level Relation Extraction (SentRE) tasks, which typically encompass a\nrestricted set of relations and triplet facts within a single sentence.\nFurthermore, certain approaches resort to treating relations as candidate\nchoices integrated into prompt templates, leading to inefficient processing and\nsuboptimal performance when tackling Document-Level Relation Extraction (DocRE)\ntasks, which entail handling multiple relations and triplet facts distributed\nacross a given document, posing distinct challenges. To overcome these\nlimitations, we introduce AutoRE, an end-to-end DocRE model that adopts a novel\nRE extraction paradigm named RHF (Relation-Head-Facts). Unlike existing\napproaches, AutoRE does not rely on the assumption of known relation options,\nmaking it more reflective of real-world scenarios. Additionally, we have\ndeveloped an easily extensible RE framework using a Parameters Efficient Fine\nTuning (PEFT) algorithm (QLoRA). Our experiments on the RE-DocRED dataset\nshowcase AutoRE's best performance, achieving state-of-the-art results,\nsurpassing TAG by 10.03\\% and 9.03\\% respectively on the dev and test set. The\ncode is available\\url{https://github.com/THUDM/AutoRE} and the demonstration\nvideo is provided https://www.youtube.com/watch?v=IhKRsZUAxKk\n","authors":["Lilong Xue","Dan Zhang","Yuxiao Dong","Jie Tang"],"pdf_url":"https://arxiv.org/pdf/2403.14888v2.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2407.05750v2","updated":"2024-07-25T09:17:21Z","published":"2024-07-08T09:03:12Z","title":"Large Language Models Understand Layout","summary":" Large language models (LLMs) demonstrate extraordinary abilities in a wide\nrange of natural language processing (NLP) tasks. In this paper, we show that,\nbeyond text understanding capability, LLMs are capable of processing text\nlayouts that are denoted by spatial markers. They are able to answer questions\nthat require explicit spatial perceiving and reasoning, while a drastic\nperformance drop is observed when the spatial markers from the original data\nare excluded. We perform a series of experiments with the GPT-3.5, Baichuan2,\nLlama2 and ChatGLM3 models on various types of layout-sensitive datasets for\nfurther analysis. The experimental results reveal that the layout understanding\nability of LLMs is mainly introduced by the coding data for pretraining, which\nis further enhanced at the instruction-tuning stage. In addition, layout\nunderstanding can be enhanced by integrating low-cost, auto-generated data\napproached by a novel text game. Finally, we show that layout understanding\nability is beneficial for building efficient visual question-answering (VQA)\nsystems.\n","authors":["Weiming Li","Manni Duan","Dong An","Yan Shao"],"pdf_url":"https://arxiv.org/pdf/2407.05750v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02750v2","updated":"2024-07-25T09:16:05Z","published":"2024-02-05T06:06:47Z","title":"KIVI: A Tuning-Free Asymmetric 2bit Quantization for KV Cache","summary":" Efficiently serving large language models (LLMs) requires batching of many\nrequests to reduce the cost per request. Yet, with larger batch sizes and\nlonger context lengths, the key-value (KV) cache, which stores attention keys\nand values to avoid re-computations, significantly increases memory demands and\nbecomes the new bottleneck in speed and memory usage. Additionally, the loading\nof the KV cache causes the computational core to be idle, which limits the\ninference speed. A straightforward and effective solution to reduce KV cache\nsize is quantization, which decreases the total bytes taken by KV cache.\nHowever, there is a lack of in-depth studies that explore the element\ndistribution of KV cache to understand the hardness and limitation of KV cache\nquantization. To fill the gap, we conducted a comprehensive study on the\nelement distribution in KV cache of popular LLMs. Our findings indicate that\nthe key cache should be quantized per-channel, i.e., group elements along the\nchannel dimension and quantize them together. In contrast, the value cache\nshould be quantized per-token. From this analysis, we developed a tuning-free\n2bit KV cache quantization algorithm named KIVI. With hardware-friendly\nimplementation, KIVI can enable Llama, Falcon, and Mistral models to maintain\nalmost the same quality while using $\\mathbf{2.6\\times}$ less peak memory\n(including model weight). This reduction in memory usage enables up to\n$\\mathbf{4\\times}$ larger batch size, bringing $\\mathbf{2.35\\times \\sim\n3.47\\times}$ throughput on real LLM inference workload. The source code is\navailable at https://github.com/jy-yuan/KIVI.\n","authors":["Zirui Liu","Jiayi Yuan","Hongye Jin","Shaochen Zhong","Zhaozhuo Xu","Vladimir Braverman","Beidi Chen","Xia Hu"],"pdf_url":"https://arxiv.org/pdf/2402.02750v2.pdf","comment":"ICML2024"},{"id":"http://arxiv.org/abs/2407.17876v1","updated":"2024-07-25T08:46:49Z","published":"2024-07-25T08:46:49Z","title":"A Large-Scale Sensitivity Analysis on Latent Embeddings and\n Dimensionality Reductions for Text Spatializations","summary":" The semantic similarity between documents of a text corpus can be visualized\nusing map-like metaphors based on two-dimensional scatterplot layouts. These\nlayouts result from a dimensionality reduction on the document-term matrix or a\nrepresentation within a latent embedding, including topic models. Thereby, the\nresulting layout depends on the input data and hyperparameters of the\ndimensionality reduction and is therefore affected by changes in them.\nFurthermore, the resulting layout is affected by changes in the input data and\nhyperparameters of the dimensionality reduction. However, such changes to the\nlayout require additional cognitive efforts from the user. In this work, we\npresent a sensitivity study that analyzes the stability of these layouts\nconcerning (1) changes in the text corpora, (2) changes in the hyperparameter,\nand (3) randomness in the initialization. Our approach has two stages: data\nmeasurement and data analysis. First, we derived layouts for the combination of\nthree text corpora and six text embeddings and a grid-search-inspired\nhyperparameter selection of the dimensionality reductions. Afterward, we\nquantified the similarity of the layouts through ten metrics, concerning local\nand global structures and class separation. Second, we analyzed the resulting\n42817 tabular data points in a descriptive statistical analysis. From this, we\nderived guidelines for informed decisions on the layout algorithm and highlight\nspecific hyperparameter settings. We provide our implementation as a Git\nrepository at\nhttps://github.com/hpicgs/Topic-Models-and-Dimensionality-Reduction-Sensitivity-Study\nand results as Zenodo archive at https://doi.org/10.5281/zenodo.12772898.\n","authors":["Daniel Atzberger","Tim Cech","Willy Scheibel","Jürgen Döllner","Michael Behrisch","Tobias Schreck"],"pdf_url":"https://arxiv.org/pdf/2407.17876v1.pdf","comment":"To be published at IEEE VIS 2024 conference"},{"id":"http://arxiv.org/abs/2407.17874v1","updated":"2024-07-25T08:44:04Z","published":"2024-07-25T08:44:04Z","title":"Improving Domain-Specific ASR with LLM-Generated Contextual Descriptions","summary":" End-to-end automatic speech recognition (E2E ASR) systems have significantly\nimproved speech recognition through training on extensive datasets. Despite\nthese advancements, they still struggle to accurately recognize domain specific\nwords, such as proper nouns and technical terminologies. To address this\nproblem, we propose a method to utilize the state-of-the-art Whisper without\nmodifying its architecture, preserving its generalization performance while\nenabling it to leverage descriptions effectively. Moreover, we propose two\nadditional training techniques to improve the domain specific ASR: decoder\nfine-tuning, and context perturbation. We also propose a method to use a Large\nLanguage Model (LLM) to generate descriptions with simple metadata, when\ndescriptions are unavailable. Our experiments demonstrate that proposed methods\nnotably enhance domain-specific ASR accuracy on real-life datasets, with\nLLM-generated descriptions outperforming human-crafted ones in effectiveness.\n","authors":["Jiwon Suh","Injae Na","Woohwan Jung"],"pdf_url":"https://arxiv.org/pdf/2407.17874v1.pdf","comment":"Accepted to INTERSPEECH 2024"},{"id":"http://arxiv.org/abs/2407.17870v1","updated":"2024-07-25T08:42:53Z","published":"2024-07-25T08:42:53Z","title":"Is the Digital Forensics and Incident Response Pipeline Ready for\n Text-Based Threats in LLM Era?","summary":" In the era of generative AI, the widespread adoption of Neural Text\nGenerators (NTGs) presents new cybersecurity challenges, particularly within\nthe realms of Digital Forensics and Incident Response (DFIR). These challenges\nprimarily involve the detection and attribution of sources behind advanced\nattacks like spearphishing and disinformation campaigns. As NTGs evolve, the\ntask of distinguishing between human and NTG-authored texts becomes critically\ncomplex. This paper rigorously evaluates the DFIR pipeline tailored for\ntext-based security systems, specifically focusing on the challenges of\ndetecting and attributing authorship of NTG-authored texts. By introducing a\nnovel human-NTG co-authorship text attack, termed CS-ACT, our study uncovers\nsignificant vulnerabilities in traditional DFIR methodologies, highlighting\ndiscrepancies between ideal scenarios and real-world conditions. Utilizing 14\ndiverse datasets and 43 unique NTGs, up to the latest GPT-4, our research\nidentifies substantial vulnerabilities in the forensic profiling phase,\nparticularly in attributing authorship to NTGs. Our comprehensive evaluation\npoints to factors such as model sophistication and the lack of distinctive\nstyle within NTGs as significant contributors for these vulnerabilities. Our\nfindings underscore the necessity for more sophisticated and adaptable\nstrategies, such as incorporating adversarial learning, stylizing NTGs, and\nimplementing hierarchical attribution through the mapping of NTG lineages to\nenhance source attribution. This sets the stage for future research and the\ndevelopment of more resilient text-based security systems.\n","authors":["Avanti Bhandarkar","Ronald Wilson","Anushka Swarup","Mengdi Zhu","Damon Woodard"],"pdf_url":"https://arxiv.org/pdf/2407.17870v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2407.17866v1","updated":"2024-07-25T08:36:58Z","published":"2024-07-25T08:36:58Z","title":"Financial Statement Analysis with Large Language Models","summary":" We investigate whether an LLM can successfully perform financial statement\nanalysis in a way similar to a professional human analyst. We provide\nstandardized and anonymous financial statements to GPT4 and instruct the model\nto analyze them to determine the direction of future earnings. Even without any\nnarrative or industry-specific information, the LLM outperforms financial\nanalysts in its ability to predict earnings changes. The LLM exhibits a\nrelative advantage over human analysts in situations when the analysts tend to\nstruggle. Furthermore, we find that the prediction accuracy of the LLM is on\npar with the performance of a narrowly trained state-of-the-art ML model. LLM\nprediction does not stem from its training memory. Instead, we find that the\nLLM generates useful narrative insights about a company's future performance.\nLastly, our trading strategies based on GPT's predictions yield a higher Sharpe\nratio and alphas than strategies based on other models. Taken together, our\nresults suggest that LLMs may take a central role in decision-making.\n","authors":["Alex Kim","Maximilian Muhn","Valeri Nikolaev"],"pdf_url":"https://arxiv.org/pdf/2407.17866v1.pdf","comment":"Previously posted on SSRN (May 21, 2024). See\n http://papers.ssrn.com/sol3/papers.cfm?abstract_id=4835311"},{"id":"http://arxiv.org/abs/2407.17863v1","updated":"2024-07-25T08:33:23Z","published":"2024-07-25T08:33:23Z","title":"factgenie: A Framework for Span-based Evaluation of Generated Texts","summary":" We present factgenie: a framework for annotating and visualizing word spans\nin textual model outputs. Annotations can capture various span-based phenomena\nsuch as semantic inaccuracies or irrelevant text. With factgenie, the\nannotations can be collected both from human crowdworkers and large language\nmodels. Our framework consists of a web interface for data visualization and\ngathering text annotations, powered by an easily extensible codebase.\n","authors":["Zdeněk Kasner","Ondřej Plátek","Patrícia Schmidtová","Simone Balloccu","Ondřej Dušek"],"pdf_url":"https://arxiv.org/pdf/2407.17863v1.pdf","comment":"Accepted to INLG 2024 (System Demonstrations)"},{"id":"http://arxiv.org/abs/2407.17862v1","updated":"2024-07-25T08:31:57Z","published":"2024-07-25T08:31:57Z","title":"Exploring Description-Augmented Dataless Intent Classification","summary":" In this work, we introduce several schemes to leverage description-augmented\nembedding similarity for dataless intent classification using current\nstate-of-the-art (SOTA) text embedding models. We report results of our methods\non four commonly used intent classification datasets and compare against\nprevious works of a similar nature. Our work shows promising results for\ndataless classification scaling to a large number of unseen intents. We show\ncompetitive results and significant improvements (+6.12\\% Avg.) over strong\nzero-shot baselines, all without training on labelled or task-specific data.\nFurthermore, we provide qualitative error analysis of the shortfalls of this\nmethodology to help guide future research in this area.\n","authors":["Ruoyu Hu","Foaad Khosmood","Abbas Edalat"],"pdf_url":"https://arxiv.org/pdf/2407.17862v1.pdf","comment":"Accepted to the 6th NLP for Conversational AI Workshop at ACL\n 2024(NLP4ConvAI)"},{"id":"http://arxiv.org/abs/2407.17854v1","updated":"2024-07-25T08:15:43Z","published":"2024-07-25T08:15:43Z","title":"Shapley Value-based Contrastive Alignment for Multimodal Information\n Extraction","summary":" The rise of social media and the exponential growth of multimodal\ncommunication necessitates advanced techniques for Multimodal Information\nExtraction (MIE). However, existing methodologies primarily rely on direct\nImage-Text interactions, a paradigm that often faces significant challenges due\nto semantic and modality gaps between images and text. In this paper, we\nintroduce a new paradigm of Image-Context-Text interaction, where large\nmultimodal models (LMMs) are utilized to generate descriptive textual context\nto bridge these gaps. In line with this paradigm, we propose a novel Shapley\nValue-based Contrastive Alignment (Shap-CA) method, which aligns both\ncontext-text and context-image pairs. Shap-CA initially applies the Shapley\nvalue concept from cooperative game theory to assess the individual\ncontribution of each element in the set of contexts, texts and images towards\ntotal semantic and modality overlaps. Following this quantitative evaluation, a\ncontrastive learning strategy is employed to enhance the interactive\ncontribution within context-text/image pairs, while minimizing the influence\nacross these pairs. Furthermore, we design an adaptive fusion module for\nselective cross-modal fusion. Extensive experiments across four MIE datasets\ndemonstrate that our method significantly outperforms existing state-of-the-art\nmethods.\n","authors":["Wen Luo","Yu Xia","Shen Tianshu","Sujian Li"],"pdf_url":"https://arxiv.org/pdf/2407.17854v1.pdf","comment":"Accepted at ACM Multimedia 2024"},{"id":"http://arxiv.org/abs/2407.17852v1","updated":"2024-07-25T08:08:55Z","published":"2024-07-25T08:08:55Z","title":"Scaling A Simple Approach to Zero-Shot Speech Recognition","summary":" Despite rapid progress in increasing the language coverage of automatic\nspeech recognition, the field is still far from covering all languages with a\nknown writing script. Recent work showed promising results with a zero-shot\napproach requiring only a small amount of text data, however, accuracy heavily\ndepends on the quality of the used phonemizer which is often weak for unseen\nlanguages. In this paper, we present MMS Zero-shot a conceptually simpler\napproach based on romanization and an acoustic model trained on data in 1,078\ndifferent languages or three orders of magnitude more than prior art. MMS\nZero-shot reduces the average character error rate by a relative 46% over 100\nunseen languages compared to the best previous work. Moreover, the error rate\nof our approach is only 2.5x higher compared to in-domain supervised baselines,\nwhile our approach uses no labeled data for the evaluation languages at all.\n","authors":["Jinming Zhao","Vineel Pratap","Michael Auli"],"pdf_url":"https://arxiv.org/pdf/2407.17852v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2402.13055v2","updated":"2024-07-25T08:07:39Z","published":"2024-02-20T14:43:39Z","title":"Identifying Semantic Induction Heads to Understand In-Context Learning","summary":" Although large language models (LLMs) have demonstrated remarkable\nperformance, the lack of transparency in their inference logic raises concerns\nabout their trustworthiness. To gain a better understanding of LLMs, we conduct\na detailed analysis of the operations of attention heads and aim to better\nunderstand the in-context learning of LLMs. Specifically, we investigate\nwhether attention heads encode two types of relationships between tokens\npresent in natural languages: the syntactic dependency parsed from sentences\nand the relation within knowledge graphs. We find that certain attention heads\nexhibit a pattern where, when attending to head tokens, they recall tail tokens\nand increase the output logits of those tail tokens. More crucially, the\nformulation of such semantic induction heads has a close correlation with the\nemergence of the in-context learning ability of language models. The study of\nsemantic attention heads advances our understanding of the intricate operations\nof attention heads in transformers, and further provides new insights into the\nin-context learning of LLMs.\n","authors":["Jie Ren","Qipeng Guo","Hang Yan","Dongrui Liu","Quanshi Zhang","Xipeng Qiu","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2402.13055v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17844v1","updated":"2024-07-25T07:58:19Z","published":"2024-07-25T07:58:19Z","title":"Innovative Speech-Based Deep Learning Approaches for Parkinson's Disease\n Classification: A Systematic Review","summary":" Parkinson's disease (PD), the second most prevalent neurodegenerative\ndisorder worldwide, frequently presents with early-stage speech impairments.\nRecent advancements in Artificial Intelligence (AI), particularly deep learning\n(DL), have significantly enhanced PD diagnosis through the analysis of speech\ndata. Nevertheless, the progress of research is restricted by the limited\navailability of publicly accessible speech-based PD datasets, primarily due to\nprivacy and ethical concerns. This review covers the latest DL-based AI\napproaches for speech-based PD classification, focusing on performance,\navailable resources and associated challenges of 33 scientific works published\nbetween 2020 and March 2024. These DL approaches are categorized into\nend-to-end (E2E) learning, transfer learning (TL) and deep acoustic features\n(DAF) extraction. Among E2E approaches, Convolutional Neural Networks (CNNs)\nare prevalent, though Transformers are increasingly popular. E2E approaches\nface challenges such as limited data and computational resources, especially\nwith Transformers. TL addresses these issues by providing more robust PD\ndiagnosis and better generalizability across languages. DAF extraction aims to\nimprove the explainability and interpretability of results by examining the\nspecific effects of deep features on both other DL approaches and more\ntraditional machine learning (ML) methods. However, it often underperforms\ncompared to E2E and TL approaches. This review also discusses unresolved issues\nrelated to bias, explainability and privacy, highlighting the need for future\nresearch.\n","authors":["Lisanne van Gelderen","Cristian Tejedor-García"],"pdf_url":"https://arxiv.org/pdf/2407.17844v1.pdf","comment":"Submitted in Applied Sciences - peer reviewed Open Access journal.\n This research was funded by the NWO research programme AiNed Fellowship\n Grants under the project Responsible AI for Voice Diagnostics (RAIVD) - grant\n number NGF.1607.22.013"},{"id":"http://arxiv.org/abs/2407.17075v2","updated":"2024-07-25T07:50:46Z","published":"2024-07-24T08:04:00Z","title":"SAFETY-J: Evaluating Safety with Critique","summary":" The deployment of Large Language Models (LLMs) in content generation raises\nsignificant safety concerns, particularly regarding the transparency and\ninterpretability of content evaluations. Current methods, primarily focused on\nbinary safety classifications, lack mechanisms for detailed critique, limiting\ntheir utility for model improvement and user trust. To address these\nlimitations, we introduce SAFETY-J, a bilingual generative safety evaluator for\nEnglish and Chinese with critique-based judgment. SAFETY-J utilizes a robust\ntraining dataset that includes diverse dialogues and augmented query-response\npairs to assess safety across various scenarios comprehensively. We establish\nan automated meta-evaluation benchmark that objectively assesses the quality of\ncritiques with minimal human intervention, facilitating scalable and continuous\nimprovement. Additionally, SAFETY-J employs an iterative preference learning\ntechnique to dynamically refine safety assessments based on meta-evaluations\nand critiques. Our evaluations demonstrate that SAFETY-J provides more nuanced\nand accurate safety evaluations, thereby enhancing both critique quality and\npredictive reliability in complex content scenarios. To facilitate further\nresearch and application, we open-source SAFETY-J's training protocols,\ndatasets, and code at \\url{https://github.com/GAIR-NLP/Safety-J}.\n","authors":["Yixiu Liu","Yuxiang Zheng","Shijie Xia","Yuan Guo","Jiajun Li","Yi Tu","Chaoling Song","Pengfei Liu"],"pdf_url":"https://arxiv.org/pdf/2407.17075v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17125v2","updated":"2024-07-25T07:39:44Z","published":"2024-07-24T09:48:48Z","title":"Behavioral Testing: Can Large Language Models Implicitly Resolve\n Ambiguous Entities?","summary":" One of the major aspects contributing to the striking performance of large\nlanguage models (LLMs) is the vast amount of factual knowledge accumulated\nduring pre-training. Yet, many LLMs suffer from self-inconsistency, which\nraises doubts about their trustworthiness and reliability. In this paper, we\nfocus on entity type ambiguity and analyze current state-of-the-art LLMs for\ntheir proficiency and consistency in applying their factual knowledge when\nprompted for entities under ambiguity. To do so, we propose an evaluation\nprotocol that disentangles knowing from applying knowledge, and test\nstate-of-the-art LLMs on 49 entities. Our experiments reveal that LLMs perform\npoorly with ambiguous prompts, achieving only 80% accuracy. Our results further\ndemonstrate systematic discrepancies in LLM behavior and their failure to\nconsistently apply information, indicating that the models can exhibit\nknowledge without being able to utilize it, significant biases for preferred\nreadings, as well as self inconsistencies. Our study highlights the importance\nof handling entity ambiguity in future for more trustworthy LLMs\n","authors":["Anastasiia Sedova","Robert Litschko","Diego Frassinelli","Benjamin Roth","Barbara Plank"],"pdf_url":"https://arxiv.org/pdf/2407.17125v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17827v1","updated":"2024-07-25T07:35:27Z","published":"2024-07-25T07:35:27Z","title":"Unified Lexical Representation for Interpretable Visual-Language\n Alignment","summary":" Visual-Language Alignment (VLA) has gained a lot of attention since CLIP's\ngroundbreaking work. Although CLIP performs well, the typical direct latent\nfeature alignment lacks clarity in its representation and similarity scores. On\nthe other hand, lexical representation, a vector whose element represents the\nsimilarity between the sample and a word from the vocabulary, is a natural\nsparse representation and interpretable, providing exact matches for individual\nwords. However, lexical representations is difficult to learn due to no\nground-truth supervision and false-discovery issues, and thus requires complex\ndesign to train effectively. In this paper, we introduce LexVLA, a more\ninterpretable VLA framework by learning a unified lexical representation for\nboth modalities without complex design. We use DINOv2 as our visual model for\nits local-inclined features and Llama 2, a generative language model, to\nleverage its in-context lexical prediction ability. To avoid the false\ndiscovery, we propose an overuse penalty to refrain the lexical representation\nfrom falsely frequently activating meaningless words. We demonstrate that these\ntwo pre-trained uni-modal models can be well-aligned by fine-tuning on modest\nmulti-modal dataset and avoid intricate training configurations. On cross-modal\nretrieval benchmarks, LexVLA, trained on the CC-12M multi-modal dataset,\noutperforms baselines fine-tuned on larger datasets (e.g., YFCC15M) and those\ntrained from scratch on even bigger datasets (e.g., 1.1B data, including\nCC-12M). We conduct extensive experiments to analyze LexVLA.\n","authors":["Yifan Li","Yikai Wang","Yanwei Fu","Dongyu Ru","Zheng Zhang","Tong He"],"pdf_url":"https://arxiv.org/pdf/2407.17827v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17817v1","updated":"2024-07-25T07:10:31Z","published":"2024-07-25T07:10:31Z","title":"Demystifying Verbatim Memorization in Large Language Models","summary":" Large Language Models (LLMs) frequently memorize long sequences verbatim,\noften with serious legal and privacy implications. Much prior work has studied\nsuch verbatim memorization using observational data. To complement such work,\nwe develop a framework to study verbatim memorization in a controlled setting\nby continuing pre-training from Pythia checkpoints with injected sequences. We\nfind that (1) non-trivial amounts of repetition are necessary for verbatim\nmemorization to happen; (2) later (and presumably better) checkpoints are more\nlikely to verbatim memorize sequences, even for out-of-distribution sequences;\n(3) the generation of memorized sequences is triggered by distributed model\nstates that encode high-level features and makes important use of general\nlanguage modeling capabilities. Guided by these insights, we develop stress\ntests to evaluate unlearning methods and find they often fail to remove the\nverbatim memorized information, while also degrading the LM. Overall, these\nfindings challenge the hypothesis that verbatim memorization stems from\nspecific model weights or mechanisms. Rather, verbatim memorization is\nintertwined with the LM's general capabilities and thus will be very difficult\nto isolate and suppress without degrading model quality.\n","authors":["Jing Huang","Diyi Yang","Christopher Potts"],"pdf_url":"https://arxiv.org/pdf/2407.17817v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16228v2","updated":"2024-07-25T07:05:30Z","published":"2023-09-28T08:09:33Z","title":"Brand Network Booster: A new system for improving brand connectivity","summary":" This paper presents a new decision support system offered for an in-depth\nanalysis of semantic networks, which can provide insights for a better\nexploration of a brand's image and the improvement of its connectivity. In\nterms of network analysis, we show that this goal is achieved by solving an\nextended version of the Maximum Betweenness Improvement problem, which includes\nthe possibility of considering adversarial nodes, constrained budgets, and\nweighted networks - where connectivity improvement can be obtained by adding\nlinks or increasing the weight of existing connections. Our contribution\nincludes a new algorithmic framework and the integration of this framework into\na software system called Brand Network Booster (BNB), which supports brand\nconnectivity evaluation and improvement. We present this new system together\nwith three case studies, and we also discuss its performance. Our tool and\napproach are valuable to both network scholars and in facilitating strategic\ndecision-making processes for marketing and communication managers across\nvarious sectors, be it public or private.\n","authors":["J. Cancellieri","W. Didimo","A. Fronzetti Colladon","F. Montecchiani","R. Vestrelli"],"pdf_url":"https://arxiv.org/pdf/2309.16228v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.06851v4","updated":"2024-07-25T06:41:43Z","published":"2023-11-12T14:01:38Z","title":"Automatic Textual Normalization for Hate Speech Detection","summary":" Social media data is a valuable resource for research, yet it contains a wide\nrange of non-standard words (NSW). These irregularities hinder the effective\noperation of NLP tools. Current state-of-the-art methods for the Vietnamese\nlanguage address this issue as a problem of lexical normalization, involving\nthe creation of manual rules or the implementation of multi-staged deep\nlearning frameworks, which necessitate extensive efforts to craft intricate\nrules. In contrast, our approach is straightforward, employing solely a\nsequence-to-sequence (Seq2Seq) model. In this research, we provide a dataset\nfor textual normalization, comprising 2,181 human-annotated comments with an\ninter-annotator agreement of 0.9014. By leveraging the Seq2Seq model for\ntextual normalization, our results reveal that the accuracy achieved falls\nslightly short of 70%. Nevertheless, textual normalization enhances the\naccuracy of the Hate Speech Detection (HSD) task by approximately 2%,\ndemonstrating its potential to improve the performance of complex NLP tasks.\nOur dataset is accessible for research purposes.\n","authors":["Anh Thi-Hoang Nguyen","Dung Ha Nguyen","Nguyet Thi Nguyen","Khanh Thanh-Duy Ho","Kiet Van Nguyen"],"pdf_url":"https://arxiv.org/pdf/2311.06851v4.pdf","comment":"2023 International Conference on Intelligent Systems Design and\n Applications (ISDA2023)"},{"id":"http://arxiv.org/abs/2306.17103v4","updated":"2024-07-25T06:15:20Z","published":"2023-06-29T17:01:51Z","title":"LyricWhiz: Robust Multilingual Zero-shot Lyrics Transcription by\n Whispering to ChatGPT","summary":" We introduce LyricWhiz, a robust, multilingual, and zero-shot automatic\nlyrics transcription method achieving state-of-the-art performance on various\nlyrics transcription datasets, even in challenging genres such as rock and\nmetal. Our novel, training-free approach utilizes Whisper, a weakly supervised\nrobust speech recognition model, and GPT-4, today's most performant chat-based\nlarge language model. In the proposed method, Whisper functions as the \"ear\" by\ntranscribing the audio, while GPT-4 serves as the \"brain,\" acting as an\nannotator with a strong performance for contextualized output selection and\ncorrection. Our experiments show that LyricWhiz significantly reduces Word\nError Rate compared to existing methods in English and can effectively\ntranscribe lyrics across multiple languages. Furthermore, we use LyricWhiz to\ncreate the first publicly available, large-scale, multilingual lyrics\ntranscription dataset with a CC-BY-NC-SA copyright license, based on\nMTG-Jamendo, and offer a human-annotated subset for noise level estimation and\nevaluation. We anticipate that our proposed method and dataset will advance the\ndevelopment of multilingual lyrics transcription, a challenging and emerging\ntask.\n","authors":["Le Zhuo","Ruibin Yuan","Jiahao Pan","Yinghao Ma","Yizhi LI","Ge Zhang","Si Liu","Roger Dannenberg","Jie Fu","Chenghua Lin","Emmanouil Benetos","Wei Xue","Yike Guo"],"pdf_url":"https://arxiv.org/pdf/2306.17103v4.pdf","comment":"9 pages, 2 figures, 5 tables, accepted by ISMIR 2023"},{"id":"http://arxiv.org/abs/2407.17773v1","updated":"2024-07-25T05:02:39Z","published":"2024-07-25T05:02:39Z","title":"KiVA: Kid-inspired Visual Analogies for Testing Large Multimodal Models","summary":" This paper investigates visual analogical reasoning in large multimodal\nmodels (LMMs) compared to human adults and children. A \"visual analogy\" is an\nabstract rule inferred from one image and applied to another. While benchmarks\nexist for testing visual reasoning in LMMs, they require advanced skills and\nomit basic visual analogies that even young children can make. Inspired by\ndevelopmental psychology, we propose a new benchmark of 1,400 visual\ntransformations of everyday objects to test LMMs on visual analogical reasoning\nand compare them to children and adults. We structure the evaluation into three\nstages: identifying what changed (e.g., color, number, etc.), how it changed\n(e.g., added one object), and applying the rule to new scenarios. Our findings\nshow that while models like GPT-4V, LLaVA-1.5, and MANTIS identify the \"what\"\neffectively, they struggle with quantifying the \"how\" and extrapolating this\nrule to new objects. In contrast, children and adults exhibit much stronger\nanalogical reasoning at all three stages. Additionally, the strongest tested\nmodel, GPT-4V, performs better in tasks involving simple visual attributes like\ncolor and size, correlating with quicker human adult response times.\nConversely, more complex tasks such as number, rotation, and reflection, which\nnecessitate extensive cognitive processing and understanding of the 3D physical\nworld, present more significant challenges. Altogether, these findings\nhighlight the limitations of training models on data that primarily consists of\n2D images and text.\n","authors":["Eunice Yiu","Maan Qraitem","Charlie Wong","Anisa Noor Majhi","Yutong Bai","Shiry Ginosar","Alison Gopnik","Kate Saenko"],"pdf_url":"https://arxiv.org/pdf/2407.17773v1.pdf","comment":"9 pages. For the KiVA benchmark, see https://github.com/ey242/KiVA"},{"id":"http://arxiv.org/abs/2407.17772v1","updated":"2024-07-25T05:02:27Z","published":"2024-07-25T05:02:27Z","title":"ERIT Lightweight Multimodal Dataset for Elderly Emotion Recognition and\n Multimodal Fusion Evaluation","summary":" ERIT is a novel multimodal dataset designed to facilitate research in a\nlightweight multimodal fusion. It contains text and image data collected from\nvideos of elderly individuals reacting to various situations, as well as seven\nemotion labels for each data sample. Because of the use of labeled images of\nelderly users reacting emotionally, it is also facilitating research on emotion\nrecognition in an underrepresented age group in machine learning visual emotion\nrecognition. The dataset is validated through comprehensive experiments\nindicating its importance in neural multimodal fusion research.\n","authors":["Rita Frieske","Bertrand E. Shi"],"pdf_url":"https://arxiv.org/pdf/2407.17772v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17771v1","updated":"2024-07-25T04:58:08Z","published":"2024-07-25T04:58:08Z","title":"Banyan: Improved Representation Learning with Explicit Structure","summary":" We present Banyan, an improved model to learn semantic representations by\ninducing explicit structure over data. In contrast to prior approaches using\nstructure spanning single sentences, Banyan learns by resolving multiple\nconstituent structures into a shared one explicitly incorporating global\ncontext. Combined with an improved message-passing scheme inspired by Griffin,\nBanyan learns significantly better representations, avoids spurious false\nnegatives with contrastive learning, and drastically improves memory efficiency\nin such explicit-structured models. Using the Self-StrAE framework, we show\nthat Banyan (a) outperforms baselines using sentential structure across various\nsettings (b) matches or outperforms unstructured baselines like GloVe\n(+augmentations) and a RoBERTa medium (+simcse) pre-trained on 100M tokens,\ndespite having just a handful of (non-embedding) parameters, and (c) also\nlearns effective representations across several low resource (Asian and\nAfrican) languages as measured on SemRel tasks.\n","authors":["Mattia Opper","N. Siddharth"],"pdf_url":"https://arxiv.org/pdf/2407.17771v1.pdf","comment":"First Draft"},{"id":"http://arxiv.org/abs/2407.17770v1","updated":"2024-07-25T04:57:31Z","published":"2024-07-25T04:57:31Z","title":"BotEval: Facilitating Interactive Human Evaluation","summary":" Following the rapid progress in natural language processing (NLP) models,\nlanguage models are applied to increasingly more complex interactive tasks such\nas negotiations and conversation moderations. Having human evaluators directly\ninteract with these NLP models is essential for adequately evaluating the\nperformance on such interactive tasks. We develop BotEval, an easily\ncustomizable, open-source, evaluation toolkit that focuses on enabling\nhuman-bot interactions as part of the evaluation process, as opposed to human\nevaluators making judgements for a static input. BotEval balances flexibility\nfor customization and user-friendliness by providing templates for common use\ncases that span various degrees of complexity and built-in compatibility with\npopular crowdsourcing platforms. We showcase the numerous useful features of\nBotEval through a study that evaluates the performance of various chatbots on\ntheir effectiveness for conversational moderation and discuss how BotEval\ndiffers from other annotation tools.\n","authors":["Hyundong Cho","Thamme Gowda","Yuyang Huang","Zixun Lu","Tianli Tong","Jonathan May"],"pdf_url":"https://arxiv.org/pdf/2407.17770v1.pdf","comment":"ACL 2024 SDT, 10 pages"},{"id":"http://arxiv.org/abs/2407.10499v2","updated":"2024-07-25T04:44:54Z","published":"2024-07-15T07:43:55Z","title":"CIBench: Evaluating Your LLMs with a Code Interpreter Plugin","summary":" While LLM-Based agents, which use external tools to solve complex problems,\nhave made significant progress, benchmarking their ability is challenging,\nthereby hindering a clear understanding of their limitations. In this paper, we\npropose an interactive evaluation framework, named CIBench, to comprehensively\nassess LLMs' ability to utilize code interpreters for data science tasks. Our\nevaluation framework includes an evaluation dataset and two evaluation modes.\nThe evaluation dataset is constructed using an LLM-human cooperative approach\nand simulates an authentic workflow by leveraging consecutive and interactive\nIPython sessions. The two evaluation modes assess LLMs' ability with and\nwithout human assistance. We conduct extensive experiments to analyze the\nability of 24 LLMs on CIBench and provide valuable insights for future LLMs in\ncode interpreter utilization.\n","authors":["Songyang Zhang","Chuyu Zhang","Yingfan Hu","Haowen Shen","Kuikun Liu","Zerun Ma","Fengzhe Zhou","Wenwei Zhang","Xuming He","Dahua Lin","Kai Chen"],"pdf_url":"https://arxiv.org/pdf/2407.10499v2.pdf","comment":"Under review. The first three authors contribute equally, and\n Songyang Zhang is the project leader"},{"id":"http://arxiv.org/abs/2308.16884v2","updated":"2024-07-25T04:30:15Z","published":"2023-08-31T17:43:08Z","title":"The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122\n Language Variants","summary":" We present Belebele, a multiple-choice machine reading comprehension (MRC)\ndataset spanning 122 language variants. Significantly expanding the language\ncoverage of natural language understanding (NLU) benchmarks, this dataset\nenables the evaluation of text models in high-, medium-, and low-resource\nlanguages. Each question is based on a short passage from the Flores-200\ndataset and has four multiple-choice answers. The questions were carefully\ncurated to discriminate between models with different levels of general\nlanguage comprehension. The English dataset on its own proves difficult enough\nto challenge state-of-the-art language models. Being fully parallel, this\ndataset enables direct comparison of model performance across all languages. We\nuse this dataset to evaluate the capabilities of multilingual masked language\nmodels (MLMs) and large language models (LLMs). We present extensive results\nand find that despite significant cross-lingual transfer in English-centric\nLLMs, much smaller MLMs pretrained on balanced multilingual data still\nunderstand far more languages. We also observe that larger vocabulary size and\nconscious vocabulary construction correlate with better performance on\nlow-resource languages. Overall, Belebele opens up new avenues for evaluating\nand analyzing the multilingual capabilities of NLP systems.\n","authors":["Lucas Bandarkar","Davis Liang","Benjamin Muller","Mikel Artetxe","Satya Narayan Shukla","Donald Husa","Naman Goyal","Abhinandan Krishnan","Luke Zettlemoyer","Madian Khabsa"],"pdf_url":"https://arxiv.org/pdf/2308.16884v2.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2407.17745v1","updated":"2024-07-25T03:40:09Z","published":"2024-07-25T03:40:09Z","title":"Beyond Entity Alignment: Towards Complete Knowledge Graph Alignment via\n Entity-Relation Synergy","summary":" Knowledge Graph Alignment (KGA) aims to integrate knowledge from multiple\nsources to address the limitations of individual Knowledge Graphs (KGs) in\nterms of coverage and depth. However, current KGA models fall short in\nachieving a ``complete'' knowledge graph alignment. Existing models primarily\nemphasize the linkage of cross-graph entities but overlook aligning relations\nacross KGs, thereby providing only a partial solution to KGA. The semantic\ncorrelations embedded in relations are largely overlooked, potentially\nrestricting a comprehensive understanding of cross-KG signals. In this paper,\nwe propose to conceptualize relation alignment as an independent task and\nconduct KGA by decomposing it into two distinct but highly correlated\nsub-tasks: entity alignment and relation alignment. To capture the mutually\nreinforcing correlations between these objectives, we propose a novel\nExpectation-Maximization-based model, EREM, which iteratively optimizes both\nsub-tasks. Experimental results on real-world datasets demonstrate that EREM\nconsistently outperforms state-of-the-art models in both entity alignment and\nrelation alignment tasks.\n","authors":["Xiaohan Fang","Chaozhuo Li","Yi Zhao","Qian Zang","Litian Zhang","Jiquan Peng","Xi Zhang","Jibing Gong"],"pdf_url":"https://arxiv.org/pdf/2407.17745v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.11686v3","updated":"2024-07-25T03:34:56Z","published":"2024-07-16T13:03:58Z","title":"CCoE: A Compact LLM with Collaboration of Experts","summary":" In the domain of Large Language Model (LLM), LLMs demonstrate significant\ncapabilities in natural language understanding and generation. With the growing\nneeds of applying LLMs on various domains, it is a research question that how\nto efficiently train and build a model that has expertise in different domains\nbut with a low training cost. We propose CCoE architecture, a framework of\neasily coupling multiple strong domain experts together to fuse into a big LLM,\nprovides a collective way of utilizing the different domain expert LLMs.\nBesides, training a large collaborative of multiple expert LLMs requires a high\nrequirements on training sources. CCoE bypasses this problem through isolating\nother experts and train each expert separately. The design of CCoE assembles\nmultiple expert LLMs through the CoE (Collaboration of Experts) layer. Each CoE\nlayer could have one or more expert LLMs. Expert LLMs have different number of\nlayers and have been well-trained for different domain tasks. Each expert is\nfine-tuned to be able to achieve the comparable results with SOTA domain LLMs.\nWe start from 5 experts in the domain of Code, Math, Law, text-to-SQL and\nMedical. The results indicate that our CCoE framework can easily and\nefficiently boost nearly 10%-20% performance on original base model in\ndifferent domains but using less resources on training, as well as inference.\n","authors":["Shaomang Huang","Jianfeng Pan","Hanzhong Zheng"],"pdf_url":"https://arxiv.org/pdf/2407.11686v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01686v2","updated":"2024-07-25T03:29:09Z","published":"2024-05-02T19:20:11Z","title":"Automatically Extracting Numerical Results from Randomized Controlled\n Trials with Large Language Models","summary":" Meta-analyses statistically aggregate the findings of different randomized\ncontrolled trials (RCTs) to assess treatment effectiveness. Because this yields\nrobust estimates of treatment effectiveness, results from meta-analyses are\nconsidered the strongest form of evidence. However, rigorous evidence syntheses\nare time-consuming and labor-intensive, requiring manual extraction of data\nfrom individual trials to be synthesized. Ideally, language technologies would\npermit fully automatic meta-analysis, on demand. This requires accurately\nextracting numerical results from individual trials, which has been beyond the\ncapabilities of natural language processing (NLP) models to date. In this work,\nwe evaluate whether modern large language models (LLMs) can reliably perform\nthis task. We annotate (and release) a modest but granular evaluation dataset\nof clinical trial reports with numerical findings attached to interventions,\ncomparators, and outcomes. Using this dataset, we evaluate the performance of\nseven LLMs applied zero-shot for the task of conditionally extracting numerical\nfindings from trial reports. We find that massive LLMs that can accommodate\nlengthy inputs are tantalizingly close to realizing fully automatic\nmeta-analysis, especially for dichotomous (binary) outcomes (e.g., mortality).\nHowever, LLMs -- including ones trained on biomedical texts -- perform poorly\nwhen the outcome measures are complex and tallying the results requires\ninference. This work charts a path toward fully automatic meta-analysis of RCTs\nvia LLMs, while also highlighting the limitations of existing models for this\naim.\n","authors":["Hye Sun Yun","David Pogrebitskiy","Iain J. Marshall","Byron C. Wallace"],"pdf_url":"https://arxiv.org/pdf/2405.01686v2.pdf","comment":"25 pages, 7 figures, 6 tables, MLHC 2024"},{"id":"http://arxiv.org/abs/2311.07052v3","updated":"2024-07-25T03:20:15Z","published":"2023-11-13T03:36:18Z","title":"Towards the Law of Capacity Gap in Distilling Language Models","summary":" Language model (LM) distillation is a trending area that aims to distil the\nknowledge residing in a large teacher LM to a small student one. While various\nmethods have been proposed to maximize the effectiveness of the distillation,\nsignificant challenges persist, particularly when there is a substantial\ncapacity gap between the teacher and student LMs. This issue, often referred to\nas the \\textit{curse} of capacity gap, suggests that a larger teacher does not\nnecessarily result in a superior student compared to one distilled from a\nsmaller teacher. In other words, there is likely an optimal teacher yielding\nthe best student along the scaling course of the teacher. However, the curse of\ncapacity gap can not be tackled without notable compute overhead, as indicated\nin previous studies. In the context of large LMs (LLMs), previously viable\napproaches become much less meaningful, as it is an impossible triangle to\ndistill an expected student from an optimal teacher student with small compute\noverhead. Fortunately, the impossible triangle can fortunately be possible\nprovided an inducted \\textit{law} of capacity gap. In this paper, we take the\nspirits of scaling law and reveal that the optimal teacher scale almost\nconsistently follows a linear scaling with the student scale across different\nmodel architectures and data scales. The law later guides us to distil a 3B\nstudent LM (termed \\textsc{MiniMA}) from LLaMA2-7B. \\textsc{MiniMA} is\ndemonstrated to outperform a wide range of 3B competitors and could even\ncompete with several 7B models.\n","authors":["Chen Zhang","Dawei Song","Zheyu Ye","Yan Gao"],"pdf_url":"https://arxiv.org/pdf/2311.07052v3.pdf","comment":"32 pages, 10 figures, 15 tables, work in progress. Code and\n checkpoints are available at https://github.com/GeneZC/MiniMA"},{"id":"http://arxiv.org/abs/2407.17734v1","updated":"2024-07-25T03:12:57Z","published":"2024-07-25T03:12:57Z","title":"Cost-effective Instruction Learning for Pathology Vision and Language\n Analysis","summary":" The advent of vision-language models fosters the interactive conversations\nbetween AI-enabled models and humans. Yet applying these models into clinics\nmust deal with daunting challenges around large-scale training data, financial,\nand computational resources. Here we propose a cost-effective instruction\nlearning framework for conversational pathology named as CLOVER. CLOVER only\ntrains a lightweight module and uses instruction tuning while freezing the\nparameters of the large language model. Instead of using costly GPT-4, we\npropose well-designed prompts on GPT-3.5 for building generation-based\ninstructions, emphasizing the utility of pathological knowledge derived from\nthe Internet source. To augment the use of instructions, we construct a\nhigh-quality set of template-based instructions in the context of digital\npathology. From two benchmark datasets, our findings reveal the strength of\nhybrid-form instructions in the visual question-answer in pathology. Extensive\nresults show the cost-effectiveness of CLOVER in answering both open-ended and\nclosed-ended questions, where CLOVER outperforms strong baselines that possess\n37 times more training parameters and use instruction data generated from\nGPT-4. Through the instruction tuning, CLOVER exhibits robustness of few-shot\nlearning in the external clinical dataset. These findings demonstrate that\ncost-effective modeling of CLOVER could accelerate the adoption of rapid\nconversational applications in the landscape of digital pathology.\n","authors":["Kaitao Chen","Mianxin Liu","Fang Yan","Lei Ma","Xiaoming Shi","Lilong Wang","Xiaosong Wang","Lifeng Zhu","Zhe Wang","Mu Zhou","Shaoting Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.17734v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.09530v4","updated":"2024-07-25T03:08:18Z","published":"2023-09-18T07:17:52Z","title":"Adapting Large Language Models to Domains via Reading Comprehension","summary":" We explore how continued pre-training on domain-specific corpora influences\nlarge language models, revealing that training on the raw corpora endows the\nmodel with domain knowledge, but drastically hurts its prompting ability for\nquestion answering. Taken inspiration from human learning via reading\ncomprehension--practice after reading improves the ability to answer questions\nbased on the learned knowledge--we propose a simple method for transforming raw\ncorpora into reading comprehension texts. Each raw text is enriched with a\nseries of tasks related to its content. Our method, highly scalable and\napplicable to any pre-training corpora, consistently enhances performance\nacross various tasks in three different domains: biomedicine, finance, and law.\nNotably, our 7B language model achieves competitive performance with\ndomain-specific models of much larger scales, such as BloombergGPT-50B.\nFurthermore, we demonstrate that domain-specific reading comprehension texts\ncan improve the model's performance even on general benchmarks, showing the\npotential to develop a general model across even more domains. Our model, code,\nand data are available at https://github.com/microsoft/LMOps.\n","authors":["Daixuan Cheng","Shaohan Huang","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2309.09530v4.pdf","comment":"ICLR 2024 Conference"},{"id":"http://arxiv.org/abs/2407.17730v1","updated":"2024-07-25T03:01:47Z","published":"2024-07-25T03:01:47Z","title":"Are Large Language Models Possible to Conduct Cognitive Behavioral\n Therapy?","summary":" In contemporary society, the issue of psychological health has become\nincreasingly prominent, characterized by the diversification, complexity, and\nuniversality of mental disorders. Cognitive Behavioral Therapy (CBT), currently\nthe most influential and clinically effective psychological treatment method\nwith no side effects, has limited coverage and poor quality in most countries.\nIn recent years, researches on the recognition and intervention of emotional\ndisorders using large language models (LLMs) have been validated, providing new\npossibilities for psychological assistance therapy. However, are LLMs truly\npossible to conduct cognitive behavioral therapy? Many concerns have been\nraised by mental health experts regarding the use of LLMs for therapy. Seeking\nto answer this question, we collected real CBT corpus from online video\nwebsites, designed and conducted a targeted automatic evaluation framework\ninvolving the evaluation of emotion tendency of generated text, structured\ndialogue pattern and proactive inquiry ability. For emotion tendency, we\ncalculate the emotion tendency score of the CBT dialogue text generated by each\nmodel. For structured dialogue pattern, we use a diverse range of automatic\nevaluation metrics to compare speaking style, the ability to maintain\nconsistency of topic and the use of technology in CBT between different models\n. As for inquiring to guide the patient, we utilize PQA (Proactive Questioning\nAbility) metric. We also evaluated the CBT ability of the LLM after integrating\na CBT knowledge base to explore the help of introducing additional knowledge to\nenhance the model's CBT counseling ability. Four LLM variants with excellent\nperformance on natural language processing are evaluated, and the experimental\nresult shows the great potential of LLMs in psychological counseling realm,\nespecially after combining with other technological means.\n","authors":["Hao Shen","Zihan Li","Minqiang Yang","Minghui Ni","Yongfeng Tao","Zhengyang Yu","Weihao Zheng","Chen Xu","Bin Hu"],"pdf_url":"https://arxiv.org/pdf/2407.17730v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.07386v2","updated":"2024-07-25T02:46:50Z","published":"2024-02-12T03:05:54Z","title":"Chain-of-Layer: Iteratively Prompting Large Language Models for Taxonomy\n Induction from Limited Examples","summary":" Automatic taxonomy induction is crucial for web search, recommendation\nsystems, and question answering. Manual curation of taxonomies is expensive in\nterms of human effort, making automatic taxonomy construction highly desirable.\nIn this work, we introduce Chain-of-Layer which is an in-context learning\nframework designed to induct taxonomies from a given set of entities.\nChain-of-Layer breaks down the task into selecting relevant candidate entities\nin each layer and gradually building the taxonomy from top to bottom. To\nminimize errors, we introduce the Ensemble-based Ranking Filter to reduce the\nhallucinated content generated at each iteration. Through extensive\nexperiments, we demonstrate that Chain-of-Layer achieves state-of-the-art\nperformance on four real-world benchmarks.\n","authors":["Qingkai Zeng","Yuyang Bai","Zhaoxuan Tan","Shangbin Feng","Zhenwen Liang","Zhihan Zhang","Meng Jiang"],"pdf_url":"https://arxiv.org/pdf/2402.07386v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17716v1","updated":"2024-07-25T02:30:40Z","published":"2024-07-25T02:30:40Z","title":"Describe Where You Are: Improving Noise-Robustness for Speech Emotion\n Recognition with Text Description of the Environment","summary":" Speech emotion recognition (SER) systems often struggle in real-world\nenvironments, where ambient noise severely degrades their performance. This\npaper explores a novel approach that exploits prior knowledge of testing\nenvironments to maximize SER performance under noisy conditions. To address\nthis task, we propose a text-guided, environment-aware training where an SER\nmodel is trained with contaminated speech samples and their paired noise\ndescription. We use a pre-trained text encoder to extract the text-based\nenvironment embedding and then fuse it to a transformer-based SER model during\ntraining and inference. We demonstrate the effectiveness of our approach\nthrough our experiment with the MSP-Podcast corpus and real-world additive\nnoise samples collected from the Freesound repository. Our experiment indicates\nthat the text-based environment descriptions processed by a large language\nmodel (LLM) produce representations that improve the noise-robustness of the\nSER system. In addition, our proposed approach with an LLM yields better\nperformance than our environment-agnostic baselines, especially in low\nsignal-to-noise ratio (SNR) conditions. When testing at -5dB SNR level, our\nproposed method shows better performance than our best baseline model by 31.8 %\n(arousal), 23.5% (dominance), and 9.5% (valence).\n","authors":["Seong-Gyun Leem","Daniel Fulford","Jukka-Pekka Onnela","David Gard","Carlos Busso"],"pdf_url":"https://arxiv.org/pdf/2407.17716v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01599v2","updated":"2024-07-25T02:25:11Z","published":"2024-06-26T02:20:23Z","title":"JailbreakZoo: Survey, Landscapes, and Horizons in Jailbreaking Large\n Language and Vision-Language Models","summary":" The rapid evolution of artificial intelligence (AI) through developments in\nLarge Language Models (LLMs) and Vision-Language Models (VLMs) has brought\nsignificant advancements across various technological domains. While these\nmodels enhance capabilities in natural language processing and visual\ninteractive tasks, their growing adoption raises critical concerns regarding\nsecurity and ethical alignment. This survey provides an extensive review of the\nemerging field of jailbreaking--deliberately circumventing the ethical and\noperational boundaries of LLMs and VLMs--and the consequent development of\ndefense mechanisms. Our study categorizes jailbreaks into seven distinct types\nand elaborates on defense strategies that address these vulnerabilities.\nThrough this comprehensive examination, we identify research gaps and propose\ndirections for future studies to enhance the security frameworks of LLMs and\nVLMs. Our findings underscore the necessity for a unified perspective that\nintegrates both jailbreak strategies and defensive solutions to foster a\nrobust, secure, and reliable environment for the next generation of language\nmodels. More details can be found on our website:\n\\url{https://chonghan-chen.com/llm-jailbreak-zoo-survey/}.\n","authors":["Haibo Jin","Leyang Hu","Xinuo Li","Peiyan Zhang","Chonghan Chen","Jun Zhuang","Haohan Wang"],"pdf_url":"https://arxiv.org/pdf/2407.01599v2.pdf","comment":"45 pages"},{"id":"http://arxiv.org/abs/2407.17695v1","updated":"2024-07-25T01:32:41Z","published":"2024-07-25T01:32:41Z","title":"Enhancing Agent Learning through World Dynamics Modeling","summary":" While large language models (LLMs) have been increasingly deployed across\ntasks in language understanding and interactive decision-making, their\nimpressive performance is largely due to the comprehensive and in-depth domain\nknowledge embedded within them. However, the extent of this knowledge can vary\nacross different domains. Existing methods often assume that LLMs already\npossess such comprehensive and in-depth knowledge of their environment,\noverlooking potential gaps in their understanding of actual world dynamics. To\naddress this gap, we introduce Discover, Verify, and Evolve (DiVE), a framework\nthat discovers world dynamics from a small number of demonstrations, verifies\nthe correctness of these dynamics, and evolves new, advanced dynamics tailored\nto the current situation. Through extensive evaluations, we analyze the impact\nof each component on performance and compare the automatically generated\ndynamics from DiVE with human-annotated world dynamics. Our results demonstrate\nthat LLMs guided by DiVE can make better decisions, achieving rewards\ncomparable to human players in the Crafter environment.\n","authors":["Zhiyuan Sun","Haochen Shi","Marc-Alexandre Côté","Glen Berseth","Xingdi Yuan","Bang Liu"],"pdf_url":"https://arxiv.org/pdf/2407.17695v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17688v1","updated":"2024-07-25T01:11:38Z","published":"2024-07-25T01:11:38Z","title":"Examining the Influence of Political Bias on Large Language Model\n Performance in Stance Classification","summary":" Large Language Models (LLMs) have demonstrated remarkable capabilities in\nexecuting tasks based on natural language queries. However, these models,\ntrained on curated datasets, inherently embody biases ranging from racial to\nnational and gender biases. It remains uncertain whether these biases impact\nthe performance of LLMs for certain tasks. In this study, we investigate the\npolitical biases of LLMs within the stance classification task, specifically\nexamining whether these models exhibit a tendency to more accurately classify\npolitically-charged stances. Utilizing three datasets, seven LLMs, and four\ndistinct prompting schemes, we analyze the performance of LLMs on politically\noriented statements and targets. Our findings reveal a statistically\nsignificant difference in the performance of LLMs across various politically\noriented stance classification tasks. Furthermore, we observe that this\ndifference primarily manifests at the dataset level, with models and prompting\nschemes showing statistically similar performances across different stance\nclassification datasets. Lastly, we observe that when there is greater\nambiguity in the target the statement is directed towards, LLMs have poorer\nstance classification accuracy.\n","authors":["Lynnette Hui Xian Ng","Iain Cruickshank","Roy Ka-Wei Lee"],"pdf_url":"https://arxiv.org/pdf/2407.17688v1.pdf","comment":"Accepted at ICWSM 2025"},{"id":"http://arxiv.org/abs/2302.00509v2","updated":"2024-07-25T01:09:57Z","published":"2023-02-01T15:28:55Z","title":"Exploring Semantic Perturbations on Grover","summary":" With news and information being as easy to access as they currently are, it\nis more important than ever to ensure that people are not mislead by what they\nread. Recently, the rise of neural fake news (AI-generated fake news) and its\ndemonstrated effectiveness at fooling humans has prompted the development of\nmodels to detect it. One such model is the Grover model, which can both detect\nneural fake news to prevent it, and generate it to demonstrate how a model\ncould be misused to fool human readers. In this work we explore the Grover\nmodel's fake news detection capabilities by performing targeted attacks through\nperturbations on input news articles. Through this we test Grover's resilience\nto these adversarial attacks and expose some potential vulnerabilities which\nshould be addressed in further iterations to ensure it can detect all types of\nfake news accurately.\n","authors":["Ziqing Ji","Pranav Kulkarni","Marko Neskovic","Kevin Nolan","Yan Xu"],"pdf_url":"https://arxiv.org/pdf/2302.00509v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17686v1","updated":"2024-07-25T01:07:09Z","published":"2024-07-25T01:07:09Z","title":"Transformers on Markov Data: Constant Depth Suffices","summary":" Attention-based transformers have been remarkably successful at modeling\ngenerative processes across various domains and modalities. In this paper, we\nstudy the behavior of transformers on data drawn from \\kth Markov processes,\nwhere the conditional distribution of the next symbol in a sequence depends on\nthe previous $k$ symbols observed. We observe a surprising phenomenon\nempirically which contradicts previous findings: when trained for sufficiently\nlong, a transformer with a fixed depth and $1$ head per layer is able to\nachieve low test loss on sequences drawn from \\kth Markov sources, even as $k$\ngrows. Furthermore, this low test loss is achieved by the transformer's ability\nto represent and learn the in-context conditional empirical distribution. On\nthe theoretical side, our main result is that a transformer with a single head\nand three layers can represent the in-context conditional empirical\ndistribution for \\kth Markov sources, concurring with our empirical\nobservations. Along the way, we prove that \\textit{attention-only} transformers\nwith $O(\\log_2(k))$ layers can represent the in-context conditional empirical\ndistribution by composing induction heads to track the previous $k$ symbols in\nthe sequence. These results provide more insight into our current understanding\nof the mechanisms by which transformers learn to capture context, by\nunderstanding their behavior on Markov sources.\n","authors":["Nived Rajaraman","Marco Bondaschi","Kannan Ramchandran","Michael Gastpar","Ashok Vardhan Makkuva"],"pdf_url":"https://arxiv.org/pdf/2407.17686v1.pdf","comment":"29 pages, 10 figures"},{"id":"http://arxiv.org/abs/2407.17678v1","updated":"2024-07-25T00:27:07Z","published":"2024-07-25T00:27:07Z","title":"Efficient LLM Training and Serving with Heterogeneous Context Sharding\n among Attention Heads","summary":" Existing LLM training and inference frameworks struggle in boosting\nefficiency with sparsity while maintaining the integrity of context and model\narchitecture. Inspired by the sharding concept in database and the fact that\nattention parallelizes over heads on accelerators, we propose Sparsely-Sharded\n(S2) Attention, an attention algorithm that allocates heterogeneous context\npartitions for different attention heads to divide and conquer. S2-Attention\nenforces each attention head to only attend to a partition of contexts\nfollowing a strided sparsity pattern, while the full context is preserved as\nthe union of all the shards. As attention heads are processed in separate\nthread blocks, the context reduction for each head can thus produce end-to-end\nspeed-up and memory reduction. At inference, LLMs trained with S2-Attention can\nthen take the KV cache reduction as free meals with guaranteed model quality\npreserve. In experiments, we show S2-Attentioncan provide as much as (1) 25.3X\nwall-clock attention speed-up over FlashAttention-2, resulting in 6X reduction\nin end-to-end training time and 10X inference latency, (2) on-par model\ntraining quality compared to default attention, (3)perfect needle retrieval\naccuracy over 32K context window. On top of the algorithm, we build DKernel, an\nLLM training and inference kernel library that allows users to customize\nsparsity patterns for their own models. We open-sourced DKerneland make it\ncompatible with Megatron, Pytorch, and vLLM.\n","authors":["Xihui Lin","Yunan Zhang","Suyu Ge","Barun Patra","Vishrav Chaudhary","Xia Song"],"pdf_url":"https://arxiv.org/pdf/2407.17678v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2111.06971v2","updated":"2024-07-25T01:19:48Z","published":"2021-11-12T22:31:47Z","title":"Exploiting All Samples in Low-Resource Sentence Classification: Early\n Stopping and Initialization Parameters","summary":" To improve deep-learning performance in low-resource settings, many\nresearchers have redesigned model architectures or applied additional data\n(e.g., external resources, unlabeled samples). However, there have been\nrelatively few discussions on how to make good use of small amounts of labeled\nsamples, although it is potentially beneficial and should be done before\napplying additional data or redesigning models. In this study, we assume a\nlow-resource setting in which only a few labeled samples (i.e., 30-100 per\nclass) are available, and we discuss how to exploit them without additional\ndata or model redesigns. We explore possible approaches in the following three\naspects: training-validation splitting, early stopping, and weight\ninitialization. Extensive experiments are conducted on six public sentence\nclassification datasets. Performance on various evaluation metrics (e.g.,\naccuracy, loss, and calibration error) significantly varied depending on the\napproaches that were combined in the three aspects. Based on the results, we\npropose an integrated method, which is to initialize the model with a weight\naveraging method and use a non-validation stop method to train all samples.\nThis simple integrated method consistently outperforms the competitive methods;\ne.g., the average accuracy of six datasets of this method was 1.8% higher than\nthose of conventional validation-based methods. In addition, the integrated\nmethod further improves the performance when adapted to several\nstate-of-the-art models that use additional data or redesign the network\narchitecture (e.g., self-training and enhanced structural models). Our results\nhighlight the importance of the training strategy and suggest that the\nintegrated method can be the first step in the low-resource setting. This study\nprovides empirical knowledge that will be helpful when dealing with\nlow-resource data in future efforts.\n","authors":["Hongseok Choi","Hyunju Lee"],"pdf_url":"https://arxiv.org/pdf/2111.06971v2.pdf","comment":"15 pages, 8 figures, published in IEEE Access"},{"id":"http://arxiv.org/abs/2402.06782v4","updated":"2024-07-25T23:32:21Z","published":"2024-02-09T21:05:01Z","title":"Debating with More Persuasive LLMs Leads to More Truthful Answers","summary":" Common methods for aligning large language models (LLMs) with desired\nbehaviour heavily rely on human-labelled data. However, as models grow\nincreasingly sophisticated, they will surpass human expertise, and the role of\nhuman evaluation will evolve into non-experts overseeing experts. In\nanticipation of this, we ask: can weaker models assess the correctness of\nstronger models? We investigate this question in an analogous setting, where\nstronger models (experts) possess the necessary information to answer questions\nand weaker models (non-experts) lack this information. The method we evaluate\nis debate, where two LLM experts each argue for a different answer, and a\nnon-expert selects the answer. We find that debate consistently helps both\nnon-expert models and humans answer questions, achieving 76% and 88% accuracy\nrespectively (naive baselines obtain 48% and 60%). Furthermore, optimising\nexpert debaters for persuasiveness in an unsupervised manner improves\nnon-expert ability to identify the truth in debates. Our results provide\nencouraging empirical evidence for the viability of aligning models with debate\nin the absence of ground truth.\n","authors":["Akbir Khan","John Hughes","Dan Valentine","Laura Ruis","Kshitij Sachan","Ansh Radhakrishnan","Edward Grefenstette","Samuel R. Bowman","Tim Rocktäschel","Ethan Perez"],"pdf_url":"https://arxiv.org/pdf/2402.06782v4.pdf","comment":"For code please check: https://github.com/ucl-dark/llm_debate"},{"id":"http://arxiv.org/abs/2402.08983v4","updated":"2024-07-25T22:59:44Z","published":"2024-02-14T06:54:31Z","title":"SafeDecoding: Defending against Jailbreak Attacks via Safety-Aware\n Decoding","summary":" As large language models (LLMs) become increasingly integrated into\nreal-world applications such as code generation and chatbot assistance,\nextensive efforts have been made to align LLM behavior with human values,\nincluding safety. Jailbreak attacks, aiming to provoke unintended and unsafe\nbehaviors from LLMs, remain a significant/leading LLM safety threat. In this\npaper, we aim to defend LLMs against jailbreak attacks by introducing\nSafeDecoding, a safety-aware decoding strategy for LLMs to generate helpful and\nharmless responses to user queries. Our insight in developing SafeDecoding is\nbased on the observation that, even though probabilities of tokens representing\nharmful contents outweigh those representing harmless responses, safety\ndisclaimers still appear among the top tokens after sorting tokens by\nprobability in descending order. This allows us to mitigate jailbreak attacks\nby identifying safety disclaimers and amplifying their token probabilities,\nwhile simultaneously attenuating the probabilities of token sequences that are\naligned with the objectives of jailbreak attacks. We perform extensive\nexperiments on five LLMs using six state-of-the-art jailbreak attacks and four\nbenchmark datasets. Our results show that SafeDecoding significantly reduces\nthe attack success rate and harmfulness of jailbreak attacks without\ncompromising the helpfulness of responses to benign user queries. SafeDecoding\noutperforms six defense methods.\n","authors":["Zhangchen Xu","Fengqing Jiang","Luyao Niu","Jinyuan Jia","Bill Yuchen Lin","Radha Poovendran"],"pdf_url":"https://arxiv.org/pdf/2402.08983v4.pdf","comment":"To appear in ACL 2024"},{"id":"http://arxiv.org/abs/2407.12869v2","updated":"2024-07-25T22:51:39Z","published":"2024-07-13T21:09:38Z","title":"Bilingual Adaptation of Monolingual Foundation Models","summary":" We present an efficient method for adapting a monolingual Large Language\nModel (LLM) to another language, addressing challenges of catastrophic\nforgetting and tokenizer limitations. We focus this study on adapting Llama 2\nto Arabic. Our two-stage approach begins with expanding the vocabulary and\ntraining only the embeddings matrix, followed by full model continual\npre-training on a bilingual corpus. By continually pre-training on a mix of\nArabic and English corpora, the model retains its proficiency in English while\nacquiring capabilities in Arabic. Our approach results in significant\nimprovements in Arabic and slight enhancements in English, demonstrating\ncost-effective cross-lingual transfer. We perform ablations on embedding\ninitialization techniques, data mix ratios, and learning rates and release a\ndetailed training recipe. To demonstrate generalizability of this approach we\nalso adapted Llama 3 8B to Arabic and Llama 2 13B to Hindi.\n","authors":["Gurpreet Gosal","Yishi Xu","Gokul Ramakrishnan","Rituraj Joshi","Avraham Sheinin"," Zhiming"," Chen","Biswajit Mishra","Natalia Vassilieva","Joel Hestness","Neha Sengupta","Sunil Kumar Sahu","Bokang Jia","Onkar Pandit","Satheesh Katipomu","Samta Kamboj","Samujjwal Ghosh","Rahul Pal","Parvez Mullah","Soundar Doraiswamy","Mohamed El Karim Chami","Preslav Nakov"],"pdf_url":"https://arxiv.org/pdf/2407.12869v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14644v2","updated":"2024-07-25T22:44:54Z","published":"2024-07-19T19:47:26Z","title":"Human-Interpretable Adversarial Prompt Attack on Large Language Models\n with Situational Context","summary":" Previous research on testing the vulnerabilities in Large Language Models\n(LLMs) using adversarial attacks has primarily focused on nonsensical prompt\ninjections, which are easily detected upon manual or automated review (e.g.,\nvia byte entropy). However, the exploration of innocuous human-understandable\nmalicious prompts augmented with adversarial injections remains limited. In\nthis research, we explore converting a nonsensical suffix attack into a\nsensible prompt via a situation-driven contextual re-writing. This allows us to\nshow suffix conversion without any gradients, using only LLMs to perform the\nattacks, and thus better understand the scope of possible risks. We combine an\nindependent, meaningful adversarial insertion and situations derived from\nmovies to check if this can trick an LLM. The situations are extracted from the\nIMDB dataset, and prompts are defined following a few-shot chain-of-thought\nprompting. Our approach demonstrates that a successful situation-driven attack\ncan be executed on both open-source and proprietary LLMs. We find that across\nmany LLMs, as few as 1 attempt produces an attack and that these attacks\ntransfer between LLMs.\n","authors":["Nilanjana Das","Edward Raff","Manas Gaur"],"pdf_url":"https://arxiv.org/pdf/2407.14644v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18421v1","updated":"2024-07-25T22:42:36Z","published":"2024-07-25T22:42:36Z","title":"Self-Directed Synthetic Dialogues and Revisions Technical Report","summary":" Synthetic data has become an important tool in the fine-tuning of language\nmodels to follow instructions and solve complex problems. Nevertheless, the\nmajority of open data to date is often lacking multi-turn data and collected on\nclosed models, limiting progress on advancing open fine-tuning methods. We\nintroduce Self Directed Synthetic Dialogues (SDSD), an experimental dataset\nconsisting of guided conversations of language models talking to themselves.\nThe dataset consists of multi-turn conversations generated with DBRX, Llama 2\n70B, and Mistral Large, all instructed to follow a conversation plan generated\nprior to the conversation. We also explore including principles from\nConstitutional AI and other related works to create synthetic preference data\nvia revisions to the final conversation turn. We hope this work encourages\nfurther exploration in multi-turn data and the use of open models for expanding\nthe impact of synthetic data.\n","authors":["Nathan Lambert","Hailey Schoelkopf","Aaron Gokaslan","Luca Soldaini","Valentina Pyatkin","Louis Castricato"],"pdf_url":"https://arxiv.org/pdf/2407.18421v1.pdf","comment":"25 pages, 3 figures, 4 tables"},{"id":"http://arxiv.org/abs/2407.18418v1","updated":"2024-07-25T22:31:50Z","published":"2024-07-25T22:31:50Z","title":"The Art of Refusal: A Survey of Abstention in Large Language Models","summary":" Abstention, the refusal of large language models (LLMs) to provide an answer,\nis increasingly recognized for its potential to mitigate hallucinations and\nenhance safety in building LLM systems. In this survey, we introduce a\nframework to examine abstention behavior from three perspectives: the query,\nthe model, and human values. We review the literature on abstention methods\n(categorized based on the development stages of LLMs), benchmarks, and\nevaluation metrics, and discuss the merits and limitations of prior work. We\nfurther identify and motivate areas for future research, such as encouraging\nthe study of abstention as a meta-capability across tasks and customizing\nabstention abilities based on context. In doing so, we aim to broaden the scope\nand impact of abstention methodologies in AI systems.\n","authors":["Bingbing Wen","Jihan Yao","Shangbin Feng","Chenjun Xu","Yulia Tsvetkov","Bill Howe","Lucy Lu Wang"],"pdf_url":"https://arxiv.org/pdf/2407.18418v1.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2407.18416v1","updated":"2024-07-25T22:24:45Z","published":"2024-07-25T22:24:45Z","title":"PersonaGym: Evaluating Persona Agents and LLMs","summary":" Persona agents, which are LLM agents that act according to an assigned\npersona, have demonstrated impressive contextual response capabilities across\nvarious applications. These persona agents offer significant enhancements\nacross diverse sectors, such as education, healthcare, and entertainment, where\nmodel developers can align agent responses to different user requirements\nthereby broadening the scope of agent applications. However, evaluating persona\nagent performance is incredibly challenging due to the complexity of assessing\npersona adherence in free-form interactions across various environments that\nare relevant to each persona agent. We introduce PersonaGym, the first dynamic\nevaluation framework for assessing persona agents, and PersonaScore, the first\nautomated human-aligned metric grounded in decision theory for comprehensive\nlarge-scale evaluation of persona agents. Our evaluation of 6 open and\nclosed-source LLMs, using a benchmark encompassing 200 personas and 10,000\nquestions, reveals significant opportunities for advancement in persona agent\ncapabilities across state-of-the-art models. For example, Claude 3.5 Sonnet\nonly has a 2.97% relative improvement in PersonaScore than GPT 3.5 despite\nbeing a much more advanced model. Importantly, we find that increased model\nsize and complexity do not necessarily imply enhanced persona agent\ncapabilities thereby highlighting the pressing need for algorithmic and\narchitectural invention towards faithful and performant persona agents.\n","authors":["Vinay Samuel","Henry Peng Zou","Yue Zhou","Shreyas Chaudhari","Ashwin Kalyan","Tanmay Rajpurohit","Ameet Deshpande","Karthik Narasimhan","Vishvak Murahari"],"pdf_url":"https://arxiv.org/pdf/2407.18416v1.pdf","comment":"21 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.01603v2","updated":"2024-07-25T21:23:15Z","published":"2024-06-26T17:33:21Z","title":"A Review of Large Language Models and Autonomous Agents in Chemistry","summary":" Large language models (LLMs) have emerged as powerful tools in chemistry,\nsignificantly impacting molecule design, property prediction, and synthesis\noptimization. This review highlights LLM capabilities in these domains and\ntheir potential to accelerate scientific discovery through automation. We also\nreview LLM-based autonomous agents: LLMs with a broader set of tools to\ninteract with their surrounding environment. These agents perform diverse tasks\nsuch as paper scraping, interfacing with automated laboratories, and synthesis\nplanning. As agents are an emerging topic, we extend the scope of our review of\nagents beyond chemistry and discuss across any scientific domains. This review\ncovers the recent history, current capabilities, and design of LLMs and\nautonomous agents, addressing specific challenges, opportunities, and future\ndirections in chemistry. Key challenges include data quality and integration,\nmodel interpretability, and the need for standard benchmarks, while future\ndirections point towards more sophisticated multi-modal agents and enhanced\ncollaboration between agents and experimental methods. Due to the quick pace of\nthis field, a repository has been built to keep track of the latest studies:\nhttps://github.com/ur-whitelab/LLMs-in-science.\n","authors":["Mayk Caldas Ramos","Christopher J. Collison","Andrew D. White"],"pdf_url":"https://arxiv.org/pdf/2407.01603v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18376v1","updated":"2024-07-25T20:19:29Z","published":"2024-07-25T20:19:29Z","title":"Exploring Bengali Religious Dialect Biases in Large Language Models with\n Evaluation Perspectives","summary":" While Large Language Models (LLM) have created a massive technological impact\nin the past decade, allowing for human-enabled applications, they can produce\noutput that contains stereotypes and biases, especially when using low-resource\nlanguages. This can be of great ethical concern when dealing with sensitive\ntopics such as religion. As a means toward making LLMS more fair, we explore\nbias from a religious perspective in Bengali, focusing specifically on two main\nreligious dialects: Hindu and Muslim-majority dialects. Here, we perform\ndifferent experiments and audit showing the comparative analysis of different\nsentences using three commonly used LLMs: ChatGPT, Gemini, and Microsoft\nCopilot, pertaining to the Hindu and Muslim dialects of specific words and\nshowcasing which ones catch the social biases and which do not. Furthermore, we\nanalyze our findings and relate them to potential reasons and evaluation\nperspectives, considering their global impact with over 300 million speakers\nworldwide. With this work, we hope to establish the rigor for creating more\nfairness in LLMs, as these are widely used as creative writing agents.\n","authors":["Azmine Toushik Wasi","Raima Islam","Mst Rafia Islam","Taki Hasan Rafi","Dong-Kyu Chae"],"pdf_url":"https://arxiv.org/pdf/2407.18376v1.pdf","comment":"10 Pages, 4 Figures. Accepted to the 1st Human-centered Evaluation\n and Auditing of Language Models Workshop at CHI 2024 (Workshop website:\n https://heal-workshop.github.io/#:~:text=Exploring%20Bengali%20Religious%20Dialect%20Biases%20in%20Large%20Language%20Models%20with%20Evaluation%20Perspectives)"},{"id":"http://arxiv.org/abs/2407.18370v1","updated":"2024-07-25T20:04:59Z","published":"2024-07-25T20:04:59Z","title":"Trust or Escalate: LLM Judges with Provable Guarantees for Human\n Agreement","summary":" We present a principled approach to provide LLM-based evaluation with a\nrigorous guarantee of human agreement. We first propose that a reliable\nevaluation method should not uncritically rely on model preferences for\npairwise evaluation, but rather assess the confidence of judge models and\nselectively decide when to trust its judgement. We then show that under this\nselective evaluation framework, human agreement can be provably guaranteed --\nsuch that the model evaluation aligns with that of humans to a user-specified\nagreement level. As part of our framework, we also introduce Simulated\nAnnotators, a novel confidence estimation method that significantly improves\njudge calibration and thus enables high coverage of evaluated instances.\nFinally, we propose Cascaded Selective Evaluation, where we use cheaper models\nas initial judges and escalate to stronger models only when necessary -- again,\nwhile still providing a provable guarantee of human agreement. Experimental\nresults show that Cascaded Selective Evaluation guarantees strong alignment\nwith humans, far beyond what LLM judges could achieve without selective\nevaluation. For example, on a subset of Chatbot Arena where GPT-4 almost never\nachieves 80% human agreement, our method, even while employing substantially\ncost-effective models such as Mistral-7B, guarantees over 80% human agreement\nwith almost 80% test coverage.\n","authors":["Jaehun Jung","Faeze Brahman","Yejin Choi"],"pdf_url":"https://arxiv.org/pdf/2407.18370v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18367v1","updated":"2024-07-25T20:03:43Z","published":"2024-07-25T20:03:43Z","title":"Robust Claim Verification Through Fact Detection","summary":" Claim verification can be a challenging task. In this paper, we present a\nmethod to enhance the robustness and reasoning capabilities of automated claim\nverification through the extraction of short facts from evidence. Our novel\napproach, FactDetect, leverages Large Language Models (LLMs) to generate\nconcise factual statements from evidence and label these facts based on their\nsemantic relevance to the claim and evidence. The generated facts are then\ncombined with the claim and evidence. To train a lightweight supervised model,\nwe incorporate a fact-detection task into the claim verification process as a\nmultitasking approach to improve both performance and explainability. We also\nshow that augmenting FactDetect in the claim verification prompt enhances\nperformance in zero-shot claim verification using LLMs. Our method demonstrates\ncompetitive results in the supervised claim verification model by 15% on the F1\nscore when evaluated for challenging scientific claim verification datasets. We\nalso demonstrate that FactDetect can be augmented with claim and evidence for\nzero-shot prompting (AugFactDetect) in LLMs for verdict prediction. We show\nthat AugFactDetect outperforms the baseline with statistical significance on\nthree challenging scientific claim verification datasets with an average of\n17.3% performance gain compared to the best performing baselines.\n","authors":["Nazanin Jafari","James Allan"],"pdf_url":"https://arxiv.org/pdf/2407.18367v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01509v3","updated":"2024-07-25T19:50:32Z","published":"2024-07-01T17:53:35Z","title":"MIA-Bench: Towards Better Instruction Following Evaluation of Multimodal\n LLMs","summary":" We introduce MIA-Bench, a new benchmark designed to evaluate multimodal large\nlanguage models (MLLMs) on their ability to strictly adhere to complex\ninstructions. Our benchmark comprises a diverse set of 400 image-prompt pairs,\neach crafted to challenge the models' compliance with layered instructions in\ngenerating accurate responses that satisfy specific requested patterns.\nEvaluation results from a wide array of state-of-the-art MLLMs reveal\nsignificant variations in performance, highlighting areas for improvement in\ninstruction fidelity. Additionally, we create extra training data and explore\nsupervised fine-tuning to enhance the models' ability to strictly follow\ninstructions without compromising performance on other tasks. We hope this\nbenchmark not only serves as a tool for measuring MLLM adherence to\ninstructions, but also guides future developments in MLLM training methods.\n","authors":["Yusu Qian","Hanrong Ye","Jean-Philippe Fauconnier","Peter Grasch","Yinfei Yang","Zhe Gan"],"pdf_url":"https://arxiv.org/pdf/2407.01509v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.04400v2","updated":"2024-07-25T19:32:16Z","published":"2024-03-07T10:54:27Z","title":"Exploring Continual Learning of Compositional Generalization in NLI","summary":" Compositional Natural Language Inference has been explored to assess the true\nabilities of neural models to perform NLI. Yet, current evaluations assume\nmodels to have full access to all primitive inferences in advance, in contrast\nto humans that continuously acquire inference knowledge. In this paper, we\nintroduce the Continual Compositional Generalization in Inference (C2Gen NLI)\nchallenge, where a model continuously acquires knowledge of constituting\nprimitive inference tasks as a basis for compositional inferences. We explore\nhow continual learning affects compositional generalization in NLI, by\ndesigning a continual learning setup for compositional NLI inference tasks. Our\nexperiments demonstrate that models fail to compositionally generalize in a\ncontinual scenario. To address this problem, we first benchmark various\ncontinual learning algorithms and verify their efficacy. We then further\nanalyze C2Gen, focusing on how to order primitives and compositional inference\ntypes and examining correlations between subtasks. Our analyses show that by\nlearning subtasks continuously while observing their dependencies and\nincreasing degrees of difficulty, continual learning can enhance composition\ngeneralization ability.\n","authors":["Xiyan Fu","Anette Frank"],"pdf_url":"https://arxiv.org/pdf/2403.04400v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.10962v4","updated":"2024-07-25T18:58:51Z","published":"2024-02-13T20:10:29Z","title":"Measuring and Controlling Instruction (In)Stability in Language Model\n Dialogs","summary":" System-prompting is a standard tool for customizing language-model chatbots,\nenabling them to follow a specific instruction. An implicit assumption in the\nuse of system prompts is that they will be stable, so the chatbot will continue\nto generate text according to the stipulated instructions for the duration of a\nconversation. We propose a quantitative benchmark to test this assumption,\nevaluating instruction stability via self-chats between two instructed\nchatbots. Testing popular models like LLaMA2-chat-70B and GPT-3.5, we reveal a\nsignificant instruction drift within eight rounds of conversations. An\nempirical and theoretical analysis of this phenomenon suggests the transformer\nattention mechanism plays a role, due to attention decay over long exchanges.\nTo combat attention decay and instruction drift, we propose a lightweight\nmethod called split-softmax, which compares favorably against two strong\nbaselines.\n","authors":["Kenneth Li","Tianle Liu","Naomi Bashkansky","David Bau","Fernanda Viégas","Hanspeter Pfister","Martin Wattenberg"],"pdf_url":"https://arxiv.org/pdf/2402.10962v4.pdf","comment":"COLM 2024; Code and data: https://github.com/likenneth/persona_drift"},{"id":"http://arxiv.org/abs/2108.08614v9","updated":"2024-07-25T18:51:23Z","published":"2021-08-19T10:50:52Z","title":"UNIQORN: Unified Question Answering over RDF Knowledge Graphs and\n Natural Language Text","summary":" Question answering over RDF data like knowledge graphs has been greatly\nadvanced, with a number of good systems providing crisp answers for natural\nlanguage questions or telegraphic queries. Some of these systems incorporate\ntextual sources as additional evidence for the answering process, but cannot\ncompute answers that are present in text alone. Conversely, the IR and NLP\ncommunities have addressed QA over text, but such systems barely utilize\nsemantic data and knowledge. This paper presents a method for complex questions\nthat can seamlessly operate over a mixture of RDF datasets and text corpora, or\nindividual sources, in a unified framework. Our method, called UNIQORN, builds\na context graph on-the-fly, by retrieving question-relevant evidences from the\nRDF data and/or a text corpus, using fine-tuned BERT models. The resulting\ngraph typically contains all question-relevant evidences but also a lot of\nnoise. UNIQORN copes with this input by a graph algorithm for Group Steiner\nTrees, that identifies the best answer candidates in the context graph.\nExperimental results on several benchmarks of complex questions with multiple\nentities and relations, show that UNIQORN significantly outperforms\nstate-of-the-art methods for heterogeneous QA -- in a full training mode, as\nwell as in zero-shot settings. The graph-based methodology provides\nuser-interpretable evidence for the complete answering process.\n","authors":["Soumajit Pramanik","Jesujoba Alabi","Rishiraj Saha Roy","Gerhard Weikum"],"pdf_url":"https://arxiv.org/pdf/2108.08614v9.pdf","comment":"27 pages"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2407.18251v1","updated":"2024-07-25T17:59:48Z","published":"2024-07-25T17:59:48Z","title":"Sparse vs Contiguous Adversarial Pixel Perturbations in Multimodal\n Models: An Empirical Analysis","summary":" Assessing the robustness of multimodal models against adversarial examples is\nan important aspect for the safety of its users. We craft L0-norm perturbation\nattacks on the preprocessed input images. We launch them in a black-box setup\nagainst four multimodal models and two unimodal DNNs, considering both targeted\nand untargeted misclassification. Our attacks target less than 0.04% of\nperturbed image area and integrate different spatial positioning of perturbed\npixels: sparse positioning and pixels arranged in different contiguous shapes\n(row, column, diagonal, and patch). To the best of our knowledge, we are the\nfirst to assess the robustness of three state-of-the-art multimodal models\n(ALIGN, AltCLIP, GroupViT) against different sparse and contiguous pixel\ndistribution perturbations. The obtained results indicate that unimodal DNNs\nare more robust than multimodal models. Furthermore, models using CNN-based\nImage Encoder are more vulnerable than models with ViT - for untargeted\nattacks, we obtain a 99% success rate by perturbing less than 0.02% of the\nimage area.\n","authors":["Cristian-Alexandru Botocan","Raphael Meier","Ljiljana Dolamic"],"pdf_url":"https://arxiv.org/pdf/2407.18251v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18249v1","updated":"2024-07-25T17:59:31Z","published":"2024-07-25T17:59:31Z","title":"Trajectory-aligned Space-time Tokens for Few-shot Action Recognition","summary":" We propose a simple yet effective approach for few-shot action recognition,\nemphasizing the disentanglement of motion and appearance representations. By\nharnessing recent progress in tracking, specifically point trajectories and\nself-supervised representation learning, we build trajectory-aligned tokens\n(TATs) that capture motion and appearance information. This approach\nsignificantly reduces the data requirements while retaining essential\ninformation. To process these representations, we use a Masked Space-time\nTransformer that effectively learns to aggregate information to facilitate\nfew-shot action recognition. We demonstrate state-of-the-art results on\nfew-shot action recognition across multiple datasets. Our project page is\navailable at https://www.cs.umd.edu/~pulkit/tats\n","authors":["Pulkit Kumar","Namitha Padmanabhan","Luke Luo","Sai Saketh Rambhatla","Abhinav Shrivastava"],"pdf_url":"https://arxiv.org/pdf/2407.18249v1.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2407.18247v1","updated":"2024-07-25T17:59:13Z","published":"2024-07-25T17:59:13Z","title":"RegionDrag: Fast Region-Based Image Editing with Diffusion Models","summary":" Point-drag-based image editing methods, like DragDiffusion, have attracted\nsignificant attention. However, point-drag-based approaches suffer from\ncomputational overhead and misinterpretation of user intentions due to the\nsparsity of point-based editing instructions. In this paper, we propose a\nregion-based copy-and-paste dragging method, RegionDrag, to overcome these\nlimitations. RegionDrag allows users to express their editing instructions in\nthe form of handle and target regions, enabling more precise control and\nalleviating ambiguity. In addition, region-based operations complete editing in\none iteration and are much faster than point-drag-based methods. We also\nincorporate the attention-swapping technique for enhanced stability during\nediting. To validate our approach, we extend existing point-drag-based datasets\nwith region-based dragging instructions. Experimental results demonstrate that\nRegionDrag outperforms existing point-drag-based approaches in terms of speed,\naccuracy, and alignment with user intentions. Remarkably, RegionDrag completes\nthe edit on an image with a resolution of 512x512 in less than 2 seconds, which\nis more than 100x faster than DragDiffusion, while achieving better\nperformance. Project page: https://visual-ai.github.io/regiondrag.\n","authors":["Jingyi Lu","Xinghui Li","Kai Han"],"pdf_url":"https://arxiv.org/pdf/2407.18247v1.pdf","comment":"ECCV 2024, Project page: https://visual-ai.github.io/regiondrag"},{"id":"http://arxiv.org/abs/2407.18245v1","updated":"2024-07-25T17:58:17Z","published":"2024-07-25T17:58:17Z","title":"VGGHeads: A Large-Scale Synthetic Dataset for 3D Human Heads","summary":" Human head detection, keypoint estimation, and 3D head model fitting are\nimportant tasks with many applications. However, traditional real-world\ndatasets often suffer from bias, privacy, and ethical concerns, and they have\nbeen recorded in laboratory environments, which makes it difficult for trained\nmodels to generalize. Here, we introduce VGGHeads -- a large scale synthetic\ndataset generated with diffusion models for human head detection and 3D mesh\nestimation. Our dataset comprises over 1 million high-resolution images, each\nannotated with detailed 3D head meshes, facial landmarks, and bounding boxes.\nUsing this dataset we introduce a new model architecture capable of\nsimultaneous heads detection and head meshes reconstruction from a single image\nin a single step. Through extensive experimental evaluations, we demonstrate\nthat models trained on our synthetic data achieve strong performance on real\nimages. Furthermore, the versatility of our dataset makes it applicable across\na broad spectrum of tasks, offering a general and comprehensive representation\nof human heads. Additionally, we provide detailed information about the\nsynthetic data generation pipeline, enabling it to be re-used for other tasks\nand domains.\n","authors":["Orest Kupyn","Eugene Khvedchenia","Christian Rupprecht"],"pdf_url":"https://arxiv.org/pdf/2407.18245v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18244v1","updated":"2024-07-25T17:58:03Z","published":"2024-07-25T17:58:03Z","title":"RefMask3D: Language-Guided Transformer for 3D Referring Segmentation","summary":" 3D referring segmentation is an emerging and challenging vision-language task\nthat aims to segment the object described by a natural language expression in a\npoint cloud scene. The key challenge behind this task is vision-language\nfeature fusion and alignment. In this work, we propose RefMask3D to explore the\ncomprehensive multi-modal feature interaction and understanding. First, we\npropose a Geometry-Enhanced Group-Word Attention to integrate language with\ngeometrically coherent sub-clouds through cross-modal group-word attention,\nwhich effectively addresses the challenges posed by the sparse and irregular\nnature of point clouds. Then, we introduce a Linguistic Primitives Construction\nto produce semantic primitives representing distinct semantic attributes, which\ngreatly enhance the vision-language understanding at the decoding stage.\nFurthermore, we introduce an Object Cluster Module that analyzes the\ninterrelationships among linguistic primitives to consolidate their insights\nand pinpoint common characteristics, helping to capture holistic information\nand enhance the precision of target identification. The proposed RefMask3D\nachieves new state-of-the-art performance on 3D referring segmentation, 3D\nvisual grounding, and also 2D referring image segmentation. Especially,\nRefMask3D outperforms previous state-of-the-art method by a large margin of\n3.16% mIoU} on the challenging ScanRefer dataset. Code is available at\nhttps://github.com/heshuting555/RefMask3D.\n","authors":["Shuting He","Henghui Ding"],"pdf_url":"https://arxiv.org/pdf/2407.18244v1.pdf","comment":"ACM MM 2024, Code: https://github.com/heshuting555/RefMask3D"},{"id":"http://arxiv.org/abs/2407.18243v1","updated":"2024-07-25T17:57:48Z","published":"2024-07-25T17:57:48Z","title":"BIV-Priv-Seg: Locating Private Content in Images Taken by People With\n Visual Impairments","summary":" Individuals who are blind or have low vision (BLV) are at a heightened risk\nof sharing private information if they share photographs they have taken. To\nfacilitate developing technologies that can help preserve privacy, we introduce\nBIV-Priv-Seg, the first localization dataset originating from people with\nvisual impairments that shows private content. It contains 1,028 images with\nsegmentation annotations for 16 private object categories. We first\ncharacterize BIV-Priv-Seg and then evaluate modern models' performance for\nlocating private content in the dataset. We find modern models struggle most\nwith locating private objects that are not salient, small, and lack text as\nwell as recognizing when private content is absent from an image. We facilitate\nfuture extensions by sharing our new dataset with the evaluation server at\nhttps://vizwiz.org/tasks-and-datasets/object-localization.\n","authors":["Yu-Yun Tseng","Tanusree Sharma","Lotus Zhang","Abigale Stangl","Leah Findlater","Yang Wang","Danna Gurari Yu-Yun Tseng","Tanusree Sharma","Lotus Zhang","Abigale Stangl","Leah Findlater","Yang Wang","Danna Gurari"],"pdf_url":"https://arxiv.org/pdf/2407.18243v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18240v1","updated":"2024-07-25T17:54:58Z","published":"2024-07-25T17:54:58Z","title":"CodedVO: Coded Visual Odometry","summary":" Autonomous robots often rely on monocular cameras for odometry estimation and\nnavigation. However, the scale ambiguity problem presents a critical barrier to\neffective monocular visual odometry. In this paper, we present CodedVO, a novel\nmonocular visual odometry method that overcomes the scale ambiguity problem by\nemploying custom optics to physically encode metric depth information into\nimagery. By incorporating this information into our odometry pipeline, we\nachieve state-of-the-art performance in monocular visual odometry with a known\nscale. We evaluate our method in diverse indoor environments and demonstrate\nits robustness and adaptability. We achieve a 0.08m average trajectory error in\nodometry evaluation on the ICL-NUIM indoor odometry dataset.\n","authors":["Sachin Shah","Naitri Rajyaguru","Chahat Deep Singh","Christopher Metzler","Yiannis Aloimonos"],"pdf_url":"https://arxiv.org/pdf/2407.18240v1.pdf","comment":"7 pages, 4 figures, IEEE ROBOTICS AND AUTOMATION LETTERS"},{"id":"http://arxiv.org/abs/2407.18232v1","updated":"2024-07-25T17:50:32Z","published":"2024-07-25T17:50:32Z","title":"LION: Linear Group RNN for 3D Object Detection in Point Clouds","summary":" The benefit of transformers in large-scale 3D point cloud perception tasks,\nsuch as 3D object detection, is limited by their quadratic computation cost\nwhen modeling long-range relationships. In contrast, linear RNNs have low\ncomputational complexity and are suitable for long-range modeling. Toward this\ngoal, we propose a simple and effective window-based framework built on LInear\ngrOup RNN (i.e., perform linear RNN for grouped features) for accurate 3D\nobject detection, called LION. The key property is to allow sufficient feature\ninteraction in a much larger group than transformer-based methods. However,\neffectively applying linear group RNN to 3D object detection in highly sparse\npoint clouds is not trivial due to its limitation in handling spatial modeling.\nTo tackle this problem, we simply introduce a 3D spatial feature descriptor and\nintegrate it into the linear group RNN operators to enhance their spatial\nfeatures rather than blindly increasing the number of scanning orders for voxel\nfeatures. To further address the challenge in highly sparse point clouds, we\npropose a 3D voxel generation strategy to densify foreground features thanks to\nlinear group RNN as a natural property of auto-regressive models. Extensive\nexperiments verify the effectiveness of the proposed components and the\ngeneralization of our LION on different linear group RNN operators including\nMamba, RWKV, and RetNet. Furthermore, it is worth mentioning that our\nLION-Mamba achieves state-of-the-art on Waymo, nuScenes, Argoverse V2, and ONCE\ndataset. Last but not least, our method supports kinds of advanced linear RNN\noperators (e.g., RetNet, RWKV, Mamba, xLSTM and TTT) on small but popular KITTI\ndataset for a quick experience with our linear RNN-based framework.\n","authors":["Zhe Liu","Jinghua Hou","Xinyu Wang","Xiaoqing Ye","Jingdong Wang","Hengshuang Zhao","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2407.18232v1.pdf","comment":"Project page: https://happinesslz.github.io/projects/LION/"},{"id":"http://arxiv.org/abs/2407.13759v2","updated":"2024-07-25T17:47:20Z","published":"2024-07-18T17:56:30Z","title":"Streetscapes: Large-scale Consistent Street View Generation Using\n Autoregressive Video Diffusion","summary":" We present a method for generating Streetscapes-long sequences of views\nthrough an on-the-fly synthesized city-scale scene. Our generation is\nconditioned by language input (e.g., city name, weather), as well as an\nunderlying map/layout hosting the desired trajectory. Compared to recent models\nfor video generation or 3D view synthesis, our method can scale to much\nlonger-range camera trajectories, spanning several city blocks, while\nmaintaining visual quality and consistency. To achieve this goal, we build on\nrecent work on video diffusion, used within an autoregressive framework that\ncan easily scale to long sequences. In particular, we introduce a new temporal\nimputation method that prevents our autoregressive approach from drifting from\nthe distribution of realistic city imagery. We train our Streetscapes system on\na compelling source of data-posed imagery from Google Street View, along with\ncontextual map data-which allows users to generate city views conditioned on\nany desired city layout, with controllable camera poses. Please see more\nresults at our project page at https://boyangdeng.com/streetscapes.\n","authors":["Boyang Deng","Richard Tucker","Zhengqi Li","Leonidas Guibas","Noah Snavely","Gordon Wetzstein"],"pdf_url":"https://arxiv.org/pdf/2407.13759v2.pdf","comment":"*Equal Contributions; Fixed few duplicated references from 1st\n upload; Project Page: https://boyangdeng.com/streetscapes"},{"id":"http://arxiv.org/abs/2211.10526v5","updated":"2024-07-25T17:29:22Z","published":"2022-11-18T22:49:04Z","title":"Castling-ViT: Compressing Self-Attention via Switching Towards\n Linear-Angular Attention at Vision Transformer Inference","summary":" Vision Transformers (ViTs) have shown impressive performance but still\nrequire a high computation cost as compared to convolutional neural networks\n(CNNs), one reason is that ViTs' attention measures global similarities and\nthus has a quadratic complexity with the number of input tokens. Existing\nefficient ViTs adopt local attention (e.g., Swin) or linear attention (e.g.,\nPerformer), which sacrifice ViTs' capabilities of capturing either global or\nlocal context. In this work, we ask an important research question: Can ViTs\nlearn both global and local context while being more efficient during\ninference? To this end, we propose a framework called Castling-ViT, which\ntrains ViTs using both linear-angular attention and masked softmax-based\nquadratic attention, but then switches to having only linear angular attention\nduring ViT inference. Our Castling-ViT leverages angular kernels to measure the\nsimilarities between queries and keys via spectral angles. And we further\nsimplify it with two techniques: (1) a novel linear-angular attention\nmechanism: we decompose the angular kernels into linear terms and high-order\nresiduals, and only keep the linear terms; and (2) we adopt two parameterized\nmodules to approximate high-order residuals: a depthwise convolution and an\nauxiliary masked softmax attention to help learn both global and local\ninformation, where the masks for softmax attention are regularized to gradually\nbecome zeros and thus incur no overhead during ViT inference. Extensive\nexperiments and ablation studies on three tasks consistently validate the\neffectiveness of the proposed Castling-ViT, e.g., achieving up to a 1.8% higher\naccuracy or 40% MACs reduction on ImageNet classification and 1.2 higher mAP on\nCOCO detection under comparable FLOPs, as compared to ViTs with vanilla\nsoftmax-based attentions.\n","authors":["Haoran You","Yunyang Xiong","Xiaoliang Dai","Bichen Wu","Peizhao Zhang","Haoqi Fan","Peter Vajda","Yingyan Celine Lin"],"pdf_url":"https://arxiv.org/pdf/2211.10526v5.pdf","comment":"CVPR 2023 Camera Ready"},{"id":"http://arxiv.org/abs/2407.18207v1","updated":"2024-07-25T17:17:10Z","published":"2024-07-25T17:17:10Z","title":"Geometry Fidelity for Spherical Images","summary":" Spherical or omni-directional images offer an immersive visual format\nappealing to a wide range of computer vision applications. However, geometric\nproperties of spherical images pose a major challenge for models and metrics\ndesigned for ordinary 2D images. Here, we show that direct application of\nFr\\'echet Inception Distance (FID) is insufficient for quantifying geometric\nfidelity in spherical images. We introduce two quantitative metrics accounting\nfor geometric constraints, namely Omnidirectional FID (OmniFID) and\nDiscontinuity Score (DS). OmniFID is an extension of FID tailored to\nadditionally capture field-of-view requirements of the spherical format by\nleveraging cubemap projections. DS is a kernel-based seam alignment score of\ncontinuity across borders of 2D representations of spherical images. In\nexperiments, OmniFID and DS quantify geometry fidelity issues that are\nundetected by FID.\n","authors":["Anders Christensen","Nooshin Mojab","Khushman Patel","Karan Ahuja","Zeynep Akata","Ole Winther","Mar Gonzalez-Franco","Andrea Colaco"],"pdf_url":"https://arxiv.org/pdf/2407.18207v1.pdf","comment":"Accepted at ECCV 2024"},{"id":"http://arxiv.org/abs/2404.03613v4","updated":"2024-07-25T17:15:58Z","published":"2024-04-04T17:34:41Z","title":"Per-Gaussian Embedding-Based Deformation for Deformable 3D Gaussian\n Splatting","summary":" As 3D Gaussian Splatting (3DGS) provides fast and high-quality novel view\nsynthesis, it is a natural extension to deform a canonical 3DGS to multiple\nframes for representing a dynamic scene. However, previous works fail to\naccurately reconstruct complex dynamic scenes. We attribute the failure to the\ndesign of the deformation field, which is built as a coordinate-based function.\nThis approach is problematic because 3DGS is a mixture of multiple fields\ncentered at the Gaussians, not just a single coordinate-based framework. To\nresolve this problem, we define the deformation as a function of per-Gaussian\nembeddings and temporal embeddings. Moreover, we decompose deformations as\ncoarse and fine deformations to model slow and fast movements, respectively.\nAlso, we introduce a local smoothness regularization for per-Gaussian embedding\nto improve the details in dynamic regions. Project page:\nhttps://jeongminb.github.io/e-d3dgs/\n","authors":["Jeongmin Bae","Seoha Kim","Youngsik Yun","Hahyun Lee","Gun Bang","Youngjung Uh"],"pdf_url":"https://arxiv.org/pdf/2404.03613v4.pdf","comment":"ECCV 2024. Project page: https://jeongminb.github.io/e-d3dgs/"},{"id":"http://arxiv.org/abs/2407.18178v1","updated":"2024-07-25T16:37:07Z","published":"2024-07-25T16:37:07Z","title":"PianoMime: Learning a Generalist, Dexterous Piano Player from Internet\n Demonstrations","summary":" In this work, we introduce PianoMime, a framework for training a\npiano-playing agent using internet demonstrations. The internet is a promising\nsource of large-scale demonstrations for training our robot agents. In\nparticular, for the case of piano-playing, Youtube is full of videos of\nprofessional pianists playing a wide myriad of songs. In our work, we leverage\nthese demonstrations to learn a generalist piano-playing agent capable of\nplaying any arbitrary song. Our framework is divided into three parts: a data\npreparation phase to extract the informative features from the Youtube videos,\na policy learning phase to train song-specific expert policies from the\ndemonstrations and a policy distillation phase to distil the policies into a\nsingle generalist agent. We explore different policy designs to represent the\nagent and evaluate the influence of the amount of training data on the\ngeneralization capability of the agent to novel songs not available in the\ndataset. We show that we are able to learn a policy with up to 56\\% F1 score on\nunseen songs.\n","authors":["Cheng Qian","Julen Urain","Kevin Zakka","Jan Peters"],"pdf_url":"https://arxiv.org/pdf/2407.18178v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18175v1","updated":"2024-07-25T16:35:46Z","published":"2024-07-25T16:35:46Z","title":"Quasar-ViT: Hardware-Oriented Quantization-Aware Architecture Search for\n Vision Transformers","summary":" Vision transformers (ViTs) have demonstrated their superior accuracy for\ncomputer vision tasks compared to convolutional neural networks (CNNs).\nHowever, ViT models are often computation-intensive for efficient deployment on\nresource-limited edge devices. This work proposes Quasar-ViT, a\nhardware-oriented quantization-aware architecture search framework for ViTs, to\ndesign efficient ViT models for hardware implementation while preserving the\naccuracy. First, Quasar-ViT trains a supernet using our row-wise flexible\nmixed-precision quantization scheme, mixed-precision weight entanglement, and\nsupernet layer scaling techniques. Then, it applies an efficient\nhardware-oriented search algorithm, integrated with hardware latency and\nresource modeling, to determine a series of optimal subnets from supernet under\ndifferent inference latency targets. Finally, we propose a series of\nmodel-adaptive designs on the FPGA platform to support the architecture search\nand mitigate the gap between the theoretical computation reduction and the\npractical inference speedup. Our searched models achieve 101.5, 159.6, and\n251.6 frames-per-second (FPS) inference speed on the AMD/Xilinx ZCU102 FPGA\nwith 80.4%, 78.6%, and 74.9% top-1 accuracy, respectively, for the ImageNet\ndataset, consistently outperforming prior works.\n","authors":["Zhengang Li","Alec Lu","Yanyue Xie","Zhenglun Kong","Mengshu Sun","Hao Tang","Zhong Jia Xue","Peiyan Dong","Caiwen Ding","Yanzhi Wang","Xue Lin","Zhenman Fang"],"pdf_url":"https://arxiv.org/pdf/2407.18175v1.pdf","comment":"Accepted by ICS 2024"},{"id":"http://arxiv.org/abs/2407.18145v1","updated":"2024-07-25T15:49:26Z","published":"2024-07-25T15:49:26Z","title":"Taxonomy-Aware Continual Semantic Segmentation in Hyperbolic Spaces for\n Open-World Perception","summary":" Semantic segmentation models are typically trained on a fixed set of classes,\nlimiting their applicability in open-world scenarios. Class-incremental\nsemantic segmentation aims to update models with emerging new classes while\npreventing catastrophic forgetting of previously learned ones. However,\nexisting methods impose strict rigidity on old classes, reducing their\neffectiveness in learning new incremental classes. In this work, we propose\nTaxonomy-Oriented Poincar\\'e-regularized Incremental-Class Segmentation\n(TOPICS) that learns feature embeddings in hyperbolic space following explicit\ntaxonomy-tree structures. This supervision provides plasticity for old classes,\nupdating ancestors based on new classes while integrating new classes at\nfitting positions. Additionally, we maintain implicit class relational\nconstraints on the geometric basis of the Poincar\\'e ball. This ensures that\nthe latent space can continuously adapt to new constraints while maintaining a\nrobust structure to combat catastrophic forgetting. We also establish eight\nrealistic incremental learning protocols for autonomous driving scenarios,\nwhere novel classes can originate from known classes or the background.\nExtensive evaluations of TOPICS on the Cityscapes and Mapillary Vistas 2.0\nbenchmarks demonstrate that it achieves state-of-the-art performance. We make\nthe code and trained models publicly available at\nhttp://topics.cs.uni-freiburg.de.\n","authors":["Julia Hindel","Daniele Cattaneo","Abhinav Valada"],"pdf_url":"https://arxiv.org/pdf/2407.18145v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08651v3","updated":"2024-07-25T15:46:50Z","published":"2024-03-13T16:06:07Z","title":"HAIFIT: Human-Centered AI for Fashion Image Translation","summary":" In the realm of fashion design, sketches serve as the canvas for expressing\nan artist's distinctive drawing style and creative vision, capturing intricate\ndetails like stroke variations and texture nuances. The advent of\nsketch-to-image cross-modal translation technology has notably aided designers.\nHowever, existing methods often compromise these sketch details during image\ngeneration, resulting in images that deviate from the designer's intended\nconcept. This limitation hampers the ability to offer designers a precise\npreview of the final output. To overcome this challenge, we introduce HAIFIT, a\nnovel approach that transforms sketches into high-fidelity, lifelike clothing\nimages by integrating multi-scale features and capturing extensive feature map\ndependencies from diverse perspectives. Through extensive qualitative and\nquantitative evaluations conducted on our self-collected dataset, our method\ndemonstrates superior performance compared to existing methods in generating\nphotorealistic clothing images. Our method excels in preserving the distinctive\nstyle and intricate details essential for fashion design applications. In\naddition, our method also has obvious advantages in model training and\ninference speed, contributing to reducing designers' time costs and improving\ndesign efficiency.\n","authors":["Jianan Jiang","Xinglin Li","Weiren Yu","Di Wu"],"pdf_url":"https://arxiv.org/pdf/2403.08651v3.pdf","comment":"10 pages,7 figures"},{"id":"http://arxiv.org/abs/2404.03632v2","updated":"2024-07-25T15:45:58Z","published":"2024-04-04T17:53:33Z","title":"Reference-Based 3D-Aware Image Editing with Triplanes","summary":" Generative Adversarial Networks (GANs) have emerged as powerful tools for\nhigh-quality image generation and real image editing by manipulating their\nlatent spaces. Recent advancements in GANs include 3D-aware models such as\nEG3D, which feature efficient triplane-based architectures capable of\nreconstructing 3D geometry from single images. However, limited attention has\nbeen given to providing an integrated framework for 3D-aware, high-quality,\nreference-based image editing. This study addresses this gap by exploring and\ndemonstrating the effectiveness of the triplane space for advanced\nreference-based edits. Our novel approach integrates encoding, automatic\nlocalization, spatial disentanglement of triplane features, and fusion learning\nto achieve the desired edits. Additionally, our framework demonstrates\nversatility and robustness across various domains, extending its effectiveness\nto animal face edits, partially stylized edits like cartoon faces, full-body\nclothing edits, and 360-degree head edits. Our method shows state-of-the-art\nperformance over relevant latent direction, text, and image-guided 2D and\n3D-aware diffusion and GAN methods, both qualitatively and quantitatively.\n","authors":["Bahri Batuhan Bilecen","Yigit Yalin","Ning Yu","Aysegul Dundar"],"pdf_url":"https://arxiv.org/pdf/2404.03632v2.pdf","comment":"20 pages, including supplementary material"},{"id":"http://arxiv.org/abs/2407.18137v1","updated":"2024-07-25T15:42:46Z","published":"2024-07-25T15:42:46Z","title":"XS-VID: An Extremely Small Video Object Detection Dataset","summary":" Small Video Object Detection (SVOD) is a crucial subfield in modern computer\nvision, essential for early object discovery and detection. However, existing\nSVOD datasets are scarce and suffer from issues such as insufficiently small\nobjects, limited object categories, and lack of scene diversity, leading to\nunitary application scenarios for corresponding methods. To address this gap,\nwe develop the XS-VID dataset, which comprises aerial data from various periods\nand scenes, and annotates eight major object categories. To further evaluate\nexisting methods for detecting extremely small objects, XS-VID extensively\ncollects three types of objects with smaller pixel areas: extremely small\n(\\textit{es}, $0\\sim12^2$), relatively small (\\textit{rs}, $12^2\\sim20^2$), and\ngenerally small (\\textit{gs}, $20^2\\sim32^2$). XS-VID offers unprecedented\nbreadth and depth in covering and quantifying minuscule objects, significantly\nenriching the scene and object diversity in the dataset. Extensive validations\non XS-VID and the publicly available VisDrone2019VID dataset show that existing\nmethods struggle with small object detection and significantly underperform\ncompared to general object detectors. Leveraging the strengths of previous\nmethods and addressing their weaknesses, we propose YOLOFT, which enhances\nlocal feature associations and integrates temporal motion features,\nsignificantly improving the accuracy and stability of SVOD. Our datasets and\nbenchmarks are available at \\url{https://gjhhust.github.io/XS-VID/}.\n","authors":["Jiahao Guo","Ziyang Xu","Lianjun Wu","Fei Gao","Wenyu Liu","Xinggang Wang"],"pdf_url":"https://arxiv.org/pdf/2407.18137v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18134v1","updated":"2024-07-25T15:38:16Z","published":"2024-07-25T15:38:16Z","title":"$\\mathbb{X}$-Sample Contrastive Loss: Improving Contrastive Learning\n with Sample Similarity Graphs","summary":" Learning good representations involves capturing the diverse ways in which\ndata samples relate. Contrastive loss - an objective matching related samples -\nunderlies methods from self-supervised to multimodal learning. Contrastive\nlosses, however, can be viewed more broadly as modifying a similarity graph to\nindicate how samples should relate in the embedding space. This view reveals a\nshortcoming in contrastive learning: the similarity graph is binary, as only\none sample is the related positive sample. Crucially, similarities\n\\textit{across} samples are ignored. Based on this observation, we revise the\nstandard contrastive loss to explicitly encode how a sample relates to others.\nWe experiment with this new objective, called $\\mathbb{X}$-Sample Contrastive,\nto train vision models based on similarities in class or text caption\ndescriptions. Our study spans three scales: ImageNet-1k with 1 million, CC3M\nwith 3 million, and CC12M with 12 million samples. The representations learned\nvia our objective outperform both contrastive self-supervised and\nvision-language models trained on the same data across a range of tasks. When\ntraining on CC12M, we outperform CLIP by $0.6\\%$ on both ImageNet and ImageNet\nReal. Our objective appears to work particularly well in lower-data regimes,\nwith gains over CLIP of $16.8\\%$ on ImageNet and $18.1\\%$ on ImageNet Real when\ntraining with CC3M. Finally, our objective seems to encourage the model to\nlearn representations that separate objects from their attributes and\nbackgrounds, with gains of $3.3$-$5.6$\\% over CLIP on ImageNet9. We hope the\nproposed solution takes a small step towards developing richer learning\nobjectives for understanding sample relations in foundation models.\n","authors":["Vlad Sobal","Mark Ibrahim","Randall Balestriero","Vivien Cabannes","Diane Bouchacourt","Pietro Astolfi","Kyunghyun Cho","Yann LeCun"],"pdf_url":"https://arxiv.org/pdf/2407.18134v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18128v1","updated":"2024-07-25T15:35:44Z","published":"2024-07-25T15:35:44Z","title":"Estimating Earthquake Magnitude in Sentinel-1 Imagery via Ranking","summary":" Earthquakes are commonly estimated using physical seismic stations, however,\ndue to the installation requirements and costs of these stations, global\ncoverage quickly becomes impractical. An efficient and lower-cost alternative\nis to develop machine learning models to globally monitor earth observation\ndata to pinpoint regions impacted by these natural disasters. However, due to\nthe small amount of historically recorded earthquakes, this becomes a low-data\nregime problem requiring algorithmic improvements to achieve peak performance\nwhen learning to regress earthquake magnitude. In this paper, we propose to\npose the estimation of earthquake magnitudes as a metric-learning problem,\ntraining models to not only estimate earthquake magnitude from Sentinel-1\nsatellite imagery but to additionally rank pairwise samples. Our experiments\nshow at max a 30%+ improvement in MAE over prior regression-only based methods,\nparticularly transformer-based architectures.\n","authors":["Daniele Rege Cambrin","Isaac Corley","Paolo Garza","Peyman Najafirad"],"pdf_url":"https://arxiv.org/pdf/2407.18128v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17449v2","updated":"2024-07-25T15:33:00Z","published":"2024-07-24T17:30:21Z","title":"Looking at Model Debiasing through the Lens of Anomaly Detection","summary":" It is widely recognized that deep neural networks are sensitive to bias in\nthe data. This means that during training these models are likely to learn\nspurious correlations between data and labels, resulting in limited\ngeneralization abilities and low performance. In this context, model debiasing\napproaches can be devised aiming at reducing the model's dependency on such\nunwanted correlations, either leveraging the knowledge of bias information or\nnot. In this work, we focus on the latter and more realistic scenario, showing\nthe importance of accurately predicting the bias-conflicting and bias-aligned\nsamples to obtain compelling performance in bias mitigation. On this ground, we\npropose to conceive the problem of model bias from an out-of-distribution\nperspective, introducing a new bias identification method based on anomaly\ndetection. We claim that when data is mostly biased, bias-conflicting samples\ncan be regarded as outliers with respect to the bias-aligned distribution in\nthe feature space of a biased model, thus allowing for precisely detecting them\nwith an anomaly detection method. Coupling the proposed bias identification\napproach with bias-conflicting data upsampling and augmentation in a two-step\nstrategy, we reach state-of-the-art performance on synthetic and real benchmark\ndatasets. Ultimately, our proposed approach shows that the data bias issue does\nnot necessarily require complex debiasing methods, given that an accurate bias\nidentification procedure is defined.\n","authors":["Vito Paolo Pastore","Massimiliano Ciranni","Davide Marinelli","Francesca Odone","Vittorio Murino"],"pdf_url":"https://arxiv.org/pdf/2407.17449v2.pdf","comment":"15 pages, 7 figures"},{"id":"http://arxiv.org/abs/2407.18125v1","updated":"2024-07-25T15:32:59Z","published":"2024-07-25T15:32:59Z","title":"Self-supervised pre-training with diffusion model for few-shot landmark\n detection in x-ray images","summary":" In the last few years, deep neural networks have been extensively applied in\nthe medical domain for different tasks, ranging from image classification and\nsegmentation to landmark detection. However, the application of these\ntechnologies in the medical domain is often hindered by data scarcity, both in\nterms of available annotations and images. This study introduces a new\nself-supervised pre-training protocol based on diffusion models for landmark\ndetection in x-ray images. Our results show that the proposed self-supervised\nframework can provide accurate landmark detection with a minimal number of\navailable annotated training images (up to 50), outperforming ImageNet\nsupervised pre-training and state-of-the-art self-supervised pre-trainings for\nthree popular x-ray benchmark datasets. To our knowledge, this is the first\nexploration of diffusion models for self-supervised learning in landmark\ndetection, which may offer a valuable pre-training approach in few-shot\nregimes, for mitigating data scarcity.\n","authors":["Roberto Di Via","Francesca Odone","Vito Paolo Pastore"],"pdf_url":"https://arxiv.org/pdf/2407.18125v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13129v2","updated":"2024-07-25T15:32:39Z","published":"2024-03-19T19:58:54Z","title":"Better Call SAL: Towards Learning to Segment Anything in Lidar","summary":" We propose the SAL (Segment Anything in Lidar) method consisting of a\ntext-promptable zero-shot model for segmenting and classifying any object in\nLidar, and a pseudo-labeling engine that facilitates model training without\nmanual supervision. While the established paradigm for Lidar Panoptic\nSegmentation (LPS) relies on manual supervision for a handful of object classes\ndefined a priori, we utilize 2D vision foundation models to generate 3D\nsupervision ``for free''. Our pseudo-labels consist of instance masks and\ncorresponding CLIP tokens, which we lift to Lidar using calibrated multi-modal\ndata. By training our model on these labels, we distill the 2D foundation\nmodels into our Lidar SAL model. Even without manual labels, our model achieves\n$91\\%$ in terms of class-agnostic segmentation and $54\\%$ in terms of zero-shot\nLidar Panoptic Segmentation of the fully supervised state-of-the-art.\nFurthermore, we outperform several baselines that do not distill but only lift\nimage features to 3D. More importantly, we demonstrate that SAL supports\narbitrary class prompts, can be easily extended to new datasets, and shows\nsignificant potential to improve with increasing amounts of self-labeled data.\nCode and models are available at this\n$\\href{https://github.com/nv-dvl/segment-anything-lidar}{URL}$.\n","authors":["Aljoša Ošep","Tim Meinhardt","Francesco Ferroni","Neehar Peri","Deva Ramanan","Laura Leal-Taixé"],"pdf_url":"https://arxiv.org/pdf/2403.13129v2.pdf","comment":"Accepted to ECCV 2024"},{"id":"http://arxiv.org/abs/2407.18121v1","updated":"2024-07-25T15:29:05Z","published":"2024-07-25T15:29:05Z","title":"Efficient Inference of Vision Instruction-Following Models with Elastic\n Cache","summary":" In the field of instruction-following large vision-language models (LVLMs),\nthe efficient deployment of these models faces challenges, notably due to the\nhigh memory demands of their key-value (KV) caches. Conventional cache\nmanagement strategies for LLMs focus on cache eviction, which often fails to\naddress the specific needs of multimodal instruction-following models.\nRecognizing this gap, in this paper, we introduce Elastic Cache, a novel\napproach that benefits from applying distinct acceleration methods for\ninstruction encoding and output generation stages. We investigate the metrics\nof importance in different stages and propose an importance-driven cache\nmerging strategy to prune redundancy caches. Instead of discarding less\nimportant caches, our strategy identifies important key/value vectors as anchor\npoints. Surrounding less important caches are then merged with these anchors,\nenhancing the preservation of contextual information in the KV caches while\nyielding an arbitrary acceleration ratio. For instruction encoding, we utilize\nthe frequency to evaluate the importance of caches. Regarding output\ngeneration, we prioritize tokens based on their distance with an offset, by\nwhich both the initial and most recent tokens are retained. Results on a range\nof LVLMs demonstrate that Elastic Cache not only boosts efficiency but also\nnotably outperforms existing pruning methods in language generation across\nvarious tasks. Code is available at https://github.com/liuzuyan/ElasticCache\n","authors":["Zuyan Liu","Benlin Liu","Jiahui Wang","Yuhao Dong","Guangyi Chen","Yongming Rao","Ranjay Krishna","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2407.18121v1.pdf","comment":"Accepted to ECCV 2024"},{"id":"http://arxiv.org/abs/2407.18112v1","updated":"2024-07-25T15:20:58Z","published":"2024-07-25T15:20:58Z","title":"Keypoint Promptable Re-Identification","summary":" Occluded Person Re-Identification (ReID) is a metric learning task that\ninvolves matching occluded individuals based on their appearance. While many\nstudies have tackled occlusions caused by objects, multi-person occlusions\nremain less explored. In this work, we identify and address a critical\nchallenge overlooked by previous occluded ReID methods: the Multi-Person\nAmbiguity (MPA) arising when multiple individuals are visible in the same\nbounding box, making it impossible to determine the intended ReID target among\nthe candidates. Inspired by recent work on prompting in vision, we introduce\nKeypoint Promptable ReID (KPR), a novel formulation of the ReID problem that\nexplicitly complements the input bounding box with a set of semantic keypoints\nindicating the intended target. Since promptable re-identification is an\nunexplored paradigm, existing ReID datasets lack the pixel-level annotations\nnecessary for prompting. To bridge this gap and foster further research on this\ntopic, we introduce Occluded-PoseTrack ReID, a novel ReID dataset with\nkeypoints labels, that features strong inter-person occlusions. Furthermore, we\nrelease custom keypoint labels for four popular ReID benchmarks. Experiments on\nperson retrieval, but also on pose tracking, demonstrate that our method\nsystematically surpasses previous state-of-the-art approaches on various\noccluded scenarios. Our code, dataset and annotations are available at\nhttps://github.com/VlSomers/keypoint_promptable_reidentification.\n","authors":["Vladimir Somers","Christophe De Vleeschouwer","Alexandre Alahi"],"pdf_url":"https://arxiv.org/pdf/2407.18112v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02878v2","updated":"2024-07-25T15:20:48Z","published":"2023-12-05T16:48:17Z","title":"Towards More Practical Group Activity Detection: A New Benchmark and\n Model","summary":" Group activity detection (GAD) is the task of identifying members of each\ngroup and classifying the activity of the group at the same time in a video.\nWhile GAD has been studied recently, there is still much room for improvement\nin both dataset and methodology due to their limited capability to address\npractical GAD scenarios. To resolve these issues, we first present a new\ndataset, dubbed Caf\\'e. Unlike existing datasets, Caf\\'e is constructed\nprimarily for GAD and presents more practical scenarios and metrics, as well as\nbeing large-scale and providing rich annotations. Along with the dataset, we\npropose a new GAD model that deals with an unknown number of groups and latent\ngroup members efficiently and effectively. We evaluated our model on three\ndatasets including Caf\\'e, where it outperformed previous work in terms of both\naccuracy and inference speed.\n","authors":["Dongkeun Kim","Youngkil Song","Minsu Cho","Suha Kwak"],"pdf_url":"https://arxiv.org/pdf/2312.02878v2.pdf","comment":"Accepted to ECCV 2024, Project page:\n https://cvlab.postech.ac.kr/research/CAFE"},{"id":"http://arxiv.org/abs/2407.18105v1","updated":"2024-07-25T15:08:54Z","published":"2024-07-25T15:08:54Z","title":"Multi-Resolution Histopathology Patch Graphs for Ovarian Cancer\n Subtyping","summary":" Computer vision models are increasingly capable of classifying ovarian\nepithelial cancer subtypes, but they differ from pathologists by processing\nsmall tissue patches at a single resolution. Multi-resolution graph models\nleverage the spatial relationships of patches at multiple magnifications,\nlearning the context for each patch. In this study, we conduct the most\nthorough validation of a graph model for ovarian cancer subtyping to date.\nSeven models were tuned and trained using five-fold cross-validation on a set\nof 1864 whole slide images (WSIs) from 434 patients treated at Leeds Teaching\nHospitals NHS Trust. The cross-validation models were ensembled and evaluated\nusing a balanced hold-out test set of 100 WSIs from 30 patients, and an\nexternal validation set of 80 WSIs from 80 patients in the Transcanadian Study.\nThe best-performing model, a graph model using 10x+20x magnification data, gave\nbalanced accuracies of 73%, 88%, and 99% in cross-validation, hold-out testing,\nand external validation, respectively. However, this only exceeded the\nperformance of attention-based multiple instance learning in external\nvalidation, with a 93% balanced accuracy. Graph models benefitted greatly from\nusing the UNI foundation model rather than an ImageNet-pretrained ResNet50 for\nfeature extraction, with this having a much greater effect on performance than\nchanging the subsequent classification approach. The accuracy of the combined\nfoundation model and multi-resolution graph network offers a step towards the\nclinical applicability of these models, with a new highest-reported performance\nfor this task, though further validations are still required to ensure the\nrobustness and usability of the models.\n","authors":["Jack Breen","Katie Allen","Kieran Zucker","Nicolas M. Orsi","Nishant Ravikumar"],"pdf_url":"https://arxiv.org/pdf/2407.18105v1.pdf","comment":"Initially submitted version of a paper which has been accepted in the\n GRAIL workshop at MICCAI 2024"},{"id":"http://arxiv.org/abs/2406.09272v3","updated":"2024-07-25T15:03:37Z","published":"2024-06-13T16:10:19Z","title":"Action2Sound: Ambient-Aware Generation of Action Sounds from Egocentric\n Videos","summary":" Generating realistic audio for human actions is important for many\napplications, such as creating sound effects for films or virtual reality\ngames. Existing approaches implicitly assume total correspondence between the\nvideo and audio during training, yet many sounds happen off-screen and have\nweak to no correspondence with the visuals -- resulting in uncontrolled ambient\nsounds or hallucinations at test time. We propose a novel ambient-aware audio\ngeneration model, AV-LDM. We devise a novel audio-conditioning mechanism to\nlearn to disentangle foreground action sounds from the ambient background\nsounds in in-the-wild training videos. Given a novel silent video, our model\nuses retrieval-augmented generation to create audio that matches the visual\ncontent both semantically and temporally. We train and evaluate our model on\ntwo in-the-wild egocentric video datasets, Ego4D and EPIC-KITCHENS, and we\nintroduce Ego4D-Sounds -- 1.2M curated clips with action-audio correspondence.\nOur model outperforms an array of existing methods, allows controllable\ngeneration of the ambient sound, and even shows promise for generalizing to\ncomputer graphics game clips. Overall, our approach is the first to focus\nvideo-to-audio generation faithfully on the observed visual content despite\ntraining from uncurated clips with natural background sounds.\n","authors":["Changan Chen","Puyuan Peng","Ami Baid","Zihui Xue","Wei-Ning Hsu","David Harwath","Kristen Grauman"],"pdf_url":"https://arxiv.org/pdf/2406.09272v3.pdf","comment":"Project page: https://vision.cs.utexas.edu/projects/action2sound.\n ECCV 2024 camera-ready version"},{"id":"http://arxiv.org/abs/2407.18100v1","updated":"2024-07-25T15:03:36Z","published":"2024-07-25T15:03:36Z","title":"DINOv2 Rocks Geological Image Analysis: Classification, Segmentation,\n and Interpretability","summary":" This study investigates the interpretability, classification, and\nsegmentation of CT-scan images of rock samples, with a particular focus on the\napplication of DINOv2 within Geosciences. We compared various segmentation\ntechniques to evaluate their efficacy, efficiency, and adaptability in\ngeological image analysis. The methods assessed include the Otsu thresholding\nmethod, clustering techniques (K-means and fuzzy C-means), a supervised machine\nlearning approach (Random Forest), and deep learning methods (UNet and DINOv2).\nWe tested these methods using ten binary sandstone datasets and three\nmulti-class calcite datasets. To begin, we provide a thorough interpretability\nanalysis of DINOv2's features in the geoscientific context, discussing its\nsuitability and inherent ability to process CT-scanned rock data. In terms of\nclassification, the out-of-the-box DINOv2 demonstrates an impressive capability\nto perfectly classify rock images, even when the CT scans are out of its\noriginal training set. Regarding segmentation, thresholding and unsupervised\nmethods, while fast, perform poorly despite image preprocessing, whereas\nsupervised methods show better results. We underscore the computational demands\nof deep learning but highlight its minimal intervention, superior\ngeneralization, and performance without additional image preprocessing.\nAdditionally, we observe a lack of correlation between a network's depth or the\nnumber of parameters and its performance. Our results show that a LoRA\nfine-tuned DINOv2 excels in out-of-distribution segmentation and significantly\noutperforms other methods in multi-class segmentation. By systematically\ncomparing these methods, we identify the most efficient strategy for meticulous\nand laborious segmentation tasks. DINOv2 proves advantageous, achieving\nsegmentations that could be described as \"better than ground-truth\" against\nrelatively small training sets.\n","authors":["Florent Brondolo","Samuel Beaussant"],"pdf_url":"https://arxiv.org/pdf/2407.18100v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18097v1","updated":"2024-07-25T15:02:24Z","published":"2024-07-25T15:02:24Z","title":"SSTD: Stripe-Like Space Target Detection using Single-Point Supervision","summary":" Stripe-like space target detection (SSTD) plays a key role in enhancing space\nsituational awareness and assessing spacecraft behaviour. This domain faces\nthree challenges: the lack of publicly available datasets, interference from\nstray light and stars, and the variability of stripe-like targets, which\ncomplicates pixel-level annotation. In response, we introduces\n`AstroStripeSet', a pioneering dataset designed for SSTD, aiming to bridge the\ngap in academic resources and advance research in SSTD. Furthermore, we propose\na novel pseudo-label evolution teacher-student framework with single-point\nsupervision. This framework starts with generating initial pseudo-labels using\nthe zero-shot capabilities of the Segment Anything Model (SAM) in a\nsingle-point setting, and refines these labels iteratively. In our framework,\nthe fine-tuned StripeSAM serves as the teacher and the newly developed\nStripeNet as the student, consistently improving segmentation performance by\nimproving the quality of pseudo-labels. We also introduce `GeoDice', a new loss\nfunction customized for the linear characteristics of stripe-like targets.\nExtensive experiments show that the performance of our approach matches fully\nsupervised methods on all evaluation metrics, establishing a new\nstate-of-the-art (SOTA) benchmark. Our dataset and code will be made publicly\navailable.\n","authors":["Zijian Zhu","Ali Zia","Xuesong Li","Bingbing Dan","Yuebo Ma","Enhai Liu","Rujin Zhao"],"pdf_url":"https://arxiv.org/pdf/2407.18097v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00300v2","updated":"2024-07-25T14:48:34Z","published":"2024-02-01T03:27:26Z","title":"Self-supervised learning of video representations from a child's\n perspective","summary":" Children learn powerful internal models of the world around them from a few\nyears of egocentric visual experience. Can such internal models be learned from\na child's visual experience with highly generic learning algorithms or do they\nrequire strong inductive biases? Recent advances in collecting large-scale,\nlongitudinal, developmentally realistic video datasets and generic\nself-supervised learning (SSL) algorithms are allowing us to begin to tackle\nthis nature vs. nurture question. However, existing work typically focuses on\nimage-based SSL algorithms and visual capabilities that can be learned from\nstatic images (e.g. object recognition), thus ignoring temporal aspects of the\nworld. To close this gap, here we train self-supervised video models on\nlongitudinal, egocentric headcam recordings collected from a child over a two\nyear period in their early development (6-31 months). The resulting models are\nhighly effective at facilitating the learning of action concepts from a small\nnumber of labeled examples; they have favorable data size scaling properties;\nand they display emergent video interpolation capabilities. Video models also\nlearn more robust object representations than image-based models trained with\nthe exact same data. These results suggest that important temporal aspects of a\nchild's internal model of the world may be learnable from their visual\nexperience using highly generic learning algorithms and without strong\ninductive biases.\n","authors":["A. Emin Orhan","Wentao Wang","Alex N. Wang","Mengye Ren","Brenden M. Lake"],"pdf_url":"https://arxiv.org/pdf/2402.00300v2.pdf","comment":"Published as a conference paper at CogSci 2024; code & models\n available from https://github.com/eminorhan/video-models"},{"id":"http://arxiv.org/abs/2404.09556v2","updated":"2024-07-25T14:42:11Z","published":"2024-04-15T08:19:08Z","title":"nnU-Net Revisited: A Call for Rigorous Validation in 3D Medical Image\n Segmentation","summary":" The release of nnU-Net marked a paradigm shift in 3D medical image\nsegmentation, demonstrating that a properly configured U-Net architecture could\nstill achieve state-of-the-art results. Despite this, the pursuit of novel\narchitectures, and the respective claims of superior performance over the U-Net\nbaseline, continued. In this study, we demonstrate that many of these recent\nclaims fail to hold up when scrutinized for common validation shortcomings,\nsuch as the use of inadequate baselines, insufficient datasets, and neglected\ncomputational resources. By meticulously avoiding these pitfalls, we conduct a\nthorough and comprehensive benchmarking of current segmentation methods\nincluding CNN-based, Transformer-based, and Mamba-based approaches. In contrast\nto current beliefs, we find that the recipe for state-of-the-art performance is\n1) employing CNN-based U-Net models, including ResNet and ConvNeXt variants, 2)\nusing the nnU-Net framework, and 3) scaling models to modern hardware\nresources. These results indicate an ongoing innovation bias towards novel\narchitectures in the field and underscore the need for more stringent\nvalidation standards in the quest for scientific progress.\n","authors":["Fabian Isensee","Tassilo Wald","Constantin Ulrich","Michael Baumgartner","Saikat Roy","Klaus Maier-Hein","Paul F. Jaeger"],"pdf_url":"https://arxiv.org/pdf/2404.09556v2.pdf","comment":"Accepted at MICCAI 2024"},{"id":"http://arxiv.org/abs/2406.14549v2","updated":"2024-07-25T14:33:33Z","published":"2024-06-20T17:56:17Z","title":"Uncovering Latent Memories: Assessing Data Leakage and Memorization\n Patterns in Frontier AI Models","summary":" Frontier AI systems are making transformative impacts across society, but\nsuch benefits are not without costs: models trained on web-scale datasets\ncontaining personal and private data raise profound concerns about data privacy\nand security. Language models are trained on extensive corpora including\npotentially sensitive or proprietary information, and the risk of data leakage\n- where the model response reveals pieces of such information - remains\ninadequately understood. Prior work has investigated what factors drive\nmemorization and have identified that sequence complexity and the number of\nrepetitions drive memorization. Here, we focus on the evolution of memorization\nover training. We begin by reproducing findings that the probability of\nmemorizing a sequence scales logarithmically with the number of times it is\npresent in the data. We next show that sequences which are apparently not\nmemorized after the first encounter can be \"uncovered\" throughout the course of\ntraining even without subsequent encounters, a phenomenon we term \"latent\nmemorization\". The presence of latent memorization presents a challenge for\ndata privacy as memorized sequences may be hidden at the final checkpoint of\nthe model but remain easily recoverable. To this end, we develop a diagnostic\ntest relying on the cross entropy loss to uncover latent memorized sequences\nwith high accuracy.\n","authors":["Sunny Duan","Mikail Khona","Abhiram Iyer","Rylan Schaeffer","Ila R Fiete"],"pdf_url":"https://arxiv.org/pdf/2406.14549v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.10885v3","updated":"2024-07-25T14:30:22Z","published":"2024-02-16T18:43:02Z","title":"3D Diffuser Actor: Policy Diffusion with 3D Scene Representations","summary":" Diffusion policies are conditional diffusion models that learn robot action\ndistributions conditioned on the robot and environment state. They have\nrecently shown to outperform both deterministic and alternative action\ndistribution learning formulations. 3D robot policies use 3D scene feature\nrepresentations aggregated from a single or multiple camera views using sensed\ndepth. They have shown to generalize better than their 2D counterparts across\ncamera viewpoints. We unify these two lines of work and present 3D Diffuser\nActor, a neural policy equipped with a novel 3D denoising transformer that\nfuses information from the 3D visual scene, a language instruction and\nproprioception to predict the noise in noised 3D robot pose trajectories. 3D\nDiffuser Actor sets a new state-of-the-art on RLBench with an absolute\nperformance gain of 18.1% over the current SOTA on a multi-view setup and an\nabsolute gain of 13.1% on a single-view setup. On the CALVIN benchmark, it\nimproves over the current SOTA by a 9% relative increase. It also learns to\ncontrol a robot manipulator in the real world from a handful of demonstrations.\nThrough thorough comparisons with the current SOTA policies and ablations of\nour model, we show 3D Diffuser Actor's design choices dramatically outperform\n2D representations, regression and classification objectives, absolute\nattentions, and holistic non-tokenized 3D scene embeddings.\n","authors":["Tsung-Wei Ke","Nikolaos Gkanatsios","Katerina Fragkiadaki"],"pdf_url":"https://arxiv.org/pdf/2402.10885v3.pdf","comment":"First two authors contributed equally"},{"id":"http://arxiv.org/abs/2407.18070v1","updated":"2024-07-25T14:25:17Z","published":"2024-07-25T14:25:17Z","title":"CSWin-UNet: Transformer UNet with Cross-Shaped Windows for Medical Image\n Segmentation","summary":" Deep learning, especially convolutional neural networks (CNNs) and\nTransformer architectures, have become the focus of extensive research in\nmedical image segmentation, achieving impressive results. However, CNNs come\nwith inductive biases that limit their effectiveness in more complex, varied\nsegmentation scenarios. Conversely, while Transformer-based methods excel at\ncapturing global and long-range semantic details, they suffer from high\ncomputational demands. In this study, we propose CSWin-UNet, a novel U-shaped\nsegmentation method that incorporates the CSWin self-attention mechanism into\nthe UNet to facilitate horizontal and vertical stripes self-attention. This\nmethod significantly enhances both computational efficiency and receptive field\ninteractions. Additionally, our innovative decoder utilizes a content-aware\nreassembly operator that strategically reassembles features, guided by\npredicted kernels, for precise image resolution restoration. Our extensive\nempirical evaluations on diverse datasets, including synapse multi-organ CT,\ncardiac MRI, and skin lesions, demonstrate that CSWin-UNet maintains low model\ncomplexity while delivering high segmentation accuracy.\n","authors":["Xiao Liu","Peng Gao","Tao Yu","Fei Wang","Ru-Yue Yuan"],"pdf_url":"https://arxiv.org/pdf/2407.18070v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18067v1","updated":"2024-07-25T14:21:50Z","published":"2024-07-25T14:21:50Z","title":"HVM-1: Large-scale video models pretrained with nearly 5000 hours of\n human-like video data","summary":" We introduce Human-like Video Models (HVM-1), large-scale video models\npretrained with nearly 5000 hours of curated human-like video data (mostly\negocentric, temporally extended, continuous video recordings), using the\nspatiotemporal masked autoencoder (ST-MAE) algorithm. We release two 633M\nparameter models trained at spatial resolutions of 224x224 and 448x448 pixels.\nWe evaluate the performance of these models in downstream few-shot video and\nimage recognition tasks and compare them against a model pretrained with 1330\nhours of short action-oriented video clips from YouTube (Kinetics-700). HVM-1\nmodels perform competitively against the Kinetics-700 pretrained model in\ndownstream evaluations despite substantial qualitative differences between the\nspatiotemporal characteristics of the corresponding pretraining datasets. HVM-1\nmodels also learn more accurate and more robust object representations compared\nto models pretrained with the image-based MAE algorithm on the same data,\ndemonstrating the potential benefits of learning to predict temporal\nregularities in natural videos for learning better object representations.\n","authors":["A. Emin Orhan"],"pdf_url":"https://arxiv.org/pdf/2407.18067v1.pdf","comment":"10 pages, 5 figures, 1 table; code & models available from\n https://github.com/eminorhan/hvm-1"},{"id":"http://arxiv.org/abs/2407.18054v1","updated":"2024-07-25T14:07:49Z","published":"2024-07-25T14:07:49Z","title":"LKCell: Efficient Cell Nuclei Instance Segmentation with Large\n Convolution Kernels","summary":" The segmentation of cell nuclei in tissue images stained with the blood dye\nhematoxylin and eosin (H$\\&$E) is essential for various clinical applications\nand analyses. Due to the complex characteristics of cellular morphology, a\nlarge receptive field is considered crucial for generating high-quality\nsegmentation. However, previous methods face challenges in achieving a balance\nbetween the receptive field and computational burden. To address this issue, we\npropose LKCell, a high-accuracy and efficient cell segmentation method. Its\ncore insight lies in unleashing the potential of large convolution kernels to\nachieve computationally efficient large receptive fields. Specifically, (1) We\ntransfer pre-trained large convolution kernel models to the medical domain for\nthe first time, demonstrating their effectiveness in cell segmentation. (2) We\nanalyze the redundancy of previous methods and design a new segmentation\ndecoder based on large convolution kernels. It achieves higher performance\nwhile significantly reducing the number of parameters. We evaluate our method\non the most challenging benchmark and achieve state-of-the-art results (0.5080\nmPQ) in cell nuclei instance segmentation with only 21.6% FLOPs compared with\nthe previous leading method. Our source code and models are available at\nhttps://github.com/hustvl/LKCell.\n","authors":["Ziwei Cui","Jingfeng Yao","Lunbin Zeng","Juan Yang","Wenyu Liu","Xinggang Wang"],"pdf_url":"https://arxiv.org/pdf/2407.18054v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18046v1","updated":"2024-07-25T13:53:48Z","published":"2024-07-25T13:53:48Z","title":"GaussianSR: High Fidelity 2D Gaussian Splatting for Arbitrary-Scale\n Image Super-Resolution","summary":" Implicit neural representations (INRs) have significantly advanced the field\nof arbitrary-scale super-resolution (ASSR) of images. Most existing INR-based\nASSR networks first extract features from the given low-resolution image using\nan encoder, and then render the super-resolved result via a multi-layer\nperceptron decoder. Although these approaches have shown promising results,\ntheir performance is constrained by the limited representation ability of\ndiscrete latent codes in the encoded features. In this paper, we propose a\nnovel ASSR method named GaussianSR that overcomes this limitation through 2D\nGaussian Splatting (2DGS). Unlike traditional methods that treat pixels as\ndiscrete points, GaussianSR represents each pixel as a continuous Gaussian\nfield. The encoded features are simultaneously refined and upsampled by\nrendering the mutually stacked Gaussian fields. As a result, long-range\ndependencies are established to enhance representation ability. In addition, a\nclassifier is developed to dynamically assign Gaussian kernels to all pixels to\nfurther improve flexibility. All components of GaussianSR (i.e., encoder,\nclassifier, Gaussian kernels, and decoder) are jointly learned end-to-end.\nExperiments demonstrate that GaussianSR achieves superior ASSR performance with\nfewer parameters than existing methods while enjoying interpretable and\ncontent-aware feature aggregations.\n","authors":["Jintong Hu","Bin Xia","Bin Chen","Wenming Yang","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.18046v1.pdf","comment":"13 pages, 12 figures"},{"id":"http://arxiv.org/abs/2407.18043v1","updated":"2024-07-25T13:44:49Z","published":"2024-07-25T13:44:49Z","title":"YOCO: You Only Calibrate Once for Accurate Extrinsic Parameter in\n LiDAR-Camera Systems","summary":" In a multi-sensor fusion system composed of cameras and LiDAR, precise\nextrinsic calibration contributes to the system's long-term stability and\naccurate perception of the environment. However, methods based on extracting\nand registering corresponding points still face challenges in terms of\nautomation and precision. This paper proposes a novel fully automatic extrinsic\ncalibration method for LiDAR-camera systems that circumvents the need for\ncorresponding point registration. In our approach, a novel algorithm to extract\nrequired LiDAR correspondence point is proposed. This method can effectively\nfilter out irrelevant points by computing the orientation of plane point clouds\nand extracting points by applying distance- and density-based thresholds. We\navoid the need for corresponding point registration by introducing extrinsic\nparameters between the LiDAR and camera into the projection of extracted points\nand constructing co-planar constraints. These parameters are then optimized to\nsolve for the extrinsic. We validated our method across multiple sets of\nLiDAR-camera systems. In synthetic experiments, our method demonstrates\nsuperior performance compared to current calibration techniques. Real-world\ndata experiments further confirm the precision and robustness of the proposed\nalgorithm, with average rotation and translation calibration errors between\nLiDAR and camera of less than 0.05 degree and 0.015m, respectively. This method\nenables automatic and accurate extrinsic calibration in a single one step,\nemphasizing the potential of calibration algorithms beyond using corresponding\npoint registration to enhance the automation and precision of LiDAR-camera\nsystem calibration.\n","authors":["Tianle Zeng","Dengke He","Feifan Yan","Meixi He"],"pdf_url":"https://arxiv.org/pdf/2407.18043v1.pdf","comment":"IEEE TRANSACTIONS ON INSTRUMENTATION AND MEASUREMENT"},{"id":"http://arxiv.org/abs/2403.15377v2","updated":"2024-07-25T13:42:44Z","published":"2024-03-22T17:57:42Z","title":"InternVideo2: Scaling Foundation Models for Multimodal Video\n Understanding","summary":" We introduce InternVideo2, a new family of video foundation models (ViFM)\nthat achieve the state-of-the-art results in video recognition, video-text\ntasks, and video-centric dialogue. Our core design is a progressive training\napproach that unifies the masked video modeling, crossmodal contrastive\nlearning, and next token prediction, scaling up the video encoder size to 6B\nparameters. At the data level, we prioritize spatiotemporal consistency by\nsemantically segmenting videos and generating video-audio-speech captions. This\nimproves the alignment between video and text. Through extensive experiments,\nwe validate our designs and demonstrate superior performance on over 60 video\nand audio tasks. Notably, our model outperforms others on various video-related\ndialogue and long video understanding benchmarks, highlighting its ability to\nreason and comprehend longer contexts. Code and models are available at\nhttps://github.com/OpenGVLab/InternVideo/tree/main/InternVideo2/.\n","authors":["Yi Wang","Kunchang Li","Xinhao Li","Jiashuo Yu","Yinan He","Chenting Wang","Guo Chen","Baoqi Pei","Ziang Yan","Rongkun Zheng","Jilan Xu","Zun Wang","Yansong Shi","Tianxiang Jiang","Songze Li","Hongjie Zhang","Yifei Huang","Yu Qiao","Yali Wang","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2403.15377v2.pdf","comment":"a technical report about video understanding (accepted to ECCV2024)"},{"id":"http://arxiv.org/abs/2407.18038v1","updated":"2024-07-25T13:31:55Z","published":"2024-07-25T13:31:55Z","title":"TiCoSS: Tightening the Coupling between Semantic Segmentation and Stereo\n Matching within A Joint Learning Framework","summary":" Semantic segmentation and stereo matching, respectively analogous to the\nventral and dorsal streams in our human brain, are two key components of\nautonomous driving perception systems. Addressing these two tasks with separate\nnetworks is no longer the mainstream direction in developing computer vision\nalgorithms, particularly with the recent advances in large vision models and\nembodied artificial intelligence. The trend is shifting towards combining them\nwithin a joint learning framework, especially emphasizing feature sharing\nbetween the two tasks. The major contributions of this study lie in\ncomprehensively tightening the coupling between semantic segmentation and\nstereo matching. Specifically, this study introduces three novelties: (1) a\ntightly coupled, gated feature fusion strategy, (2) a hierarchical deep\nsupervision strategy, and (3) a coupling tightening loss function. The combined\nuse of these technical contributions results in TiCoSS, a state-of-the-art\njoint learning framework that simultaneously tackles semantic segmentation and\nstereo matching. Through extensive experiments on the KITTI and vKITTI2\ndatasets, along with qualitative and quantitative analyses, we validate the\neffectiveness of our developed strategies and loss function, and demonstrate\nits superior performance compared to prior arts, with a notable increase in\nmIoU by over 9%. Our source code will be publicly available at\nmias.group/TiCoSS upon publication.\n","authors":["Guanfeng Tang","Zhiyuan Wu","Rui Fan"],"pdf_url":"https://arxiv.org/pdf/2407.18038v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14242v2","updated":"2024-07-25T13:30:33Z","published":"2024-07-19T12:22:32Z","title":"Continual Panoptic Perception: Towards Multi-modal Incremental\n Interpretation of Remote Sensing Images","summary":" Continual learning (CL) breaks off the one-way training manner and enables a\nmodel to adapt to new data, semantics and tasks continuously. However, current\nCL methods mainly focus on single tasks. Besides, CL models are plagued by\ncatastrophic forgetting and semantic drift since the lack of old data, which\noften occurs in remote-sensing interpretation due to the intricate fine-grained\nsemantics. In this paper, we propose Continual Panoptic Perception (CPP), a\nunified continual learning model that leverages multi-task joint learning\ncovering pixel-level classification, instance-level segmentation and\nimage-level perception for universal interpretation in remote sensing images.\nConcretely, we propose a collaborative cross-modal encoder (CCE) to extract the\ninput image features, which supports pixel classification and caption\ngeneration synchronously. To inherit the knowledge from the old model without\nexemplar memory, we propose a task-interactive knowledge distillation (TKD)\nmethod, which leverages cross-modal optimization and task-asymmetric\npseudo-labeling (TPL) to alleviate catastrophic forgetting. Furthermore, we\nalso propose a joint optimization mechanism to achieve end-to-end multi-modal\npanoptic perception. Experimental results on the fine-grained panoptic\nperception dataset validate the effectiveness of the proposed model, and also\nprove that joint optimization can boost sub-task CL efficiency with over 13\\%\nrelative improvement on panoptic quality.\n","authors":["Bo Yuan","Danpei Zhao","Zhuoran Liu","Wentao Li","Tian Li"],"pdf_url":"https://arxiv.org/pdf/2407.14242v2.pdf","comment":"Accepted in ACMMM 2024"},{"id":"http://arxiv.org/abs/2407.18035v1","updated":"2024-07-25T13:29:37Z","published":"2024-07-25T13:29:37Z","title":"RestoreAgent: Autonomous Image Restoration Agent via Multimodal Large\n Language Models","summary":" Natural images captured by mobile devices often suffer from multiple types of\ndegradation, such as noise, blur, and low light. Traditional image restoration\nmethods require manual selection of specific tasks, algorithms, and execution\nsequences, which is time-consuming and may yield suboptimal results. All-in-one\nmodels, though capable of handling multiple tasks, typically support only a\nlimited range and often produce overly smooth, low-fidelity outcomes due to\ntheir broad data distribution fitting. To address these challenges, we first\ndefine a new pipeline for restoring images with multiple degradations, and then\nintroduce RestoreAgent, an intelligent image restoration system leveraging\nmultimodal large language models. RestoreAgent autonomously assesses the type\nand extent of degradation in input images and performs restoration through (1)\ndetermining the appropriate restoration tasks, (2) optimizing the task\nsequence, (3) selecting the most suitable models, and (4) executing the\nrestoration. Experimental results demonstrate the superior performance of\nRestoreAgent in handling complex degradation, surpassing human experts.\nFurthermore, the system modular design facilitates the fast integration of new\ntasks and models, enhancing its flexibility and scalability for various\napplications.\n","authors":["Haoyu Chen","Wenbo Li","Jinjin Gu","Jingjing Ren","Sixiang Chen","Tian Ye","Renjing Pei","Kaiwen Zhou","Fenglong Song","Lei Zhu"],"pdf_url":"https://arxiv.org/pdf/2407.18035v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18034v1","updated":"2024-07-25T13:29:32Z","published":"2024-07-25T13:29:32Z","title":"AttentionHand: Text-driven Controllable Hand Image Generation for 3D\n Hand Reconstruction in the Wild","summary":" Recently, there has been a significant amount of research conducted on 3D\nhand reconstruction to use various forms of human-computer interaction.\nHowever, 3D hand reconstruction in the wild is challenging due to extreme lack\nof in-the-wild 3D hand datasets. Especially, when hands are in complex pose\nsuch as interacting hands, the problems like appearance similarity, self-handed\noccclusion and depth ambiguity make it more difficult. To overcome these\nissues, we propose AttentionHand, a novel method for text-driven controllable\nhand image generation. Since AttentionHand can generate various and numerous\nin-the-wild hand images well-aligned with 3D hand label, we can acquire a new\n3D hand dataset, and can relieve the domain gap between indoor and outdoor\nscenes. Our method needs easy-to-use four modalities (i.e, an RGB image, a hand\nmesh image from 3D label, a bounding box, and a text prompt). These modalities\nare embedded into the latent space by the encoding phase. Then, through the\ntext attention stage, hand-related tokens from the given text prompt are\nattended to highlight hand-related regions of the latent embedding. After the\nhighlighted embedding is fed to the visual attention stage, hand-related\nregions in the embedding are attended by conditioning global and local hand\nmesh images with the diffusion-based pipeline. In the decoding phase, the final\nfeature is decoded to new hand images, which are well-aligned with the given\nhand mesh image and text prompt. As a result, AttentionHand achieved\nstate-of-the-art among text-to-hand image generation models, and the\nperformance of 3D hand mesh reconstruction was improved by additionally\ntraining with hand images generated by AttentionHand.\n","authors":["Junho Park","Kyeongbo Kong","Suk-Ju Kang"],"pdf_url":"https://arxiv.org/pdf/2407.18034v1.pdf","comment":"Accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2407.09503v2","updated":"2024-07-25T13:29:27Z","published":"2024-06-14T09:39:53Z","title":"PARSE-Ego4D: Personal Action Recommendation Suggestions for Egocentric\n Videos","summary":" Intelligent assistance involves not only understanding but also action.\nExisting ego-centric video datasets contain rich annotations of the videos, but\nnot of actions that an intelligent assistant could perform in the moment. To\naddress this gap, we release PARSE-Ego4D, a new set of personal action\nrecommendation annotations for the Ego4D dataset. We take a multi-stage\napproach to generating and evaluating these annotations. First, we used a\nprompt-engineered large language model (LLM) to generate context-aware action\nsuggestions and identified over 18,000 action suggestions. While these\nsynthetic action suggestions are valuable, the inherent limitations of LLMs\nnecessitate human evaluation. To ensure high-quality and user-centered\nrecommendations, we conducted a large-scale human annotation study that\nprovides grounding in human preferences for all of PARSE-Ego4D. We analyze the\ninter-rater agreement and evaluate subjective preferences of participants.\nBased on our synthetic dataset and complete human annotations, we propose\nseveral new tasks for action suggestions based on ego-centric videos. We\nencourage novel solutions that improve latency and energy requirements. The\nannotations in PARSE-Ego4D will support researchers and developers who are\nworking on building action recommendation systems for augmented and virtual\nreality systems.\n","authors":["Steven Abreu","Tiffany D. Do","Karan Ahuja","Eric J. Gonzalez","Lee Payne","Daniel McDuff","Mar Gonzalez-Franco"],"pdf_url":"https://arxiv.org/pdf/2407.09503v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18026v1","updated":"2024-07-25T13:23:57Z","published":"2024-07-25T13:23:57Z","title":"Segmentation-guided MRI reconstruction for meaningfully diverse\n reconstructions","summary":" Inverse problems, such as accelerated MRI reconstruction, are ill-posed and\nan infinite amount of possible and plausible solutions exist. This may not only\nlead to uncertainty in the reconstructed image but also in downstream tasks\nsuch as semantic segmentation. This uncertainty, however, is mostly not\nanalyzed in the literature, even though probabilistic reconstruction models are\ncommonly used. These models can be prone to ignore plausible but unlikely\nsolutions like rare pathologies. Building on MRI reconstruction approaches\nbased on diffusion models, we add guidance to the diffusion process during\ninference, generating two meaningfully diverse reconstructions corresponding to\nan upper and lower bound segmentation. The reconstruction uncertainty can then\nbe quantified by the difference between these bounds, which we coin the\n'uncertainty boundary'. We analyzed the behavior of the upper and lower bound\nsegmentations for a wide range of acceleration factors and found the\nuncertainty boundary to be both more reliable and more accurate compared to\nrepeated sampling. Code is available at https://github.com/NikolasMorshuis/SGR\n","authors":["Jan Nikolas Morshuis","Matthias Hein","Christian F. Baumgartner"],"pdf_url":"https://arxiv.org/pdf/2407.18026v1.pdf","comment":"Accepted at DGM4MICCAI 2024"},{"id":"http://arxiv.org/abs/2407.15396v2","updated":"2024-07-25T12:54:52Z","published":"2024-07-22T05:53:46Z","title":"Semantic Diversity-aware Prototype-based Learning for Unbiased Scene\n Graph Generation","summary":" The scene graph generation (SGG) task involves detecting objects within an\nimage and predicting predicates that represent the relationships between the\nobjects. However, in SGG benchmark datasets, each subject-object pair is\nannotated with a single predicate even though a single predicate may exhibit\ndiverse semantics (i.e., semantic diversity), existing SGG models are trained\nto predict the one and only predicate for each pair. This in turn results in\nthe SGG models to overlook the semantic diversity that may exist in a\npredicate, thus leading to biased predictions. In this paper, we propose a\nnovel model-agnostic Semantic Diversity-aware Prototype-based Learning (DPL)\nframework that enables unbiased predictions based on the understanding of the\nsemantic diversity of predicates. Specifically, DPL learns the regions in the\nsemantic space covered by each predicate to distinguish among the various\ndifferent semantics that a single predicate can represent. Extensive\nexperiments demonstrate that our proposed model-agnostic DPL framework brings\nsignificant performance improvement on existing SGG models, and also\neffectively understands the semantic diversity of predicates.\n","authors":["Jaehyeong Jeon","Kibum Kim","Kanghoon Yoon","Chanyoung Park"],"pdf_url":"https://arxiv.org/pdf/2407.15396v2.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2407.18002v1","updated":"2024-07-25T12:53:21Z","published":"2024-07-25T12:53:21Z","title":"Network Inversion of Convolutional Neural Nets","summary":" Neural networks have emerged as powerful tools across various applications,\nyet their decision-making process often remains opaque, leading to them being\nperceived as \"black boxes.\" This opacity raises concerns about their\ninterpretability and reliability, especially in safety-critical scenarios.\nNetwork inversion techniques offer a solution by allowing us to peek inside\nthese black boxes, revealing the features and patterns learned by the networks\nbehind their decision-making processes and thereby provide valuable insights\ninto how neural networks arrive at their conclusions, making them more\ninterpretable and trustworthy. This paper presents a simple yet effective\napproach to network inversion using a carefully conditioned generator that\nlearns the data distribution in the input space of the trained neural network,\nenabling the reconstruction of inputs that would most likely lead to the\ndesired outputs. To capture the diversity in the input space for a given\noutput, instead of simply revealing the conditioning labels to the generator,\nwe hideously encode the conditioning label information into vectors, further\nexemplified by heavy dropout in the generation process and minimisation of\ncosine similarity between the features corresponding to the generated images.\nThe paper concludes with immediate applications of Network Inversion including\nin interpretability, explainability and generation of adversarial samples.\n","authors":["Pirzada Suhail","Amit Sethi"],"pdf_url":"https://arxiv.org/pdf/2407.18002v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2108.08158v3","updated":"2024-07-25T12:52:52Z","published":"2021-08-18T14:04:52Z","title":"Practical X-ray Gastric Cancer Screening Using Refined Stochastic Data\n Augmentation and Hard Boundary Box Training","summary":" Endoscopy is widely used to diagnose gastric cancer and has a high diagnostic\nperformance, but because it must be performed by a physician, the number of\npeople who can be diagnosed is limited. Gastric X-ray, on the other hand, can\nbe performed by technicians and can screen a much larger number of patients\nthan endoscopy, but its correct diagnosis requires experience. We propose an\nunprecedented and practical gastric cancer diagnosis support system for gastric\nX-ray images, which will enable more people to be screened. The system is based\non a general deep learning-based object detection model and includes two novel\ntechnical proposals: refined probabilistic stomach image augmentation (R-sGAIA)\nand hard boundary box learning (HBBT). R-sGAIA is a probabilistic gastric fold\nregion enhancement method that provides more learning patterns for cancer\ndetection models. HBBT is an efficient training method for object detection\nmodels that allows the use of unannotated negative (i.e., healthy control)\nsamples that cannot be used for training in conventional detection models,\nthereby improving model performance. The sensitivity (SE) of the proposed\nsystem for gastric cancer (90.2%) is higher than that of the expert (85.5%),\nand two out of five candidates detected box are cancerous, achieving a high\nprecision while maintaining a high processing speed of 0.51 seconds/image. The\nproposed system showed 5.9 points higher on the F1 score compared to methods\nusing the same object detection model and state-of-the-art data augmentation.\nIn short, the system quickly and efficiently shows the radiologist where to\nlook, greatly reducing the radiologist's workload.\n","authors":["Hideaki Okamoto","Takakiyo Nomura","Kazuhito Nabeshima","Jun Hashimoto","Hitoshi Iyatomi"],"pdf_url":"https://arxiv.org/pdf/2108.08158v3.pdf","comment":"19 pages, 6 figures"},{"id":"http://arxiv.org/abs/2407.18000v1","updated":"2024-07-25T12:49:24Z","published":"2024-07-25T12:49:24Z","title":"Investigation to answer three key questions concerning plant pest\n identification and development of a practical identification framework","summary":" The development of practical and robust automated diagnostic systems for\nidentifying plant pests is crucial for efficient agricultural production. In\nthis paper, we first investigate three key research questions (RQs) that have\nnot been addressed thus far in the field of image-based plant pest\nidentification. Based on the knowledge gained, we then develop an accurate,\nrobust, and fast plant pest identification framework using 334K images\ncomprising 78 combinations of four plant portions (the leaf front, leaf back,\nfruit, and flower of cucumber, tomato, strawberry, and eggplant) and 20 pest\nspecies captured at 27 farms. The results reveal the following. (1) For an\nappropriate evaluation of the model, the test data should not include images of\nthe field from which the training images were collected, or other\nconsiderations to increase the diversity of the test set should be taken into\naccount. (2) Pre-extraction of ROIs, such as leaves and fruits, helps to\nimprove identification accuracy. (3) Integration of closely related species\nusing the same control methods and cross-crop training methods for the same\npests, are effective. Our two-stage plant pest identification framework,\nenabling ROI detection and convolutional neural network (CNN)-based\nidentification, achieved a highly practical performance of 91.0% and 88.5% in\nmean accuracy and macro F1 score, respectively, for 12,223 instances of test\ndata of 21 classes collected from unseen fields, where 25 classes of images\nfrom 318,971 samples were used for training; the average identification time\nwas 476 ms/image.\n","authors":["Ryosuke Wayama","Yuki Sasaki","Satoshi Kagiwada","Nobusuke Iwasaki","Hitoshi Iyatomi"],"pdf_url":"https://arxiv.org/pdf/2407.18000v1.pdf","comment":"40 pages, 10 figures"},{"id":"http://arxiv.org/abs/2407.17996v1","updated":"2024-07-25T12:43:41Z","published":"2024-07-25T12:43:41Z","title":"Joint RGB-Spectral Decomposition Model Guided Image Enhancement in\n Mobile Photography","summary":" The integration of miniaturized spectrometers into mobile devices offers new\navenues for image quality enhancement and facilitates novel downstream tasks.\nHowever, the broader application of spectral sensors in mobile photography is\nhindered by the inherent complexity of spectral images and the constraints of\nspectral imaging capabilities. To overcome these challenges, we propose a joint\nRGB-Spectral decomposition model guided enhancement framework, which consists\nof two steps: joint decomposition and prior-guided enhancement. Firstly, we\nleverage the complementarity between RGB and Low-resolution Multi-Spectral\nImages (Lr-MSI) to predict shading, reflectance, and material semantic priors.\nSubsequently, these priors are seamlessly integrated into the established\nHDRNet to promote dynamic range enhancement, color mapping, and grid expert\nlearning, respectively. Additionally, we construct a high-quality Mobile-Spec\ndataset to support our research, and our experiments validate the effectiveness\nof Lr-MSI in the tone enhancement task. This work aims to establish a solid\nfoundation for advancing spectral vision in mobile photography. The code is\navailable at \\url{https://github.com/CalayZhou/JDM-HDRNet}.\n","authors":["Kailai Zhou","Lijing Cai","Yibo Wang","Mengya Zhang","Bihan Wen","Qiu Shen","Xun Cao"],"pdf_url":"https://arxiv.org/pdf/2407.17996v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.13675v3","updated":"2024-07-25T12:32:21Z","published":"2024-07-18T16:50:59Z","title":"MeshSegmenter: Zero-Shot Mesh Semantic Segmentation via Texture\n Synthesis","summary":" We present MeshSegmenter, a simple yet effective framework designed for\nzero-shot 3D semantic segmentation. This model successfully extends the\npowerful capabilities of 2D segmentation models to 3D meshes, delivering\naccurate 3D segmentation across diverse meshes and segment descriptions.\nSpecifically, our model leverages the Segment Anything Model (SAM) model to\nsegment the target regions from images rendered from the 3D shape. In light of\nthe importance of the texture for segmentation, we also leverage the pretrained\nstable diffusion model to generate images with textures from 3D shape, and\nleverage SAM to segment the target regions from images with textures. Textures\nsupplement the shape for segmentation and facilitate accurate 3D segmentation\neven in geometrically non-prominent areas, such as segmenting a car door within\na car mesh. To achieve the 3D segments, we render 2D images from different\nviews and conduct segmentation for both textured and untextured images. Lastly,\nwe develop a multi-view revoting scheme that integrates 2D segmentation results\nand confidence scores from various views onto the 3D mesh, ensuring the 3D\nconsistency of segmentation results and eliminating inaccuracies from specific\nperspectives. Through these innovations, MeshSegmenter offers stable and\nreliable 3D segmentation results both quantitatively and qualitatively,\nhighlighting its potential as a transformative tool in the field of 3D\nzero-shot segmentation. The code is available at\n\\url{https://github.com/zimingzhong/MeshSegmenter}.\n","authors":["Ziming Zhong","Yanxu Xu","Jing Li","Jiale Xu","Zhengxin Li","Chaohui Yu","Shenghua Gao"],"pdf_url":"https://arxiv.org/pdf/2407.13675v3.pdf","comment":"The paper was accepted by ECCV2024"},{"id":"http://arxiv.org/abs/2312.07423v2","updated":"2024-07-25T12:24:10Z","published":"2023-12-12T16:45:52Z","title":"Holoported Characters: Real-time Free-viewpoint Rendering of Humans from\n Sparse RGB Cameras","summary":" We present the first approach to render highly realistic free-viewpoint\nvideos of a human actor in general apparel, from sparse multi-view recording to\ndisplay, in real-time at an unprecedented 4K resolution. At inference, our\nmethod only requires four camera views of the moving actor and the respective\n3D skeletal pose. It handles actors in wide clothing, and reproduces even\nfine-scale dynamic detail, e.g. clothing wrinkles, face expressions, and hand\ngestures. At training time, our learning-based approach expects dense\nmulti-view video and a rigged static surface scan of the actor. Our method\ncomprises three main stages. Stage 1 is a skeleton-driven neural approach for\nhigh-quality capture of the detailed dynamic mesh geometry. Stage 2 is a novel\nsolution to create a view-dependent texture using four test-time camera views\nas input. Finally, stage 3 comprises a new image-based refinement network\nrendering the final 4K image given the output from the previous stages. Our\napproach establishes a new benchmark for real-time rendering resolution and\nquality using sparse input camera views, unlocking possibilities for immersive\ntelepresence.\n","authors":["Ashwath Shetty","Marc Habermann","Guoxing Sun","Diogo Luvizon","Vladislav Golyanik","Christian Theobalt"],"pdf_url":"https://arxiv.org/pdf/2312.07423v2.pdf","comment":"Project page: https://vcai.mpi-inf.mpg.de/projects/holochar/ 8 pages,\n 2 tables and 8 figures; presented at Computer Vision and Pattern Recognition\n (CVPR) 2024"},{"id":"http://arxiv.org/abs/2406.09126v2","updated":"2024-07-25T11:50:52Z","published":"2024-06-13T13:59:47Z","title":"Auto-Vocabulary Segmentation for LiDAR Points","summary":" Existing perception methods for autonomous driving fall short of recognizing\nunknown entities not covered in the training data. Open-vocabulary methods\noffer promising capabilities in detecting any object but are limited by\nuser-specified queries representing target classes. We propose AutoVoc3D, a\nframework for automatic object class recognition and open-ended segmentation.\nEvaluation on nuScenes showcases AutoVoc3D's ability to generate precise\nsemantic classes and accurate point-wise segmentation. Moreover, we introduce\nText-Point Semantic Similarity, a new metric to assess the semantic similarity\nbetween text and point cloud without eliminating novel classes.\n","authors":["Weijie Wei","Osman Ülger","Fatemeh Karimi Nejadasl","Theo Gevers","Martin R. Oswald"],"pdf_url":"https://arxiv.org/pdf/2406.09126v2.pdf","comment":"Accepted by CVPR 2024 OpenSun3D Workshop"},{"id":"http://arxiv.org/abs/2308.12112v4","updated":"2024-07-25T11:49:54Z","published":"2023-08-23T13:02:52Z","title":"Category Adaptation Meets Projected Distillation in Generalized\n Continual Category Discovery","summary":" Generalized Continual Category Discovery (GCCD) tackles learning from\nsequentially arriving, partially labeled datasets while uncovering new\ncategories. Traditional methods depend on feature distillation to prevent\nforgetting the old knowledge. However, this strategy restricts the model's\nability to adapt and effectively distinguish new categories. To address this,\nwe introduce a novel technique integrating a learnable projector with feature\ndistillation, thus enhancing model adaptability without sacrificing past\nknowledge. The resulting distribution shift of the previously learned\ncategories is mitigated with the auxiliary category adaptation network. We\ndemonstrate that while each component offers modest benefits individually,\ntheir combination - dubbed CAMP (Category Adaptation Meets Projected\ndistillation) - significantly improves the balance between learning new\ninformation and retaining old. CAMP exhibits superior performance across\nseveral GCCD and Class Incremental Learning scenarios. The code is available at\nhttps://github.com/grypesc/CAMP.\n","authors":["Grzegorz Rypeść","Daniel Marczak","Sebastian Cygert","Tomasz Trzciński","Bartłomiej Twardowski"],"pdf_url":"https://arxiv.org/pdf/2308.12112v4.pdf","comment":"Accepted for ECCV 2024"},{"id":"http://arxiv.org/abs/2407.17967v1","updated":"2024-07-25T11:39:20Z","published":"2024-07-25T11:39:20Z","title":"Lightweight Language-driven Grasp Detection using Conditional\n Consistency Model","summary":" Language-driven grasp detection is a fundamental yet challenging task in\nrobotics with various industrial applications. In this work, we present a new\napproach for language-driven grasp detection that leverages the concept of\nlightweight diffusion models to achieve fast inference time. By integrating\ndiffusion processes with grasping prompts in natural language, our method can\neffectively encode visual and textual information, enabling more accurate and\nversatile grasp positioning that aligns well with the text query. To overcome\nthe long inference time problem in diffusion models, we leverage the image and\ntext features as the condition in the consistency model to reduce the number of\ndenoising timesteps during inference. The intensive experimental results show\nthat our method outperforms other recent grasp detection methods and\nlightweight diffusion models by a clear margin. We further validate our method\nin real-world robotic experiments to demonstrate its fast inference time\ncapability.\n","authors":["Nghia Nguyen","Minh Nhat Vu","Baoru Huang","An Vuong","Ngan Le","Thieu Vo","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2407.17967v1.pdf","comment":"Accepted at IROS 2024"},{"id":"http://arxiv.org/abs/2309.10527v3","updated":"2024-07-25T11:26:49Z","published":"2023-09-19T11:13:01Z","title":"SPOT: Scalable 3D Pre-training via Occupancy Prediction for Learning\n Transferable 3D Representations","summary":" Annotating 3D LiDAR point clouds for perception tasks is fundamental for many\napplications e.g., autonomous driving, yet it still remains notoriously\nlabor-intensive. Pretraining-finetuning approach can alleviate the labeling\nburden by fine-tuning a pre-trained backbone across various downstream datasets\nas well as tasks. In this paper, we propose SPOT, namely Scalable Pre-training\nvia Occupancy prediction for learning Transferable 3D representations under\nsuch a label-efficient fine-tuning paradigm. SPOT achieves effectiveness on\nvarious public datasets with different downstream tasks, showcasing its general\nrepresentation power, cross-domain robustness and data scalability which are\nthree key factors for real-world application. Specifically, we both\ntheoretically and empirically show, for the first time, that general\nrepresentations learning can be achieved through the task of occupancy\nprediction. Then, to address the domain gap caused by different LiDAR sensors\nand annotation methods, we develop a beam re-sampling technique for point cloud\naugmentation combined with class-balancing strategy. Furthermore, scalable\npre-training is observed, that is, the downstream performance across all the\nexperiments gets better with more pre-training data. Additionally, such\npre-training strategy also remains compatible with unlabeled data. The hope is\nthat our findings will facilitate the understanding of LiDAR points and pave\nthe way for future advancements in LiDAR pre-training.\n","authors":["Xiangchao Yan","Runjian Chen","Bo Zhang","Hancheng Ye","Renqiu Xia","Jiakang Yuan","Hongbin Zhou","Xinyu Cai","Botian Shi","Wenqi Shao","Ping Luo","Yu Qiao","Tao Chen","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2309.10527v3.pdf","comment":"15 pages, 8 figures, Code is available at\n https://github.com/PJLab-ADG/3DTrans"},{"id":"http://arxiv.org/abs/2407.17956v1","updated":"2024-07-25T11:22:54Z","published":"2024-07-25T11:22:54Z","title":"SaccadeDet: A Novel Dual-Stage Architecture for Rapid and Accurate\n Detection in Gigapixel Images","summary":" The advancement of deep learning in object detection has predominantly\nfocused on megapixel images, leaving a critical gap in the efficient processing\nof gigapixel images. These super high-resolution images present unique\nchallenges due to their immense size and computational demands. To address\nthis, we introduce 'SaccadeDet', an innovative architecture for gigapixel-level\nobject detection, inspired by the human eye saccadic movement. The cornerstone\nof SaccadeDet is its ability to strategically select and process image regions,\ndramatically reducing computational load. This is achieved through a two-stage\nprocess: the 'saccade' stage, which identifies regions of probable interest,\nand the 'gaze' stage, which refines detection in these targeted areas. Our\napproach, evaluated on the PANDA dataset, not only achieves an 8x speed\nincrease over the state-of-the-art methods but also demonstrates significant\npotential in gigapixel-level pathology analysis through its application to\nWhole Slide Imaging.\n","authors":["Wenxi Li","Ruxin Zhang","Haozhe Lin","Yuchen Guo","Chao Ma","Xiaokang Yang"],"pdf_url":"https://arxiv.org/pdf/2407.17956v1.pdf","comment":"This paper is accepted to ECML-PKDD 2024"},{"id":"http://arxiv.org/abs/2407.17954v1","updated":"2024-07-25T11:19:55Z","published":"2024-07-25T11:19:55Z","title":"Scaling Training Data with Lossy Image Compression","summary":" Empirically-determined scaling laws have been broadly successful in\npredicting the evolution of large machine learning models with training data\nand number of parameters. As a consequence, they have been useful for\noptimizing the allocation of limited resources, most notably compute time.\n In certain applications, storage space is an important constraint, and data\nformat needs to be chosen carefully as a consequence. Computer vision is a\nprominent example: images are inherently analog, but are always stored in a\ndigital format using a finite number of bits. Given a dataset of digital\nimages, the number of bits $L$ to store each of them can be further reduced\nusing lossy data compression. This, however, can degrade the quality of the\nmodel trained on such images, since each example has lower resolution.\n In order to capture this trade-off and optimize storage of training data, we\npropose a `storage scaling law' that describes the joint evolution of test\nerror with sample size and number of bits per image. We prove that this law\nholds within a stylized model for image compression, and verify it empirically\non two computer vision tasks, extracting the relevant parameters. We then show\nthat this law can be used to optimize the lossy compression level. At given\nstorage, models trained on optimally compressed images present a significantly\nsmaller test error with respect to models trained on the original data.\nFinally, we investigate the potential benefits of randomizing the compression\nlevel.\n","authors":["Katherine L. Mentzer","Andrea Montanari"],"pdf_url":"https://arxiv.org/pdf/2407.17954v1.pdf","comment":"21 pages, 27 figures"},{"id":"http://arxiv.org/abs/2407.17952v1","updated":"2024-07-25T11:16:37Z","published":"2024-07-25T11:16:37Z","title":"BetterDepth: Plug-and-Play Diffusion Refiner for Zero-Shot Monocular\n Depth Estimation","summary":" By training over large-scale datasets, zero-shot monocular depth estimation\n(MDE) methods show robust performance in the wild but often suffer from\ninsufficiently precise details. Although recent diffusion-based MDE approaches\nexhibit appealing detail extraction ability, they still struggle in\ngeometrically challenging scenes due to the difficulty of gaining robust\ngeometric priors from diverse datasets. To leverage the complementary merits of\nboth worlds, we propose BetterDepth to efficiently achieve geometrically\ncorrect affine-invariant MDE performance while capturing fine-grained details.\nSpecifically, BetterDepth is a conditional diffusion-based refiner that takes\nthe prediction from pre-trained MDE models as depth conditioning, in which the\nglobal depth context is well-captured, and iteratively refines details based on\nthe input image. For the training of such a refiner, we propose global\npre-alignment and local patch masking methods to ensure the faithfulness of\nBetterDepth to depth conditioning while learning to capture fine-grained scene\ndetails. By efficient training on small-scale synthetic datasets, BetterDepth\nachieves state-of-the-art zero-shot MDE performance on diverse public datasets\nand in-the-wild scenes. Moreover, BetterDepth can improve the performance of\nother MDE models in a plug-and-play manner without additional re-training.\n","authors":["Xiang Zhang","Bingxin Ke","Hayko Riemenschneider","Nando Metzger","Anton Obukhov","Markus Gross","Konrad Schindler","Christopher Schroers"],"pdf_url":"https://arxiv.org/pdf/2407.17952v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.07720v3","updated":"2024-07-25T11:15:37Z","published":"2024-07-10T14:53:37Z","title":"SvANet: A Scale-variant Attention-based Network for Small Medical Object\n Segmentation","summary":" Early detection and accurate diagnosis can predict the risk of malignant\ndisease transformation, thereby increasing the probability of effective\ntreatment. A mild syndrome with small infected regions is an ominous warning\nand is foremost in the early diagnosis of diseases. Deep learning algorithms,\nsuch as convolutional neural networks (CNNs), have been used to segment natural\nor medical objects, showing promising results. However, analyzing medical\nobjects of small areas in images remains a challenge due to information losses\nand compression defects caused by convolution and pooling operations in CNNs.\nThese losses and defects become increasingly significant as the network\ndeepens, particularly for small medical objects. To address these challenges,\nwe propose a novel scale-variant attention-based network (SvANet) for accurate\nsmall-scale object segmentation in medical images. The SvANet consists of Monte\nCarlo attention, scale-variant attention, and vision transformer, which\nincorporates cross-scale features and alleviates compression artifacts for\nenhancing the discrimination of small medical objects. Quantitative\nexperimental results demonstrate the superior performance of SvANet, achieving\n96.12%, 96.11%, 89.79%, 84.15%, 80.25%, 73.05%, and 72.58% in mean Dice\ncoefficient for segmenting kidney tumors, skin lesions, hepatic tumors, polyps,\nsurgical excision cells, retinal vasculatures, and sperms, which occupy less\nthan 1% of the image areas in KiTS23, ISIC 2018, ATLAS, PolypGen, TissueNet,\nFIVES, and SpermHealth datasets, respectively.\n","authors":["Wei Dai","Rui Liu","Zixuan Wu","Tianyi Wu","Min Wang","Junxian Zhou","Yixuan Yuan","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2407.07720v3.pdf","comment":"14 pages, 9 figures, under review"},{"id":"http://arxiv.org/abs/2407.17950v1","updated":"2024-07-25T11:11:05Z","published":"2024-07-25T11:11:05Z","title":"Real Time American Sign Language Detection Using Yolo-v9","summary":" This paper focuses on real-time American Sign Language Detection. YOLO is a\nconvolutional neural network (CNN) based model, which was first released in\n2015. In recent years, it gained popularity for its real-time detection\ncapabilities. Our study specifically targets YOLO-v9 model, released in 2024.\nAs the model is newly introduced, not much work has been done on it, especially\nnot in Sign Language Detection. Our paper provides deep insight on how YOLO- v9\nworks and better than previous model.\n","authors":["Amna Imran","Meghana Shashishekhara Hulikal","Hamza A. A. Gardi"],"pdf_url":"https://arxiv.org/pdf/2407.17950v1.pdf","comment":"11 pages, 13 figures, 1 table"},{"id":"http://arxiv.org/abs/2407.17938v1","updated":"2024-07-25T10:55:19Z","published":"2024-07-25T10:55:19Z","title":"Analyzing Brain Tumor Connectomics using Graphs and Persistent Homology","summary":" Recent advances in molecular and genetic research have identified a diverse\nrange of brain tumor sub-types, shedding light on differences in their\nmolecular mechanisms, heterogeneity, and origins. The present study performs\nwhole-brain connectome analysis using diffusionweighted images. To achieve\nthis, both graph theory and persistent homology - a prominent approach in\ntopological data analysis are employed in order to quantify changes in the\nstructural connectivity of the wholebrain connectome in subjects with brain\ntumors. Probabilistic tractography is used to map the number of streamlines\nconnecting 84 distinct brain regions, as delineated by the Desikan-Killiany\natlas from FreeSurfer. These streamline mappings form the connectome matrix, on\nwhich persistent homology based analysis and graph theoretical analysis are\nexecuted to evaluate the discriminatory power between tumor sub-types that\ninclude meningioma and glioma. A detailed statistical analysis is conducted on\npersistent homology-derived topological features and graphical features to\nidentify the brain regions where differences between study groups are\nstatistically significant (p < 0.05). For classification purpose, graph-based\nlocal features are utilized, achieving a highest accuracy of 88%. In\nclassifying tumor sub-types, an accuracy of 80% is attained. The findings\nobtained from this study underscore the potential of persistent homology and\ngraph theoretical analysis of the whole-brain connectome in detecting\nalterations in structural connectivity patterns specific to different types of\nbrain tumors.\n","authors":["Debanjali Bhattacharya","Ninad Aithal","Manish Jayswal","Neelam Sinha"],"pdf_url":"https://arxiv.org/pdf/2407.17938v1.pdf","comment":"15 Pages, 7 Figures, 2 Tables, TGI3-MICCAI Workshop"},{"id":"http://arxiv.org/abs/2407.13842v2","updated":"2024-07-25T10:51:19Z","published":"2024-07-18T18:24:51Z","title":"Language-Driven 6-DoF Grasp Detection Using Negative Prompt Guidance","summary":" 6-DoF grasp detection has been a fundamental and challenging problem in\nrobotic vision. While previous works have focused on ensuring grasp stability,\nthey often do not consider human intention conveyed through natural language,\nhindering effective collaboration between robots and users in complex 3D\nenvironments. In this paper, we present a new approach for language-driven\n6-DoF grasp detection in cluttered point clouds. We first introduce\nGrasp-Anything-6D, a large-scale dataset for the language-driven 6-DoF grasp\ndetection task with 1M point cloud scenes and more than 200M\nlanguage-associated 3D grasp poses. We further introduce a novel diffusion\nmodel that incorporates a new negative prompt guidance learning strategy. The\nproposed negative prompt strategy directs the detection process toward the\ndesired object while steering away from unwanted ones given the language input.\nOur method enables an end-to-end framework where humans can command the robot\nto grasp desired objects in a cluttered scene using natural language. Intensive\nexperimental results show the effectiveness of our method in both benchmarking\nexperiments and real-world scenarios, surpassing other baselines. In addition,\nwe demonstrate the practicality of our approach in real-world robotic\napplications. Our project is available at\nhttps://airvlab.github.io/grasp-anything.\n","authors":["Toan Nguyen","Minh Nhat Vu","Baoru Huang","An Vuong","Quan Vuong","Ngan Le","Thieu Vo","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2407.13842v2.pdf","comment":"Accepted at ECCV 2024"},{"id":"http://arxiv.org/abs/2407.17933v1","updated":"2024-07-25T10:46:29Z","published":"2024-07-25T10:46:29Z","title":"Segmentation by registration-enabled SAM prompt engineering using five\n reference images","summary":" The recently proposed Segment Anything Model (SAM) is a general tool for\nimage segmentation, but it requires additional adaptation and careful\nfine-tuning for medical image segmentation, especially for small,\nirregularly-shaped, and boundary-ambiguous anatomical structures such as the\nknee cartilage that is of interest in this work. Repaired cartilage, after\ncertain surgical procedures, exhibits imaging patterns unseen to pre-training,\nposing further challenges for using models like SAM with or without\ngeneral-purpose fine-tuning. To address this, we propose a novel\nregistration-based prompt engineering framework for medical image segmentation\nusing SAM. This approach utilises established image registration algorithms to\nalign the new image (to-be-segmented) and a small number of reference images,\nwithout requiring segmentation labels. The spatial transformations generated by\nregistration align either the new image or pre-defined point-based prompts,\nbefore using them as input to SAM. This strategy, requiring as few as five\nreference images with defined point prompts, effectively prompts SAM for\ninference on new images, without needing any segmentation labels. Evaluation of\nMR images from patients who received cartilage stem cell therapy yielded Dice\nscores of 0.89, 0.87, 0.53, and 0.52 for segmenting femur, tibia, femoral- and\ntibial cartilages, respectively. This outperforms atlas-based label fusion and\nis comparable to supervised nnUNet, an upper-bound fair baseline in this\napplication, both of which require full segmentation labels for reference\nsamples. The codes are available at:\nhttps://github.com/chrissyinreallife/KneeSegmentWithSAM.git\n","authors":["Yaxi Chen","Aleksandra Ivanova","Shaheer U. Saeed","Rikin Hargunani","Jie Huang","Chaozong Liu","Yipeng Hu"],"pdf_url":"https://arxiv.org/pdf/2407.17933v1.pdf","comment":"Accepted to the 11th International Workshop on Biomedical Image\n Registration (WBIR 2024)"},{"id":"http://arxiv.org/abs/2407.17929v1","updated":"2024-07-25T10:38:32Z","published":"2024-07-25T10:38:32Z","title":"Guided Latent Slot Diffusion for Object-Centric Learning","summary":" Slot attention aims to decompose an input image into a set of meaningful\nobject files (slots). These latent object representations enable various\ndownstream tasks. Yet, these slots often bind to object parts, not objects\nthemselves, especially for real-world datasets. To address this, we introduce\nGuided Latent Slot Diffusion - GLASS, an object-centric model that uses\ngenerated captions as a guiding signal to better align slots with objects. Our\nkey insight is to learn the slot-attention module in the space of generated\nimages. This allows us to repurpose the pre-trained diffusion decoder model,\nwhich reconstructs the images from the slots, as a semantic mask generator\nbased on the generated captions. GLASS learns an object-level representation\nsuitable for multiple tasks simultaneously, e.g., segmentation, image\ngeneration, and property prediction, outperforming previous methods. For object\ndiscovery, GLASS achieves approx. a +35% and +10% relative improvement for mIoU\nover the previous state-of-the-art (SOTA) method on the VOC and COCO datasets,\nrespectively, and establishes a new SOTA FID score for conditional image\ngeneration amongst slot-attention-based methods. For the segmentation task,\nGLASS surpasses SOTA weakly-supervised and language-based segmentation models,\nwhich were specifically designed for the task.\n","authors":["Krishnakant Singh","Simone Schaub-Meyer","Stefan Roth"],"pdf_url":"https://arxiv.org/pdf/2407.17929v1.pdf","comment":"Project Page: https://guided-sa.github.io"},{"id":"http://arxiv.org/abs/2407.17927v1","updated":"2024-07-25T10:24:54Z","published":"2024-07-25T10:24:54Z","title":"Invariance of deep image quality metrics to affine transformations","summary":" Deep architectures are the current state-of-the-art in predicting subjective\nimage quality. Usually, these models are evaluated according to their ability\nto correlate with human opinion in databases with a range of distortions that\nmay appear in digital media. However, these oversee affine transformations\nwhich may represent better the changes in the images actually happening in\nnatural conditions. Humans can be particularly invariant to these natural\ntransformations, as opposed to the digital ones. In this work, we evaluate\nstate-of-the-art deep image quality metrics by assessing their invariance to\naffine transformations, specifically: rotation, translation, scaling, and\nchanges in spectral illumination. We propose a methodology to assign\ninvisibility thresholds for any perceptual metric. This methodology involves\ntransforming the distance measured by an arbitrary metric to a common distance\nrepresentation based on available subjectively rated databases. We\npsychophysically measure an absolute detection threshold in that common\nrepresentation and express it in the physical units of each affine transform\nfor each metric. By doing so, we allow the analyzed metrics to be directly\ncomparable with actual human thresholds. We find that none of the\nstate-of-the-art metrics shows human-like results under this strong test based\non invisibility thresholds. This means that tuning the models exclusively to\npredict the visibility of generic distortions may disregard other properties of\nhuman vision as for instance invariances or invisibility thresholds.\n","authors":["Nuria Alabau-Bosque","Paula Daudén-Oliver","Jorge Vila-Tomás","Valero Laparra","Jesús Malo"],"pdf_url":"https://arxiv.org/pdf/2407.17927v1.pdf","comment":"12 pages 13 figures"},{"id":"http://arxiv.org/abs/2402.16598v4","updated":"2024-07-25T10:10:45Z","published":"2024-02-26T14:28:39Z","title":"PCR-99: A Practical Method for Point Cloud Registration with 99%\n Outliers","summary":" We propose a robust method for point cloud registration that can handle both\nunknown scales and extreme outlier ratios. Our method, dubbed PCR-99, uses a\ndeterministic 3-point sampling approach with two novel mechanisms that\nsignificantly boost the speed: (1) an improved ordering of the samples based on\npairwise scale consistency, prioritizing the point correspondences that are\nmore likely to be inliers, and (2) an efficient outlier rejection scheme based\non triplet scale consistency, prescreening bad samples and reducing the number\nof hypotheses to be tested. Our evaluation shows that, up to 98% outlier ratio,\nthe proposed method achieves comparable performance to the state of the art. At\n99% outlier ratio, however, it outperforms the state of the art for both\nknown-scale and unknown-scale problems. Especially for the latter, we observe a\nclear superiority in terms of robustness and speed.\n","authors":["Seong Hun Lee","Javier Civera","Patrick Vandewalle"],"pdf_url":"https://arxiv.org/pdf/2402.16598v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17911v1","updated":"2024-07-25T10:06:26Z","published":"2024-07-25T10:06:26Z","title":"ReCorD: Reasoning and Correcting Diffusion for HOI Generation","summary":" Diffusion models revolutionize image generation by leveraging natural\nlanguage to guide the creation of multimedia content. Despite significant\nadvancements in such generative models, challenges persist in depicting\ndetailed human-object interactions, especially regarding pose and object\nplacement accuracy. We introduce a training-free method named Reasoning and\nCorrecting Diffusion (ReCorD) to address these challenges. Our model couples\nLatent Diffusion Models with Visual Language Models to refine the generation\nprocess, ensuring precise depictions of HOIs. We propose an interaction-aware\nreasoning module to improve the interpretation of the interaction, along with\nan interaction correcting module to refine the output image for more precise\nHOI generation delicately. Through a meticulous process of pose selection and\nobject positioning, ReCorD achieves superior fidelity in generated images while\nefficiently reducing computational requirements. We conduct comprehensive\nexperiments on three benchmarks to demonstrate the significant progress in\nsolving text-to-image generation tasks, showcasing ReCorD's ability to render\ncomplex interactions accurately by outperforming existing methods in HOI\nclassification score, as well as FID and Verb CLIP-Score. Project website is\navailable at https://alberthkyhky.github.io/ReCorD/ .\n","authors":["Jian-Yu Jiang-Lin","Kang-Yang Huang","Ling Lo","Yi-Ning Huang","Terence Lin","Jhih-Ciang Wu","Hong-Han Shuai","Wen-Huang Cheng"],"pdf_url":"https://arxiv.org/pdf/2407.17911v1.pdf","comment":"Accepted by ACM MM 2024. Project website:\n https://alberthkyhky.github.io/ReCorD/"},{"id":"http://arxiv.org/abs/2403.11761v2","updated":"2024-07-25T10:02:18Z","published":"2024-03-18T13:14:46Z","title":"BEVCar: Camera-Radar Fusion for BEV Map and Object Segmentation","summary":" Semantic scene segmentation from a bird's-eye-view (BEV) perspective plays a\ncrucial role in facilitating planning and decision-making for mobile robots.\nAlthough recent vision-only methods have demonstrated notable advancements in\nperformance, they often struggle under adverse illumination conditions such as\nrain or nighttime. While active sensors offer a solution to this challenge, the\nprohibitively high cost of LiDARs remains a limiting factor. Fusing camera data\nwith automotive radars poses a more inexpensive alternative but has received\nless attention in prior research. In this work, we aim to advance this\npromising avenue by introducing BEVCar, a novel approach for joint BEV object\nand map segmentation. The core novelty of our approach lies in first learning a\npoint-based encoding of raw radar data, which is then leveraged to efficiently\ninitialize the lifting of image features into the BEV space. We perform\nextensive experiments on the nuScenes dataset and demonstrate that BEVCar\noutperforms the current state of the art. Moreover, we show that incorporating\nradar information significantly enhances robustness in challenging\nenvironmental conditions and improves segmentation performance for distant\nobjects. To foster future research, we provide the weather split of the\nnuScenes dataset used in our experiments, along with our code and trained\nmodels at http://bevcar.cs.uni-freiburg.de.\n","authors":["Jonas Schramm","Niclas Vödisch","Kürsat Petek","B Ravi Kiran","Senthil Yogamani","Wolfram Burgard","Abhinav Valada"],"pdf_url":"https://arxiv.org/pdf/2403.11761v2.pdf","comment":"Accepted for the IEEE/RSJ International Conference on Intelligent\n Robots and Systems (IROS), 2024"},{"id":"http://arxiv.org/abs/2407.17909v1","updated":"2024-07-25T10:00:21Z","published":"2024-07-25T10:00:21Z","title":"Separating Novel Features for Logical Anomaly Detection: A\n Straightforward yet Effective Approach","summary":" Vision-based inspection algorithms have significantly contributed to quality\ncontrol in industrial settings, particularly in addressing structural defects\nlike dent and contamination which are prevalent in mass production. Extensive\nresearch efforts have led to the development of related benchmarks such as\nMVTec AD (Bergmann et al., 2019). However, in industrial settings, there can be\ninstances of logical defects, where acceptable items are found in unsuitable\nlocations or product pairs do not match as expected. Recent methods tackling\nlogical defects effectively employ knowledge distillation to generate\ndifference maps. Knowledge distillation (KD) is used to learn normal data\ndistribution in unsupervised manner. Despite their effectiveness, these methods\noften overlook the potential false negatives. Excessive similarity between the\nteacher network and student network can hinder the generation of a suitable\ndifference map for logical anomaly detection. This technical report provides\ninsights on handling potential false negatives by utilizing a simple constraint\nin KD-based logical anomaly detection methods. We select EfficientAD as a\nstate-of-the-art baseline and apply a margin-based constraint to its\nunsupervised learning scheme. Applying this constraint, we can improve the\nAUROC for MVTec LOCO AD by 1.3 %.\n","authors":["Kangil Lee","Geonuk Kim"],"pdf_url":"https://arxiv.org/pdf/2407.17909v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17907v1","updated":"2024-07-25T09:53:12Z","published":"2024-07-25T09:53:12Z","title":"Amortized Posterior Sampling with Diffusion Prior Distillation","summary":" We propose a variational inference approach to sample from the posterior\ndistribution for solving inverse problems. From a pre-trained diffusion model,\nour approach trains a conditional flow model to minimize the divergence between\nthe proposal variational distribution and the posterior distribution implicitly\ndefined through the diffusion model. Once trained, the flow model is capable of\nsampling from the posterior distribution with a single NFE, amortized with\nrespect to the measurement. The proposed method paves a new path for distilling\na diffusion prior for efficient posterior sampling. We show that our method is\napplicable to standard signals in Euclidean space, as well as signals on\nmanifold.\n","authors":["Abbas Mammadov","Hyungjin Chung","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2407.17907v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17906v1","updated":"2024-07-25T09:51:14Z","published":"2024-07-25T09:51:14Z","title":"Hierarchical Object Detection and Recognition Framework for Practical\n Plant Disease Diagnosis","summary":" Recently, object detection methods (OD; e.g., YOLO-based models) have been\nwidely utilized in plant disease diagnosis. These methods demonstrate\nrobustness to distance variations and excel at detecting small lesions compared\nto classification methods (CL; e.g., CNN models). However, there are issues\nsuch as low diagnostic performance for hard-to-detect diseases and high\nlabeling costs. Additionally, since healthy cases cannot be explicitly trained,\nthere is a risk of false positives. We propose the Hierarchical object\ndetection and recognition framework (HODRF), a sophisticated and highly\nintegrated two-stage system that combines the strengths of both OD and CL for\nplant disease diagnosis. In the first stage, HODRF uses OD to identify regions\nof interest (ROIs) without specifying the disease. In the second stage, CL\ndiagnoses diseases surrounding the ROIs. HODRF offers several advantages: (1)\nSince OD detects only one type of ROI, HODRF can detect diseases with limited\ntraining images by leveraging its ability to identify other lesions. (2) While\nOD over-detects healthy cases, HODRF significantly reduces these errors by\nusing CL in the second stage. (3) CL's accuracy improves in HODRF as it\nidentifies diagnostic targets given as ROIs, making it less vulnerable to size\nchanges. (4) HODRF benefits from CL's lower annotation costs, allowing it to\nlearn from a larger number of images. We implemented HODRF using YOLOv7 for OD\nand EfficientNetV2 for CL and evaluated its performance on a large-scale\ndataset (4 crops, 20 diseased and healthy classes, 281K images). HODRF\noutperformed YOLOv7 alone by 5.8 to 21.5 points on healthy data and 0.6 to 7.5\npoints on macro F1 scores, and it improved macro F1 by 1.1 to 7.2 points over\nEfficientNetV2.\n","authors":["Kohei Iwano","Shogo Shibuya","Satoshi Kagiwada","Hitoshi Iyatomi"],"pdf_url":"https://arxiv.org/pdf/2407.17906v1.pdf","comment":"6 pages, 3 figures"},{"id":"http://arxiv.org/abs/2407.17905v1","updated":"2024-07-25T09:51:09Z","published":"2024-07-25T09:51:09Z","title":"StreamMOS: Streaming Moving Object Segmentation with Multi-View\n Perception and Dual-Span Memory","summary":" Moving object segmentation based on LiDAR is a crucial and challenging task\nfor autonomous driving and mobile robotics. Most approaches explore\nspatio-temporal information from LiDAR sequences to predict moving objects in\nthe current frame. However, they often focus on transferring temporal cues in a\nsingle inference and regard every prediction as independent of others. This may\ncause inconsistent segmentation results for the same object in different\nframes. To overcome this issue, we propose a streaming network with a memory\nmechanism, called StreamMOS, to build the association of features and\npredictions among multiple inferences. Specifically, we utilize a short-term\nmemory to convey historical features, which can be regarded as spatial prior of\nmoving objects and adopted to enhance current inference by temporal fusion.\nMeanwhile, we build a long-term memory to store previous predictions and\nexploit them to refine the present forecast at voxel and instance levels\nthrough voting. Besides, we present multi-view encoder with cascade projection\nand asymmetric convolution to extract motion feature of objects in different\nrepresentations. Extensive experiments validate that our algorithm gets\ncompetitive performance on SemanticKITTI and Sipailou Campus datasets. Code\nwill be released at https://github.com/NEU-REAL/StreamMOS.git.\n","authors":["Zhiheng Li","Yubo Cui","Jiexi Zhong","Zheng Fang"],"pdf_url":"https://arxiv.org/pdf/2407.17905v1.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2405.10870v2","updated":"2024-07-25T09:51:06Z","published":"2024-05-17T16:01:11Z","title":"Multicenter Privacy-Preserving Model Training for Deep Learning Brain\n Metastases Autosegmentation","summary":" Objectives: This work aims to explore the impact of multicenter data\nheterogeneity on deep learning brain metastases (BM) autosegmentation\nperformance, and assess the efficacy of an incremental transfer learning\ntechnique, namely learning without forgetting (LWF), to improve model\ngeneralizability without sharing raw data.\n Materials and methods: A total of six BM datasets from University Hospital\nErlangen (UKER), University Hospital Zurich (USZ), Stanford, UCSF, NYU and\nBraTS Challenge 2023 on BM segmentation were used for this evaluation. First,\nthe multicenter performance of a convolutional neural network (DeepMedic) for\nBM autosegmentation was established for exclusive single-center training and\nfor training on pooled data, respectively. Subsequently bilateral collaboration\nwas evaluated, where a UKER pretrained model is shared to another center for\nfurther training using transfer learning (TL) either with or without LWF.\n Results: For single-center training, average F1 scores of BM detection range\nfrom 0.625 (NYU) to 0.876 (UKER) on respective single-center test data. Mixed\nmulticenter training notably improves F1 scores at Stanford and NYU, with\nnegligible improvement at other centers. When the UKER pretrained model is\napplied to USZ, LWF achieves a higher average F1 score (0.839) than naive TL\n(0.570) and single-center training (0.688) on combined UKER and USZ test data.\nNaive TL improves sensitivity and contouring accuracy, but compromises\nprecision. Conversely, LWF demonstrates commendable sensitivity, precision and\ncontouring accuracy. When applied to Stanford, similar performance was\nobserved.\n Conclusion: Data heterogeneity results in varying performance in BM\nautosegmentation, posing challenges to model generalizability. LWF is a\npromising approach to peer-to-peer privacy-preserving model training.\n","authors":["Yixing Huang","Zahra Khodabakhshi","Ahmed Gomaa","Manuel Schmidt","Rainer Fietkau","Matthias Guckenberger","Nicolaus Andratschke","Christoph Bert","Stephanie Tanadini-Lang","Florian Putz"],"pdf_url":"https://arxiv.org/pdf/2405.10870v2.pdf","comment":"Official published version in the Green Journal:\n https://doi.org/10.1016/j.radonc.2024.110419"},{"id":"http://arxiv.org/abs/2407.17324v2","updated":"2024-07-25T09:50:03Z","published":"2024-07-24T14:48:40Z","title":"Enhanced Deep Learning Methodologies and MRI Selection Techniques for\n Dementia Diagnosis in the Elderly Population","summary":" Dementia, a debilitating neurological condition affecting millions worldwide,\npresents significant diagnostic challenges. In this work, we introduce a novel\nmethodology for the classification of demented and non-demented elderly\npatients using 3D brain Magnetic Resonance Imaging (MRI) scans. Our approach\nfeatures a unique technique for selectively processing MRI slices, focusing on\nthe most relevant brain regions and excluding less informative sections. This\nmethodology is complemented by a confidence-based classification committee\ncomposed of three custom deep learning models: Dem3D ResNet, Dem3D CNN, and\nDem3D EfficientNet. These models work synergistically to enhance\ndecision-making accuracy, leveraging their collective strengths. Tested on the\nOpen Access Series of Imaging Studies(OASIS) dataset, our method achieved an\nimpressive accuracy of 94.12%, surpassing existing methodologies. Furthermore,\nvalidation on the Alzheimer's Disease Neuroimaging Initiative (ADNI) dataset\nconfirmed the robustness and generalizability of our approach. The use of\nexplainable AI (XAI) techniques and comprehensive ablation studies further\nsubstantiate the effectiveness of our techniques, providing insights into the\ndecision-making process and the importance of our methodology. This research\noffers a significant advancement in dementia diagnosis, providing a highly\naccurate and efficient tool for clinical applications.\n","authors":["Nikolaos Ntampakis","Konstantinos Diamantaras","Ioanna Chouvarda","Vasileios Argyriou","Panagiotis Sarigianndis"],"pdf_url":"https://arxiv.org/pdf/2407.17324v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17904v1","updated":"2024-07-25T09:49:04Z","published":"2024-07-25T09:49:04Z","title":"Exploring the Effect of Dataset Diversity in Self-Supervised Learning\n for Surgical Computer Vision","summary":" Over the past decade, computer vision applications in minimally invasive\nsurgery have rapidly increased. Despite this growth, the impact of surgical\ncomputer vision remains limited compared to other medical fields like pathology\nand radiology, primarily due to the scarcity of representative annotated data.\nWhereas transfer learning from large annotated datasets such as ImageNet has\nbeen conventionally the norm to achieve high-performing models, recent\nadvancements in self-supervised learning (SSL) have demonstrated superior\nperformance. In medical image analysis, in-domain SSL pretraining has already\nbeen shown to outperform ImageNet-based initialization. Although unlabeled data\nin the field of surgical computer vision is abundant, the diversity within this\ndata is limited. This study investigates the role of dataset diversity in SSL\nfor surgical computer vision, comparing procedure-specific datasets against a\nmore heterogeneous general surgical dataset across three different downstream\nsurgical applications. The obtained results show that using solely\nprocedure-specific data can lead to substantial improvements of 13.8%, 9.5%,\nand 36.8% compared to ImageNet pretraining. However, extending this data with\nmore heterogeneous surgical data further increases performance by an additional\n5.0%, 5.2%, and 2.5%, suggesting that increasing diversity within SSL data is\nbeneficial for model performance. The code and pretrained model weights are\nmade publicly available at https://github.com/TimJaspers0801/SurgeNet.\n","authors":["Tim J. M. Jaspers","Ronald L. P. D. de Jong","Yasmina Al Khalil","Tijn Zeelenberg","Carolus H. J. Kusters","Yiping Li","Romy C. van Jaarsveld","Franciscus H. A. Bakker","Jelle P. Ruurda","Willem M. Brinkman","Peter H. N. De With","Fons van der Sommen"],"pdf_url":"https://arxiv.org/pdf/2407.17904v1.pdf","comment":"accepted - Data Engineering in Medical Imaging (DEMI) Workshop @\n MICCAI2024"},{"id":"http://arxiv.org/abs/2405.16813v2","updated":"2024-07-25T09:47:05Z","published":"2024-05-27T04:14:20Z","title":"SiNGR: Brain Tumor Segmentation via Signed Normalized Geodesic Transform\n Regression","summary":" One of the primary challenges in brain tumor segmentation arises from the\nuncertainty of voxels close to tumor boundaries. However, the conventional\nprocess of generating ground truth segmentation masks fails to treat such\nuncertainties properly. Those \"hard labels\" with 0s and 1s conceptually\ninfluenced the majority of prior studies on brain image segmentation. As a\nresult, tumor segmentation is often solved through voxel classification. In\nthis work, we instead view this problem as a voxel-level regression, where the\nground truth represents a certainty mapping from any pixel to the border of the\ntumor. We propose a novel ground truth label transformation, which is based on\na signed geodesic transform, to capture the uncertainty in brain tumors'\nvicinity. We combine this idea with a Focal-like regression L1-loss that\nenables effective regression learning in high-dimensional output space by\nappropriately weighting voxels according to their difficulty. We thoroughly\nconduct an experimental evaluation to validate the components of our proposed\nmethod, compare it to a diverse array of state-of-the-art segmentation models,\nand show that it is architecture-agnostic. The code of our method is made\npublicly available (\\url{https://github.com/Oulu-IMEDS/SiNGR/}).\n","authors":["Trung Dang","Huy Hoang Nguyen","Aleksei Tiulpin"],"pdf_url":"https://arxiv.org/pdf/2405.16813v2.pdf","comment":"Accepted as a conference paper at MICCAI 2024"},{"id":"http://arxiv.org/abs/2405.07987v5","updated":"2024-07-25T09:33:50Z","published":"2024-05-13T17:58:30Z","title":"The Platonic Representation Hypothesis","summary":" We argue that representations in AI models, particularly deep networks, are\nconverging. First, we survey many examples of convergence in the literature:\nover time and across multiple domains, the ways by which different neural\nnetworks represent data are becoming more aligned. Next, we demonstrate\nconvergence across data modalities: as vision models and language models get\nlarger, they measure distance between datapoints in a more and more alike way.\nWe hypothesize that this convergence is driving toward a shared statistical\nmodel of reality, akin to Plato's concept of an ideal reality. We term such a\nrepresentation the platonic representation and discuss several possible\nselective pressures toward it. Finally, we discuss the implications of these\ntrends, their limitations, and counterexamples to our analysis.\n","authors":["Minyoung Huh","Brian Cheung","Tongzhou Wang","Phillip Isola"],"pdf_url":"https://arxiv.org/pdf/2405.07987v5.pdf","comment":"Equal contributions. Project: https://phillipi.github.io/prh/ Code:\n https://github.com/minyoungg/platonic-rep"},{"id":"http://arxiv.org/abs/2405.16815v2","updated":"2024-07-25T09:29:36Z","published":"2024-05-27T04:17:10Z","title":"Image-level Regression for Uncertainty-aware Retinal Image Segmentation","summary":" Accurate retinal vessel (RV) segmentation is a crucial step in the\nquantitative assessment of retinal vasculature, which is needed for the early\ndetection of retinal diseases and other conditions. Numerous studies have been\nconducted to tackle the problem of segmenting vessels automatically using a\npixel-wise classification approach. The common practice of creating ground\ntruth labels is to categorize pixels as foreground and background. This\napproach is, however, biased, and it ignores the uncertainty of a human\nannotator when it comes to annotating e.g. thin vessels. In this work, we\npropose a simple and effective method that casts the RV segmentation task as an\nimage-level regression. For this purpose, we first introduce a novel\nSegmentation Annotation Uncertainty-Aware (SAUNA) transform, which adds pixel\nuncertainty to the ground truth using the pixel's closeness to the annotation\nboundary and vessel thickness. To train our model with soft labels, we\ngeneralize the earlier proposed Jaccard metric loss to arbitrary hypercubes for\nsoft Jaccard index (Intersection-over-Union) optimization. Additionally, we\nemploy a stable version of the Focal-L1 loss for pixel-wise regression. We\nconduct thorough experiments and compare our method to a diverse set of\nbaselines across 5 retinal image datasets. Our empirical results indicate that\nthe integration of the SAUNA transform and these segmentation losses led to\nsignificant performance boosts for different segmentation models. Particularly,\nour methodology enables UNet-like architectures to substantially outperform\ncomputational-intensive baselines. Our implementation is available at\n\\url{https://github.com/Oulu-IMEDS/SAUNA}.\n","authors":["Trung Dang","Huy Hoang Nguyen","Aleksei Tiulpin"],"pdf_url":"https://arxiv.org/pdf/2405.16815v2.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2407.17229v2","updated":"2024-07-25T09:29:21Z","published":"2024-07-24T12:32:24Z","title":"LPGen: Enhancing High-Fidelity Landscape Painting Generation through\n Diffusion Model","summary":" Generating landscape paintings expands the possibilities of artistic\ncreativity and imagination. Traditional landscape painting methods involve\nusing ink or colored ink on rice paper, which requires substantial time and\neffort. These methods are susceptible to errors and inconsistencies and lack\nprecise control over lines and colors. This paper presents LPGen, a\nhigh-fidelity, controllable model for landscape painting generation,\nintroducing a novel multi-modal framework that integrates image prompts into\nthe diffusion model. We extract its edges and contours by computing canny edges\nfrom the target landscape image. These, along with natural language text\nprompts and drawing style references, are fed into the latent diffusion model\nas conditions. We implement a decoupled cross-attention strategy to ensure\ncompatibility between image and text prompts, facilitating multi-modal image\ngeneration. A decoder generates the final image. Quantitative and qualitative\nanalyses demonstrate that our method outperforms existing approaches in\nlandscape painting generation and exceeds the current state-of-the-art. The\nLPGen network effectively controls the composition and color of landscape\npaintings, generates more accurate images, and supports further research in\ndeep learning-based landscape painting generation.\n","authors":["Wanggong Yang","Xiaona Wang","Yingrui Qiu","Yifei Zhao"],"pdf_url":"https://arxiv.org/pdf/2407.17229v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.14596v3","updated":"2024-07-25T09:20:22Z","published":"2023-06-26T11:13:22Z","title":"Deep Learning for Cancer Prognosis Prediction Using Portrait Photos by\n StyleGAN Embedding","summary":" Survival prediction for cancer patients is critical for optimal treatment\nselection and patient management. Current patient survival prediction methods\ntypically extract survival information from patients' clinical record data or\nbiological and imaging data. In practice, experienced clinicians can have a\npreliminary assessment of patients' health status based on patients' observable\nphysical appearances, which are mainly facial features. However, such\nassessment is highly subjective. In this work, the efficacy of objectively\ncapturing and using prognostic information contained in conventional portrait\nphotographs using deep learning for survival predication purposes is\ninvestigated for the first time. A pre-trained StyleGAN2 model is fine-tuned on\na custom dataset of our cancer patients' photos to empower its generator with\ngenerative ability suitable for patients' photos. The StyleGAN2 is then used to\nembed the photographs to its highly expressive latent space. Utilizing the\nstate-of-the-art survival analysis models and based on StyleGAN's latent space\nphoto embeddings, this approach achieved a C-index of 0.677, which is notably\nhigher than chance and evidencing the prognostic value embedded in simple 2D\nfacial images. In addition, thanks to StyleGAN's interpretable latent space,\nour survival prediction model can be validated for relying on essential facial\nfeatures, eliminating any biases from extraneous information like clothing or\nbackground. Moreover, a health attribute is obtained from regression\ncoefficients, which has important potential value for patient care.\n","authors":["Amr Hagag","Ahmed Gomaa","Dominik Kornek","Andreas Maier","Rainer Fietkau","Christoph Bert","Florian Putz","Yixing Huang"],"pdf_url":"https://arxiv.org/pdf/2306.14596v3.pdf","comment":"MICCAI 2024 Early Accept"},{"id":"http://arxiv.org/abs/2312.07537v2","updated":"2024-07-25T09:10:52Z","published":"2023-12-12T18:59:16Z","title":"FreeInit: Bridging Initialization Gap in Video Diffusion Models","summary":" Though diffusion-based video generation has witnessed rapid progress, the\ninference results of existing models still exhibit unsatisfactory temporal\nconsistency and unnatural dynamics. In this paper, we delve deep into the noise\ninitialization of video diffusion models, and discover an implicit\ntraining-inference gap that attributes to the unsatisfactory inference\nquality.Our key findings are: 1) the spatial-temporal frequency distribution of\nthe initial noise at inference is intrinsically different from that for\ntraining, and 2) the denoising process is significantly influenced by the\nlow-frequency components of the initial noise. Motivated by these observations,\nwe propose a concise yet effective inference sampling strategy, FreeInit, which\nsignificantly improves temporal consistency of videos generated by diffusion\nmodels. Through iteratively refining the spatial-temporal low-frequency\ncomponents of the initial latent during inference, FreeInit is able to\ncompensate the initialization gap between training and inference, thus\neffectively improving the subject appearance and temporal consistency of\ngeneration results. Extensive experiments demonstrate that FreeInit\nconsistently enhances the generation quality of various text-to-video diffusion\nmodels without additional training or fine-tuning.\n","authors":["Tianxing Wu","Chenyang Si","Yuming Jiang","Ziqi Huang","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2312.07537v2.pdf","comment":"Project page: https://tianxingwu.github.io/pages/FreeInit/ Code:\n https://github.com/TianxingWu/FreeInit"},{"id":"http://arxiv.org/abs/2407.17877v1","updated":"2024-07-25T08:47:27Z","published":"2024-07-25T08:47:27Z","title":"Advancing 3D Point Cloud Understanding through Deep Transfer Learning: A\n Comprehensive Survey","summary":" The 3D point cloud (3DPC) has significantly evolved and benefited from the\nadvance of deep learning (DL). However, the latter faces various issues,\nincluding the lack of data or annotated data, the existence of a significant\ngap between training data and test data, and the requirement for high\ncomputational resources. To that end, deep transfer learning (DTL), which\ndecreases dependency and costs by utilizing knowledge gained from a source\ndata/task in training a target data/task, has been widely investigated.\nNumerous DTL frameworks have been suggested for aligning point clouds obtained\nfrom several scans of the same scene. Additionally, DA, which is a subset of\nDTL, has been modified to enhance the point cloud data's quality by dealing\nwith noise and missing points. Ultimately, fine-tuning and DA approaches have\ndemonstrated their effectiveness in addressing the distinct difficulties\ninherent in point cloud data. This paper presents the first review shedding\nlight on this aspect. it provides a comprehensive overview of the latest\ntechniques for understanding 3DPC using DTL and domain adaptation (DA).\nAccordingly, DTL's background is first presented along with the datasets and\nevaluation metrics. A well-defined taxonomy is introduced, and detailed\ncomparisons are presented, considering different aspects such as different\nknowledge transfer strategies, and performance. The paper covers various\napplications, such as 3DPC object detection, semantic labeling, segmentation,\nclassification, registration, downsampling/upsampling, and denoising.\nFurthermore, the article discusses the advantages and limitations of the\npresented frameworks, identifies open challenges, and suggests potential\nresearch directions.\n","authors":["Shahab Saquib Sohail","Yassine Himeur","Hamza Kheddar","Abbes Amira","Fodil Fadli","Shadi Atalla","Abigail Copiaco","Wathiq Mansoor"],"pdf_url":"https://arxiv.org/pdf/2407.17877v1.pdf","comment":"55 pages, 9 tables, and 15 figures"},{"id":"http://arxiv.org/abs/2309.12865v3","updated":"2024-07-25T08:32:15Z","published":"2023-09-22T13:39:24Z","title":"Bridging Sensor Gaps via Attention Gated Tuning for Hyperspectral Image\n Classification","summary":" Data-hungry HSI classification methods require high-quality labeled HSIs,\nwhich are often costly to obtain. This characteristic limits the performance\npotential of data-driven methods when dealing with limited annotated samples.\nBridging the domain gap between data acquired from different sensors allows us\nto utilize abundant labeled data across sensors to break this bottleneck. In\nthis paper, we propose a novel Attention-Gated Tuning (AGT) strategy and a\ntriplet-structured transformer model, Tri-Former, to address this issue. The\nAGT strategy serves as a bridge, allowing us to leverage existing labeled HSI\ndatasets, even RGB datasets to enhance the performance on new HSI datasets with\nlimited samples. Instead of inserting additional parameters inside the basic\nmodel, we train a lightweight auxiliary branch that takes intermediate features\nas input from the basic model and makes predictions. The proposed AGT resolves\nconflicts between heterogeneous and even cross-modal data by suppressing the\ndisturbing information and enhances the useful information through a soft gate.\nAdditionally, we introduce Tri-Former, a triplet-structured transformer with a\nspectral-spatial separation design that enhances parameter utilization and\ncomputational efficiency, enabling easier and flexible fine-tuning. Comparison\nexperiments conducted on three representative HSI datasets captured by\ndifferent sensors demonstrate the proposed Tri-Former achieves better\nperformance compared to several state-of-the-art methods. Homologous,\nheterologous and cross-modal tuning experiments verified the effectiveness of\nthe proposed AGT. Code has been released at:\n\\href{https://github.com/Cecilia-xue/AGT}{https://github.com/Cecilia-xue/AGT}.\n","authors":["Xizhe Xue","Haokui Zhang","Zongwen Bai","Ying Li"],"pdf_url":"https://arxiv.org/pdf/2309.12865v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17857v1","updated":"2024-07-25T08:22:30Z","published":"2024-07-25T08:22:30Z","title":"Mew: Multiplexed Immunofluorescence Image Analysis through an Efficient\n Multiplex Network","summary":" Recent advancements in graph-based approaches for multiplexed\nimmunofluorescence (mIF) images have significantly propelled the field forward,\noffering deeper insights into patient-level phenotyping. However, current\ngraph-based methodologies encounter two primary challenges: (1) Cellular\nHeterogeneity, where existing approaches fail to adequately address the\ninductive biases inherent in graphs, particularly the homophily characteristic\nobserved in cellular connectivity and; (2) Scalability, where handling cellular\ngraphs from high-dimensional images faces difficulties in managing a high\nnumber of cells. To overcome these limitations, we introduce Mew, a novel\nframework designed to efficiently process mIF images through the lens of\nmultiplex network. Mew innovatively constructs a multiplex network comprising\ntwo distinct layers: a Voronoi network for geometric information and a\nCell-type network for capturing cell-wise homogeneity. This framework equips a\nscalable and efficient Graph Neural Network (GNN), capable of processing the\nentire graph during training. Furthermore, Mew integrates an interpretable\nattention module that autonomously identifies relevant layers for image\nclassification. Extensive experiments on a real-world patient dataset from\nvarious institutions highlight Mew's remarkable efficacy and efficiency,\nmarking a significant advancement in mIF image analysis. The source code of Mew\ncan be found here: \\url{https://github.com/UNITES-Lab/Mew}\n","authors":["Sukwon Yun","Jie Peng","Alexandro E. Trevino","Chanyoung Park","Tianlong Chen"],"pdf_url":"https://arxiv.org/pdf/2407.17857v1.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2403.06461v3","updated":"2024-07-25T08:21:31Z","published":"2024-03-11T06:56:08Z","title":"Reliable Spatial-Temporal Voxels For Multi-Modal Test-Time Adaptation","summary":" Multi-modal test-time adaptation (MM-TTA) is proposed to adapt models to an\nunlabeled target domain by leveraging the complementary multi-modal inputs in\nan online manner. Previous MM-TTA methods for 3D segmentation rely on\npredictions of cross-modal information in each input frame, while they ignore\nthe fact that predictions of geometric neighborhoods within consecutive frames\nare highly correlated, leading to unstable predictions across time. To fulfill\nthis gap, we propose ReLiable Spatial-temporal Voxels (Latte), an MM-TTA method\nthat leverages reliable cross-modal spatial-temporal correspondences for\nmulti-modal 3D segmentation. Motivated by the fact that reliable predictions\nshould be consistent with their spatial-temporal correspondences, Latte\naggregates consecutive frames in a slide window manner and constructs\nSpatial-Temopral (ST) voxels to capture temporally local prediction consistency\nfor each modality. After filtering out ST voxels with high ST entropy, Latte\nconducts cross-modal learning for each point and pixel by attending to those\nwith reliable and consistent predictions among both spatial and temporal\nneighborhoods. Experimental results show that Latte achieves state-of-the-art\nperformance on three different MM-TTA benchmarks compared to previous MM-TTA or\nTTA methods. Visit our project site https://sites.google.com/view/eccv24-latte.\n","authors":["Haozhi Cao","Yuecong Xu","Jianfei Yang","Pengyu Yin","Xingyu Ji","Shenghai Yuan","Lihua Xie"],"pdf_url":"https://arxiv.org/pdf/2403.06461v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06903v2","updated":"2024-07-25T08:19:53Z","published":"2024-04-10T10:46:59Z","title":"DreamScene360: Unconstrained Text-to-3D Scene Generation with Panoramic\n Gaussian Splatting","summary":" The increasing demand for virtual reality applications has highlighted the\nsignificance of crafting immersive 3D assets. We present a text-to-3D\n360$^{\\circ}$ scene generation pipeline that facilitates the creation of\ncomprehensive 360$^{\\circ}$ scenes for in-the-wild environments in a matter of\nminutes. Our approach utilizes the generative power of a 2D diffusion model and\nprompt self-refinement to create a high-quality and globally coherent panoramic\nimage. This image acts as a preliminary \"flat\" (2D) scene representation.\nSubsequently, it is lifted into 3D Gaussians, employing splatting techniques to\nenable real-time exploration. To produce consistent 3D geometry, our pipeline\nconstructs a spatially coherent structure by aligning the 2D monocular depth\ninto a globally optimized point cloud. This point cloud serves as the initial\nstate for the centroids of 3D Gaussians. In order to address invisible issues\ninherent in single-view inputs, we impose semantic and geometric constraints on\nboth synthesized and input camera views as regularizations. These guide the\noptimization of Gaussians, aiding in the reconstruction of unseen regions. In\nsummary, our method offers a globally consistent 3D scene within a\n360$^{\\circ}$ perspective, providing an enhanced immersive experience over\nexisting techniques. Project website at: http://dreamscene360.github.io/\n","authors":["Shijie Zhou","Zhiwen Fan","Dejia Xu","Haoran Chang","Pradyumna Chari","Tejas Bharadwaj","Suya You","Zhangyang Wang","Achuta Kadambi"],"pdf_url":"https://arxiv.org/pdf/2404.06903v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12832v2","updated":"2024-07-25T08:09:12Z","published":"2024-04-19T12:09:49Z","title":"COIN: Counterfactual inpainting for weakly supervised semantic\n segmentation for medical images","summary":" Deep learning is dramatically transforming the field of medical imaging and\nradiology, enabling the identification of pathologies in medical images,\nincluding computed tomography (CT) and X-ray scans. However, the performance of\ndeep learning models, particularly in segmentation tasks, is often limited by\nthe need for extensive annotated datasets. To address this challenge, the\ncapabilities of weakly supervised semantic segmentation are explored through\nthe lens of Explainable AI and the generation of counterfactual explanations.\nThe scope of this research is development of a novel counterfactual inpainting\napproach (COIN) that flips the predicted classification label from abnormal to\nnormal by using a generative model. For instance, if the classifier deems an\ninput medical image X as abnormal, indicating the presence of a pathology, the\ngenerative model aims to inpaint the abnormal region, thus reversing the\nclassifier's original prediction label. The approach enables us to produce\nprecise segmentations for pathologies without depending on pre-existing\nsegmentation masks. Crucially, image-level labels are utilized, which are\nsubstantially easier to acquire than creating detailed segmentation masks. The\neffectiveness of the method is demonstrated by segmenting synthetic targets and\nactual kidney tumors from CT images acquired from Tartu University Hospital in\nEstonia. The findings indicate that COIN greatly surpasses established\nattribution methods, such as RISE, ScoreCAM, and LayerCAM, as well as an\nalternative counterfactual explanation method introduced by Singla et al. This\nevidence suggests that COIN is a promising approach for semantic segmentation\nof tumors in CT images, and presents a step forward in making deep learning\napplications more accessible and effective in healthcare, where annotated data\nis scarce.\n","authors":["Dmytro Shvetsov","Joonas Ariva","Marharyta Domnich","Raul Vicente","Dmytro Fishman"],"pdf_url":"https://arxiv.org/pdf/2404.12832v2.pdf","comment":"This work has been accepted to be presented to The 2nd World\n Conference on eXplainable Artificial Intelligence (xAI 2024), July 17-19,\n 2024 - Valletta, Malta"},{"id":"http://arxiv.org/abs/2407.17850v1","updated":"2024-07-25T08:07:40Z","published":"2024-07-25T08:07:40Z","title":"FlexiEdit: Frequency-Aware Latent Refinement for Enhanced Non-Rigid\n Editing","summary":" Current image editing methods primarily utilize DDIM Inversion, employing a\ntwo-branch diffusion approach to preserve the attributes and layout of the\noriginal image. However, these methods encounter challenges with non-rigid\nedits, which involve altering the image's layout or structure. Our\ncomprehensive analysis reveals that the high-frequency components of DDIM\nlatent, crucial for retaining the original image's key features and layout,\nsignificantly contribute to these limitations. Addressing this, we introduce\nFlexiEdit, which enhances fidelity to input text prompts by refining DDIM\nlatent, by reducing high-frequency components in targeted editing areas.\nFlexiEdit comprises two key components: (1) Latent Refinement, which modifies\nDDIM latent to better accommodate layout adjustments, and (2) Edit Fidelity\nEnhancement via Re-inversion, aimed at ensuring the edits more accurately\nreflect the input text prompts. Our approach represents notable progress in\nimage editing, particularly in performing complex non-rigid edits, showcasing\nits enhanced capability through comparative experiments.\n","authors":["Gwanhyeong Koo","Sunjae Yoon","Ji Woo Hong","Chang D. Yoo"],"pdf_url":"https://arxiv.org/pdf/2407.17850v1.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2407.17847v1","updated":"2024-07-25T08:00:49Z","published":"2024-07-25T08:00:49Z","title":"Move and Act: Enhanced Object Manipulation and Background Integrity for\n Image Editing","summary":" Current methods commonly utilize three-branch structures of inversion,\nreconstruction, and editing, to tackle consistent image editing task. However,\nthese methods lack control over the generation position of the edited object\nand have issues with background preservation. To overcome these limitations, we\npropose a tuning-free method with only two branches: inversion and editing.\nThis approach allows users to simultaneously edit the object's action and\ncontrol the generation position of the edited object. Additionally, it achieves\nimproved background preservation. Specifically, we transfer the edited object\ninformation to the target area and repair or preserve the background of other\nareas during the inversion process at a specific time step. In the editing\nstage, we use the image features in self-attention to query the key and value\nof the corresponding time step in the inversion to achieve consistent image\nediting. Impressive image editing results and quantitative evaluation\ndemonstrate the effectiveness of our method. The code is available at\nhttps://github.com/mobiushy/move-act.\n","authors":["Pengfei Jiang","Mingbao Lin","Fei Chao","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2407.17847v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17843v1","updated":"2024-07-25T07:57:55Z","published":"2024-07-25T07:57:55Z","title":"DragText: Rethinking Text Embedding in Point-based Image Editing","summary":" Point-based image editing enables accurate and flexible control through\ncontent dragging. However, the role of text embedding in the editing process\nhas not been thoroughly investigated. A significant aspect that remains\nunexplored is the interaction between text and image embeddings. In this study,\nwe show that during the progressive editing of an input image in a diffusion\nmodel, the text embedding remains constant. As the image embedding increasingly\ndiverges from its initial state, the discrepancy between the image and text\nembeddings presents a significant challenge. Moreover, we found that the text\nprompt significantly influences the dragging process, particularly in\nmaintaining content integrity and achieving the desired manipulation. To\nutilize these insights, we propose DragText, which optimizes text embedding in\nconjunction with the dragging process to pair with the modified image\nembedding. Simultaneously, we regularize the text optimization process to\npreserve the integrity of the original text prompt. Our approach can be\nseamlessly integrated with existing diffusion-based drag methods with only a\nfew lines of code.\n","authors":["Gayoon Choi","Taejin Jeong","Sujung Hong","Jaehoon Joo","Seong Jae Hwang"],"pdf_url":"https://arxiv.org/pdf/2407.17843v1.pdf","comment":"22 pages, 18 figures"},{"id":"http://arxiv.org/abs/2407.17838v1","updated":"2024-07-25T07:52:11Z","published":"2024-07-25T07:52:11Z","title":"UMono: Physical Model Informed Hybrid CNN-Transformer Framework for\n Underwater Monocular Depth Estimation","summary":" Underwater monocular depth estimation serves as the foundation for tasks such\nas 3D reconstruction of underwater scenes. However, due to the influence of\nlight and medium, the underwater environment undergoes a distinctive imaging\nprocess, which presents challenges in accurately estimating depth from a single\nimage. The existing methods fail to consider the unique characteristics of\nunderwater environments, leading to inadequate estimation results and limited\ngeneralization performance. Furthermore, underwater depth estimation requires\nextracting and fusing both local and global features, which is not fully\nexplored in existing methods. In this paper, an end-to-end learning framework\nfor underwater monocular depth estimation called UMono is presented, which\nincorporates underwater image formation model characteristics into network\narchitecture, and effectively utilize both local and global features of\nunderwater image. Experimental results demonstrate that the proposed method is\neffective for underwater monocular depth estimation and outperforms the\nexisting methods in both quantitative and qualitative analyses.\n","authors":["Jian Wang","Jing Wang","Shenghui Rong","Bo He"],"pdf_url":"https://arxiv.org/pdf/2407.17838v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.16087v3","updated":"2024-07-25T07:50:58Z","published":"2024-06-23T12:02:17Z","title":"Imperative Learning: A Self-supervised Neural-Symbolic Learning\n Framework for Robot Autonomy","summary":" Data-driven methods such as reinforcement and imitation learning have\nachieved remarkable success in robot autonomy. However, their data-centric\nnature still hinders them from generalizing well to ever-changing environments.\nMoreover, collecting large datasets for robotic tasks is often impractical and\nexpensive. To overcome these challenges, we introduce a new self-supervised\nneural-symbolic (NeSy) computational framework, imperative learning (IL), for\nrobot autonomy, leveraging the generalization abilities of symbolic reasoning.\nThe framework of IL consists of three primary components: a neural module, a\nreasoning engine, and a memory system. We formulate IL as a special bilevel\noptimization (BLO), which enables reciprocal learning over the three modules.\nThis overcomes the label-intensive obstacles associated with data-driven\napproaches and takes advantage of symbolic reasoning concerning logical\nreasoning, physical principles, geometric analysis, etc. We discuss several\noptimization techniques for IL and verify their effectiveness in five distinct\nrobot autonomy tasks including path planning, rule induction, optimal control,\nvisual odometry, and multi-robot routing. Through various experiments, we show\nthat IL can significantly enhance robot autonomy capabilities and we anticipate\nthat it will catalyze further research across diverse domains.\n","authors":["Chen Wang","Kaiyi Ji","Junyi Geng","Zhongqiang Ren","Taimeng Fu","Fan Yang","Yifan Guo","Haonan He","Xiangyu Chen","Zitong Zhan","Qiwei Du","Shaoshu Su","Bowen Li","Yuheng Qiu","Yi Du","Qihang Li","Yifan Yang","Xiao Lin","Zhipeng Zhao"],"pdf_url":"https://arxiv.org/pdf/2406.16087v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17310v2","updated":"2024-07-25T07:47:11Z","published":"2024-07-24T14:22:55Z","title":"LangOcc: Self-Supervised Open Vocabulary Occupancy Estimation via Volume\n Rendering","summary":" The 3D occupancy estimation task has become an important challenge in the\narea of vision-based autonomous driving recently. However, most existing\ncamera-based methods rely on costly 3D voxel labels or LiDAR scans for\ntraining, limiting their practicality and scalability. Moreover, most methods\nare tied to a predefined set of classes which they can detect. In this work we\npresent a novel approach for open vocabulary occupancy estimation called\nLangOcc, that is trained only via camera images, and can detect arbitrary\nsemantics via vision-language alignment. In particular, we distill the\nknowledge of the strong vision-language aligned encoder CLIP into a 3D\noccupancy model via differentiable volume rendering. Our model estimates\nvision-language aligned features in a 3D voxel grid using only images. It is\ntrained in a self-supervised manner by rendering our estimations back to 2D\nspace, where ground-truth features can be computed. This training mechanism\nautomatically supervises the scene geometry, allowing for a straight-forward\nand powerful training method without any explicit geometry supervision. LangOcc\noutperforms LiDAR-supervised competitors in open vocabulary occupancy by a\nlarge margin, solely relying on vision-based training. We also achieve\nstate-of-the-art results in self-supervised semantic occupancy estimation on\nthe Occ3D-nuScenes dataset, despite not being limited to a specific set of\ncategories, thus demonstrating the effectiveness of our proposed\nvision-language training.\n","authors":["Simon Boeder","Fabian Gigengack","Benjamin Risse"],"pdf_url":"https://arxiv.org/pdf/2407.17310v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17834v1","updated":"2024-07-25T07:45:28Z","published":"2024-07-25T07:45:28Z","title":"Towards the Spectral bias Alleviation by Normalizations in Coordinate\n Networks","summary":" Representing signals using coordinate networks dominates the area of inverse\nproblems recently, and is widely applied in various scientific computing tasks.\nStill, there exists an issue of spectral bias in coordinate networks, limiting\nthe capacity to learn high-frequency components. This problem is caused by the\npathological distribution of the neural tangent kernel's (NTK's) eigenvalues of\ncoordinate networks. We find that, this pathological distribution could be\nimproved using classical normalization techniques (batch normalization and\nlayer normalization), which are commonly used in convolutional neural networks\nbut rarely used in coordinate networks. We prove that normalization techniques\ngreatly reduces the maximum and variance of NTK's eigenvalues while slightly\nmodifies the mean value, considering the max eigenvalue is much larger than the\nmost, this variance change results in a shift of eigenvalues' distribution from\na lower one to a higher one, therefore the spectral bias could be alleviated.\nFurthermore, we propose two new normalization techniques by combining these two\ntechniques in different ways. The efficacy of these normalization techniques is\nsubstantiated by the significant improvements and new state-of-the-arts\nachieved by applying normalization-based coordinate networks to various tasks,\nincluding the image compression, computed tomography reconstruction, shape\nrepresentation, magnetic resonance imaging, novel view synthesis and multi-view\nstereo reconstruction.\n","authors":["Zhicheng Cai","Hao Zhu","Qiu Shen","Xinran Wang","Xun Cao"],"pdf_url":"https://arxiv.org/pdf/2407.17834v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03059v7","updated":"2024-07-25T07:42:15Z","published":"2023-10-04T16:49:36Z","title":"Point-PEFT: Parameter-Efficient Fine-Tuning for 3D Pre-trained Models","summary":" The popularity of pre-trained large models has revolutionized downstream\ntasks across diverse fields, such as language, vision, and multi-modality. To\nminimize the adaption cost for downstream tasks, many Parameter-Efficient\nFine-Tuning (PEFT) techniques are proposed for language and 2D image\npre-trained models. However, the specialized PEFT method for 3D pre-trained\nmodels is still under-explored. To this end, we introduce Point-PEFT, a novel\nframework for adapting point cloud pre-trained models with minimal learnable\nparameters. Specifically, for a pre-trained 3D model, we freeze most of its\nparameters, and only tune the newly added PEFT modules on downstream tasks,\nwhich consist of a Point-prior Prompt and a Geometry-aware Adapter. The\nPoint-prior Prompt adopts a set of learnable prompt tokens, for which we\npropose to construct a memory bank with domain-specific knowledge, and utilize\na parameter-free attention to enhance the prompt tokens. The Geometry-aware\nAdapter aims to aggregate point cloud features within spatial neighborhoods to\ncapture fine-grained geometric information through local interactions.\nExtensive experiments indicate that our Point-PEFT can achieve better\nperformance than the full fine-tuning on various downstream tasks, while using\nonly 5% of the trainable parameters, demonstrating the efficiency and\neffectiveness of our approach. Code is released at\nhttps://github.com/Ivan-Tang-3D/Point-PEFT.\n","authors":["Yiwen Tang","Ray Zhang","Zoey Guo","Dong Wang","Zhigang Wang","Bin Zhao","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2310.03059v7.pdf","comment":"The specialized PEFT framework for 3D pre-trained models, which\n achieves competitive performance to full fine-tuning, and significantly\n reduces the computational resources. Project page:\n https://github.com/Ivan-Tang-3D/Point-PEFT"},{"id":"http://arxiv.org/abs/2407.17829v1","updated":"2024-07-25T07:38:27Z","published":"2024-07-25T07:38:27Z","title":"Image Segmentation via Divisive Normalization: dealing with\n environmental diversity","summary":" Autonomous driving is a challenging scenario for image segmentation due to\nthe presence of uncontrolled environmental conditions and the eventually\ncatastrophic consequences of failures. Previous work suggested that a\nbiologically motivated computation, the so-called Divisive Normalization, could\nbe useful to deal with image variability, but its effects have not been\nsystematically studied over different data sources and environmental factors.\nHere we put segmentation U-nets augmented with Divisive Normalization to work\nfar from training conditions to find where this adaptation is more critical. We\ncategorize the scenes according to their radiance level and dynamic range\n(day/night), and according to their achromatic/chromatic contrasts. We also\nconsider video game (synthetic) images to broaden the range of environments. We\ncheck the performance in the extreme percentiles of such categorization. Then,\nwe push the limits further by artificially modifying the images in\nperceptually/environmentally relevant dimensions: luminance, contrasts and\nspectral radiance. Results show that neural networks with Divisive\nNormalization get better results in all the scenarios and their performance\nremains more stable with regard to the considered environmental factors and\nnature of the source. Finally, we explain the improvements in segmentation\nperformance in two ways: (1) by quantifying the invariance of the responses\nthat incorporate Divisive Normalization, and (2) by illustrating the adaptive\nnonlinearity of the different layers that depends on the local activity.\n","authors":["Pablo Hernández-Cámara","Jorge Vila-Tomás","Paula Dauden-Oliver","Nuria Alabau-Bosque","Valero Laparra","Jesús Malo"],"pdf_url":"https://arxiv.org/pdf/2407.17829v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17827v1","updated":"2024-07-25T07:35:27Z","published":"2024-07-25T07:35:27Z","title":"Unified Lexical Representation for Interpretable Visual-Language\n Alignment","summary":" Visual-Language Alignment (VLA) has gained a lot of attention since CLIP's\ngroundbreaking work. Although CLIP performs well, the typical direct latent\nfeature alignment lacks clarity in its representation and similarity scores. On\nthe other hand, lexical representation, a vector whose element represents the\nsimilarity between the sample and a word from the vocabulary, is a natural\nsparse representation and interpretable, providing exact matches for individual\nwords. However, lexical representations is difficult to learn due to no\nground-truth supervision and false-discovery issues, and thus requires complex\ndesign to train effectively. In this paper, we introduce LexVLA, a more\ninterpretable VLA framework by learning a unified lexical representation for\nboth modalities without complex design. We use DINOv2 as our visual model for\nits local-inclined features and Llama 2, a generative language model, to\nleverage its in-context lexical prediction ability. To avoid the false\ndiscovery, we propose an overuse penalty to refrain the lexical representation\nfrom falsely frequently activating meaningless words. We demonstrate that these\ntwo pre-trained uni-modal models can be well-aligned by fine-tuning on modest\nmulti-modal dataset and avoid intricate training configurations. On cross-modal\nretrieval benchmarks, LexVLA, trained on the CC-12M multi-modal dataset,\noutperforms baselines fine-tuned on larger datasets (e.g., YFCC15M) and those\ntrained from scratch on even bigger datasets (e.g., 1.1B data, including\nCC-12M). We conduct extensive experiments to analyze LexVLA.\n","authors":["Yifan Li","Yikai Wang","Yanwei Fu","Dongyu Ru","Zheng Zhang","Tong He"],"pdf_url":"https://arxiv.org/pdf/2407.17827v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.10172v2","updated":"2024-07-25T07:26:40Z","published":"2024-07-14T11:59:22Z","title":"Restoring Images in Adverse Weather Conditions via Histogram Transformer","summary":" Transformer-based image restoration methods in adverse weather have achieved\nsignificant progress. Most of them use self-attention along the channel\ndimension or within spatially fixed-range blocks to reduce computational load.\nHowever, such a compromise results in limitations in capturing long-range\nspatial features. Inspired by the observation that the weather-induced\ndegradation factors mainly cause similar occlusion and brightness, in this\nwork, we propose an efficient Histogram Transformer (Histoformer) for restoring\nimages affected by adverse weather. It is powered by a mechanism dubbed\nhistogram self-attention, which sorts and segments spatial features into\nintensity-based bins. Self-attention is then applied across bins or within each\nbin to selectively focus on spatial features of dynamic range and process\nsimilar degraded pixels of the long range together. To boost histogram\nself-attention, we present a dynamic-range convolution enabling conventional\nconvolution to conduct operation over similar pixels rather than neighbor\npixels. We also observe that the common pixel-wise losses neglect linear\nassociation and correlation between output and ground-truth. Thus, we propose\nto leverage the Pearson correlation coefficient as a loss function to enforce\nthe recovered pixels following the identical order as ground-truth. Extensive\nexperiments demonstrate the efficacy and superiority of our proposed method. We\nhave released the codes in Github.\n","authors":["Shangquan Sun","Wenqi Ren","Xinwei Gao","Rui Wang","Xiaochun Cao"],"pdf_url":"https://arxiv.org/pdf/2407.10172v2.pdf","comment":"19 pages, 7 figures, 10MB"},{"id":"http://arxiv.org/abs/2310.05483v5","updated":"2024-07-25T07:08:24Z","published":"2023-10-09T07:42:33Z","title":"HarmonicNeRF: Geometry-Informed Synthetic View Augmentation for 3D Scene\n Reconstruction in Driving Scenarios","summary":" In the realm of autonomous driving, achieving precise 3D reconstruction of\nthe driving environment is critical for ensuring safety and effective\nnavigation. Neural Radiance Fields (NeRF) have shown promise in creating highly\ndetailed and accurate models of complex environments. However, the application\nof NeRF in autonomous driving scenarios encounters several challenges,\nprimarily due to the sparsity of viewpoints inherent in camera trajectories and\nthe constraints on data collection in unbounded outdoor scenes, which typically\noccur along predetermined paths. This limitation not only reduces the available\nscene information but also poses significant challenges for NeRF training, as\nthe sparse and path-distributed observational data leads to\nunder-representation of the scene's geometry. In this paper, we introduce\nHarmonicNeRF, a novel approach for outdoor self-supervised monocular scene\nreconstruction. HarmonicNeRF capitalizes on the strengths of NeRF and enhances\nsurface reconstruction accuracy by augmenting the input space with\ngeometry-informed synthetic views. This is achieved through the application of\nspherical harmonics to generate novel radiance values, taking into careful\nconsideration the color observations from the limited available real-world\nviews. Additionally, our method incorporates proxy geometry to effectively\nmanage occlusion, generating radiance pseudo-labels that circumvent the\nlimitations of traditional image-warping techniques, which often fail in sparse\ndata conditions typical of autonomous driving environments. Extensive\nexperiments conducted on the KITTI, Argoverse, and NuScenes datasets\ndemonstrate our approach establishes new benchmarks in synthesizing novel depth\nviews and reconstructing scenes, significantly outperforming existing methods.\nProject page: https://github.com/Jiawei-Yao0812/HarmonicNeRF\n","authors":["Xiaochao Pan","Jiawei Yao","Hongrui Kou","Tong Wu","Canran Xiao"],"pdf_url":"https://arxiv.org/pdf/2310.05483v5.pdf","comment":"Accepted by ACM MM 2024, project page:\n https://github.com/Jiawei-Yao0812/HarmonicNeRF"},{"id":"http://arxiv.org/abs/2407.11652v2","updated":"2024-07-25T07:04:32Z","published":"2024-07-16T12:18:20Z","title":"CCVA-FL: Cross-Client Variations Adaptive Federated Learning for Medical\n Imaging","summary":" Federated Learning (FL) offers a privacy-preserving approach to train models\non decentralized data. Its potential in healthcare is significant, but\nchallenges arise due to cross-client variations in medical image data,\nexacerbated by limited annotations. This paper introduces Cross-Client\nVariations Adaptive Federated Learning (CCVA-FL) to address these issues.\nCCVA-FL aims to minimize cross-client variations by transforming images into a\ncommon feature space. It involves expert annotation of a subset of images from\neach client, followed by the selection of a client with the least data\ncomplexity as the target. Synthetic medical images are then generated using\nScalable Diffusion Models with Transformers (DiT) based on the target client's\nannotated images. These synthetic images, capturing diversity and representing\nthe original data, are shared with other clients. Each client then translates\nits local images into the target image space using image-to-image translation.\nThe translated images are subsequently used in a federated learning setting to\ndevelop a server model. Our results demonstrate that CCVA-FL outperforms\nVanilla Federated Averaging by effectively addressing data distribution\ndifferences across clients without compromising privacy.\n","authors":["Sunny Gupta","Amit Sethi"],"pdf_url":"https://arxiv.org/pdf/2407.11652v2.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2407.17813v1","updated":"2024-07-25T06:59:15Z","published":"2024-07-25T06:59:15Z","title":"Enhancing Model Performance: Another Approach to Vision-Language\n Instruction Tuning","summary":" The integration of large language models (LLMs) with vision-language (VL)\ntasks has been a transformative development in the realm of artificial\nintelligence, highlighting the potential of LLMs as a versatile general-purpose\nchatbot. However, the current trend in this evolution focuses on the\nintegration of vision and language to create models that can operate in more\ndiverse and real-world contexts. We present a novel approach, termed Bottleneck\nAdapter, specifically crafted for enhancing the multimodal functionalities of\nthese complex models, enabling joint optimization of the entire multimodal LLM\nframework through a process known as Multimodal Model Tuning (MMT). Our\napproach utilizes lightweight adapters to connect the image encoder and LLM\nwithout the need for large, complex neural networks. Unlike the conventional\nmodular training schemes, our approach adopts an end-to-end optimization\nregime, which, when combined with the adapters, facilitates the joint\noptimization using a significantly smaller parameter set. Our method exhibits\nrobust performance with 90.12\\% accuracy, outperforming both human-level\nperformance (88.4\\%) and LaVIN-7B (89.41\\%).\n","authors":[" Vedanshu","MM Tripathi","Bhavnesh Jaint"],"pdf_url":"https://arxiv.org/pdf/2407.17813v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01926v3","updated":"2024-07-25T06:54:01Z","published":"2024-07-02T03:43:39Z","title":"Chemical Shift Encoding based Double Bonds Quantification in\n Triglycerides using Deep Image Prior","summary":" This study evaluated a deep learning-based method using Deep Image Prior\n(DIP) to quantify triglyceride double bonds from chemical-shift encoded\nmulti-echo gradient echo images without network training. We employed a cost\nfunction based on signal constraints to iteratively update the neural network\non a single dataset. The method was validated using phantom experiments and in\nvivo scans. Results showed close alignment between measured and reference\ndouble bond values, with phantom experiments yielding a Pearson correlation\ncoefficient of 0.96 (p = .0005). In vivo results demonstrated good agreement in\nsubcutaneous fat. We conclude that Deep Image Prior shows feasibility for\nquantifying double bonds and fatty acid content from chemical-shift encoded\nmulti-echo MRI.\n","authors":["Chaoxing Huang","Ziqiang Yu","Zijian Gao","Qiuyi Shen","Queenie Chan","Vincent Wai-Sun Wong","Winnie Chiu-Wing Chu","Weitian Chen"],"pdf_url":"https://arxiv.org/pdf/2407.01926v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17170v2","updated":"2024-07-25T06:40:39Z","published":"2024-07-24T11:22:02Z","title":"Domain Generalized Recaptured Screen Image Identification Using SWIN\n Transformer","summary":" An increasing number of classification approaches have been developed to\naddress the issue of image rebroadcast and recapturing, a standard attack\nstrategy in insurance frauds, face spoofing, and video piracy. However, most of\nthem neglected scale variations and domain generalization scenarios, performing\npoorly in instances involving domain shifts, typically made worse by\ninter-domain and cross-domain scale variances. To overcome these issues, we\npropose a cascaded data augmentation and SWIN transformer domain generalization\nframework (DAST-DG) in the current research work Initially, we examine the\ndisparity in dataset representation. A feature generator is trained to make\nauthentic images from various domains indistinguishable. This process is then\napplied to recaptured images, creating a dual adversarial learning setup.\nExtensive experiments demonstrate that our approach is practical and surpasses\nstate-of-the-art methods across different databases. Our model achieves an\naccuracy of approximately 82\\% with a precision of 95\\% on high-variance\ndatasets.\n","authors":["Preeti Mehta","Aman Sagar","Suchi Kumari"],"pdf_url":"https://arxiv.org/pdf/2407.17170v2.pdf","comment":"11 pages, 10 figures, 9 tables"},{"id":"http://arxiv.org/abs/2407.17797v1","updated":"2024-07-25T06:10:33Z","published":"2024-07-25T06:10:33Z","title":"A Unified Understanding of Adversarial Vulnerability Regarding Unimodal\n Models and Vision-Language Pre-training Models","summary":" With Vision-Language Pre-training (VLP) models demonstrating powerful\nmultimodal interaction capabilities, the application scenarios of neural\nnetworks are no longer confined to unimodal domains but have expanded to more\ncomplex multimodal V+L downstream tasks. The security vulnerabilities of\nunimodal models have been extensively examined, whereas those of VLP models\nremain challenging. We note that in CV models, the understanding of images\ncomes from annotated information, while VLP models are designed to learn image\nrepresentations directly from raw text. Motivated by this discrepancy, we\ndeveloped the Feature Guidance Attack (FGA), a novel method that uses text\nrepresentations to direct the perturbation of clean images, resulting in the\ngeneration of adversarial images. FGA is orthogonal to many advanced attack\nstrategies in the unimodal domain, facilitating the direct application of rich\nresearch findings from the unimodal to the multimodal scenario. By\nappropriately introducing text attack into FGA, we construct Feature Guidance\nwith Text Attack (FGA-T). Through the interaction of attacking two modalities,\nFGA-T achieves superior attack effects against VLP models. Moreover,\nincorporating data augmentation and momentum mechanisms significantly improves\nthe black-box transferability of FGA-T. Our method demonstrates stable and\neffective attack capabilities across various datasets, downstream tasks, and\nboth black-box and white-box settings, offering a unified baseline for\nexploring the robustness of VLP models.\n","authors":["Haonan Zheng","Xinyang Deng","Wen Jiang","Wenrui Li"],"pdf_url":"https://arxiv.org/pdf/2407.17797v1.pdf","comment":"14 pages, 9 figures, published in ACMMM2024(oral)"},{"id":"http://arxiv.org/abs/2407.17792v1","updated":"2024-07-25T06:03:02Z","published":"2024-07-25T06:03:02Z","title":"Harnessing Temporal Causality for Advanced Temporal Action Detection","summary":" As a fundamental task in long-form video understanding, temporal action\ndetection (TAD) aims to capture inherent temporal relations in untrimmed videos\nand identify candidate actions with precise boundaries. Over the years, various\nnetworks, including convolutions, graphs, and transformers, have been explored\nfor effective temporal modeling for TAD. However, these modules typically treat\npast and future information equally, overlooking the crucial fact that changes\nin action boundaries are essentially causal events. Inspired by this insight,\nwe propose leveraging the temporal causality of actions to enhance TAD\nrepresentation by restricting the model's access to only past or future\ncontext. We introduce CausalTAD, which combines causal attention and causal\nMamba to achieve state-of-the-art performance on multiple benchmarks. Notably,\nwith CausalTAD, we ranked 1st in the Action Recognition, Action Detection, and\nAudio-Based Interaction Detection tracks at the EPIC-Kitchens Challenge 2024,\nas well as 1st in the Moment Queries track at the Ego4D Challenge 2024. Our\ncode is available at https://github.com/sming256/OpenTAD/causaltad.\n","authors":["Shuming Liu","Lin Sui","Chen-Lin Zhang","Fangzhou Mu","Chen Zhao","Bernard Ghanem"],"pdf_url":"https://arxiv.org/pdf/2407.17792v1.pdf","comment":"1st in Moment Queries track at the Ego4D Challenge 2024; 1st in\n Action Recognition, Action Detection, and Audio-Based Interaction Detection\n tracks at the EPIC-Kitchens Challenge 2024"},{"id":"http://arxiv.org/abs/2310.05989v3","updated":"2024-07-25T06:02:18Z","published":"2023-10-07T21:55:29Z","title":"QE-BEV: Query Evolution for Bird's Eye View Object Detection in Varied\n Contexts","summary":" 3D object detection plays a pivotal role in autonomous driving and robotics,\ndemanding precise interpretation of Bird's Eye View (BEV) images. The dynamic\nnature of real-world environments necessitates the use of dynamic query\nmechanisms in 3D object detection to adaptively capture and process the complex\nspatio-temporal relationships present in these scenes. However, prior\nimplementations of dynamic queries have often faced difficulties in effectively\nleveraging these relationships, particularly when it comes to integrating\ntemporal information in a computationally efficient manner. Addressing this\nlimitation, we introduce a framework utilizing dynamic query evolution\nstrategy, harnesses K-means clustering and Top-K attention mechanisms for\nrefined spatio-temporal data processing. By dynamically segmenting the BEV\nspace and prioritizing key features through Top-K attention, our model achieves\na real-time, focused analysis of pertinent scene elements. Our extensive\nevaluation on the nuScenes and Waymo dataset showcases a marked improvement in\ndetection accuracy, setting a new benchmark in the domain of query-based BEV\nobject detection. Our dynamic query evolution strategy has the potential to\npush the boundaries of current BEV methods with enhanced adaptability and\ncomputational efficiency. Project page:\nhttps://github.com/Jiawei-Yao0812/QE-BEV\n","authors":["Jiawei Yao","Yingxin Lai","Hongrui Kou","Tong Wu","Ruixi Liu"],"pdf_url":"https://arxiv.org/pdf/2310.05989v3.pdf","comment":"Accepted by ACM MM 2024, project page:\n https://github.com/Jiawei-Yao0812/QE-BEV"},{"id":"http://arxiv.org/abs/2407.17791v1","updated":"2024-07-25T05:58:58Z","published":"2024-07-25T05:58:58Z","title":"Investigating learning-independent abstract reasoning in artificial\n neural networks","summary":" Humans are capable of solving complex abstract reasoning tests. Whether this\nability reflects a learning-independent inference mechanism applicable to any\nnovel unlearned problem or whether it is a manifestation of extensive training\nthroughout life is an open question. Addressing this question in humans is\nchallenging because it is impossible to control their prior training. However,\nassuming a similarity between the cognitive processing of Artificial Neural\nNetworks (ANNs) and humans, the extent to which training is required for ANNs'\nabstract reasoning is informative about this question in humans. Previous\nstudies demonstrated that ANNs can solve abstract reasoning tests. However,\nthis success required extensive training. In this study, we examined the\nlearning-independent abstract reasoning of ANNs. Specifically, we evaluated\ntheir performance without any pretraining, with the ANNs' weights being\nrandomly-initialized, and only change in the process of problem solving. We\nfound that naive ANN models can solve non-trivial visual reasoning tests,\nsimilar to those used to evaluate human learning-independent reasoning. We\nfurther studied the mechanisms that support this ability. Our results suggest\nthe possibility of learning-independent abstract reasoning that does not\nrequire extensive training.\n","authors":["Tomer Barak","Yonatan Loewenstein"],"pdf_url":"https://arxiv.org/pdf/2407.17791v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17786v1","updated":"2024-07-25T05:30:09Z","published":"2024-07-25T05:30:09Z","title":"Topology-Preserving Downsampling of Binary Images","summary":" We present a novel discrete optimization-based approach to generate\ndownsampled versions of binary images that are guaranteed to have the same\ntopology as the original, measured by the zeroth and first Betti numbers of the\nblack regions, while having good similarity to the original image as measured\nby IoU and Dice scores. To our best knowledge, all existing binary image\ndownsampling methods do not have such topology-preserving guarantees. We also\nimplemented a baseline morphological operation (dilation)-based approach that\nalways generates topologically correct results. However, we found the\nsimilarity scores to be much worse. We demonstrate several applications of our\napproach. First, generating smaller versions of medical image segmentation\nmasks for easier human inspection. Second, improving the efficiency of binary\nimage operations, including persistent homology computation and shortest path\ncomputation, by substituting the original images with smaller ones. In\nparticular, the latter is a novel application that is made feasible only by the\nfull topology-preservation guarantee of our method.\n","authors":["Chia-Chia Chen","Chi-Han Peng"],"pdf_url":"https://arxiv.org/pdf/2407.17786v1.pdf","comment":"Accepted to The 18th European Conference on Computer Vision (ECCV)\n 2024"},{"id":"http://arxiv.org/abs/2406.19407v4","updated":"2024-07-25T05:24:41Z","published":"2024-06-12T06:41:23Z","title":"YOLOv10 to Its Genesis: A Decadal and Comprehensive Review of The You\n Only Look Once (YOLO) Series","summary":" This review systematically examines the progression of the You Only Look Once\n(YOLO) object detection algorithms from YOLOv1 to the recently unveiled\nYOLOv10. Employing a reverse chronological analysis, this study examines the\nadvancements introduced by YOLO algorithms, beginning with YOLOv10 and\nprogressing through YOLOv9, YOLOv8, and subsequent versions to explore each\nversion's contributions to enhancing speed, accuracy, and computational\nefficiency in real-time object detection. The study highlights the\ntransformative impact of YOLO across five critical application areas:\nautomotive safety, healthcare, industrial manufacturing, surveillance, and\nagriculture. By detailing the incremental technological advancements in\nsubsequent YOLO versions, this review chronicles the evolution of YOLO, and\ndiscusses the challenges and limitations in each earlier versions. The\nevolution signifies a path towards integrating YOLO with multimodal,\ncontext-aware, and General Artificial Intelligence (AGI) systems for the next\nYOLO decade, promising significant implications for future developments in\nAI-driven applications.\n","authors":["Ranjan Sapkota","Rizwan Qureshi","Marco Flores Calero","Chetan Badjugar","Upesh Nepal","Alwin Poulose","Peter Zeno","Uday Bhanu Prakash Vaddevolu","Sheheryar Khan","Maged Shoman","Hong Yan","Manoj Karkee"],"pdf_url":"https://arxiv.org/pdf/2406.19407v4.pdf","comment":"11 Figures, 7 Tables"},{"id":"http://arxiv.org/abs/2407.17028v2","updated":"2024-07-25T05:23:24Z","published":"2024-07-24T06:15:28Z","title":"Enhancing Environmental Monitoring through Multispectral Imaging: The\n WasteMS Dataset for Semantic Segmentation of Lakeside Waste","summary":" Environmental monitoring of lakeside green areas is crucial for environmental\nprotection. Compared to manual inspections, computer vision technologies offer\na more efficient solution when deployed on-site. Multispectral imaging provides\ndiverse information about objects under different spectrums, aiding in the\ndifferentiation between waste and lakeside lawn environments. This study\nintroduces WasteMS, the first multispectral dataset established for the\nsemantic segmentation of lakeside waste. WasteMS includes a diverse range of\nwaste types in lawn environments, captured under various lighting conditions.\nWe implemented a rigorous annotation process to label waste in images.\nRepresentative semantic segmentation frameworks were used to evaluate\nsegmentation accuracy using WasteMS. Challenges encountered when using WasteMS\nfor segmenting waste on lakeside lawns were discussed. The WasteMS dataset is\navailable at https://github.com/zhuqinfeng1999/WasteMS.\n","authors":["Qinfeng Zhu","Ningxin Weng","Lei Fan","Yuanzhi Cai"],"pdf_url":"https://arxiv.org/pdf/2407.17028v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17783v1","updated":"2024-07-25T05:23:20Z","published":"2024-07-25T05:23:20Z","title":"How Lightweight Can A Vision Transformer Be","summary":" In this paper, we explore a strategy that uses Mixture-of-Experts (MoE) to\nstreamline, rather than augment, vision transformers. Each expert in an MoE\nlayer is a SwiGLU feedforward network, where V and W2 are shared across the\nlayer. No complex attention or convolutional mechanisms are employed.\nDepth-wise scaling is applied to progressively reduce the size of the hidden\nlayer and the number of experts is increased in stages. Grouped query attention\nis used. We studied the proposed approach with and without pre-training on\nsmall datasets and investigated whether transfer learning works at this scale.\nWe found that the architecture is competitive even at a size of 0.67M\nparameters.\n","authors":["Jen Hong Tan"],"pdf_url":"https://arxiv.org/pdf/2407.17783v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17780v1","updated":"2024-07-25T05:21:48Z","published":"2024-07-25T05:21:48Z","title":"HF-Fed: Hierarchical based customized Federated Learning Framework for\n X-Ray Imaging","summary":" In clinical applications, X-ray technology is vital for noninvasive\nexaminations like mammography, providing essential anatomical information.\nHowever, the radiation risk associated with X-ray procedures raises concerns.\nX-ray reconstruction is crucial in medical imaging for detailed visual\nrepresentations of internal structures, aiding diagnosis and treatment without\ninvasive procedures. Recent advancements in deep learning (DL) have shown\npromise in X-ray reconstruction, but conventional DL methods often require\ncentralized aggregation of large datasets, leading to domain shifts and privacy\nissues. To address these challenges, we introduce the Hierarchical\nFramework-based Federated Learning method (HF-Fed) for customized X-ray\nimaging. HF-Fed tackles X-ray imaging optimization by decomposing the problem\ninto local data adaptation and holistic X-ray imaging. It employs a\nhospital-specific hierarchical framework and a shared common imaging network\ncalled Network of Networks (NoN) to acquire stable features from diverse data\ndistributions. The hierarchical hypernetwork extracts domain-specific\nhyperparameters, conditioning the NoN for customized X-ray reconstruction.\nExperimental results demonstrate HF-Fed's competitive performance, offering a\npromising solution for enhancing X-ray imaging without data sharing. This study\nsignificantly contributes to the literature on federated learning in\nhealthcare, providing valuable insights for policymakers and healthcare\nproviders. The source code and pre-trained HF-Fed model are available at\n\\url{https://tisharepo.github.io/Webpage/}.\n","authors":["Tajamul Ashraf","Tisha Madame"],"pdf_url":"https://arxiv.org/pdf/2407.17780v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17779v1","updated":"2024-07-25T05:18:18Z","published":"2024-07-25T05:18:18Z","title":"DAC: 2D-3D Retrieval with Noisy Labels via Divide-and-Conquer Alignment\n and Correction","summary":" With the recent burst of 2D and 3D data, cross-modal retrieval has attracted\nincreasing attention recently. However, manual labeling by non-experts will\ninevitably introduce corrupted annotations given ambiguous 2D/3D content.\nThough previous works have addressed this issue by designing a naive division\nstrategy with hand-crafted thresholds, their performance generally exhibits\ngreat sensitivity to the threshold value. Besides, they fail to fully utilize\nthe valuable supervisory signals within each divided subset. To tackle this\nproblem, we propose a Divide-and-conquer 2D-3D cross-modal Alignment and\nCorrection framework (DAC), which comprises Multimodal Dynamic Division (MDD)\nand Adaptive Alignment and Correction (AAC). Specifically, the former performs\naccurate sample division by adaptive credibility modeling for each sample based\non the compensation information within multimodal loss distribution. Then in\nAAC, samples in distinct subsets are exploited with different alignment\nstrategies to fully enhance the semantic compactness and meanwhile alleviate\nover-fitting to noisy labels, where a self-correction strategy is introduced to\nimprove the quality of representation. Moreover. To evaluate the effectiveness\nin real-world scenarios, we introduce a challenging noisy benchmark, namely\nObjaverse-N200, which comprises 200k-level samples annotated with 1156\nrealistic noisy labels. Extensive experiments on both traditional and the newly\nproposed benchmarks demonstrate the generality and superiority of our DAC,\nwhere DAC outperforms state-of-the-art models by a large margin. (i.e., with\n+5.9% gain on ModelNet40 and +5.8% on Objaverse-N200).\n","authors":["Chaofan Gan","Yuanpeng Tu","Yuxi Li","Weiyao Lin"],"pdf_url":"https://arxiv.org/pdf/2407.17779v1.pdf","comment":"accepted by ACM MM 2024"},{"id":"http://arxiv.org/abs/2407.17773v1","updated":"2024-07-25T05:02:39Z","published":"2024-07-25T05:02:39Z","title":"KiVA: Kid-inspired Visual Analogies for Testing Large Multimodal Models","summary":" This paper investigates visual analogical reasoning in large multimodal\nmodels (LMMs) compared to human adults and children. A \"visual analogy\" is an\nabstract rule inferred from one image and applied to another. While benchmarks\nexist for testing visual reasoning in LMMs, they require advanced skills and\nomit basic visual analogies that even young children can make. Inspired by\ndevelopmental psychology, we propose a new benchmark of 1,400 visual\ntransformations of everyday objects to test LMMs on visual analogical reasoning\nand compare them to children and adults. We structure the evaluation into three\nstages: identifying what changed (e.g., color, number, etc.), how it changed\n(e.g., added one object), and applying the rule to new scenarios. Our findings\nshow that while models like GPT-4V, LLaVA-1.5, and MANTIS identify the \"what\"\neffectively, they struggle with quantifying the \"how\" and extrapolating this\nrule to new objects. In contrast, children and adults exhibit much stronger\nanalogical reasoning at all three stages. Additionally, the strongest tested\nmodel, GPT-4V, performs better in tasks involving simple visual attributes like\ncolor and size, correlating with quicker human adult response times.\nConversely, more complex tasks such as number, rotation, and reflection, which\nnecessitate extensive cognitive processing and understanding of the 3D physical\nworld, present more significant challenges. Altogether, these findings\nhighlight the limitations of training models on data that primarily consists of\n2D images and text.\n","authors":["Eunice Yiu","Maan Qraitem","Charlie Wong","Anisa Noor Majhi","Yutong Bai","Shiry Ginosar","Alison Gopnik","Kate Saenko"],"pdf_url":"https://arxiv.org/pdf/2407.17773v1.pdf","comment":"9 pages. For the KiVA benchmark, see https://github.com/ey242/KiVA"},{"id":"http://arxiv.org/abs/2407.17772v1","updated":"2024-07-25T05:02:27Z","published":"2024-07-25T05:02:27Z","title":"ERIT Lightweight Multimodal Dataset for Elderly Emotion Recognition and\n Multimodal Fusion Evaluation","summary":" ERIT is a novel multimodal dataset designed to facilitate research in a\nlightweight multimodal fusion. It contains text and image data collected from\nvideos of elderly individuals reacting to various situations, as well as seven\nemotion labels for each data sample. Because of the use of labeled images of\nelderly users reacting emotionally, it is also facilitating research on emotion\nrecognition in an underrepresented age group in machine learning visual emotion\nrecognition. The dataset is validated through comprehensive experiments\nindicating its importance in neural multimodal fusion research.\n","authors":["Rita Frieske","Bertrand E. Shi"],"pdf_url":"https://arxiv.org/pdf/2407.17772v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02307v2","updated":"2024-07-25T04:40:04Z","published":"2024-03-04T18:44:30Z","title":"Harnessing Intra-group Variations Via a Population-Level Context for\n Pathology Detection","summary":" Realizing sufficient separability between the distributions of healthy and\npathological samples is a critical obstacle for pathology detection\nconvolutional models. Moreover, these models exhibit a bias for contrast-based\nimages, with diminished performance on texture-based medical images. This study\nintroduces the notion of a population-level context for pathology detection and\nemploys a graph theoretic approach to model and incorporate it into the latent\ncode of an autoencoder via a refinement module we term PopuSense. PopuSense\nseeks to capture additional intra-group variations inherent in biomedical data\nthat a local or global context of the convolutional model might miss or smooth\nout. Proof-of-concept experiments on contrast-based and texture-based images,\nwith minimal adaptation, encounter the existing preference for intensity-based\ninput. Nevertheless, PopuSense demonstrates improved separability in\ncontrast-based images, presenting an additional avenue for refining\nrepresentations learned by a model.\n","authors":["P. Bilha Githinji","Xi Yuan","Zhenglin Chen","Ijaz Gul","Dingqi Shang","Wen Liang","Jianming Deng","Dan Zeng","Dongmei yu","Chenggang Yan","Peiwu Qin"],"pdf_url":"https://arxiv.org/pdf/2403.02307v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17762v1","updated":"2024-07-25T04:33:19Z","published":"2024-07-25T04:33:19Z","title":"Mpox Detection Advanced: Rapid Epidemic Response Through Synthetic Data","summary":" Rapid development of disease detection models using computer vision is\ncrucial in responding to medical emergencies, such as epidemics or bioterrorism\nevents. Traditional data collection methods are often too slow in these\nscenarios, requiring innovative approaches for quick, reliable model generation\nfrom minimal data. Our study introduces a novel approach by constructing a\ncomprehensive computer vision model to detect Mpox lesions using only synthetic\ndata. Initially, these models generated a diverse set of synthetic images\nrepresenting Mpox lesions on various body parts (face, back, chest, leg, neck,\narm) across different skin tones as defined by the Fitzpatrick scale (fair,\nbrown, dark skin). Subsequently, we trained and tested a vision model with this\nsynthetic dataset to evaluate the diffusion models' efficacy in producing\nhigh-quality training data and its impact on the vision model's medical image\nrecognition performance. The results were promising; the vision model achieved\na 97% accuracy rate, with 96% precision and recall for Mpox cases, and\nsimilarly high metrics for normal and other skin disorder cases, demonstrating\nits ability to correctly identify true positives and minimize false positives.\nThe model achieved an F1-Score of 96% for Mpox cases and 98% for normal and\nother skin disorders, reflecting a balanced precision-recall relationship, thus\nensuring reliability and robustness in its predictions. Our proposed\nSynthVision methodology indicates the potential to develop accurate computer\nvision models with minimal data input for future medical emergencies.\n","authors":["Yudara Kularathne","Prathapa Janitha","Sithira Ambepitiya","Prarththanan Sothyrajah","Thanveer Ahamed","Dinuka Wijesundara"],"pdf_url":"https://arxiv.org/pdf/2407.17762v1.pdf","comment":"8 pages, 4 figures, 1 table"},{"id":"http://arxiv.org/abs/2407.06581v4","updated":"2024-07-25T04:19:58Z","published":"2024-07-09T06:20:17Z","title":"Vision language models are blind","summary":" While large language models with vision capabilities (VLMs), e.g., GPT-4o and\nGemini 1.5 Pro, are powering various image-text applications and scoring high\non many vision-understanding benchmarks, we find that they are surprisingly\nstill struggling with low-level vision tasks that are easy to humans.\nSpecifically, on BlindTest, our suite of 7 very simple tasks such as\nidentifying (a) whether two circles overlap; (b) whether two lines intersect;\n(c) which letter is being circled in a word; and (d) counting circles in an\nOlympic-like logo, four state-of-the-art VLMs are only 58.57% accurate on\naverage. Claude 3.5 Sonnet performs the best at 74.01% accuracy, but this is\nstill far from the human expected accuracy of 100%. Across different image\nresolutions and line widths, VLMs consistently struggle with tasks that require\nprecise spatial information and recognizing geometric primitives that overlap\nor are close together. Code and data are available at:\nhttps://vlmsareblind.github.io\n","authors":["Pooyan Rahmanzadehgervi","Logan Bolton","Mohammad Reza Taesiri","Anh Totti Nguyen"],"pdf_url":"https://arxiv.org/pdf/2407.06581v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17757v1","updated":"2024-07-25T04:12:49Z","published":"2024-07-25T04:12:49Z","title":"CRASH: Crash Recognition and Anticipation System Harnessing with\n Context-Aware and Temporal Focus Attentions","summary":" Accurately and promptly predicting accidents among surrounding traffic agents\nfrom camera footage is crucial for the safety of autonomous vehicles (AVs).\nThis task presents substantial challenges stemming from the unpredictable\nnature of traffic accidents, their long-tail distribution, the intricacies of\ntraffic scene dynamics, and the inherently constrained field of vision of\nonboard cameras. To address these challenges, this study introduces a novel\naccident anticipation framework for AVs, termed CRASH. It seamlessly integrates\nfive components: object detector, feature extractor, object-aware module,\ncontext-aware module, and multi-layer fusion. Specifically, we develop the\nobject-aware module to prioritize high-risk objects in complex and ambiguous\nenvironments by calculating the spatial-temporal relationships between traffic\nagents. In parallel, the context-aware is also devised to extend global visual\ninformation from the temporal to the frequency domain using the Fast Fourier\nTransform (FFT) and capture fine-grained visual features of potential objects\nand broader context cues within traffic scenes. To capture a wider range of\nvisual cues, we further propose a multi-layer fusion that dynamically computes\nthe temporal dependencies between different scenes and iteratively updates the\ncorrelations between different visual features for accurate and timely accident\nprediction. Evaluated on real-world datasets--Dashcam Accident Dataset (DAD),\nCar Crash Dataset (CCD), and AnAn Accident Detection (A3D) datasets--our model\nsurpasses existing top baselines in critical evaluation metrics like Average\nPrecision (AP) and mean Time-To-Accident (mTTA). Importantly, its robustness\nand adaptability are particularly evident in challenging driving scenarios with\nmissing or limited training data, demonstrating significant potential for\napplication in real-world autonomous driving systems.\n","authors":["Haicheng Liao","Haoyu Sun","Huanming Shen","Chengyue Wang","Kahou Tam","Chunlin Tian","Li Li","Chengzhong Xu","Zhenning Li"],"pdf_url":"https://arxiv.org/pdf/2407.17757v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17755v1","updated":"2024-07-25T04:09:17Z","published":"2024-07-25T04:09:17Z","title":"Enhancing Eye Disease Diagnosis with Deep Learning and Synthetic Data\n Augmentation","summary":" In recent years, the focus is on improving the diagnosis of diabetic\nretinopathy (DR) using machine learning and deep learning technologies.\nResearchers have explored various approaches, including the use of\nhigh-definition medical imaging, AI-driven algorithms such as convolutional\nneural networks (CNNs) and generative adversarial networks (GANs). Among all\nthe available tools, CNNs have emerged as a preferred tool due to their\nsuperior classification accuracy and efficiency. Although the accuracy of CNNs\nis comparatively better but it can be improved by introducing some hybrid\nmodels by combining various machine learning and deep learning models.\nTherefore, in this paper, an ensemble learning technique is proposed for early\ndetection and management of DR with higher accuracy. The proposed model is\ntested on the APTOS dataset and it is showing supremacy on the validation\naccuracy ($99\\%)$ in comparison to the previous models. Hence, the model can be\nhelpful for early detection and treatment of the DR, thereby enhancing the\noverall quality of care for affected individuals.\n","authors":["Saideep Kilaru","Kothamasu Jayachandra","Tanishka Yagneshwar","Suchi Kumari"],"pdf_url":"https://arxiv.org/pdf/2407.17755v1.pdf","comment":"18 pages, 7 figures, 2 Tables"},{"id":"http://arxiv.org/abs/2407.15706v3","updated":"2024-07-25T03:51:18Z","published":"2024-07-22T15:16:47Z","title":"Multi-Modality Co-Learning for Efficient Skeleton-based Action\n Recognition","summary":" Skeleton-based action recognition has garnered significant attention due to\nthe utilization of concise and resilient skeletons. Nevertheless, the absence\nof detailed body information in skeletons restricts performance, while other\nmultimodal methods require substantial inference resources and are inefficient\nwhen using multimodal data during both training and inference stages. To\naddress this and fully harness the complementary multimodal features, we\npropose a novel multi-modality co-learning (MMCL) framework by leveraging the\nmultimodal large language models (LLMs) as auxiliary networks for efficient\nskeleton-based action recognition, which engages in multi-modality co-learning\nduring the training stage and keeps efficiency by employing only concise\nskeletons in inference. Our MMCL framework primarily consists of two modules.\nFirst, the Feature Alignment Module (FAM) extracts rich RGB features from video\nframes and aligns them with global skeleton features via contrastive learning.\nSecond, the Feature Refinement Module (FRM) uses RGB images with temporal\ninformation and text instruction to generate instructive features based on the\npowerful generalization of multimodal LLMs. These instructive text features\nwill further refine the classification scores and the refined scores will\nenhance the model's robustness and generalization in a manner similar to soft\nlabels. Extensive experiments on NTU RGB+D, NTU RGB+D 120 and Northwestern-UCLA\nbenchmarks consistently verify the effectiveness of our MMCL, which outperforms\nthe existing skeleton-based action recognition methods. Meanwhile, experiments\non UTD-MHAD and SYSU-Action datasets demonstrate the commendable generalization\nof our MMCL in zero-shot and domain-adaptive action recognition. Our code is\npublicly available at: https://github.com/liujf69/MMCL-Action.\n","authors":["Jinfu Liu","Chen Chen","Mengyuan Liu"],"pdf_url":"https://arxiv.org/pdf/2407.15706v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17744v1","updated":"2024-07-25T03:35:24Z","published":"2024-07-25T03:35:24Z","title":"Balancing Complementarity and Consistency via Delayed Activation in\n Incomplete Multi-view Clustering","summary":" This paper study one challenging issue in incomplete multi-view clustering,\nwhere valuable complementary information from other views is always ignored. To\nbe specific, we propose a framework that effectively balances Complementarity\nand Consistency information in Incomplete Multi-view Clustering (CoCo-IMC).\nSpecifically, we design a dual network of delayed activation, which achieves a\nbalance of complementarity and consistency among different views. The delayed\nactivation could enriches the complementarity information that was ignored\nduring consistency learning. Then, we recover the incomplete information and\nenhance the consistency learning by minimizing the conditional entropy and\nmaximizing the mutual information across different views. This could be the\nfirst theoretical attempt to incorporate delayed activation into incomplete\ndata recovery and the balance of complementarity and consistency. We have\nproved the effectiveness of CoCo-IMC in extensive comparative experiments with\n12 state-of-the-art baselines on four publicly available datasets.\n","authors":["Bo Li"],"pdf_url":"https://arxiv.org/pdf/2407.17744v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17738v1","updated":"2024-07-25T03:26:41Z","published":"2024-07-25T03:26:41Z","title":"Enhancing Fine-grained Object Detection in Aerial Images via Orthogonal\n Mapping","summary":" Fine-Grained Object Detection (FGOD) is a critical task in high-resolution\naerial image analysis. This letter introduces Orthogonal Mapping (OM), a simple\nyet effective method aimed at addressing the challenge of semantic confusion\ninherent in FGOD. OM introduces orthogonal constraints in the feature space by\ndecoupling features from the last layer of the classification branch with a\nclass-wise orthogonal vector basis. This effectively mitigates semantic\nconfusion and enhances classification accuracy. Moreover, OM can be seamlessly\nintegrated into mainstream object detectors. Extensive experiments conducted on\nthree FGOD datasets (FAIR1M, ShipRSImageNet, and MAR20) demonstrate the\neffectiveness and superiority of the proposed approach. Notably, with just one\nline of code, OM achieves a 4.08% improvement in mean Average Precision (mAP)\nover FCOS on the ShipRSImageNet dataset. Codes are released at\nhttps://github.com/ZhuHaoranEIS/Orthogonal-FGOD.\n","authors":["Haoran Zhu","Yifan Zhou","Chang Xu","Ruixiang Zhang","Wen Yang"],"pdf_url":"https://arxiv.org/pdf/2407.17738v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17734v1","updated":"2024-07-25T03:12:57Z","published":"2024-07-25T03:12:57Z","title":"Cost-effective Instruction Learning for Pathology Vision and Language\n Analysis","summary":" The advent of vision-language models fosters the interactive conversations\nbetween AI-enabled models and humans. Yet applying these models into clinics\nmust deal with daunting challenges around large-scale training data, financial,\nand computational resources. Here we propose a cost-effective instruction\nlearning framework for conversational pathology named as CLOVER. CLOVER only\ntrains a lightweight module and uses instruction tuning while freezing the\nparameters of the large language model. Instead of using costly GPT-4, we\npropose well-designed prompts on GPT-3.5 for building generation-based\ninstructions, emphasizing the utility of pathological knowledge derived from\nthe Internet source. To augment the use of instructions, we construct a\nhigh-quality set of template-based instructions in the context of digital\npathology. From two benchmark datasets, our findings reveal the strength of\nhybrid-form instructions in the visual question-answer in pathology. Extensive\nresults show the cost-effectiveness of CLOVER in answering both open-ended and\nclosed-ended questions, where CLOVER outperforms strong baselines that possess\n37 times more training parameters and use instruction data generated from\nGPT-4. Through the instruction tuning, CLOVER exhibits robustness of few-shot\nlearning in the external clinical dataset. These findings demonstrate that\ncost-effective modeling of CLOVER could accelerate the adoption of rapid\nconversational applications in the landscape of digital pathology.\n","authors":["Kaitao Chen","Mianxin Liu","Fang Yan","Lei Ma","Xiaoming Shi","Lilong Wang","Xiaosong Wang","Lifeng Zhu","Zhe Wang","Mu Zhou","Shaoting Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.17734v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17726v1","updated":"2024-07-25T02:55:39Z","published":"2024-07-25T02:55:39Z","title":"Multi-modal Data Binding for Survival Analysis Modeling with Incomplete\n Data and Annotations","summary":" Survival analysis stands as a pivotal process in cancer treatment research,\ncrucial for predicting patient survival rates accurately. Recent advancements\nin data collection techniques have paved the way for enhancing survival\npredictions by integrating information from multiple modalities. However,\nreal-world scenarios often present challenges with incomplete data,\nparticularly when dealing with censored survival labels. Prior works have\naddressed missing modalities but have overlooked incomplete labels, which can\nintroduce bias and limit model efficacy. To bridge this gap, we introduce a\nnovel framework that simultaneously handles incomplete data across modalities\nand censored survival labels. Our approach employs advanced foundation models\nto encode individual modalities and align them into a universal representation\nspace for seamless fusion. By generating pseudo labels and incorporating\nuncertainty, we significantly enhance predictive accuracy. The proposed method\ndemonstrates outstanding prediction accuracy in two survival analysis tasks on\nboth employed datasets. This innovative approach overcomes limitations\nassociated with disparate modalities and improves the feasibility of\ncomprehensive survival analysis using multiple large foundation models.\n","authors":["Linhao Qu","Dan Huang","Shaoting Zhang","Xiaosong Wang"],"pdf_url":"https://arxiv.org/pdf/2407.17726v1.pdf","comment":"Accepted by MICCAI 2024"},{"id":"http://arxiv.org/abs/2407.01599v2","updated":"2024-07-25T02:25:11Z","published":"2024-06-26T02:20:23Z","title":"JailbreakZoo: Survey, Landscapes, and Horizons in Jailbreaking Large\n Language and Vision-Language Models","summary":" The rapid evolution of artificial intelligence (AI) through developments in\nLarge Language Models (LLMs) and Vision-Language Models (VLMs) has brought\nsignificant advancements across various technological domains. While these\nmodels enhance capabilities in natural language processing and visual\ninteractive tasks, their growing adoption raises critical concerns regarding\nsecurity and ethical alignment. This survey provides an extensive review of the\nemerging field of jailbreaking--deliberately circumventing the ethical and\noperational boundaries of LLMs and VLMs--and the consequent development of\ndefense mechanisms. Our study categorizes jailbreaks into seven distinct types\nand elaborates on defense strategies that address these vulnerabilities.\nThrough this comprehensive examination, we identify research gaps and propose\ndirections for future studies to enhance the security frameworks of LLMs and\nVLMs. Our findings underscore the necessity for a unified perspective that\nintegrates both jailbreak strategies and defensive solutions to foster a\nrobust, secure, and reliable environment for the next generation of language\nmodels. More details can be found on our website:\n\\url{https://chonghan-chen.com/llm-jailbreak-zoo-survey/}.\n","authors":["Haibo Jin","Leyang Hu","Xinuo Li","Peiyan Zhang","Chonghan Chen","Jun Zhuang","Haohan Wang"],"pdf_url":"https://arxiv.org/pdf/2407.01599v2.pdf","comment":"45 pages"},{"id":"http://arxiv.org/abs/2403.17839v2","updated":"2024-07-25T02:08:30Z","published":"2024-03-26T16:27:37Z","title":"ReMamber: Referring Image Segmentation with Mamba Twister","summary":" Referring Image Segmentation~(RIS) leveraging transformers has achieved great\nsuccess on the interpretation of complex visual-language tasks. However, the\nquadratic computation cost makes it resource-consuming in capturing long-range\nvisual-language dependencies. Fortunately, Mamba addresses this with efficient\nlinear complexity in processing. However, directly applying Mamba to\nmulti-modal interactions presents challenges, primarily due to inadequate\nchannel interactions for the effective fusion of multi-modal data. In this\npaper, we propose ReMamber, a novel RIS architecture that integrates the power\nof Mamba with a multi-modal Mamba Twister block. The Mamba Twister explicitly\nmodels image-text interaction, and fuses textual and visual features through\nits unique channel and spatial twisting mechanism. We achieve competitive\nresults on three challenging benchmarks with a simple and efficient\narchitecture. Moreover, we conduct thorough analyses of ReMamber and discuss\nother fusion designs using Mamba. These provide valuable perspectives for\nfuture research. The code has been released at:\nhttps://github.com/yyh-rain-song/ReMamber.\n","authors":["Yuhuan Yang","Chaofan Ma","Jiangchao Yao","Zhun Zhong","Ya Zhang","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2403.17839v2.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2407.05623v3","updated":"2024-07-25T01:59:16Z","published":"2024-07-08T05:31:51Z","title":"Momentum Auxiliary Network for Supervised Local Learning","summary":" Deep neural networks conventionally employ end-to-end backpropagation for\ntheir training process, which lacks biological credibility and triggers a\nlocking dilemma during network parameter updates, leading to significant GPU\nmemory use. Supervised local learning, which segments the network into multiple\nlocal blocks updated by independent auxiliary networks. However, these methods\ncannot replace end-to-end training due to lower accuracy, as gradients only\npropagate within their local block, creating a lack of information exchange\nbetween blocks. To address this issue and establish information transfer across\nblocks, we propose a Momentum Auxiliary Network (MAN) that establishes a\ndynamic interaction mechanism. The MAN leverages an exponential moving average\n(EMA) of the parameters from adjacent local blocks to enhance information flow.\nThis auxiliary network, updated through EMA, helps bridge the informational gap\nbetween blocks. Nevertheless, we observe that directly applying EMA parameters\nhas certain limitations due to feature discrepancies among local blocks. To\novercome this, we introduce learnable biases, further boosting performance. We\nhave validated our method on four image classification datasets (CIFAR-10,\nSTL-10, SVHN, ImageNet), attaining superior performance and substantial memory\nsavings. Notably, our method can reduce GPU memory usage by more than 45\\% on\nthe ImageNet dataset compared to end-to-end training, while achieving higher\nperformance. The Momentum Auxiliary Network thus offers a new perspective for\nsupervised local learning. Our code is available at:\nhttps://github.com/JunhaoSu0/MAN.\n","authors":["Junhao Su","Changpeng Cai","Feiyu Zhu","Chenghao He","Xiaojie Xu","Dongzhi Guan","Chenyang Si"],"pdf_url":"https://arxiv.org/pdf/2407.05623v3.pdf","comment":"Accepted by ECCV2024"},{"id":"http://arxiv.org/abs/2407.17705v1","updated":"2024-07-25T01:58:10Z","published":"2024-07-25T01:58:10Z","title":"ALMRR: Anomaly Localization Mamba on Industrial Textured Surface with\n Feature Reconstruction and Refinement","summary":" Unsupervised anomaly localization on industrial textured images has achieved\nremarkable results through reconstruction-based methods, yet existing\napproaches based on image reconstruction and feature reconstruc-tion each have\ntheir own shortcomings. Firstly, image-based methods tend to reconstruct both\nnormal and anomalous regions well, which lead to over-generalization.\nFeature-based methods contain a large amount of distin-guishable semantic\ninformation, however, its feature structure is redundant and lacks anomalous\ninformation, which leads to significant reconstruction errors. In this paper,\nwe propose an Anomaly Localization method based on Mamba with Feature\nReconstruction and Refinement(ALMRR) which re-constructs semantic features\nbased on Mamba and then refines them through a feature refinement module. To\nequip the model with prior knowledge of anomalies, we enhance it by adding\nartificially simulated anomalies to the original images. Unlike image\nreconstruction or repair, the features of synthesized defects are repaired\nalong with those of normal areas. Finally, the aligned features containing rich\nsemantic information are fed in-to the refinement module to obtain the anomaly\nmap. Extensive experiments have been conducted on the MVTec-AD-Textured dataset\nand other real-world industrial dataset, which has demonstrated superior\nperformance com-pared to state-of-the-art (SOTA) methods.\n","authors":["Shichen Qu","Xian Tao","Zhen Qu","Xinyi Gong","Zhengtao Zhang","Mukesh Prasad"],"pdf_url":"https://arxiv.org/pdf/2407.17705v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19919v4","updated":"2024-07-25T01:50:46Z","published":"2024-03-29T02:10:38Z","title":"Diff-Reg v1: Diffusion Matching Model for Registration Problem","summary":" Establishing reliable correspondences is essential for registration tasks\nsuch as 3D and 2D3D registration. Existing methods commonly leverage geometric\nor semantic point features to generate potential correspondences. However,\nthese features may face challenges such as large deformation, scale\ninconsistency, and ambiguous matching problems (e.g., symmetry). Additionally,\nmany previous methods, which rely on single-pass prediction, may struggle with\nlocal minima in complex scenarios. To mitigate these challenges, we introduce a\ndiffusion matching model for robust correspondence construction. Our approach\ntreats correspondence estimation as a denoising diffusion process within the\ndoubly stochastic matrix space, which gradually denoises (refines) a doubly\nstochastic matching matrix to the ground-truth one for high-quality\ncorrespondence estimation. It involves a forward diffusion process that\ngradually introduces Gaussian noise into the ground truth matching matrix and a\nreverse denoising process that iteratively refines the noisy matching matrix.\nIn particular, the feature extraction from the backbone occurs only once during\nthe inference phase. Our lightweight denoising module utilizes the same feature\nat each reverse sampling step. Evaluation of our method on both 3D and 2D3D\nregistration tasks confirms its effectiveness. The code is available at\nhttps://github.com/wuqianliang/Diff-Reg.\n","authors":["Qianliang Wu","Haobo Jiang","Lei Luo","Jun Li","Yaqing Ding","Jin Xie","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2403.19919v4.pdf","comment":"arXiv admin note: text overlap with arXiv:2401.00436"},{"id":"http://arxiv.org/abs/2402.17228v4","updated":"2024-07-25T01:20:23Z","published":"2024-02-27T05:42:38Z","title":"Feature Re-Embedding: Towards Foundation Model-Level Performance in\n Computational Pathology","summary":" Multiple instance learning (MIL) is the most widely used framework in\ncomputational pathology, encompassing sub-typing, diagnosis, prognosis, and\nmore. However, the existing MIL paradigm typically requires an offline instance\nfeature extractor, such as a pre-trained ResNet or a foundation model. This\napproach lacks the capability for feature fine-tuning within the specific\ndownstream tasks, limiting its adaptability and performance. To address this\nissue, we propose a Re-embedded Regional Transformer (R$^2$T) for re-embedding\nthe instance features online, which captures fine-grained local features and\nestablishes connections across different regions. Unlike existing works that\nfocus on pre-training powerful feature extractor or designing sophisticated\ninstance aggregator, R$^2$T is tailored to re-embed instance features online.\nIt serves as a portable module that can seamlessly integrate into mainstream\nMIL models. Extensive experimental results on common computational pathology\ntasks validate that: 1) feature re-embedding improves the performance of MIL\nmodels based on ResNet-50 features to the level of foundation model features,\nand further enhances the performance of foundation model features; 2) the\nR$^2$T can introduce more significant performance improvements to various MIL\nmodels; 3) R$^2$T-MIL, as an R$^2$T-enhanced AB-MIL, outperforms other latest\nmethods by a large margin.The code is available at:\nhttps://github.com/DearCaat/RRT-MIL.\n","authors":["Wenhao Tang","Fengtao Zhou","Sheng Huang","Xiang Zhu","Yi Zhang","Bo Liu"],"pdf_url":"https://arxiv.org/pdf/2402.17228v4.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2407.17689v1","updated":"2024-07-25T01:12:48Z","published":"2024-07-25T01:12:48Z","title":"SAM-MIL: A Spatial Contextual Aware Multiple Instance Learning Approach\n for Whole Slide Image Classification","summary":" Multiple Instance Learning (MIL) represents the predominant framework in\nWhole Slide Image (WSI) classification, covering aspects such as sub-typing,\ndiagnosis, and beyond. Current MIL models predominantly rely on instance-level\nfeatures derived from pretrained models such as ResNet. These models segment\neach WSI into independent patches and extract features from these local\npatches, leading to a significant loss of global spatial context and\nrestricting the model's focus to merely local features. To address this issue,\nwe propose a novel MIL framework, named SAM-MIL, that emphasizes spatial\ncontextual awareness and explicitly incorporates spatial context by extracting\ncomprehensive, image-level information. The Segment Anything Model (SAM)\nrepresents a pioneering visual segmentation foundational model that can capture\nsegmentation features without the need for additional fine-tuning, rendering it\nan outstanding tool for extracting spatial context directly from raw WSIs. Our\napproach includes the design of group feature extraction based on spatial\ncontext and a SAM-Guided Group Masking strategy to mitigate class imbalance\nissues. We implement a dynamic mask ratio for different segmentation categories\nand supplement these with representative group features of categories.\nMoreover, SAM-MIL divides instances to generate additional pseudo-bags, thereby\naugmenting the training set, and introduces consistency of spatial context\nacross pseudo-bags to further enhance the model's performance. Experimental\nresults on the CAMELYON-16 and TCGA Lung Cancer datasets demonstrate that our\nproposed SAM-MIL model outperforms existing mainstream methods in WSIs\nclassification. Our open-source implementation code is is available at\nhttps://github.com/FangHeng/SAM-MIL.\n","authors":["Heng Fang","Sheng Huang","Wenhao Tang","Luwen Huangfu","Bo Liu"],"pdf_url":"https://arxiv.org/pdf/2407.17689v1.pdf","comment":"accepted by ACM Multimedia 2024"},{"id":"http://arxiv.org/abs/2302.00509v2","updated":"2024-07-25T01:09:57Z","published":"2023-02-01T15:28:55Z","title":"Exploring Semantic Perturbations on Grover","summary":" With news and information being as easy to access as they currently are, it\nis more important than ever to ensure that people are not mislead by what they\nread. Recently, the rise of neural fake news (AI-generated fake news) and its\ndemonstrated effectiveness at fooling humans has prompted the development of\nmodels to detect it. One such model is the Grover model, which can both detect\nneural fake news to prevent it, and generate it to demonstrate how a model\ncould be misused to fool human readers. In this work we explore the Grover\nmodel's fake news detection capabilities by performing targeted attacks through\nperturbations on input news articles. Through this we test Grover's resilience\nto these adversarial attacks and expose some potential vulnerabilities which\nshould be addressed in further iterations to ensure it can detect all types of\nfake news accurately.\n","authors":["Ziqing Ji","Pranav Kulkarni","Marko Neskovic","Kevin Nolan","Yan Xu"],"pdf_url":"https://arxiv.org/pdf/2302.00509v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.12736v3","updated":"2024-07-25T00:00:18Z","published":"2024-07-17T16:56:06Z","title":"CHOSEN: Compilation to Hardware Optimization Stack for Efficient Vision\n Transformer Inference","summary":" Vision Transformers (ViTs) represent a groundbreaking shift in machine\nlearning approaches to computer vision. Unlike traditional approaches, ViTs\nemploy the self-attention mechanism, which has been widely used in natural\nlanguage processing, to analyze image patches. Despite their advantages in\nmodeling visual tasks, deploying ViTs on hardware platforms, notably\nField-Programmable Gate Arrays (FPGAs), introduces considerable challenges.\nThese challenges stem primarily from the non-linear calculations and high\ncomputational and memory demands of ViTs. This paper introduces CHOSEN, a\nsoftware-hardware co-design framework to address these challenges and offer an\nautomated framework for ViT deployment on the FPGAs in order to maximize\nperformance. Our framework is built upon three fundamental contributions:\nmulti-kernel design to maximize the bandwidth, mainly targeting benefits of\nmulti DDR memory banks, approximate non-linear functions that exhibit minimal\naccuracy degradation, and efficient use of available logic blocks on the FPGA,\nand efficient compiler to maximize the performance and memory-efficiency of the\ncomputing kernels by presenting a novel algorithm for design space exploration\nto find optimal hardware configuration that achieves optimal throughput and\nlatency. Compared to the state-of-the-art ViT accelerators, CHOSEN achieves a\n1.5x and 1.42x improvement in the throughput on the DeiT-S and DeiT-B models.\n","authors":["Mohammad Erfan Sadeghi","Arash Fayyazi","Suhas Somashekar","Massoud Pedram"],"pdf_url":"https://arxiv.org/pdf/2407.12736v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18428v1","updated":"2024-07-25T23:27:10Z","published":"2024-07-25T23:27:10Z","title":"Weighted Risk Invariance: Domain Generalization under Invariant Feature\n Shift","summary":" Learning models whose predictions are invariant under multiple environments\nis a promising approach for out-of-distribution generalization. Such models are\ntrained to extract features $X_{\\text{inv}}$ where the conditional distribution\n$Y \\mid X_{\\text{inv}}$ of the label given the extracted features does not\nchange across environments. Invariant models are also supposed to generalize to\nshifts in the marginal distribution $p(X_{\\text{inv}})$ of the extracted\nfeatures $X_{\\text{inv}}$, a type of shift we call an $\\textit{invariant\ncovariate shift}$. However, we show that proposed methods for learning\ninvariant models underperform under invariant covariate shift, either failing\nto learn invariant models$\\unicode{x2014}$even for data generated from simple\nand well-studied linear-Gaussian models$\\unicode{x2014}$or having poor\nfinite-sample performance. To alleviate these problems, we propose\n$\\textit{weighted risk invariance}$ (WRI). Our framework is based on imposing\ninvariance of the loss across environments subject to appropriate reweightings\nof the training examples. We show that WRI provably learns invariant models,\ni.e. discards spurious correlations, in linear-Gaussian settings. We propose a\npractical algorithm to implement WRI by learning the density\n$p(X_{\\text{inv}})$ and the model parameters simultaneously, and we demonstrate\nempirically that WRI outperforms previous invariant learning methods under\ninvariant covariate shift.\n","authors":["Gina Wong","Joshua Gleason","Rama Chellappa","Yoav Wald","Anqi Liu"],"pdf_url":"https://arxiv.org/pdf/2407.18428v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15931v4","updated":"2024-07-25T22:45:41Z","published":"2024-03-23T20:30:28Z","title":"X-Portrait: Expressive Portrait Animation with Hierarchical Motion\n Attention","summary":" We propose X-Portrait, an innovative conditional diffusion model tailored for\ngenerating expressive and temporally coherent portrait animation. Specifically,\ngiven a single portrait as appearance reference, we aim to animate it with\nmotion derived from a driving video, capturing both highly dynamic and subtle\nfacial expressions along with wide-range head movements. As its core, we\nleverage the generative prior of a pre-trained diffusion model as the rendering\nbackbone, while achieve fine-grained head pose and expression control with\nnovel controlling signals within the framework of ControlNet. In contrast to\nconventional coarse explicit controls such as facial landmarks, our motion\ncontrol module is learned to interpret the dynamics directly from the original\ndriving RGB inputs. The motion accuracy is further enhanced with a patch-based\nlocal control module that effectively enhance the motion attention to\nsmall-scale nuances like eyeball positions. Notably, to mitigate the identity\nleakage from the driving signals, we train our motion control modules with\nscaling-augmented cross-identity images, ensuring maximized disentanglement\nfrom the appearance reference modules. Experimental results demonstrate the\nuniversal effectiveness of X-Portrait across a diverse range of facial\nportraits and expressive driving sequences, and showcase its proficiency in\ngenerating captivating portrait animations with consistently maintained\nidentity characteristics.\n","authors":["You Xie","Hongyi Xu","Guoxian Song","Chao Wang","Yichun Shi","Linjie Luo"],"pdf_url":"https://arxiv.org/pdf/2403.15931v4.pdf","comment":"SIGGRAPH 2024"},{"id":"http://arxiv.org/abs/2407.12491v2","updated":"2024-07-25T21:55:44Z","published":"2024-07-17T11:17:20Z","title":"Hierarchical and Decoupled BEV Perception Learning Framework for\n Autonomous Driving","summary":" Perception is essential for autonomous driving system. Recent approaches\nbased on Bird's-eye-view (BEV) and deep learning have made significant\nprogress. However, there exists challenging issues including lengthy\ndevelopment cycles, poor reusability, and complex sensor setups in perception\nalgorithm development process. To tackle the above challenges, this paper\nproposes a novel hierarchical BEV perception paradigm, aiming to provide a\nlibrary of fundamental perception modules and user-friendly graphical\ninterface, enabling swift construction of customized models. We conduct the\nPretrain-Finetune strategy to effectively utilize large scale public datasets\nand streamline development processes. Moreover, we present a Multi-Module\nLearning (MML) approach, enhancing performance through synergistic and\niterative training of multiple models. Extensive experimental results on the\nNuscenes dataset demonstrate that our approach renders significant improvement\nover the traditional training scheme.\n","authors":["Yuqi Dai","Jian Sun","Shengbo Eben Li","Qing Xu","Jianqiang Wang","Lei He","Keqiang Li"],"pdf_url":"https://arxiv.org/pdf/2407.12491v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.09114v2","updated":"2024-07-25T21:30:41Z","published":"2024-05-15T06:14:31Z","title":"SOEDiff: Efficient Distillation for Small Object Editing","summary":" In this paper, we delve into a new task known as small object editing (SOE),\nwhich focuses on text-based image inpainting within a constrained, small-sized\narea. Despite the remarkable success have been achieved by current image\ninpainting approaches, their application to the SOE task generally results in\nfailure cases such as Object Missing, Text-Image Mismatch, and Distortion.\nThese failures stem from the limited use of small-sized objects in training\ndatasets and the downsampling operations employed by U-Net models, which\nhinders accurate generation. To overcome these challenges, we introduce a novel\ntraining-based approach, SOEDiff, aimed at enhancing the capability of baseline\nmodels like StableDiffusion in editing small-sized objects while minimizing\ntraining costs. Specifically, our method involves two key components: SO-LoRA,\nwhich efficiently fine-tunes low-rank matrices, and Cross-Scale Score\nDistillation loss, which leverages high-resolution predictions from the\npre-trained teacher diffusion model. Our method presents significant\nimprovements on the test dataset collected from MSCOCO and OpenImage,\nvalidating the effectiveness of our proposed method in small object editing. In\nparticular, when comparing SOEDiff with SD-I model on the OpenImage-f dataset,\nwe observe a 0.99 improvement in CLIP-Score and a reduction of 2.87 in FID.\n","authors":["Yiming Wu","Qihe Pan","Zhen Zhao","Zicheng Wang","Sifan Long","Ronghua Liang"],"pdf_url":"https://arxiv.org/pdf/2405.09114v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18392v1","updated":"2024-07-25T20:55:23Z","published":"2024-07-25T20:55:23Z","title":"A Reference-Based 3D Semantic-Aware Framework for Accurate Local Facial\n Attribute Editing","summary":" Facial attribute editing plays a crucial role in synthesizing realistic faces\nwith specific characteristics while maintaining realistic appearances. Despite\nadvancements, challenges persist in achieving precise, 3D-aware attribute\nmodifications, which are crucial for consistent and accurate representations of\nfaces from different angles. Current methods struggle with semantic\nentanglement and lack effective guidance for incorporating attributes while\nmaintaining image integrity. To address these issues, we introduce a novel\nframework that merges the strengths of latent-based and reference-based editing\nmethods. Our approach employs a 3D GAN inversion technique to embed attributes\nfrom the reference image into a tri-plane space, ensuring 3D consistency and\nrealistic viewing from multiple perspectives. We utilize blending techniques\nand predicted semantic masks to locate precise edit regions, merging them with\nthe contextual guidance from the reference image. A coarse-to-fine inpainting\nstrategy is then applied to preserve the integrity of untargeted areas,\nsignificantly enhancing realism. Our evaluations demonstrate superior\nperformance across diverse editing tasks, validating our framework's\neffectiveness in realistic and applicable facial attribute editing.\n","authors":["Yu-Kai Huang","Yutong Zheng","Yen-Shuo Su","Anudeepsekhar Bolimera","Han Zhang","Fangyi Chen","Marios Savvides"],"pdf_url":"https://arxiv.org/pdf/2407.18392v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.02737v2","updated":"2024-07-25T20:49:45Z","published":"2023-04-05T20:36:04Z","title":"Efficient OCR for Building a Diverse Digital History","summary":" Thousands of users consult digital archives daily, but the information they\ncan access is unrepresentative of the diversity of documentary history. The\nsequence-to-sequence architecture typically used for optical character\nrecognition (OCR) - which jointly learns a vision and language model - is\npoorly extensible to low-resource document collections, as learning a\nlanguage-vision model requires extensive labeled sequences and compute. This\nstudy models OCR as a character level image retrieval problem, using a\ncontrastively trained vision encoder. Because the model only learns characters'\nvisual features, it is more sample efficient and extensible than existing\narchitectures, enabling accurate OCR in settings where existing solutions fail.\nCrucially, the model opens new avenues for community engagement in making\ndigital history more representative of documentary history.\n","authors":["Jacob Carlson","Tom Bryan","Melissa Dell"],"pdf_url":"https://arxiv.org/pdf/2304.02737v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18391v1","updated":"2024-07-25T20:49:22Z","published":"2024-07-25T20:49:22Z","title":"UOUO: Uncontextualized Uncommon Objects for Measuring Knowledge Horizons\n of Vision Language Models","summary":" Smaller-scale Vision-Langauge Models (VLMs) often claim to perform on par\nwith larger models in general-domain visual grounding and question-answering\nbenchmarks while offering advantages in computational efficiency and storage.\nHowever, their ability to handle rare objects, which fall into the long tail of\ndata distributions, is less understood. To rigorously evaluate this aspect, we\nintroduce the \"Uncontextualized Uncommon Objects\" (UOUO) benchmark. This\nbenchmark focuses on systematically testing VLMs with both large and small\nparameter counts on rare and specialized objects. Our comprehensive analysis\nreveals that while smaller VLMs maintain competitive performance on common\ndatasets, they significantly underperform on tasks involving uncommon objects.\nWe also propose an advanced, scalable pipeline for data collection and\ncleaning, ensuring the UOUO benchmark provides high-quality, challenging\ninstances. These findings highlight the need to consider long-tail\ndistributions when assessing the true capabilities of VLMs.\n","authors":["Xinyu Pi","Mingyuan Wu","Jize Jiang","Haozhen Zheng","Beitong Tian","Chengxiang Zhai","Klara Nahrstedt","Zhiting Hu"],"pdf_url":"https://arxiv.org/pdf/2407.18391v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2407.18390v1","updated":"2024-07-25T20:47:43Z","published":"2024-07-25T20:47:43Z","title":"Adapting Mouse Pathological Model to Human Glomerular Lesion\n Segmentation","summary":" Moving from animal models to human applications in preclinical research\nencompasses a broad spectrum of disciplines in medical science. A fundamental\nelement in the development of new drugs, treatments, diagnostic methods, and in\ndeepening our understanding of disease processes is the accurate measurement of\nkidney tissues. Past studies have demonstrated the viability of translating\nglomeruli segmentation techniques from mouse models to human applications. Yet,\nthese investigations tend to neglect the complexities involved in segmenting\npathological glomeruli affected by different lesions. Such lesions present a\nwider range of morphological variations compared to healthy glomerular tissue,\nwhich are arguably more valuable than normal glomeruli in clinical practice.\nFurthermore, data on lesions from animal models can be more readily scaled up\nfrom disease models and whole kidney biopsies. This brings up a question:\n``\\textit{Can a pathological segmentation model trained on mouse models be\neffectively applied to human patients?}\" To answer this question, we introduced\nGLAM, a deep learning study for fine-grained segmentation of human kidney\nlesions using a mouse model, addressing mouse-to-human transfer learning, by\nevaluating different learning strategies for segmenting human pathological\nlesions using zero-shot transfer learning and hybrid learning by leveraging\nmouse samples. From the results, the hybrid learning model achieved superior\nperformance.\n","authors":["Lining Yu","Mengmeng Yin","Ruining Deng","Quan Liu","Tianyuan Yao","Can Cui","Yu Wang","Yaohong Wang","Shilin Zhao","Haichun Yang","Yuankai Huo"],"pdf_url":"https://arxiv.org/pdf/2407.18390v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.11238v2","updated":"2024-07-25T20:40:13Z","published":"2024-07-15T21:04:11Z","title":"Evaluating geometric accuracy of NeRF reconstructions compared to SLAM\n method","summary":" As Neural Radiance Field (NeRF) implementations become faster, more efficient\nand accurate, their applicability to real world mapping tasks becomes more\naccessible. Traditionally, 3D mapping, or scene reconstruction, has relied on\nexpensive LiDAR sensing. Photogrammetry can perform image-based 3D\nreconstruction but is computationally expensive and requires extremely dense\nimage representation to recover complex geometry and photorealism. NeRFs\nperform 3D scene reconstruction by training a neural network on sparse image\nand pose data, achieving superior results to photogrammetry with less input\ndata. This paper presents an evaluation of two NeRF scene reconstructions for\nthe purpose of estimating the diameter of a vertical PVC cylinder. One of these\nare trained on commodity iPhone data and the other is trained on robot-sourced\nimagery and poses. This neural-geometry is compared to state-of-the-art\nlidar-inertial SLAM in terms of scene noise and metric-accuracy.\n","authors":["Adam Korycki","Colleen Josephson","Steve McGuire"],"pdf_url":"https://arxiv.org/pdf/2407.11238v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18381v1","updated":"2024-07-25T20:31:40Z","published":"2024-07-25T20:31:40Z","title":"Neural Surface Detection for Unsigned Distance Fields","summary":" Extracting surfaces from Signed Distance Fields (SDFs) can be accomplished\nusing traditional algorithms, such as Marching Cubes. However, since they rely\non sign flips across the surface, these algorithms cannot be used directly on\nUnsigned Distance Fields (UDFs). In this work, we introduce a deep-learning\napproach to taking a UDF and turning it locally into an SDF, so that it can be\neffectively triangulated using existing algorithms. We show that it achieves\nbetter accuracy in surface detection than existing methods. Furthermore it\ngeneralizes well to unseen shapes and datasets, while being parallelizable. We\nalso demonstrate the flexibily of the method by using it in conjunction with\nDualMeshUDF, a state of the art dual meshing method that can operate on UDFs,\nimproving its results and removing the need to tune its parameters.\n","authors":["Federico Stella","Nicolas Talabot","Hieu Le","Pascal Fua"],"pdf_url":"https://arxiv.org/pdf/2407.18381v1.pdf","comment":"Accepted to ECCV 2024"},{"id":"http://arxiv.org/abs/2407.16789v2","updated":"2024-07-25T20:20:03Z","published":"2024-07-23T18:42:37Z","title":"What Matters in Range View 3D Object Detection","summary":" Lidar-based perception pipelines rely on 3D object detection models to\ninterpret complex scenes. While multiple representations for lidar exist, the\nrange-view is enticing since it losslessly encodes the entire lidar sensor\noutput. In this work, we achieve state-of-the-art amongst range-view 3D object\ndetection models without using multiple techniques proposed in past range-view\nliterature. We explore range-view 3D object detection across two modern\ndatasets with substantially different properties: Argoverse 2 and Waymo Open.\nOur investigation reveals key insights: (1) input feature dimensionality\nsignificantly influences the overall performance, (2) surprisingly, employing a\nclassification loss grounded in 3D spatial proximity works as well or better\ncompared to more elaborate IoU-based losses, and (3) addressing non-uniform\nlidar density via a straightforward range subsampling technique outperforms\nexisting multi-resolution, range-conditioned networks. Our experiments reveal\nthat techniques proposed in recent range-view literature are not needed to\nachieve state-of-the-art performance. Combining the above findings, we\nestablish a new state-of-the-art model for range-view 3D object detection --\nimproving AP by 2.2% on the Waymo Open dataset while maintaining a runtime of\n10 Hz. We establish the first range-view model on the Argoverse 2 dataset and\noutperform strong voxel-based baselines. All models are multi-class and\nopen-source. Code is available at\nhttps://github.com/benjaminrwilson/range-view-3d-detection.\n","authors":["Benjamin Wilson","Nicholas Autio Mitchell","Jhony Kaesemodel Pontes","James Hays"],"pdf_url":"https://arxiv.org/pdf/2407.16789v2.pdf","comment":"Fixed broken link"},{"id":"http://arxiv.org/abs/2407.15239v2","updated":"2024-07-25T19:52:38Z","published":"2024-07-21T18:08:44Z","title":"Assessing Brittleness of Image-Text Retrieval Benchmarks from\n Vision-Language Models Perspective","summary":" Image-text retrieval (ITR), an important task in information retrieval (IR),\nis driven by pretrained vision-language models (VLMs) that consistently achieve\nstate-of-the-art performance. However, a significant challenge lies in the\nbrittleness of existing ITR benchmarks. In standard datasets for the task,\ncaptions often provide broad summaries of scenes, neglecting detailed\ninformation about specific concepts. Additionally, the current evaluation setup\nassumes simplistic binary matches between images and texts and focuses on\nintra-modality rather than cross-modal relationships, which can lead to\nmisinterpretations of model performance. Motivated by this gap, in this study,\nwe focus on examining the brittleness of the ITR evaluation pipeline with a\nfocus on concept granularity. We start by analyzing two common benchmarks,\nMS-COCO and Flickr30k, and compare them with their augmented versions,\nMS-COCO-FG and Flickr30k-FG, given a specified set of linguistic features\ncapturing concept granularity. We discover that Flickr30k-FG and MS COCO-FG\nconsistently achieve higher scores across all the selected features. To\ninvestigate the performance of VLMs on coarse and fine-grained datasets, we\nintroduce a taxonomy of perturbations. We apply these perturbations to the\nselected datasets. We evaluate four state-of-the-art models - ALIGN, AltCLIP,\nCLIP, and GroupViT - on the standard and fine-grained datasets under zero-shot\nconditions, with and without the applied perturbations. The results demonstrate\nthat although perturbations generally degrade model performance, the\nfine-grained datasets exhibit a smaller performance drop than their standard\ncounterparts. Moreover, the relative performance drop across all setups is\nconsistent across all models and datasets, indicating that the issue lies\nwithin the benchmarks. We conclude the paper by providing an agenda for\nimproving ITR evaluation pipelines.\n","authors":["Mariya Hendriksen","Shuo Zhang","Ridho Reinanda","Mohamed Yahya","Edgar Meij","Maarten de Rijke"],"pdf_url":"https://arxiv.org/pdf/2407.15239v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18362v1","updated":"2024-07-25T19:51:27Z","published":"2024-07-25T19:51:27Z","title":"Retinal IPA: Iterative KeyPoints Alignment for Multimodal Retinal\n Imaging","summary":" We propose a novel framework for retinal feature point alignment, designed\nfor learning cross-modality features to enhance matching and registration\nacross multi-modality retinal images. Our model draws on the success of\nprevious learning-based feature detection and description methods. To better\nleverage unlabeled data and constrain the model to reproduce relevant\nkeypoints, we integrate a keypoint-based segmentation task. It is trained in a\nself-supervised manner by enforcing segmentation consistency between different\naugmentations of the same image. By incorporating a keypoint augmented\nself-supervised layer, we achieve robust feature extraction across modalities.\nExtensive evaluation on two public datasets and one in-house dataset\ndemonstrates significant improvements in performance for modality-agnostic\nretinal feature alignment. Our code and model weights are publicly available at\n\\url{https://github.com/MedICL-VU/RetinaIPA}.\n","authors":["Jiacheng Wang","Hao Li","Dewei Hu","Rui Xu","Xing Yao","Yuankai K. Tao","Ipek Oguz"],"pdf_url":"https://arxiv.org/pdf/2407.18362v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01509v3","updated":"2024-07-25T19:50:32Z","published":"2024-07-01T17:53:35Z","title":"MIA-Bench: Towards Better Instruction Following Evaluation of Multimodal\n LLMs","summary":" We introduce MIA-Bench, a new benchmark designed to evaluate multimodal large\nlanguage models (MLLMs) on their ability to strictly adhere to complex\ninstructions. Our benchmark comprises a diverse set of 400 image-prompt pairs,\neach crafted to challenge the models' compliance with layered instructions in\ngenerating accurate responses that satisfy specific requested patterns.\nEvaluation results from a wide array of state-of-the-art MLLMs reveal\nsignificant variations in performance, highlighting areas for improvement in\ninstruction fidelity. Additionally, we create extra training data and explore\nsupervised fine-tuning to enhance the models' ability to strictly follow\ninstructions without compromising performance on other tasks. We hope this\nbenchmark not only serves as a tool for measuring MLLM adherence to\ninstructions, but also guides future developments in MLLM training methods.\n","authors":["Yusu Qian","Hanrong Ye","Jean-Philippe Fauconnier","Peter Grasch","Yinfei Yang","Zhe Gan"],"pdf_url":"https://arxiv.org/pdf/2407.01509v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13052v2","updated":"2024-07-25T19:44:43Z","published":"2023-11-21T23:25:04Z","title":"Novel OCT mosaicking pipeline with Feature- and Pixel-based registration","summary":" High-resolution Optical Coherence Tomography (OCT) images are crucial for\nophthalmology studies but are limited by their relatively narrow field of view\n(FoV). Image mosaicking is a technique for aligning multiple overlapping images\nto obtain a larger FoV. Current mosaicking pipelines often struggle with\nsubstantial noise and considerable displacement between the input sub-fields.\nIn this paper, we propose a versatile pipeline for stitching multi-view\nOCT/OCTA \\textit{en face} projection images. Our method combines the strengths\nof learning-based feature matching and robust pixel-based registration to align\nmultiple images effectively. Furthermore, we advance the application of a\ntrained foundational model, Segment Anything Model (SAM), to validate\nmosaicking results in an unsupervised manner. The efficacy of our pipeline is\nvalidated using an in-house dataset and a large public dataset, where our\nmethod shows superior performance in terms of both accuracy and computational\nefficiency. We also made our evaluation tool for image mosaicking and the\ncorresponding pipeline publicly available at\n\\url{https://github.com/MedICL-VU/OCT-mosaicking}.\n","authors":["Jiacheng Wang","Hao Li","Dewei Hu","Yuankai K. Tao","Ipek Oguz"],"pdf_url":"https://arxiv.org/pdf/2311.13052v2.pdf","comment":"ISBI 2024 Oral"},{"id":"http://arxiv.org/abs/2407.18338v1","updated":"2024-07-25T18:52:10Z","published":"2024-07-25T18:52:10Z","title":"SMiCRM: A Benchmark Dataset of Mechanistic Molecular Images","summary":" Optical chemical structure recognition (OCSR) systems aim to extract the\nmolecular structure information, usually in the form of molecular graph or\nSMILES, from images of chemical molecules. While many tools have been developed\nfor this purpose, challenges still exist due to different types of noises that\nmight exist in the images. Specifically, we focus on the 'arrow-pushing'\ndiagrams, a typical type of chemical images to demonstrate electron flow in\nmechanistic steps. We present Structural molecular identifier of Molecular\nimages in Chemical Reaction Mechanisms (SMiCRM), a dataset designed to\nbenchmark machine recognition capabilities of chemical molecules with\narrow-pushing annotations. Comprising 453 images, it spans a broad array of\norganic chemical reactions, each illustrated with molecular structures and\nmechanistic arrows. SMiCRM offers a rich collection of annotated molecule\nimages for enhancing the benchmarking process for OCSR methods. This dataset\nincludes a machine-readable molecular identity for each image as well as\nmechanistic arrows showing electron flow during chemical reactions. It presents\na more authentic and challenging task for testing molecular recognition\ntechnologies, and achieving this task can greatly enrich the mechanisitic\ninformation in computer-extracted chemical reaction data.\n","authors":["Ching Ting Leung","Yufan Chen","Hanyu Gao"],"pdf_url":"https://arxiv.org/pdf/2407.18338v1.pdf","comment":"Under Submission"},{"id":"http://arxiv.org/abs/2402.17986v3","updated":"2024-07-25T18:47:03Z","published":"2024-02-28T02:06:11Z","title":"PolyOculus: Simultaneous Multi-view Image-based Novel View Synthesis","summary":" This paper considers the problem of generative novel view synthesis (GNVS),\ngenerating novel, plausible views of a scene given a limited number of known\nviews. Here, we propose a set-based generative model that can simultaneously\ngenerate multiple, self-consistent new views, conditioned on any number of\nviews. Our approach is not limited to generating a single image at a time and\ncan condition on a variable number of views. As a result, when generating a\nlarge number of views, our method is not restricted to a low-order\nautoregressive generation approach and is better able to maintain generated\nimage quality over large sets of images. We evaluate our model on standard NVS\ndatasets and show that it outperforms the state-of-the-art image-based GNVS\nbaselines. Further, we show that the model is capable of generating sets of\nviews that have no natural sequential ordering, like loops and binocular\ntrajectories, and significantly outperforms other methods on such tasks.\n","authors":["Jason J. Yu","Tristan Aumentado-Armstrong","Fereshteh Forghani","Konstantinos G. Derpanis","Marcus A. Brubaker"],"pdf_url":"https://arxiv.org/pdf/2402.17986v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18289v1","updated":"2024-07-25T15:18:28Z","published":"2024-07-25T15:18:28Z","title":"MARINE: A Computer Vision Model for Detecting Rare Predator-Prey\n Interactions in Animal Videos","summary":" Encounters between predator and prey play an essential role in ecosystems,\nbut their rarity makes them difficult to detect in video recordings. Although\nadvances in action recognition (AR) and temporal action detection (AD),\nespecially transformer-based models and vision foundation models, have achieved\nhigh performance on human action datasets, animal videos remain relatively\nunder-researched. This thesis addresses this gap by proposing the model MARINE,\nwhich utilizes motion-based frame selection designed for fast animal actions\nand DINOv2 feature extraction with a trainable classification head for action\nrecognition. MARINE outperforms VideoMAE in identifying predator attacks in\nvideos of fish, both on a small and specific coral reef dataset (81.53\\%\nagainst 52.64\\% accuracy), and on a subset of the more extensive Animal Kingdom\ndataset (94.86\\% against 83.14\\% accuracy). In a multi-label setting on a\nrepresentative sample of Animal Kingdom, MARINE achieves 23.79\\% mAP,\npositioning it mid-field among existing benchmarks. Furthermore, in an AD task\non the coral reef dataset, MARINE achieves 80.78\\% AP (against VideoMAE's\n34.89\\%) although at a lowered t-IoU threshold of 25\\%. Therefore, despite room\nfor improvement, MARINE offers an effective starter framework to apply to AR\nand AD tasks on animal recordings and thus contribute to the study of natural\necosystems.\n","authors":["Zsófia Katona","Seyed Sahand Mohammadi Ziabari","Fatemeh Karimi Najadasl"],"pdf_url":"https://arxiv.org/pdf/2407.18289v1.pdf","comment":"This is an MSc thesis by Zsofia Katona, supervised by the two other\n authors"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2407.18058v1","updated":"2024-07-25T14:15:05Z","published":"2024-07-25T14:15:05Z","title":"I can listen but cannot read: An evaluation of two-tower multimodal\n systems for instrument recognition","summary":" Music two-tower multimodal systems integrate audio and text modalities into a\njoint audio-text space, enabling direct comparison between songs and their\ncorresponding labels. These systems enable new approaches for classification\nand retrieval, leveraging both modalities. Despite the promising results they\nhave shown for zero-shot classification and retrieval tasks, closer inspection\nof the embeddings is needed. This paper evaluates the inherent zero-shot\nproperties of joint audio-text spaces for the case-study of instrument\nrecognition. We present an evaluation and analysis of two-tower systems for\nzero-shot instrument recognition and a detailed analysis of the properties of\nthe pre-joint and joint embeddings spaces. Our findings suggest that audio\nencoders alone demonstrate good quality, while challenges remain within the\ntext encoder or joint space projection. Specifically, two-tower systems exhibit\nsensitivity towards specific words, favoring generic prompts over musically\ninformed ones. Despite the large size of textual encoders, they do not yet\nleverage additional textual context or infer instruments accurately from their\ndescriptions. Lastly, a novel approach for quantifying the semantic\nmeaningfulness of the textual space leveraging an instrument ontology is\nproposed. This method reveals deficiencies in the systems' understanding of\ninstruments and provides evidence of the need for fine-tuning text encoders on\nmusical data.\n","authors":["Yannis Vasilakis","Rachel Bittner","Johan Pauwels"],"pdf_url":"https://arxiv.org/pdf/2407.18058v1.pdf","comment":"Accepted to ISMIR 2024"},{"id":"http://arxiv.org/abs/2212.06543v2","updated":"2024-07-25T13:27:08Z","published":"2022-12-13T12:56:55Z","title":"Improving Stance Detection by Leveraging Measurement Knowledge from\n Social Sciences: A Case Study of Dutch Political Tweets and Traditional\n Gender Role Division","summary":" Stance detection (SD) concerns automatically determining the viewpoint (i.e.,\nin favour of, against, or neutral) of a text's author towards a target. SD has\nbeen applied to many research topics, among which the detection of stances\nbehind political tweets is an important one. In this paper, we apply SD to a\ndataset of tweets from official party accounts in the Netherlands between 2017\nand 2021, with a focus on stances towards traditional gender role division, a\ndividing issue between (some) Dutch political parties. To implement and improve\nSD of traditional gender role division, we propose to leverage an established\nsurvey instrument from social sciences, which has been validated for the\npurpose of measuring attitudes towards traditional gender role division. Based\non our experiments, we show that using such a validated survey instrument helps\nto improve SD performance.\n","authors":["Qixiang Fang","Anastasia Giachanou","Ayoub Bagheri"],"pdf_url":"https://arxiv.org/pdf/2212.06543v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12111v2","updated":"2024-07-25T12:52:54Z","published":"2023-12-19T12:33:38Z","title":"General-Purpose User Modeling with Behavioral Logs: A Snapchat Case\n Study","summary":" Learning general-purpose user representations based on user behavioral logs\nis an increasingly popular user modeling approach. It benefits from easily\navailable, privacy-friendly yet expressive data, and does not require extensive\nre-tuning of the upstream user model for different downstream tasks. While this\napproach has shown promise in search engines and e-commerce applications, its\nfit for instant messaging platforms, a cornerstone of modern digital\ncommunication, remains largely uncharted. We explore this research gap using\nSnapchat data as a case study. Specifically, we implement a Transformer-based\nuser model with customized training objectives and show that the model can\nproduce high-quality user representations across a broad range of evaluation\ntasks, among which we introduce three new downstream tasks that concern pivotal\ntopics in user research: user safety, engagement and churn. We also tackle the\nchallenge of efficient extrapolation of long sequences at inference time, by\napplying a novel positional encoding method.\n","authors":["Qixiang Fang","Zhihan Zhou","Francesco Barbieri","Yozen Liu","Leonardo Neves","Dong Nguyen","Daniel L. Oberski","Maarten W. Bos","Ron Dotsch"],"pdf_url":"https://arxiv.org/pdf/2312.12111v2.pdf","comment":"SIGIR 2024"},{"id":"http://arxiv.org/abs/2310.06491v2","updated":"2024-07-25T07:17:59Z","published":"2023-10-10T09:59:08Z","title":"Bridging Items and Language: A Transition Paradigm for Large Language\n Model-Based Recommendation","summary":" Harnessing Large Language Models (LLMs) for recommendation is rapidly\nemerging, which relies on two fundamental steps to bridge the recommendation\nitem space and the language space: 1) item indexing utilizes identifiers to\nrepresent items in the language space, and 2) generation grounding associates\nLLMs' generated token sequences to in-corpus items. However, previous methods\nexhibit inherent limitations in the two steps. Existing ID-based identifiers\n(e.g., numeric IDs) and description-based identifiers (e.g., titles) either\nlose semantics or lack adequate distinctiveness. Moreover, prior generation\ngrounding methods might generate invalid identifiers, thus misaligning with\nin-corpus items. To address these issues, we propose a novel Transition\nparadigm for LLM-based Recommender (named TransRec) to bridge items and\nlanguage. Specifically, TransRec presents multi-facet identifiers, which\nsimultaneously incorporate ID, title, and attribute for item indexing to pursue\nboth distinctiveness and semantics. Additionally, we introduce a specialized\ndata structure for TransRec to ensure generating valid identifiers only and\nutilize substring indexing to encourage LLMs to generate from any position of\nidentifiers. Lastly, TransRec presents an aggregated grounding module to\nleverage generated multi-facet identifiers to rank in-corpus items efficiently.\nWe instantiate TransRec on two backbone models, BART-large and LLaMA-7B.\nExtensive results on three real-world datasets under diverse settings validate\nthe superiority of TransRec.\n","authors":["Xinyu Lin","Wenjie Wang","Yongqi Li","Fuli Feng","See-Kiong Ng","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2310.06491v2.pdf","comment":"Accepted by KDD 2024"},{"id":"http://arxiv.org/abs/2407.17802v1","updated":"2024-07-25T06:22:08Z","published":"2024-07-25T06:22:08Z","title":"Sample Enrichment via Temporary Operations on Subsequences for\n Sequential Recommendation","summary":" Sequential recommendation leverages interaction sequences to predict\nforthcoming user behaviors, crucial for crafting personalized recommendations.\nHowever, the true preferences of a user are inherently complex and\nhigh-dimensional, while the observed data is merely a simplified and\nlow-dimensional projection of the rich preferences, which often leads to\nprevalent issues like data sparsity and inaccurate model training. To learn\ntrue preferences from the sparse data, most existing works endeavor to\nintroduce some extra information or design some ingenious models. Although they\nhave shown to be effective, extra information usually increases the cost of\ndata collection, and complex models may result in difficulty in deployment.\nInnovatively, we avoid the use of extra information or alterations to the\nmodel; instead, we fill the transformation space between the observed data and\nthe underlying preferences with randomness. Specifically, we propose a novel\nmodel-agnostic and highly generic framework for sequential recommendation\ncalled sample enrichment via temporary operations on subsequences (SETO), which\ntemporarily and separately enriches the transformation space via sequence\nenhancement operations with rationality constraints in training. The\ntransformation space not only exists in the process from input samples to\npreferences but also in preferences to target samples. We highlight our SETO's\neffectiveness and versatility over multiple representative and state-of-the-art\nsequential recommendation models (including six single-domain sequential models\nand two cross-domain sequential models) across multiple real-world datasets\n(including three single-domain datasets, three cross-domain datasets and a\nlarge-scale industry dataset).\n","authors":["Shu Chen","Jinwei Luo","Weike Pan","Jiangxing Yu","Xin Huang","Zhong Ming"],"pdf_url":"https://arxiv.org/pdf/2407.17802v1.pdf","comment":"12 pages, 6 figures"},{"id":"http://arxiv.org/abs/2407.17722v1","updated":"2024-07-25T02:48:56Z","published":"2024-07-25T02:48:56Z","title":"Text-Driven Neural Collaborative Filtering Model for Paper Source\n Tracing","summary":" Identifying significant references within the complex interrelations of a\ncitation knowledge graph is challenging, which encompasses connections through\ncitations, authorship, keywords, and other relational attributes. The Paper\nSource Tracing (PST) task seeks to automate the identification of pivotal\nreferences for given scholarly articles utilizing advanced data mining\ntechniques. In the KDD CUP 2024, we design a recommendation-based framework\ntailored for the PST task. This framework employs the Neural Collaborative\nFiltering (NCF) model to generate final predictions. To process the textual\nattributes of the papers and extract input features for the model, we utilize\nSciBERT, a pre-trained language model. According to the experimental results,\nour method achieved a score of 0.37814 on the Mean Average Precision (MAP)\nmetric, outperforming baseline models and ranking 11th among all participating\nteams. The source code is publicly available at\nhttps://github.com/MyLove-XAB/KDDCupFinal.\n","authors":["Aobo Xu","Bingyu Chang","Qingpeng Liu","Ling Jian"],"pdf_url":"https://arxiv.org/pdf/2407.17722v1.pdf","comment":"KDD CUP 2024 OAG-Challenges, Paper Source Tracing, Technical Report\n of Team AoboSama @ KDD CUP 2024. August 25--29, 2024. Barcelona, Spain"},{"id":"http://arxiv.org/abs/2407.18383v1","updated":"2024-07-25T20:36:20Z","published":"2024-07-25T20:36:20Z","title":"Supporting Evidence-Based Medicine by Finding Both Relevant and\n Significant Works","summary":" In this paper, we present a new approach to improving the relevance and\nreliability of medical IR, which builds upon the concept of Level of Evidence\n(LoE). LoE framework categorizes medical publications into 7 distinct levels\nbased on the underlying empirical evidence. Despite LoE framework's relevance\nin medical research and evidence-based practice, only few medical publications\nexplicitly state their LoE. Therefore, we develop a classification model for\nautomatically assigning LoE to medical publications, which successfully\nclassifies over 26 million documents in MEDLINE database into LoE classes. The\nsubsequent retrieval experiments on TREC PM datasets show substantial\nimprovements in retrieval relevance, when LoE is used as a search filter.\n","authors":["Sameh Frihat","Norbert Fuhr"],"pdf_url":"https://arxiv.org/pdf/2407.18383v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15239v2","updated":"2024-07-25T19:52:38Z","published":"2024-07-21T18:08:44Z","title":"Assessing Brittleness of Image-Text Retrieval Benchmarks from\n Vision-Language Models Perspective","summary":" Image-text retrieval (ITR), an important task in information retrieval (IR),\nis driven by pretrained vision-language models (VLMs) that consistently achieve\nstate-of-the-art performance. However, a significant challenge lies in the\nbrittleness of existing ITR benchmarks. In standard datasets for the task,\ncaptions often provide broad summaries of scenes, neglecting detailed\ninformation about specific concepts. Additionally, the current evaluation setup\nassumes simplistic binary matches between images and texts and focuses on\nintra-modality rather than cross-modal relationships, which can lead to\nmisinterpretations of model performance. Motivated by this gap, in this study,\nwe focus on examining the brittleness of the ITR evaluation pipeline with a\nfocus on concept granularity. We start by analyzing two common benchmarks,\nMS-COCO and Flickr30k, and compare them with their augmented versions,\nMS-COCO-FG and Flickr30k-FG, given a specified set of linguistic features\ncapturing concept granularity. We discover that Flickr30k-FG and MS COCO-FG\nconsistently achieve higher scores across all the selected features. To\ninvestigate the performance of VLMs on coarse and fine-grained datasets, we\nintroduce a taxonomy of perturbations. We apply these perturbations to the\nselected datasets. We evaluate four state-of-the-art models - ALIGN, AltCLIP,\nCLIP, and GroupViT - on the standard and fine-grained datasets under zero-shot\nconditions, with and without the applied perturbations. The results demonstrate\nthat although perturbations generally degrade model performance, the\nfine-grained datasets exhibit a smaller performance drop than their standard\ncounterparts. Moreover, the relative performance drop across all setups is\nconsistent across all models and datasets, indicating that the issue lies\nwithin the benchmarks. We conclude the paper by providing an agenda for\nimproving ITR evaluation pipelines.\n","authors":["Mariya Hendriksen","Shuo Zhang","Ridho Reinanda","Mohamed Yahya","Edgar Meij","Maarten de Rijke"],"pdf_url":"https://arxiv.org/pdf/2407.15239v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2108.08614v9","updated":"2024-07-25T18:51:23Z","published":"2021-08-19T10:50:52Z","title":"UNIQORN: Unified Question Answering over RDF Knowledge Graphs and\n Natural Language Text","summary":" Question answering over RDF data like knowledge graphs has been greatly\nadvanced, with a number of good systems providing crisp answers for natural\nlanguage questions or telegraphic queries. Some of these systems incorporate\ntextual sources as additional evidence for the answering process, but cannot\ncompute answers that are present in text alone. Conversely, the IR and NLP\ncommunities have addressed QA over text, but such systems barely utilize\nsemantic data and knowledge. This paper presents a method for complex questions\nthat can seamlessly operate over a mixture of RDF datasets and text corpora, or\nindividual sources, in a unified framework. Our method, called UNIQORN, builds\na context graph on-the-fly, by retrieving question-relevant evidences from the\nRDF data and/or a text corpus, using fine-tuned BERT models. The resulting\ngraph typically contains all question-relevant evidences but also a lot of\nnoise. UNIQORN copes with this input by a graph algorithm for Group Steiner\nTrees, that identifies the best answer candidates in the context graph.\nExperimental results on several benchmarks of complex questions with multiple\nentities and relations, show that UNIQORN significantly outperforms\nstate-of-the-art methods for heterogeneous QA -- in a full training mode, as\nwell as in zero-shot settings. The graph-based methodology provides\nuser-interpretable evidence for the complete answering process.\n","authors":["Soumajit Pramanik","Jesujoba Alabi","Rishiraj Saha Roy","Gerhard Weikum"],"pdf_url":"https://arxiv.org/pdf/2108.08614v9.pdf","comment":"27 pages"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2407.18251v1","updated":"2024-07-25T17:59:48Z","published":"2024-07-25T17:59:48Z","title":"Sparse vs Contiguous Adversarial Pixel Perturbations in Multimodal\n Models: An Empirical Analysis","summary":" Assessing the robustness of multimodal models against adversarial examples is\nan important aspect for the safety of its users. We craft L0-norm perturbation\nattacks on the preprocessed input images. We launch them in a black-box setup\nagainst four multimodal models and two unimodal DNNs, considering both targeted\nand untargeted misclassification. Our attacks target less than 0.04% of\nperturbed image area and integrate different spatial positioning of perturbed\npixels: sparse positioning and pixels arranged in different contiguous shapes\n(row, column, diagonal, and patch). To the best of our knowledge, we are the\nfirst to assess the robustness of three state-of-the-art multimodal models\n(ALIGN, AltCLIP, GroupViT) against different sparse and contiguous pixel\ndistribution perturbations. The obtained results indicate that unimodal DNNs\nare more robust than multimodal models. Furthermore, models using CNN-based\nImage Encoder are more vulnerable than models with ViT - for untargeted\nattacks, we obtain a 99% success rate by perturbing less than 0.02% of the\nimage area.\n","authors":["Cristian-Alexandru Botocan","Raphael Meier","Ljiljana Dolamic"],"pdf_url":"https://arxiv.org/pdf/2407.18251v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18245v1","updated":"2024-07-25T17:58:17Z","published":"2024-07-25T17:58:17Z","title":"VGGHeads: A Large-Scale Synthetic Dataset for 3D Human Heads","summary":" Human head detection, keypoint estimation, and 3D head model fitting are\nimportant tasks with many applications. However, traditional real-world\ndatasets often suffer from bias, privacy, and ethical concerns, and they have\nbeen recorded in laboratory environments, which makes it difficult for trained\nmodels to generalize. Here, we introduce VGGHeads -- a large scale synthetic\ndataset generated with diffusion models for human head detection and 3D mesh\nestimation. Our dataset comprises over 1 million high-resolution images, each\nannotated with detailed 3D head meshes, facial landmarks, and bounding boxes.\nUsing this dataset we introduce a new model architecture capable of\nsimultaneous heads detection and head meshes reconstruction from a single image\nin a single step. Through extensive experimental evaluations, we demonstrate\nthat models trained on our synthetic data achieve strong performance on real\nimages. Furthermore, the versatility of our dataset makes it applicable across\na broad spectrum of tasks, offering a general and comprehensive representation\nof human heads. Additionally, we provide detailed information about the\nsynthetic data generation pipeline, enabling it to be re-used for other tasks\nand domains.\n","authors":["Orest Kupyn","Eugene Khvedchenia","Christian Rupprecht"],"pdf_url":"https://arxiv.org/pdf/2407.18245v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18242v1","updated":"2024-07-25T17:57:12Z","published":"2024-07-25T17:57:12Z","title":"LoRA-Pro: Are Low-Rank Adapters Properly Optimized?","summary":" Low-Rank Adaptation, also known as LoRA, has emerged as a prominent method\nfor parameter-efficient fine-tuning foundation models by re-parameterizing the\noriginal matrix into the product of two low-rank matrices. Despite its\nefficiency, LoRA often yields inferior performance compared to full\nfine-tuning. In this paper, we propose LoRA-Pro to bridge this performance gap.\nFirstly, we delve into the optimization processes in LoRA and full fine-tuning.\nWe reveal that while LoRA employs low-rank approximation, it neglects to\napproximate the optimization process of full fine-tuning. To address this, we\nintroduce a novel concept called the \"equivalent gradient.\" This virtual\ngradient makes the optimization process on the re-parameterized matrix\nequivalent to LoRA, which can be used to quantify the differences between LoRA\nand full fine-tuning. The equivalent gradient is derived from the gradients of\nmatrices $A$ and $B$. To narrow the performance gap, our approach minimizes the\ndifferences between the equivalent gradient and the gradient obtained from full\nfine-tuning during the optimization process. By solving this objective, we\nderive optimal closed-form solutions for updating matrices $A$ and $B$. Our\nmethod constrains the optimization process, shrinking the performance gap\nbetween LoRA and full fine-tuning. Extensive experiments on natural language\nprocessing tasks validate the effectiveness of our method.\n","authors":["Zhengbo Wang","Jian Liang"],"pdf_url":"https://arxiv.org/pdf/2407.18242v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18241v1","updated":"2024-07-25T17:55:33Z","published":"2024-07-25T17:55:33Z","title":"Numerical Literals in Link Prediction: A Critical Examination of Models\n and Datasets","summary":" Link Prediction(LP) is an essential task over Knowledge Graphs(KGs),\ntraditionally focussed on using and predicting the relations between entities.\nTextual entity descriptions have already been shown to be valuable, but models\nthat incorporate numerical literals have shown minor improvements on existing\nbenchmark datasets. It is unclear whether a model is actually better in using\nnumerical literals, or better capable of utilizing the graph structure. This\nraises doubts about the effectiveness of these methods and about the\nsuitability of the existing benchmark datasets.\n We propose a methodology to evaluate LP models that incorporate numerical\nliterals. We propose i) a new synthetic dataset to better understand how well\nthese models use numerical literals and ii) dataset ablations strategies to\ninvestigate potential difficulties with the existing datasets. We identify a\nprevalent trend: many models underutilize literal information and potentially\nrely on additional parameters for performance gains. Our investigation\nhighlights the need for more extensive evaluations when releasing new models\nand datasets.\n","authors":["Moritz Blum","Basil Ell","Hannes Ill","Philipp Cimiano"],"pdf_url":"https://arxiv.org/pdf/2407.18241v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03853v4","updated":"2024-07-25T17:54:12Z","published":"2023-12-06T19:07:38Z","title":"Dr. Jekyll and Mr. Hyde: Two Faces of LLMs","summary":" Recently, we have witnessed a rise in the use of Large Language Models\n(LLMs), especially in applications like chatbot assistants. Safety mechanisms\nand specialized training procedures are implemented to prevent improper\nresponses from these assistants. In this work, we bypass these measures for\nChatGPT and Gemini (and, to some extent, Bing chat) by making them impersonate\ncomplex personas with personality characteristics that are not aligned with a\ntruthful assistant. We start by creating elaborate biographies of these\npersonas, which we then use in a new session with the same chatbots. Our\nconversations then follow a role-play style to elicit prohibited responses.\nUsing personas, we show that prohibited responses are actually provided, making\nit possible to obtain unauthorized, illegal, or harmful information. This work\nshows that by using adversarial personas, one can overcome safety mechanisms\nset out by ChatGPT and Gemini. We also introduce several ways of activating\nsuch adversarial personas, which show that both chatbots are vulnerable to this\nkind of attack. With the same principle, we introduce two defenses that push\nthe model to interpret trustworthy personalities and make it more robust\nagainst such attacks.\n","authors":["Matteo Gioele Collu","Tom Janssen-Groesbeek","Stefanos Koffas","Mauro Conti","Stjepan Picek"],"pdf_url":"https://arxiv.org/pdf/2312.03853v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16445v2","updated":"2024-07-25T17:53:38Z","published":"2024-07-23T12:54:06Z","title":"Can time series forecasting be automated? A benchmark and analysis","summary":" In the field of machine learning and artificial intelligence, time series\nforecasting plays a pivotal role across various domains such as finance,\nhealthcare, and weather. However, the task of selecting the most suitable\nforecasting method for a given dataset is a complex task due to the diversity\nof data patterns and characteristics. This research aims to address this\nchallenge by proposing a comprehensive benchmark for evaluating and ranking\ntime series forecasting methods across a wide range of datasets. This study\ninvestigates the comparative performance of many methods from two prominent\ntime series forecasting frameworks, AutoGluon-Timeseries, and sktime to shed\nlight on their applicability in different real-world scenarios. This research\ncontributes to the field of time series forecasting by providing a robust\nbenchmarking methodology and facilitating informed decision-making when\nchoosing forecasting methods for achieving optimal prediction.\n","authors":["Anvitha Thirthapura Sreedhara","Joaquin Vanschoren"],"pdf_url":"https://arxiv.org/pdf/2407.16445v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10444v2","updated":"2024-07-25T17:51:50Z","published":"2024-03-15T16:28:22Z","title":"Block Verification Accelerates Speculative Decoding","summary":" Speculative decoding is an effective method for lossless acceleration of\nlarge language models during inference. It uses a fast model to draft a block\nof tokens which are then verified in parallel by the target model, and provides\na guarantee that the output is distributed identically to a sample from the\ntarget model. In prior works, draft verification is performed independently\ntoken-by-token. Surprisingly, we show that this approach is not optimal. We\npropose Block Verification, a simple draft verification algorithm that verifies\nthe entire block jointly and provides additional wall-clock speedup. We prove\nthat the proposed mechanism is optimal in the expected number of tokens\nproduced each iteration and specifically is never worse than the standard\ntoken-level verification. Empirically, block verification provides modest but\nconsistent wall-clock speedups over the standard token verification algorithm\nof 5%-8% in a range of tasks and datasets. Given that block verification does\nnot increase code complexity, maintains the strong lossless guarantee of the\nstandard speculative decoding verification algorithm, cannot deteriorate\nperformance, and, in fact, consistently improves it, it can be used as a good\ndefault in speculative decoding implementations.\n","authors":["Ziteng Sun","Uri Mendlovic","Yaniv Leviathan","Asaf Aharoni","Ahmad Beirami","Jae Hun Ro","Ananda Theertha Suresh"],"pdf_url":"https://arxiv.org/pdf/2403.10444v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18227v1","updated":"2024-07-25T17:46:38Z","published":"2024-07-25T17:46:38Z","title":"Automated Ensemble Multimodal Machine Learning for Healthcare","summary":" The application of machine learning in medicine and healthcare has led to the\ncreation of numerous diagnostic and prognostic models. However, despite their\nsuccess, current approaches generally issue predictions using data from a\nsingle modality. This stands in stark contrast with clinician decision-making\nwhich employs diverse information from multiple sources. While several\nmultimodal machine learning approaches exist, significant challenges in\ndeveloping multimodal systems remain that are hindering clinical adoption. In\nthis paper, we introduce a multimodal framework, AutoPrognosis-M, that enables\nthe integration of structured clinical (tabular) data and medical imaging using\nautomated machine learning. AutoPrognosis-M incorporates 17 imaging models,\nincluding convolutional neural networks and vision transformers, and three\ndistinct multimodal fusion strategies. In an illustrative application using a\nmultimodal skin lesion dataset, we highlight the importance of multimodal\nmachine learning and the power of combining multiple fusion strategies using\nensemble learning. We have open-sourced our framework as a tool for the\ncommunity and hope it will accelerate the uptake of multimodal machine learning\nin healthcare and spur further innovation.\n","authors":["Fergus Imrie","Stefan Denner","Lucas S. Brunschwig","Klaus Maier-Hein","Mihaela van der Schaar"],"pdf_url":"https://arxiv.org/pdf/2407.18227v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18219v1","updated":"2024-07-25T17:35:59Z","published":"2024-07-25T17:35:59Z","title":"Recursive Introspection: Teaching Language Model Agents How to\n Self-Improve","summary":" A central piece in enabling intelligent agentic behavior in foundation models\nis to make them capable of introspecting upon their behavior, reasoning, and\ncorrecting their mistakes as more computation or interaction is available. Even\nthe strongest proprietary large language models (LLMs) do not quite exhibit the\nability of continually improving their responses sequentially, even in\nscenarios where they are explicitly told that they are making a mistake. In\nthis paper, we develop RISE: Recursive IntroSpEction, an approach for\nfine-tuning LLMs to introduce this capability, despite prior work hypothesizing\nthat this capability may not be possible to attain. Our approach prescribes an\niterative fine-tuning procedure, which attempts to teach the model how to alter\nits response after having executed previously unsuccessful attempts to solve a\nhard test-time problem, with optionally additional environment feedback. RISE\nposes fine-tuning for a single-turn prompt as solving a multi-turn Markov\ndecision process (MDP), where the initial state is the prompt. Inspired by\nprinciples in online imitation learning and reinforcement learning, we propose\nstrategies for multi-turn data collection and training so as to imbue an LLM\nwith the capability to recursively detect and correct its previous mistakes in\nsubsequent iterations. Our experiments show that RISE enables Llama2, Llama3,\nand Mistral models to improve themselves with more turns on math reasoning\ntasks, outperforming several single-turn strategies given an equal amount of\ninference-time computation. We also find that RISE scales well, often attaining\nlarger benefits with more capable models. Our analysis shows that RISE makes\nmeaningful improvements to responses to arrive at the correct solution for\nchallenging prompts, without disrupting one-turn abilities as a result of\nexpressing more complex distributions.\n","authors":["Yuxiao Qu","Tianjun Zhang","Naman Garg","Aviral Kumar"],"pdf_url":"https://arxiv.org/pdf/2407.18219v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18213v1","updated":"2024-07-25T17:26:41Z","published":"2024-07-25T17:26:41Z","title":"Exploring Scaling Trends in LLM Robustness","summary":" Language model capabilities predictably improve from scaling a model's size\nand training data. Motivated by this, increasingly large language models have\nbeen trained, yielding an array of impressive capabilities. Yet these models\nare vulnerable to adversarial prompts, such as \"jailbreaks\" that hijack models\nto perform undesired behaviors, posing a significant risk of misuse. Prior work\nindicates that computer vision models become more robust with model and data\nscaling, raising the question: does language model robustness also improve with\nscale? We study this question empirically, finding that larger models respond\nsubstantially better to adversarial training, but there is little to no benefit\nfrom model scale in the absence of explicit defenses.\n","authors":["Nikolhaus Howe","Michał Zajac","Ian McKenzie","Oskar Hollinsworth","Tom Tseng","Pierre-Luc Bacon","Adam Gleave"],"pdf_url":"https://arxiv.org/pdf/2407.18213v1.pdf","comment":"31 pages"},{"id":"http://arxiv.org/abs/2406.05981v3","updated":"2024-07-25T17:20:48Z","published":"2024-06-10T02:47:55Z","title":"ShiftAddLLM: Accelerating Pretrained LLMs via Post-Training\n Multiplication-Less Reparameterization","summary":" Large language models (LLMs) have shown impressive performance on language\ntasks but face challenges when deployed on resource-constrained devices due to\ntheir extensive parameters and reliance on dense multiplications, resulting in\nhigh memory demands and latency bottlenecks. Shift-and-add reparameterization\noffers a promising solution by replacing costly multiplications with\nhardware-friendly primitives in both the attention and multi-layer perceptron\n(MLP) layers of an LLM. However, current reparameterization techniques require\ntraining from scratch or full parameter fine-tuning to restore accuracy, which\nis resource-intensive for LLMs. To address this, we propose accelerating\npretrained LLMs through post-training shift-and-add reparameterization,\ncreating efficient multiplication-free models, dubbed ShiftAddLLM.\nSpecifically, we quantize each weight matrix into binary matrices paired with\ngroup-wise scaling factors. The associated multiplications are reparameterized\ninto (1) shifts between activations and scaling factors and (2) queries and\nadds according to the binary matrices. To reduce accuracy loss, we present a\nmulti-objective optimization method to minimize both weight and output\nactivation reparameterization errors. Additionally, based on varying\nsensitivity across layers to reparameterization, we develop an automated bit\nallocation strategy to further reduce memory usage and latency. Experiments on\nfive LLM families and eight tasks consistently validate the effectiveness of\nShiftAddLLM, achieving average perplexity improvements of 5.6 and 22.7 points\nat comparable or lower latency compared to the most competitive quantized LLMs\nat 3 and 2 bits, respectively, and more than 80% memory and energy reductions\nover the original LLMs. Codes and models are available at\nhttps://github.com/GATECH-EIC/ShiftAddLLM.\n","authors":["Haoran You","Yipin Guo","Yichao Fu","Wei Zhou","Huihong Shi","Xiaofan Zhang","Souvik Kundu","Amir Yazdanbakhsh","Yingyan Celine Lin"],"pdf_url":"https://arxiv.org/pdf/2406.05981v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06446v6","updated":"2024-07-25T17:19:31Z","published":"2023-06-10T13:53:41Z","title":"ShiftAddViT: Mixture of Multiplication Primitives Towards Efficient\n Vision Transformer","summary":" Vision Transformers (ViTs) have shown impressive performance and have become\na unified backbone for multiple vision tasks. However, both the attention\nmechanism and multi-layer perceptrons (MLPs) in ViTs are not sufficiently\nefficient due to dense multiplications, leading to costly training and\ninference. To this end, we propose to reparameterize pre-trained ViTs with a\nmixture of multiplication primitives, e.g., bitwise shifts and additions,\ntowards a new type of multiplication-reduced model, dubbed\n$\\textbf{ShiftAddViT}$, which aims to achieve end-to-end inference speedups on\nGPUs without requiring training from scratch. Specifically, all\n$\\texttt{MatMuls}$ among queries, keys, and values are reparameterized using\nadditive kernels, after mapping queries and keys to binary codes in Hamming\nspace. The remaining MLPs or linear layers are then reparameterized with shift\nkernels. We utilize TVM to implement and optimize those customized kernels for\npractical hardware deployment on GPUs. We find that such a reparameterization\non attention maintains model accuracy, while inevitably leading to accuracy\ndrops when being applied to MLPs. To marry the best of both worlds, we further\npropose a new mixture of experts (MoE) framework to reparameterize MLPs by\ntaking multiplication or its primitives as experts, e.g., multiplication and\nshift, and designing a new latency-aware load-balancing loss. Such a loss helps\nto train a generic router for assigning a dynamic amount of input tokens to\ndifferent experts according to their latency. Extensive experiments on various\n2D/3D Transformer-based vision tasks consistently validate the effectiveness of\nour proposed ShiftAddViT, achieving up to $\\textbf{5.18$\\times$}$ latency\nreductions on GPUs and $\\textbf{42.9}$% energy savings, while maintaining a\ncomparable accuracy as original or efficient ViTs.\n","authors":["Haoran You","Huihong Shi","Yipin Guo","Yingyan Celine Lin"],"pdf_url":"https://arxiv.org/pdf/2306.06446v6.pdf","comment":"Accepted by NeurIPS 2023"},{"id":"http://arxiv.org/abs/2406.07368v2","updated":"2024-07-25T17:18:01Z","published":"2024-06-11T15:34:43Z","title":"When Linear Attention Meets Autoregressive Decoding: Towards More\n Effective and Efficient Linearized Large Language Models","summary":" Autoregressive Large Language Models (LLMs) have achieved impressive\nperformance in language tasks but face two significant bottlenecks: (1)\nquadratic complexity in the attention module as the number of tokens increases,\nand (2) limited efficiency due to the sequential processing nature of\nautoregressive LLMs during generation. While linear attention and speculative\ndecoding offer potential solutions, their applicability and synergistic\npotential for enhancing autoregressive LLMs remain uncertain. We conduct the\nfirst comprehensive study on the efficacy of existing linear attention methods\nfor autoregressive LLMs, integrating them with speculative decoding. We\nintroduce an augmentation technique for linear attention that ensures\ncompatibility with speculative decoding, enabling more efficient training and\nserving of LLMs. Extensive experiments and ablation studies involving seven\nexisting linear attention models and five encoder/decoder-based LLMs\nconsistently validate the effectiveness of our augmented linearized LLMs.\nNotably, our approach achieves up to a 6.67 reduction in perplexity on the\nLLaMA model and up to a 2$\\times$ speedup during generation compared to prior\nlinear attention methods. Codes and models are available at\nhttps://github.com/GATECH-EIC/Linearized-LLM.\n","authors":["Haoran You","Yichao Fu","Zheng Wang","Amir Yazdanbakhsh","Yingyan Celine Lin"],"pdf_url":"https://arxiv.org/pdf/2406.07368v2.pdf","comment":"Accepted by ICML 2024; 17 pages; 10 figures; 16 tables"},{"id":"http://arxiv.org/abs/2407.18207v1","updated":"2024-07-25T17:17:10Z","published":"2024-07-25T17:17:10Z","title":"Geometry Fidelity for Spherical Images","summary":" Spherical or omni-directional images offer an immersive visual format\nappealing to a wide range of computer vision applications. However, geometric\nproperties of spherical images pose a major challenge for models and metrics\ndesigned for ordinary 2D images. Here, we show that direct application of\nFr\\'echet Inception Distance (FID) is insufficient for quantifying geometric\nfidelity in spherical images. We introduce two quantitative metrics accounting\nfor geometric constraints, namely Omnidirectional FID (OmniFID) and\nDiscontinuity Score (DS). OmniFID is an extension of FID tailored to\nadditionally capture field-of-view requirements of the spherical format by\nleveraging cubemap projections. DS is a kernel-based seam alignment score of\ncontinuity across borders of 2D representations of spherical images. In\nexperiments, OmniFID and DS quantify geometry fidelity issues that are\nundetected by FID.\n","authors":["Anders Christensen","Nooshin Mojab","Khushman Patel","Karan Ahuja","Zeynep Akata","Ole Winther","Mar Gonzalez-Franco","Andrea Colaco"],"pdf_url":"https://arxiv.org/pdf/2407.18207v1.pdf","comment":"Accepted at ECCV 2024"},{"id":"http://arxiv.org/abs/2407.18202v1","updated":"2024-07-25T17:11:00Z","published":"2024-07-25T17:11:00Z","title":"Differentiable Quantum Architecture Search in Asynchronous Quantum\n Reinforcement Learning","summary":" The emergence of quantum reinforcement learning (QRL) is propelled by\nadvancements in quantum computing (QC) and machine learning (ML), particularly\nthrough quantum neural networks (QNN) built on variational quantum circuits\n(VQC). These advancements have proven successful in addressing sequential\ndecision-making tasks. However, constructing effective QRL models demands\nsignificant expertise due to challenges in designing quantum circuit\narchitectures, including data encoding and parameterized circuits, which\nprofoundly influence model performance. In this paper, we propose addressing\nthis challenge with differentiable quantum architecture search (DiffQAS),\nenabling trainable circuit parameters and structure weights using\ngradient-based optimization. Furthermore, we enhance training efficiency\nthrough asynchronous reinforcement learning (RL) methods facilitating parallel\ntraining. Through numerical simulations, we demonstrate that our proposed\nDiffQAS-QRL approach achieves performance comparable to manually-crafted\ncircuit architectures across considered environments, showcasing stability\nacross diverse scenarios. This methodology offers a pathway for designing QRL\nmodels without extensive quantum knowledge, ensuring robust performance and\nfostering broader application of QRL.\n","authors":["Samuel Yen-Chi Chen"],"pdf_url":"https://arxiv.org/pdf/2407.18202v1.pdf","comment":"Accepted by IEEE International Conference on Quantum Computing and\n Engineering - QCE 2024"},{"id":"http://arxiv.org/abs/2407.18200v1","updated":"2024-07-25T17:09:22Z","published":"2024-07-25T17:09:22Z","title":"Sparse Incremental Aggregation in Multi-Hop Federated Learning","summary":" This paper investigates federated learning (FL) in a multi-hop communication\nsetup, such as in constellations with inter-satellite links. In this setup,\npart of the FL clients are responsible for forwarding other client's results to\nthe parameter server. Instead of using conventional routing, the communication\nefficiency can be improved significantly by using in-network model aggregation\nat each intermediate hop, known as incremental aggregation (IA). Prior works\n[1] have indicated diminishing gains for IA under gradient sparsification. Here\nwe study this issue and propose several novel correlated sparsification methods\nfor IA. Numerical results show that, for some of these algorithms, the full\npotential of IA is still available under sparsification without impairing\nconvergence. We demonstrate a 15x improvement in communication efficiency over\nconventional routing and a 11x improvement over state-of-the-art (SoA) sparse\nIA.\n","authors":["Sourav Mukherjee","Nasrin Razmi","Armin Dekorsy","Petar Popovski","Bho Matthiesen"],"pdf_url":"https://arxiv.org/pdf/2407.18200v1.pdf","comment":"This paper is accepted for the 25th IEEE International Workshop on\n Signal Processing Advances in Wireless Communications (SPAWC) conference"},{"id":"http://arxiv.org/abs/2310.09149v2","updated":"2024-07-25T17:05:37Z","published":"2023-10-13T14:43:11Z","title":"Wasserstein approximation schemes based on Voronoi partitions","summary":" We consider structured approximation of measures in Wasserstein space\n$\\mathrm{W}_p(\\mathbb{R}^d)$ for $p\\in[1,\\infty)$ using general measure\napproximants compactly supported on Voronoi regions derived from a scaled\nVoronoi partition of $\\mathbb{R}^d$. We show that if a full rank lattice\n$\\Lambda$ is scaled by a factor of $h\\in(0,1]$, then approximation of a measure\nbased on the Voronoi partition of $h\\Lambda$ is $O(h)$ regardless of $d$ or\n$p$. We then use a covering argument to show that $N$-term approximations of\ncompactly supported measures is $O(N^{-\\frac1d})$ which matches known rates for\noptimal quantizers and empirical measure approximation in most instances.\nAdditionally, we generalize our construction to nonuniform Voronoi partitions,\nhighlighting the flexibility and robustness of our approach for various measure\napproximation scenarios. Finally, we extend these results to noncompactly\nsupported measures with sufficient decay. Our findings are pertinent to\napplications in computer vision and machine learning where measures are used to\nrepresent structured data such as images.\n","authors":["Keaton Hamm","Varun Khurana"],"pdf_url":"https://arxiv.org/pdf/2310.09149v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14236v4","updated":"2024-07-25T16:52:15Z","published":"2024-03-21T08:54:24Z","title":"A Unified Framework for Model Editing","summary":" ROME and MEMIT are largely believed to be two different model editing\nalgorithms, with the major difference between them being the ability to perform\nbatched edits. In this paper, we unify these two algorithms under a single\nconceptual umbrella, optimizing for the same goal, which we call the\npreservation-memorization objective. ROME uses an equality constraint to\noptimize this objective to perform one edit at a time, whereas MEMIT employs a\nmore flexible least-square constraint that allows for batched edits. We\ngeneralize ROME and enable batched editing with equality constraint in the form\nof EMMET - an Equality-constrained Mass Model Editing algorithm for\nTransformers, a new batched memory-editing algorithm. EMMET can perform\nbatched-edits up to a batch-size of 10,000, with very similar performance to\nMEMIT across multiple dimensions. With the introduction of EMMET, we truly\nunify ROME and MEMIT and show that both algorithms are equivalent in terms of\ntheir optimization objective, their abilities (singular and batched editing),\ntheir model editing performance and their limitations.\n","authors":["Akshat Gupta","Dev Sajnani","Gopala Anumanchipalli"],"pdf_url":"https://arxiv.org/pdf/2403.14236v4.pdf","comment":"Under review. To appear as poster at KnowledgeableLM Workshop\n co-located with ACL 2024"},{"id":"http://arxiv.org/abs/2407.18184v1","updated":"2024-07-25T16:43:56Z","published":"2024-07-25T16:43:56Z","title":"AsEP: Benchmarking Deep Learning Methods for Antibody-specific Epitope\n Prediction","summary":" Epitope identification is vital for antibody design yet challenging due to\nthe inherent variability in antibodies. While many deep learning methods have\nbeen developed for general protein binding site prediction tasks, whether they\nwork for epitope prediction remains an understudied research question. The\nchallenge is also heightened by the lack of a consistent evaluation pipeline\nwith sufficient dataset size and epitope diversity. We introduce a filtered\nantibody-antigen complex structure dataset, AsEP (Antibody-specific Epitope\nPrediction). AsEP is the largest of its kind and provides clustered epitope\ngroups, allowing the community to develop and test novel epitope prediction\nmethods. AsEP comes with an easy-to-use interface in Python and pre-built graph\nrepresentations of each antibody-antigen complex while also supporting\ncustomizable embedding methods. Based on this new dataset, we benchmarked\nvarious representative general protein-binding site prediction methods and find\nthat their performances are not satisfactory as expected for epitope\nprediction. We thus propose a new method, WALLE, that leverages both protein\nlanguage models and graph neural networks. WALLE demonstrate about 5X\nperformance gain over existing methods. Our empirical findings evidence that\nepitope prediction benefits from combining sequential embeddings provided by\nlanguage models and geometrical information from graph representations,\nproviding a guideline for future method design. In addition, we reformulate the\ntask as bipartite link prediction, allowing easy model performance attribution\nand interpretability. We open-source our data and code at\nhttps://github.com/biochunan/AsEP-dataset.\n","authors":["Chunan Liu","Lilian Denzler","Yihong Chen","Andrew Martin","Brooks Paige"],"pdf_url":"https://arxiv.org/pdf/2407.18184v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18181v1","updated":"2024-07-25T16:42:08Z","published":"2024-07-25T16:42:08Z","title":"Gene Regulatory Network Inference from Pre-trained Single-Cell\n Transcriptomics Transformer with Joint Graph Learning","summary":" Inferring gene regulatory networks (GRNs) from single-cell RNA sequencing\n(scRNA-seq) data is a complex challenge that requires capturing the intricate\nrelationships between genes and their regulatory interactions. In this study,\nwe tackle this challenge by leveraging the single-cell BERT-based pre-trained\ntransformer model (scBERT), trained on extensive unlabeled scRNA-seq data, to\naugment structured biological knowledge from existing GRNs. We introduce a\nnovel joint graph learning approach that combines the rich contextual\nrepresentations learned by pre-trained single-cell language models with the\nstructured knowledge encoded in GRNs using graph neural networks (GNNs). By\nintegrating these two modalities, our approach effectively reasons over boththe\ngene expression level constraints provided by the scRNA-seq data and the\nstructured biological knowledge inherent in GRNs. We evaluate our method on\nhuman cell benchmark datasets from the BEELINE study with cell type-specific\nground truth networks. The results demonstrate superior performance over\ncurrent state-of-the-art baselines, offering a deeper understanding of cellular\nregulatory mechanisms.\n","authors":["Sindhura Kommu","Yizhi Wang","Yue Wang","Xuan Wang"],"pdf_url":"https://arxiv.org/pdf/2407.18181v1.pdf","comment":"Accepted into the ICML 2024 AI for Science workshop"},{"id":"http://arxiv.org/abs/2407.18175v1","updated":"2024-07-25T16:35:46Z","published":"2024-07-25T16:35:46Z","title":"Quasar-ViT: Hardware-Oriented Quantization-Aware Architecture Search for\n Vision Transformers","summary":" Vision transformers (ViTs) have demonstrated their superior accuracy for\ncomputer vision tasks compared to convolutional neural networks (CNNs).\nHowever, ViT models are often computation-intensive for efficient deployment on\nresource-limited edge devices. This work proposes Quasar-ViT, a\nhardware-oriented quantization-aware architecture search framework for ViTs, to\ndesign efficient ViT models for hardware implementation while preserving the\naccuracy. First, Quasar-ViT trains a supernet using our row-wise flexible\nmixed-precision quantization scheme, mixed-precision weight entanglement, and\nsupernet layer scaling techniques. Then, it applies an efficient\nhardware-oriented search algorithm, integrated with hardware latency and\nresource modeling, to determine a series of optimal subnets from supernet under\ndifferent inference latency targets. Finally, we propose a series of\nmodel-adaptive designs on the FPGA platform to support the architecture search\nand mitigate the gap between the theoretical computation reduction and the\npractical inference speedup. Our searched models achieve 101.5, 159.6, and\n251.6 frames-per-second (FPS) inference speed on the AMD/Xilinx ZCU102 FPGA\nwith 80.4%, 78.6%, and 74.9% top-1 accuracy, respectively, for the ImageNet\ndataset, consistently outperforming prior works.\n","authors":["Zhengang Li","Alec Lu","Yanyue Xie","Zhenglun Kong","Mengshu Sun","Hao Tang","Zhong Jia Xue","Peiyan Dong","Caiwen Ding","Yanzhi Wang","Xue Lin","Zhenman Fang"],"pdf_url":"https://arxiv.org/pdf/2407.18175v1.pdf","comment":"Accepted by ICS 2024"},{"id":"http://arxiv.org/abs/2407.18170v1","updated":"2024-07-25T16:33:35Z","published":"2024-07-25T16:33:35Z","title":"RIDA: A Robust Attack Framework on Incomplete Graphs","summary":" Graph Neural Networks (GNNs) are vital in data science but are increasingly\nsusceptible to adversarial attacks. To help researchers develop more robust GNN\nmodels, it's essential to focus on designing strong attack models as\nfoundational benchmarks and guiding references. Among adversarial attacks,\ngray-box poisoning attacks are noteworthy due to their effectiveness and fewer\nconstraints. These attacks exploit GNNs' need for retraining on updated data,\nthereby impacting their performance by perturbing these datasets. However,\ncurrent research overlooks the real-world scenario of incomplete graphs.To\naddress this gap, we introduce the Robust Incomplete Deep Attack Framework\n(RIDA). It is the first algorithm for robust gray-box poisoning attacks on\nincomplete graphs. The approach innovatively aggregates distant vertex\ninformation and ensures powerful data utilization.Extensive tests against 9\nSOTA baselines on 3 real-world datasets demonstrate RIDA's superiority in\nhandling incompleteness and high attack performance on the incomplete graph.\n","authors":["Jianke Yu","Hanchen Wang","Chen Chen","Xiaoyang Wang","Wenjie Zhang","Ying Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.18170v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12120v2","updated":"2024-07-25T16:27:49Z","published":"2024-03-18T18:00:00Z","title":"Light Curve Classification with DistClassiPy: a new distance-based\n classifier","summary":" The rise of synoptic sky surveys has ushered in an era of big data in\ntime-domain astronomy, making data science and machine learning essential tools\nfor studying celestial objects. While tree-based models (e.g. Random Forests)\nand deep learning models dominate the field, we explore the use of different\ndistance metrics to aid in the classification of astrophysical objects. We\ndeveloped DistClassiPy, a new distance metric based classifier. The direct use\nof distance metrics is unexplored in time-domain astronomy, but distance-based\nmethods can help make classification more interpretable and decrease\ncomputational costs. In particular, we applied DistClassiPy to classify light\ncurves of variable stars, comparing the distances between objects of different\nclasses. Using 18 distance metrics on a catalog of 6,000 variable stars across\n10 classes, we demonstrate classification and dimensionality reduction. Our\nclassifier meets state-of-the-art performance but has lower computational\nrequirements and improved interpretability. Additionally, DistClassiPy can be\ntailored to specific objects by identifying the most effective distance metric\nfor that classification. To facilitate broader applications within and beyond\nastronomy, we have made DistClassiPy open-source and available at\nhttps://pypi.org/project/distclassipy/.\n","authors":["Siddharth Chaini","Ashish Mahabal","Ajit Kembhavi","Federica B. Bianco"],"pdf_url":"https://arxiv.org/pdf/2403.12120v2.pdf","comment":"Accepted for publication in Astronomy and Computing (2024). 24 pages,\n 19 figures"},{"id":"http://arxiv.org/abs/2407.14207v2","updated":"2024-07-25T16:24:59Z","published":"2024-07-19T11:12:08Z","title":"Longhorn: State Space Models are Amortized Online Learners","summary":" The most fundamental capability of modern AI methods such as Large Language\nModels (LLMs) is the ability to predict the next token in a long sequence of\ntokens, known as ``sequence modeling.\" Although the Transformers model is the\ncurrent dominant approach to sequence modeling, its quadratic computational\ncost with respect to sequence length is a significant drawback. State-space\nmodels (SSMs) offer a promising alternative due to their linear decoding\nefficiency and high parallelizability during training. However, existing SSMs\noften rely on seemingly ad hoc linear recurrence designs. In this work, we\nexplore SSM design through the lens of online learning, conceptualizing SSMs as\nmeta-modules for specific online learning problems. This approach links SSM\ndesign to formulating precise online learning objectives, with state transition\nrules derived from optimizing these objectives. Based on this insight, we\nintroduce a novel deep SSM architecture based on the implicit update for\noptimizing an online regression objective. Our experimental results show that\nour models outperform state-of-the-art SSMs, including the Mamba model, on\nstandard sequence modeling benchmarks and language modeling tasks.\n","authors":["Bo Liu","Rui Wang","Lemeng Wu","Yihao Feng","Peter Stone","Qiang Liu"],"pdf_url":"https://arxiv.org/pdf/2407.14207v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19708v2","updated":"2024-07-25T16:16:46Z","published":"2024-04-30T17:00:32Z","title":"Harmonic LLMs are Trustworthy","summary":" We introduce an intuitive method to test the robustness (stability and\nexplainability) of any black-box LLM in real-time via its local deviation from\nharmoniticity, denoted as $\\gamma$. To the best of our knowledge this is the\nfirst completely model-agnostic and unsupervised method of measuring the\nrobustness of any given response from an LLM, based upon the model itself\nconforming to a purely mathematical standard. To show general application and\nimmediacy of results, we measure $\\gamma$ in 10 popular LLMs (ChatGPT,\nClaude-2.1, Claude3.0, GPT-4, GPT-4o, Smaug-72B, Mixtral-8x7B, Llama2-7B,\nMistral-7B and MPT-7B) across thousands of queries in three objective domains:\nWebQA, ProgrammingQA, and TruthfulQA. Across all models and domains tested,\nhuman annotation confirms that $\\gamma \\to 0$ indicates trustworthiness, and\nconversely searching higher values of $\\gamma$ easily exposes examples of\nhallucination, a fact that enables efficient adversarial prompt generation\nthrough stochastic gradient ascent in $\\gamma$. The low-$\\gamma$ leaders among\nthe models in the respective domains are GPT-4o, GPT-4, and Smaug-72B,\nproviding evidence that mid-size open-source models can win out against large\ncommercial models.\n","authors":["Nicholas S. Kersting","Mohammad Rahman","Suchismitha Vedala","Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2404.19708v2.pdf","comment":"15 pages, 2 figures, 16 tables; added Claude-3.0, GPT-4o, Mistral-7B,\n Mixtral-8x7B, and more annotation for other models"},{"id":"http://arxiv.org/abs/2407.18158v1","updated":"2024-07-25T16:13:58Z","published":"2024-07-25T16:13:58Z","title":"Unlocking Tokens as Data Points for Generalization Bounds on Larger\n Language Models","summary":" Large language models (LLMs) with billions of parameters excel at predicting\nthe next token in a sequence. Recent work computes non-vacuous\ncompression-based generalization bounds for LLMs, but these bounds are vacuous\nfor large models at the billion-parameter scale. Moreover, these bounds are\nobtained through restrictive compression techniques, bounding compressed models\nthat generate low-quality text. Additionally, the tightness of these existing\nbounds depends on the number of IID documents in a training set rather than the\nmuch larger number of non-IID constituent tokens, leaving untapped potential\nfor tighter bounds. In this work, we instead use properties of martingales to\nderive generalization bounds that benefit from the vast number of tokens in LLM\ntraining sets. Since a dataset contains far more tokens than documents, our\ngeneralization bounds not only tolerate but actually benefit from far less\nrestrictive compression schemes. With Monarch matrices, Kronecker\nfactorizations, and post-training quantization, we achieve non-vacuous\ngeneralization bounds for LLMs as large as LLaMA2-70B. Unlike previous\napproaches, our work achieves the first non-vacuous bounds for models that are\ndeployed in practice and generate high-quality text.\n","authors":["Sanae Lotfi","Yilun Kuang","Brandon Amos","Micah Goldblum","Marc Finzi","Andrew Gordon Wilson"],"pdf_url":"https://arxiv.org/pdf/2407.18158v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00662v2","updated":"2024-07-25T16:04:49Z","published":"2024-05-01T17:50:16Z","title":"No Representation, No Trust: Connecting Representation, Collapse, and\n Trust Issues in PPO","summary":" Reinforcement learning (RL) is inherently rife with non-stationarity since\nthe states and rewards the agent observes during training depend on its\nchanging policy. Therefore, networks in deep RL must be capable of adapting to\nnew observations and fitting new targets. However, previous works have observed\nthat networks in off-policy deep value-based methods exhibit a decrease in\nrepresentation rank, often correlated with an inability to continue learning or\na collapse in performance. Although this phenomenon has generally been\nattributed to neural network learning under non-stationarity, it has been\noverlooked in on-policy policy optimization methods which are often thought\ncapable of training indefinitely. In this work, we empirically study\nrepresentation dynamics in Proximal Policy Optimization (PPO) on the Atari and\nMuJoCo environments, revealing that PPO agents are also affected by feature\nrank deterioration and loss of plasticity. We show that this is aggravated with\nstronger non-stationarity, ultimately driving the actor's performance to\ncollapse, regardless of the performance of the critic. We ask why the trust\nregion, specific to methods like PPO, cannot alleviate or prevent the collapse.\nWe find that there is a connection between representation collapse and the\ndegradation of the trust region, one exacerbating the other, and present\nProximal Feature Optimization (PFO), a novel auxiliary loss that, along with\nother interventions, shows that regularizing the representation dynamics\nimproves the performance of PPO agents.\n","authors":["Skander Moalla","Andrea Miele","Razvan Pascanu","Caglar Gulcehre"],"pdf_url":"https://arxiv.org/pdf/2405.00662v2.pdf","comment":"ICML ARLET workshop version. Code and run histories are available at\n https://github.com/CLAIRE-Labo/no-representation-no-trust"},{"id":"http://arxiv.org/abs/2406.12839v2","updated":"2024-07-25T16:01:04Z","published":"2024-06-18T17:56:10Z","title":"Evaluating the design space of diffusion-based generative models","summary":" Most existing theoretical investigations of the accuracy of diffusion models,\nalbeit significant, assume the score function has been approximated to a\ncertain accuracy, and then use this a priori bound to control the error of\ngeneration. This article instead provides a first quantitative understanding of\nthe whole generation process, i.e., both training and sampling. More precisely,\nit conducts a non-asymptotic convergence analysis of denoising score matching\nunder gradient descent. In addition, a refined sampling error analysis for\nvariance exploding models is also provided. The combination of these two\nresults yields a full error analysis, which elucidates (again, but this time\ntheoretically) how to design the training and sampling processes for effective\ngeneration. For instance, our theory implies a preference toward noise\ndistribution and loss weighting in training that qualitatively agree with the\nones used in [Karras et al. 2022]. It also provides perspectives on the choices\nof time and variance schedules in sampling: when the score is well trained, the\ndesign in [Song et al. 2020] is more preferable, but when it is less trained,\nthe design in [Karras et al. 2022] becomes more preferable.\n","authors":["Yuqing Wang","Ye He","Molei Tao"],"pdf_url":"https://arxiv.org/pdf/2406.12839v2.pdf","comment":"Comments are welcome. Out of admiration we titled our paper after\n EDM, and hoped theorists' humor is not too corny"},{"id":"http://arxiv.org/abs/2407.18148v1","updated":"2024-07-25T15:58:56Z","published":"2024-07-25T15:58:56Z","title":"StraightLine: An End-to-End Resource-Aware Scheduler for Machine\n Learning Application Requests","summary":" The life cycle of machine learning (ML) applications consists of two stages:\nmodel development and model deployment. However, traditional ML systems (e.g.,\ntraining-specific or inference-specific systems) focus on one particular stage\nor phase of the life cycle of ML applications. These systems often aim at\noptimizing model training or accelerating model inference, and they frequently\nassume homogeneous infrastructure, which may not always reflect real-world\nscenarios that include cloud data centers, local servers, containers, and\nserverless platforms. We present StraightLine, an end-to-end resource-aware\nscheduler that schedules the optimal resources (e.g., container, virtual\nmachine, or serverless) for different ML application requests in a hybrid\ninfrastructure. The key innovation is an empirical dynamic placing algorithm\nthat intelligently places requests based on their unique characteristics (e.g.,\nrequest frequency, input data size, and data distribution). In contrast to\nexisting ML systems, StraightLine offers end-to-end resource-aware placement,\nthereby it can significantly reduce response time and failure rate for model\ndeployment when facing different computing resources in the hybrid\ninfrastructure.\n","authors":["Cheng-Wei Ching","Boyuan Guan","Hailu Xu","Liting Hu"],"pdf_url":"https://arxiv.org/pdf/2407.18148v1.pdf","comment":"6 pages, 8 figures, to appear in AIoTC'24"},{"id":"http://arxiv.org/abs/2407.18143v1","updated":"2024-07-25T15:48:24Z","published":"2024-07-25T15:48:24Z","title":"Maximum Entropy On-Policy Actor-Critic via Entropy Advantage Estimation","summary":" Entropy Regularisation is a widely adopted technique that enhances policy\noptimisation performance and stability. A notable form of entropy\nregularisation is augmenting the objective with an entropy term, thereby\nsimultaneously optimising the expected return and the entropy. This framework,\nknown as maximum entropy reinforcement learning (MaxEnt RL), has shown\ntheoretical and empirical successes. However, its practical application in\nstraightforward on-policy actor-critic settings remains surprisingly\nunderexplored. We hypothesise that this is due to the difficulty of managing\nthe entropy reward in practice. This paper proposes a simple method of\nseparating the entropy objective from the MaxEnt RL objective, which\nfacilitates the implementation of MaxEnt RL in on-policy settings. Our\nempirical evaluations demonstrate that extending Proximal Policy Optimisation\n(PPO) and Trust Region Policy Optimisation (TRPO) within the MaxEnt framework\nimproves policy optimisation performance in both MuJoCo and Procgen tasks.\nAdditionally, our results highlight MaxEnt RL's capacity to enhance\ngeneralisation.\n","authors":["Jean Seong Bjorn Choe","Jong-Kook Kim"],"pdf_url":"https://arxiv.org/pdf/2407.18143v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18141v1","updated":"2024-07-25T15:45:17Z","published":"2024-07-25T15:45:17Z","title":"IRIS: Wireless Ring for Vision-based Smart Home Interaction","summary":" Integrating cameras into wireless smart rings has been challenging due to\nsize and power constraints. We introduce IRIS, the first wireless\nvision-enabled smart ring system for smart home interactions. Equipped with a\ncamera, Bluetooth radio, inertial measurement unit (IMU), and an onboard\nbattery, IRIS meets the small size, weight, and power (SWaP) requirements for\nring devices. IRIS is context-aware, adapting its gesture set to the detected\ndevice, and can last for 16-24 hours on a single charge. IRIS leverages the\nscene semantics to achieve instance-level device recognition. In a study\ninvolving 23 participants, IRIS consistently outpaced voice commands, with a\nhigher proportion of participants expressing a preference for IRIS over voice\ncommands regarding toggling a device's state, granular control, and social\nacceptability. Our work pushes the boundary of what is possible with ring\nform-factor devices, addressing system challenges and opening up novel\ninteraction capabilities.\n","authors":["Maruchi Kim","Antonio Glenn","Bandhav Veluri","Yunseo Lee","Eyoel Gebre","Aditya Bagaria","Shwetak Patel","Shyamnath Gollakota"],"pdf_url":"https://arxiv.org/pdf/2407.18141v1.pdf","comment":"15 pages, 17 figures, 6 tables, to be published in UIST 2024"},{"id":"http://arxiv.org/abs/2407.18134v1","updated":"2024-07-25T15:38:16Z","published":"2024-07-25T15:38:16Z","title":"$\\mathbb{X}$-Sample Contrastive Loss: Improving Contrastive Learning\n with Sample Similarity Graphs","summary":" Learning good representations involves capturing the diverse ways in which\ndata samples relate. Contrastive loss - an objective matching related samples -\nunderlies methods from self-supervised to multimodal learning. Contrastive\nlosses, however, can be viewed more broadly as modifying a similarity graph to\nindicate how samples should relate in the embedding space. This view reveals a\nshortcoming in contrastive learning: the similarity graph is binary, as only\none sample is the related positive sample. Crucially, similarities\n\\textit{across} samples are ignored. Based on this observation, we revise the\nstandard contrastive loss to explicitly encode how a sample relates to others.\nWe experiment with this new objective, called $\\mathbb{X}$-Sample Contrastive,\nto train vision models based on similarities in class or text caption\ndescriptions. Our study spans three scales: ImageNet-1k with 1 million, CC3M\nwith 3 million, and CC12M with 12 million samples. The representations learned\nvia our objective outperform both contrastive self-supervised and\nvision-language models trained on the same data across a range of tasks. When\ntraining on CC12M, we outperform CLIP by $0.6\\%$ on both ImageNet and ImageNet\nReal. Our objective appears to work particularly well in lower-data regimes,\nwith gains over CLIP of $16.8\\%$ on ImageNet and $18.1\\%$ on ImageNet Real when\ntraining with CC3M. Finally, our objective seems to encourage the model to\nlearn representations that separate objects from their attributes and\nbackgrounds, with gains of $3.3$-$5.6$\\% over CLIP on ImageNet9. We hope the\nproposed solution takes a small step towards developing richer learning\nobjectives for understanding sample relations in foundation models.\n","authors":["Vlad Sobal","Mark Ibrahim","Randall Balestriero","Vivien Cabannes","Diane Bouchacourt","Pietro Astolfi","Kyunghyun Cho","Yann LeCun"],"pdf_url":"https://arxiv.org/pdf/2407.18134v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17449v2","updated":"2024-07-25T15:33:00Z","published":"2024-07-24T17:30:21Z","title":"Looking at Model Debiasing through the Lens of Anomaly Detection","summary":" It is widely recognized that deep neural networks are sensitive to bias in\nthe data. This means that during training these models are likely to learn\nspurious correlations between data and labels, resulting in limited\ngeneralization abilities and low performance. In this context, model debiasing\napproaches can be devised aiming at reducing the model's dependency on such\nunwanted correlations, either leveraging the knowledge of bias information or\nnot. In this work, we focus on the latter and more realistic scenario, showing\nthe importance of accurately predicting the bias-conflicting and bias-aligned\nsamples to obtain compelling performance in bias mitigation. On this ground, we\npropose to conceive the problem of model bias from an out-of-distribution\nperspective, introducing a new bias identification method based on anomaly\ndetection. We claim that when data is mostly biased, bias-conflicting samples\ncan be regarded as outliers with respect to the bias-aligned distribution in\nthe feature space of a biased model, thus allowing for precisely detecting them\nwith an anomaly detection method. Coupling the proposed bias identification\napproach with bias-conflicting data upsampling and augmentation in a two-step\nstrategy, we reach state-of-the-art performance on synthetic and real benchmark\ndatasets. Ultimately, our proposed approach shows that the data bias issue does\nnot necessarily require complex debiasing methods, given that an accurate bias\nidentification procedure is defined.\n","authors":["Vito Paolo Pastore","Massimiliano Ciranni","Davide Marinelli","Francesca Odone","Vittorio Murino"],"pdf_url":"https://arxiv.org/pdf/2407.17449v2.pdf","comment":"15 pages, 7 figures"},{"id":"http://arxiv.org/abs/2310.20498v2","updated":"2024-07-25T15:25:27Z","published":"2023-10-31T14:37:37Z","title":"Generative Learning of Continuous Data by Tensor Networks","summary":" Beyond their origin in modeling many-body quantum systems, tensor networks\nhave emerged as a promising class of models for solving machine learning\nproblems, notably in unsupervised generative learning. While possessing many\ndesirable features arising from their quantum-inspired nature, tensor network\ngenerative models have previously been largely restricted to binary or\ncategorical data, limiting their utility in real-world modeling problems. We\novercome this by introducing a new family of tensor network generative models\nfor continuous data, which are capable of learning from distributions\ncontaining continuous random variables. We develop our method in the setting of\nmatrix product states, first deriving a universal expressivity theorem proving\nthe ability of this model family to approximate any reasonably smooth\nprobability density function with arbitrary precision. We then benchmark the\nperformance of this model on several synthetic and real-world datasets, finding\nthat the model learns and generalizes well on distributions of continuous and\ndiscrete variables. We develop methods for modeling different data domains, and\nintroduce a trainable compression layer which is found to increase model\nperformance given limited memory or computational resources. Overall, our\nmethods give important theoretical and empirical evidence of the efficacy of\nquantum-inspired methods for the rapidly growing field of generative learning.\n","authors":["Alex Meiburg","Jing Chen","Jacob Miller","Raphaëlle Tihon","Guillaume Rabusseau","Alejandro Perdomo-Ortiz"],"pdf_url":"https://arxiv.org/pdf/2310.20498v2.pdf","comment":"21 pages, 15 figures"},{"id":"http://arxiv.org/abs/2407.18114v1","updated":"2024-07-25T15:21:54Z","published":"2024-07-25T15:21:54Z","title":"Unsupervised Training of Neural Cellular Automata on Edge Devices","summary":" The disparity in access to machine learning tools for medical imaging across\ndifferent regions significantly limits the potential for universal healthcare\ninnovation, particularly in remote areas. Our research addresses this issue by\nimplementing Neural Cellular Automata (NCA) training directly on smartphones\nfor accessible X-ray lung segmentation. We confirm the practicality and\nfeasibility of deploying and training these advanced models on five Android\ndevices, improving medical diagnostics accessibility and bridging the tech\ndivide to extend machine learning benefits in medical imaging to low- and\nmiddle-income countries (LMICs). We further enhance this approach with an\nunsupervised adaptation method using the novel Variance-Weighted Segmentation\nLoss (VWSL), which efficiently learns from unlabeled data by minimizing the\nvariance from multiple NCA predictions. This strategy notably improves model\nadaptability and performance across diverse medical imaging contexts without\nthe need for extensive computational resources or labeled datasets, effectively\nlowering the participation threshold. Our methodology, tested on three\nmultisite X-ray datasets -- Padchest, ChestX-ray8, and MIMIC-III --\ndemonstrates improvements in segmentation Dice accuracy by 0.7 to 2.8%,\ncompared to the classic Med-NCA. Additionally, in extreme cases where no\ndigital copy is available and images must be captured by a phone from an X-ray\nlightbox or monitor, VWSL enhances Dice accuracy by 5-20%, demonstrating the\nmethod's robustness even with suboptimal image sources.\n","authors":["John Kalkhof","Amin Ranem","Anirban Mukhopadhyay"],"pdf_url":"https://arxiv.org/pdf/2407.18114v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18108v1","updated":"2024-07-25T15:12:46Z","published":"2024-07-25T15:12:46Z","title":"Graph Neural Ordinary Differential Equations for Coarse-Grained\n Socioeconomic Dynamics","summary":" We present a data-driven machine-learning approach for modeling space-time\nsocioeconomic dynamics. Through coarse-graining fine-scale observations, our\nmodeling framework simplifies these complex systems to a set of tractable\nmechanistic relationships -- in the form of ordinary differential equations --\nwhile preserving critical system behaviors. This approach allows for expedited\n'what if' studies and sensitivity analyses, essential for informed\npolicy-making. Our findings, from a case study of Baltimore, MD, indicate that\nthis machine learning-augmented coarse-grained model serves as a powerful\ninstrument for deciphering the complex interactions between social factors,\ngeography, and exogenous stressors, offering a valuable asset for system\nforecasting and resilience planning.\n","authors":["James Koch","Pranab Roy Chowdhury","Heng Wan","Parin Bhaduri","Jim Yoon","Vivek Srikrishnan","W. Brent Daniel"],"pdf_url":"https://arxiv.org/pdf/2407.18108v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18103v1","updated":"2024-07-25T15:07:35Z","published":"2024-07-25T15:07:35Z","title":"Fine-Tuning Large Language Models for Stock Return Prediction Using\n Newsflow","summary":" Large language models (LLMs) and their fine-tuning techniques have\ndemonstrated superior performance in various language understanding and\ngeneration tasks. This paper explores fine-tuning LLMs for stock return\nforecasting with financial newsflow. In quantitative investing, return\nforecasting is fundamental for subsequent tasks like stock picking, portfolio\noptimization, etc. We formulate the model to include text representation and\nforecasting modules. We propose to compare the encoder-only and decoder-only\nLLMs, considering they generate text representations in distinct ways. The\nimpact of these different representations on forecasting performance remains an\nopen question. Meanwhile, we compare two simple methods of integrating LLMs'\ntoken-level representations into the forecasting module. The experiments on\nreal news and investment universes reveal that: (1) aggregated representations\nfrom LLMs' token-level embeddings generally produce return predictions that\nenhance the performance of long-only and long-short portfolios; (2) in the\nrelatively large investment universe, the decoder LLMs-based prediction model\nleads to stronger portfolios, whereas in the small universes, there are no\nconsistent winners. Among the three LLMs studied (DeBERTa, Mistral, Llama),\nMistral performs more robustly across different universes; (3) return\npredictions derived from LLMs' text representations are a strong signal for\nportfolio construction, outperforming conventional sentiment scores.\n","authors":["Tian Guo","Emmanuel Hauptmann"],"pdf_url":"https://arxiv.org/pdf/2407.18103v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16571v2","updated":"2024-07-25T15:04:31Z","published":"2023-09-28T16:27:07Z","title":"Review of Machine Learning Methods for Additive Manufacturing of\n Functionally Graded Materials","summary":" Additive Manufacturing (AM) is a transformative manufacturing technology\nenabling direct fabrication of complex parts layer-be-layer from 3D modeling\ndata. Among AM applications, the fabrication of Functionally Graded Materials\n(FGMs) has significant importance due to the potential to enhance component\nperformance across several industries. FGMs are manufactured with a gradient\ncomposition transition between dissimilar materials, enabling the design of new\nmaterials with location-dependent mechanical and physical properties. This\nstudy presents a comprehensive review of published literature pertaining to the\nimplementation of Machine Learning (ML) techniques in AM, with an emphasis on\nML-based methods for optimizing FGMs fabrication processes. Through an\nextensive survey of the literature, this review article explores the role of ML\nin addressing the inherent challenges in FGMs fabrication and encompasses\nparameter optimization, defect detection, and real-time monitoring. The article\nalso provides a discussion of future research directions and challenges in\nemploying ML-based methods in AM fabrication of FGMs.\n","authors":["Mohammad Karimzadeh","Deekshith Basvoju","Aleksandar Vakanski","Indrajit Charit","Fei Xu","Xinchang Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.16571v2.pdf","comment":"23 pages"},{"id":"http://arxiv.org/abs/2402.00300v2","updated":"2024-07-25T14:48:34Z","published":"2024-02-01T03:27:26Z","title":"Self-supervised learning of video representations from a child's\n perspective","summary":" Children learn powerful internal models of the world around them from a few\nyears of egocentric visual experience. Can such internal models be learned from\na child's visual experience with highly generic learning algorithms or do they\nrequire strong inductive biases? Recent advances in collecting large-scale,\nlongitudinal, developmentally realistic video datasets and generic\nself-supervised learning (SSL) algorithms are allowing us to begin to tackle\nthis nature vs. nurture question. However, existing work typically focuses on\nimage-based SSL algorithms and visual capabilities that can be learned from\nstatic images (e.g. object recognition), thus ignoring temporal aspects of the\nworld. To close this gap, here we train self-supervised video models on\nlongitudinal, egocentric headcam recordings collected from a child over a two\nyear period in their early development (6-31 months). The resulting models are\nhighly effective at facilitating the learning of action concepts from a small\nnumber of labeled examples; they have favorable data size scaling properties;\nand they display emergent video interpolation capabilities. Video models also\nlearn more robust object representations than image-based models trained with\nthe exact same data. These results suggest that important temporal aspects of a\nchild's internal model of the world may be learnable from their visual\nexperience using highly generic learning algorithms and without strong\ninductive biases.\n","authors":["A. Emin Orhan","Wentao Wang","Alex N. Wang","Mengye Ren","Brenden M. Lake"],"pdf_url":"https://arxiv.org/pdf/2402.00300v2.pdf","comment":"Published as a conference paper at CogSci 2024; code & models\n available from https://github.com/eminorhan/video-models"},{"id":"http://arxiv.org/abs/2406.14549v2","updated":"2024-07-25T14:33:33Z","published":"2024-06-20T17:56:17Z","title":"Uncovering Latent Memories: Assessing Data Leakage and Memorization\n Patterns in Frontier AI Models","summary":" Frontier AI systems are making transformative impacts across society, but\nsuch benefits are not without costs: models trained on web-scale datasets\ncontaining personal and private data raise profound concerns about data privacy\nand security. Language models are trained on extensive corpora including\npotentially sensitive or proprietary information, and the risk of data leakage\n- where the model response reveals pieces of such information - remains\ninadequately understood. Prior work has investigated what factors drive\nmemorization and have identified that sequence complexity and the number of\nrepetitions drive memorization. Here, we focus on the evolution of memorization\nover training. We begin by reproducing findings that the probability of\nmemorizing a sequence scales logarithmically with the number of times it is\npresent in the data. We next show that sequences which are apparently not\nmemorized after the first encounter can be \"uncovered\" throughout the course of\ntraining even without subsequent encounters, a phenomenon we term \"latent\nmemorization\". The presence of latent memorization presents a challenge for\ndata privacy as memorized sequences may be hidden at the final checkpoint of\nthe model but remain easily recoverable. To this end, we develop a diagnostic\ntest relying on the cross entropy loss to uncover latent memorized sequences\nwith high accuracy.\n","authors":["Sunny Duan","Mikail Khona","Abhiram Iyer","Rylan Schaeffer","Ila R Fiete"],"pdf_url":"https://arxiv.org/pdf/2406.14549v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.05679v3","updated":"2024-07-25T14:32:51Z","published":"2023-03-10T03:18:03Z","title":"Clustering with minimum spanning trees: How good can it be?","summary":" Minimum spanning trees (MSTs) provide a convenient representation of datasets\nin numerous pattern recognition activities. Moreover, they are relatively fast\nto compute. In this paper, we quantify the extent to which they are meaningful\nin low-dimensional partitional data clustering tasks. By identifying the upper\nbounds for the agreement between the best (oracle) algorithm and the expert\nlabels from a large battery of benchmark data, we discover that MST methods can\nbe very competitive. Next, we review, study, extend, and generalise a few\nexisting, state-of-the-art MST-based partitioning schemes. This leads to some\nnew noteworthy approaches. Overall, the Genie and the information-theoretic\nmethods often outperform the non-MST algorithms such as K-means, Gaussian\nmixtures, spectral clustering, Birch, density-based, and classical hierarchical\nagglomerative procedures. Nevertheless, we identify that there is still some\nroom for improvement, and thus the development of novel algorithms is\nencouraged.\n","authors":["Marek Gagolewski","Anna Cena","Maciej Bartoszuk","Łukasz Brzozowski"],"pdf_url":"https://arxiv.org/pdf/2303.05679v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.02935v4","updated":"2024-07-25T14:31:03Z","published":"2022-09-07T05:08:34Z","title":"Normalised clustering accuracy: An asymmetric external cluster validity\n measure","summary":" There is no, nor will there ever be, single best clustering algorithm.\nNevertheless, we would still like to be able to distinguish between methods\nthat work well on certain task types and those that systematically\nunderperform. Clustering algorithms are traditionally evaluated using either\ninternal or external validity measures. Internal measures quantify different\naspects of the obtained partitions, e.g., the average degree of cluster\ncompactness or point separability. However, their validity is questionable\nbecause the clusterings they endorse can sometimes be meaningless. External\nmeasures, on the other hand, compare the algorithms' outputs to fixed ground\ntruth groupings provided by experts. In this paper, we argue that the commonly\nused classical partition similarity scores, such as the normalised mutual\ninformation, Fowlkes-Mallows, or adjusted Rand index, miss some desirable\nproperties. In particular, they do not identify worst-case scenarios correctly,\nnor are they easily interpretable. As a consequence, the evaluation of\nclustering algorithms on diverse benchmark datasets can be difficult. To remedy\nthese issues, we propose and analyse a new measure: a version of the optimal\nset-matching accuracy, which is normalised, monotonic with respect to some\nsimilarity relation, scale-invariant, and corrected for the imbalancedness of\ncluster sizes (but neither symmetric nor adjusted for chance).\n","authors":["Marek Gagolewski"],"pdf_url":"https://arxiv.org/pdf/2209.02935v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.10885v3","updated":"2024-07-25T14:30:22Z","published":"2024-02-16T18:43:02Z","title":"3D Diffuser Actor: Policy Diffusion with 3D Scene Representations","summary":" Diffusion policies are conditional diffusion models that learn robot action\ndistributions conditioned on the robot and environment state. They have\nrecently shown to outperform both deterministic and alternative action\ndistribution learning formulations. 3D robot policies use 3D scene feature\nrepresentations aggregated from a single or multiple camera views using sensed\ndepth. They have shown to generalize better than their 2D counterparts across\ncamera viewpoints. We unify these two lines of work and present 3D Diffuser\nActor, a neural policy equipped with a novel 3D denoising transformer that\nfuses information from the 3D visual scene, a language instruction and\nproprioception to predict the noise in noised 3D robot pose trajectories. 3D\nDiffuser Actor sets a new state-of-the-art on RLBench with an absolute\nperformance gain of 18.1% over the current SOTA on a multi-view setup and an\nabsolute gain of 13.1% on a single-view setup. On the CALVIN benchmark, it\nimproves over the current SOTA by a 9% relative increase. It also learns to\ncontrol a robot manipulator in the real world from a handful of demonstrations.\nThrough thorough comparisons with the current SOTA policies and ablations of\nour model, we show 3D Diffuser Actor's design choices dramatically outperform\n2D representations, regression and classification objectives, absolute\nattentions, and holistic non-tokenized 3D scene embeddings.\n","authors":["Tsung-Wei Ke","Nikolaos Gkanatsios","Katerina Fragkiadaki"],"pdf_url":"https://arxiv.org/pdf/2402.10885v3.pdf","comment":"First two authors contributed equally"},{"id":"http://arxiv.org/abs/2407.18074v1","updated":"2024-07-25T14:28:58Z","published":"2024-07-25T14:28:58Z","title":"Principal-Agent Reinforcement Learning","summary":" Contracts are the economic framework which allows a principal to delegate a\ntask to an agent -- despite misaligned interests, and even without directly\nobserving the agent's actions. In many modern reinforcement learning settings,\nself-interested agents learn to perform a multi-stage task delegated to them by\na principal. We explore the significant potential of utilizing contracts to\nincentivize the agents. We model the delegated task as an MDP, and study a\nstochastic game between the principal and agent where the principal learns what\ncontracts to use, and the agent learns an MDP policy in response. We present a\nlearning-based algorithm for optimizing the principal's contracts, which\nprovably converges to the subgame-perfect equilibrium of the principal-agent\ngame. A deep RL implementation allows us to apply our method to very large MDPs\nwith unknown transition dynamics. We extend our approach to multiple agents,\nand demonstrate its relevance to resolving a canonical sequential social\ndilemma with minimal intervention to agent rewards.\n","authors":["Dima Ivanov","Paul Dütting","Inbal Talgam-Cohen","Tonghan Wang","David C. Parkes"],"pdf_url":"https://arxiv.org/pdf/2407.18074v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18067v1","updated":"2024-07-25T14:21:50Z","published":"2024-07-25T14:21:50Z","title":"HVM-1: Large-scale video models pretrained with nearly 5000 hours of\n human-like video data","summary":" We introduce Human-like Video Models (HVM-1), large-scale video models\npretrained with nearly 5000 hours of curated human-like video data (mostly\negocentric, temporally extended, continuous video recordings), using the\nspatiotemporal masked autoencoder (ST-MAE) algorithm. We release two 633M\nparameter models trained at spatial resolutions of 224x224 and 448x448 pixels.\nWe evaluate the performance of these models in downstream few-shot video and\nimage recognition tasks and compare them against a model pretrained with 1330\nhours of short action-oriented video clips from YouTube (Kinetics-700). HVM-1\nmodels perform competitively against the Kinetics-700 pretrained model in\ndownstream evaluations despite substantial qualitative differences between the\nspatiotemporal characteristics of the corresponding pretraining datasets. HVM-1\nmodels also learn more accurate and more robust object representations compared\nto models pretrained with the image-based MAE algorithm on the same data,\ndemonstrating the potential benefits of learning to predict temporal\nregularities in natural videos for learning better object representations.\n","authors":["A. Emin Orhan"],"pdf_url":"https://arxiv.org/pdf/2407.18067v1.pdf","comment":"10 pages, 5 figures, 1 table; code & models available from\n https://github.com/eminorhan/hvm-1"},{"id":"http://arxiv.org/abs/2407.18066v1","updated":"2024-07-25T14:19:59Z","published":"2024-07-25T14:19:59Z","title":"Multi-Agent Deep Reinforcement Learning for Resilience Optimization in\n 5G RAN","summary":" Resilience is defined as the ability of a network to resist, adapt, and\nquickly recover from disruptions, and to continue to maintain an acceptable\nlevel of services from users' perspective. With the advent of future radio\nnetworks, including advanced 5G and upcoming 6G, critical services become\nintegral to future networks, requiring uninterrupted service delivery for end\nusers. Unfortunately, with the growing network complexity, user mobility and\ndiversity, it becomes challenging to scale current resilience management\ntechniques that rely on local optimizations to large dense network deployments.\nThis paper aims to address this problem by globally optimizing the resilience\nof a dense multi-cell network based on multi-agent deep reinforcement learning.\nSpecifically, our proposed solution can dynamically tilt cell antennas and\nreconfigure transmit power to mitigate outages and increase both coverage and\nservice availability. A multi-objective optimization problem is formulated to\nsimultaneously satisfy resiliency constraints while maximizing the service\nquality in the network area in order to minimize the impact of outages on\nneighbouring cells. Extensive simulations then demonstrate that with our\nproposed solution, the average service availability in terms of user throughput\ncan be increased by up to 50-60% on average, while reaching a coverage\navailability of 99% in best cases.\n","authors":["Soumeya Kaada","Dinh-Hieu Tran","Nguyen Van Huynh","Marie-Line Alberi Morel","Sofiene Jelassi","Gerardo Rubino"],"pdf_url":"https://arxiv.org/pdf/2407.18066v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.07709v2","updated":"2024-07-25T14:17:40Z","published":"2024-06-11T20:44:04Z","title":"Diagnosing and fixing common problems in Bayesian optimization for\n molecule design","summary":" Bayesian optimization (BO) is a principled approach to molecular design\ntasks. In this paper we explain three pitfalls of BO which can cause poor\nempirical performance: an incorrect prior width, over-smoothing, and inadequate\nacquisition function maximization. We show that with these issues addressed,\neven a basic BO setup is able to achieve the highest overall performance on the\nPMO benchmark for molecule design (Gao et al 2022). These results suggest that\nBO may benefit from more attention in the machine learning for molecules\ncommunity.\n","authors":["Austin Tripp","José Miguel Hernández-Lobato"],"pdf_url":"https://arxiv.org/pdf/2406.07709v2.pdf","comment":"8 pages, 4 figures. ICML 2024 AI for science workshop\n (https://openreview.net/forum?id=V4aG4wsoIt). Code at:\n https://github.com/AustinT/basic-mol-bo-workshop2024"},{"id":"http://arxiv.org/abs/2407.18060v1","updated":"2024-07-25T14:16:02Z","published":"2024-07-25T14:16:02Z","title":"Cross-Vendor Reproducibility of Radiomics-based Machine Learning Models\n for Computer-aided Diagnosis","summary":" Background: The reproducibility of machine-learning models in prostate cancer\ndetection across different MRI vendors remains a significant challenge.\nMethods: This study investigates Support Vector Machines (SVM) and Random\nForest (RF) models trained on radiomic features extracted from T2-weighted MRI\nimages using Pyradiomics and MRCradiomics libraries. Feature selection was\nperformed using the maximum relevance minimum redundancy (MRMR) technique. We\naimed to enhance clinical decision support through multimodal learning and\nfeature fusion. Results: Our SVM model, utilizing combined features from\nPyradiomics and MRCradiomics, achieved an AUC of 0.74 on the Multi-Improd\ndataset (Siemens scanner) but decreased to 0.60 on the Philips test set. The RF\nmodel showed similar trends, with notable robustness for models using\nPyradiomics features alone (AUC of 0.78 on Philips). Conclusions: These\nfindings demonstrate the potential of multimodal feature integration to improve\nthe robustness and generalizability of machine-learning models for clinical\ndecision support in prostate cancer detection. This study marks a significant\nstep towards developing reliable AI-driven diagnostic tools that maintain\nefficacy across various imaging platforms.\n","authors":["Jatin Chaudhary","Ivan Jambor","Hannu Aronen","Otto Ettala","Jani Saunavaara","Peter Boström","Jukka Heikkonen","Rajeev Kanth","Harri Merisaari"],"pdf_url":"https://arxiv.org/pdf/2407.18060v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18058v1","updated":"2024-07-25T14:15:05Z","published":"2024-07-25T14:15:05Z","title":"I can listen but cannot read: An evaluation of two-tower multimodal\n systems for instrument recognition","summary":" Music two-tower multimodal systems integrate audio and text modalities into a\njoint audio-text space, enabling direct comparison between songs and their\ncorresponding labels. These systems enable new approaches for classification\nand retrieval, leveraging both modalities. Despite the promising results they\nhave shown for zero-shot classification and retrieval tasks, closer inspection\nof the embeddings is needed. This paper evaluates the inherent zero-shot\nproperties of joint audio-text spaces for the case-study of instrument\nrecognition. We present an evaluation and analysis of two-tower systems for\nzero-shot instrument recognition and a detailed analysis of the properties of\nthe pre-joint and joint embeddings spaces. Our findings suggest that audio\nencoders alone demonstrate good quality, while challenges remain within the\ntext encoder or joint space projection. Specifically, two-tower systems exhibit\nsensitivity towards specific words, favoring generic prompts over musically\ninformed ones. Despite the large size of textual encoders, they do not yet\nleverage additional textual context or infer instruments accurately from their\ndescriptions. Lastly, a novel approach for quantifying the semantic\nmeaningfulness of the textual space leveraging an instrument ontology is\nproposed. This method reveals deficiencies in the systems' understanding of\ninstruments and provides evidence of the need for fine-tuning text encoders on\nmusical data.\n","authors":["Yannis Vasilakis","Rachel Bittner","Johan Pauwels"],"pdf_url":"https://arxiv.org/pdf/2407.18058v1.pdf","comment":"Accepted to ISMIR 2024"},{"id":"http://arxiv.org/abs/2407.18057v1","updated":"2024-07-25T14:10:42Z","published":"2024-07-25T14:10:42Z","title":"Physics-informed nonlinear vector autoregressive models for the\n prediction of dynamical systems","summary":" Machine learning techniques have recently been of great interest for solving\ndifferential equations. Training these models is classically a data-fitting\ntask, but knowledge of the expression of the differential equation can be used\nto supplement the training objective, leading to the development of\nphysics-informed scientific machine learning. In this article, we focus on one\nclass of models called nonlinear vector autoregression (NVAR) to solve ordinary\ndifferential equations (ODEs). Motivated by connections to numerical\nintegration and physics-informed neural networks, we explicitly derive the\nphysics-informed NVAR (piNVAR) which enforces the right-hand side of the\nunderlying differential equation regardless of NVAR construction. Because NVAR\nand piNVAR completely share their learned parameters, we propose an augmented\nprocedure to jointly train the two models. Then, using both data-driven and\nODE-driven metrics, we evaluate the ability of the piNVAR model to predict\nsolutions to various ODE systems, such as the undamped spring, a Lotka-Volterra\npredator-prey nonlinear model, and the chaotic Lorenz system.\n","authors":["James H. Adler","Samuel Hocking","Xiaozhe Hu","Shafiqul Islam"],"pdf_url":"https://arxiv.org/pdf/2407.18057v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18044v1","updated":"2024-07-25T13:47:01Z","published":"2024-07-25T13:47:01Z","title":"The Geometry of Queries: Query-Based Innovations in Retrieval-Augmented\n Generation","summary":" Digital health chatbots powered by Large Language Models (LLMs) have the\npotential to significantly improve personal health management for chronic\nconditions by providing accessible and on-demand health coaching and\nquestion-answering. However, these chatbots risk providing unverified and\ninaccurate information because LLMs generate responses based on patterns\nlearned from diverse internet data. Retrieval Augmented Generation (RAG) can\nhelp mitigate hallucinations and inaccuracies in LLM responses by grounding it\non reliable content. However, efficiently and accurately retrieving most\nrelevant set of content for real-time user questions remains a challenge. In\nthis work, we introduce Query-Based Retrieval Augmented Generation (QB-RAG), a\nnovel approach that pre-computes a database of potential queries from a content\nbase using LLMs. For an incoming patient question, QB-RAG efficiently matches\nit against this pre-generated query database using vector search, improving\nalignment between user questions and the content. We establish a theoretical\nfoundation for QB-RAG and provide a comparative analysis of existing retrieval\nenhancement techniques for RAG systems. Finally, our empirical evaluation\ndemonstrates that QB-RAG significantly improves the accuracy of healthcare\nquestion answering, paving the way for robust and trustworthy LLM applications\nin digital health.\n","authors":["Eric Yang","Jonathan Amar","Jong Ha Lee","Bhawesh Kumar","Yugang Jia"],"pdf_url":"https://arxiv.org/pdf/2407.18044v1.pdf","comment":"22 pages"},{"id":"http://arxiv.org/abs/2407.18042v1","updated":"2024-07-25T13:44:42Z","published":"2024-07-25T13:44:42Z","title":"Lifelong Graph Summarization with Neural Networks: 2012, 2022, and a\n Time Warp","summary":" Summarizing web graphs is challenging due to the heterogeneity of the modeled\ninformation and its changes over time. We investigate the use of neural\nnetworks for lifelong graph summarization. Assuming we observe the web graph at\na certain time, we train the networks to summarize graph vertices. We apply\nthis trained network to summarize the vertices of the changed graph at the next\npoint in time. Subsequently, we continue training and evaluating the network to\nperform lifelong graph summarization. We use the GNNs Graph-MLP and GraphSAINT,\nas well as an MLP baseline, to summarize the temporal graphs. We compare\n$1$-hop and $2$-hop summaries. We investigate the impact of reusing parameters\nfrom a previous snapshot by measuring the backward and forward transfer and the\nforgetting rate of the neural networks. Our extensive experiments on ten weekly\nsnapshots of a web graph with over $100$M edges, sampled in 2012 and 2022, show\nthat all networks predominantly use $1$-hop information to determine the\nsummary, even when performing $2$-hop summarization. Due to the heterogeneity\nof web graphs, in some snapshots, the $2$-hop summary produces over ten times\nmore vertex summaries than the $1$-hop summary. When using the network trained\non the last snapshot from 2012 and applying it to the first snapshot of 2022,\nwe observe a strong drop in accuracy. We attribute this drop over the ten-year\ntime warp to the strongly increased heterogeneity of the web graph in 2022.\n","authors":["Jonatan Frank","Marcel Hoffmann","Nicolas Lell","David Richerby","Ansgar Scherp"],"pdf_url":"https://arxiv.org/pdf/2407.18042v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18041v1","updated":"2024-07-25T13:39:11Z","published":"2024-07-25T13:39:11Z","title":"How to Train the Teacher Model for Effective Knowledge Distillation","summary":" Recently, it was shown that the role of the teacher in knowledge distillation\n(KD) is to provide the student with an estimate of the true Bayes conditional\nprobability density (BCPD). Notably, the new findings propose that the\nstudent's error rate can be upper-bounded by the mean squared error (MSE)\nbetween the teacher's output and BCPD. Consequently, to enhance KD efficacy,\nthe teacher should be trained such that its output is close to BCPD in MSE\nsense. This paper elucidates that training the teacher model with MSE loss\nequates to minimizing the MSE between its output and BCPD, aligning with its\ncore responsibility of providing the student with a BCPD estimate closely\nresembling it in MSE terms. In this respect, through a comprehensive set of\nexperiments, we demonstrate that substituting the conventional teacher trained\nwith cross-entropy loss with one trained using MSE loss in state-of-the-art KD\nmethods consistently boosts the student's accuracy, resulting in improvements\nof up to 2.6\\%.\n","authors":["Shayan Mohajer Hamidi","Xizhen Deng","Renhao Tan","Linfeng Ye","Ahmed Hussein Salamah"],"pdf_url":"https://arxiv.org/pdf/2407.18041v1.pdf","comment":"The paper was accepted at ECCV2024"},{"id":"http://arxiv.org/abs/2402.05981v2","updated":"2024-07-25T13:37:16Z","published":"2024-02-08T08:02:57Z","title":"Anatomizing Deep Learning Inference in Web Browsers","summary":" Web applications have increasingly adopted Deep Learning (DL) through\nin-browser inference, wherein DL inference performs directly within Web\nbrowsers. The actual performance of in-browser inference and its impacts on the\nquality of experience (QoE) remain unexplored, and urgently require new QoE\nmeasurements beyond traditional ones, e.g., mainly focusing on page load time.\nTo bridge this gap, we make the first comprehensive performance measurement of\nin-browser inference to date. Our approach proposes new metrics to measure\nin-browser inference: responsiveness, smoothness, and inference accuracy. Our\nextensive analysis involves 9 representative DL models across Web browsers of\n50 popular PC devices and 20 mobile devices. The results reveal that in-browser\ninference exhibits a substantial latency gap, averaging 16.9 times slower on\nCPU and 4.9 times slower on GPU compared to native inference on PC devices. The\ngap on mobile CPU and mobile GPU is 15.8 times and 7.8 times, respectively.\nFurthermore, we identify contributing factors to such latency gap, including\nunderutilized hardware instruction sets, inherent overhead in the runtime\nenvironment, resource contention within the browser, and inefficiencies in\nsoftware libraries and GPU abstractions. Additionally, in-browser inference\nimposes significant memory demands, at times exceeding 334.6 times the size of\nthe DL models themselves, partly attributable to suboptimal memory management.\nWe also observe that in-browser inference leads to a significant 67.2% increase\nin the time it takes for GUI components to render within Web browsers,\nsignificantly affecting the overall user QoE of Web applications reliant on\nthis technology\n","authors":["Qipeng Wang","Shiqi Jiang","Zhenpeng Chen","Xu Cao","Yuanchun Li","Aoyu Li","Yun Ma","Ting Cao","Xuanzhe Liu"],"pdf_url":"https://arxiv.org/pdf/2402.05981v2.pdf","comment":"Accepted by ACM Transactions on Software Engineering and Methodology\n (TOSEM)"},{"id":"http://arxiv.org/abs/2407.18039v1","updated":"2024-07-25T13:36:42Z","published":"2024-07-25T13:36:42Z","title":"Peak-Controlled Logits Poisoning Attack in Federated Distillation","summary":" Federated Distillation (FD) offers an innovative approach to distributed\nmachine learning, leveraging knowledge distillation for efficient and flexible\ncross-device knowledge transfer without necessitating the upload of extensive\nmodel parameters to a central server. While FD has gained popularity, its\nvulnerability to poisoning attacks remains underexplored. To address this gap,\nwe previously introduced FDLA (Federated Distillation Logits Attack), a method\nthat manipulates logits communication to mislead and degrade the performance of\nclient models. However, the impact of FDLA on participants with different\nidentities and the effects of malicious modifications at various stages of\nknowledge transfer remain unexplored. To this end, we present PCFDLA\n(Peak-Controlled Federated Distillation Logits Attack), an advanced and more\nstealthy logits poisoning attack method for FD. PCFDLA enhances the\neffectiveness of FDLA by carefully controlling the peak values of logits to\ncreate highly misleading yet inconspicuous modifications. Furthermore, we\nintroduce a novel metric for better evaluating attack efficacy, demonstrating\nthat PCFDLA maintains stealth while being significantly more disruptive to\nvictim models compared to its predecessors. Experimental results across various\ndatasets confirm the superior impact of PCFDLA on model accuracy, solidifying\nits potential threat in federated distillation systems.\n","authors":["Yuhan Tang","Aoxu Zhang","Zhiyuan Wu","Bo Gao","Tian Wen","Yuwei Wang","Sheng Sun"],"pdf_url":"https://arxiv.org/pdf/2407.18039v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2401.03685"},{"id":"http://arxiv.org/abs/2407.18033v1","updated":"2024-07-25T13:27:10Z","published":"2024-07-25T13:27:10Z","title":"ECG Arrhythmia Detection Using Disease-specific Attention-based Deep\n Learning Model","summary":" The electrocardiogram (ECG) is one of the most commonly-used tools to\ndiagnose cardiovascular disease in clinical practice. Although deep learning\nmodels have achieved very impressive success in the field of automatic ECG\nanalysis, they often lack model interpretability that is significantly\nimportant in the healthcare applications. To this end, many schemes such as\ngeneral-purpose attention mechanism, Grad-CAM technique and ECG knowledge graph\nwere proposed to be integrated with deep learning models. However, they either\nresult in decreased classification performance or do not consist with the one\nin cardiologists' mind when interpreting ECG. In this study, we propose a novel\ndisease-specific attention-based deep learning model (DANet) for arrhythmia\ndetection from short ECG recordings. The novel idea is to introduce a\nsoft-coding or hard-coding waveform enhanced module into existing deep neural\nnetworks, which amends original ECG signals with the guidance of the rule for\ndiagnosis of a given disease type before being fed into the classification\nmodule. For the soft-coding DANet, we also develop a learning framework\ncombining self-supervised pre-training with two-stage supervised training. To\nverify the effectiveness of our proposed DANet, we applied it to the problem of\natrial premature contraction detection and the experimental results shows that\nit demonstrates superior performance compared to the benchmark model. Moreover,\nit also provides the waveform regions that deserve special attention in the\nmodel's decision-making process, allowing it to be a medical diagnostic\nassistant for physicians.\n","authors":["Linpeng Jin"],"pdf_url":"https://arxiv.org/pdf/2407.18033v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.18022v1","updated":"2024-07-25T13:15:25Z","published":"2024-07-25T13:15:25Z","title":"Learning mental states estimation through self-observation: a\n developmental synergy between intentions and beliefs representations in a\n deep-learning model of Theory of Mind","summary":" Theory of Mind (ToM), the ability to attribute beliefs, intentions, or mental\nstates to others, is a crucial feature of human social interaction. In complex\nenvironments, where the human sensory system reaches its limits, behaviour is\nstrongly driven by our beliefs about the state of the world around us.\nAccessing others' mental states, e.g., beliefs and intentions, allows for more\neffective social interactions in natural contexts. Yet, these variables are not\ndirectly observable, making understanding ToM a challenging quest of interest\nfor different fields, including psychology, machine learning and robotics. In\nthis paper, we contribute to this topic by showing a developmental synergy\nbetween learning to predict low-level mental states (e.g., intentions, goals)\nand attributing high-level ones (i.e., beliefs). Specifically, we assume that\nlearning beliefs attribution can occur by observing one's own decision\nprocesses involving beliefs, e.g., in a partially observable environment. Using\na simple feed-forward deep learning model, we show that, when learning to\npredict others' intentions and actions, more accurate predictions can be\nacquired earlier if beliefs attribution is learnt simultaneously. Furthermore,\nwe show that the learning performance improves even when observed actors have a\ndifferent embodiment than the observer and the gain is higher when observing\nbeliefs-driven chunks of behaviour. We propose that our computational approach\ncan inform the understanding of human social cognitive development and be\nrelevant for the design of future adaptive social robots able to autonomously\nunderstand, assist, and learn from human interaction partners in novel natural\nenvironments and tasks.\n","authors":["Francesca Bianco","Silvia Rigato","Maria Laura Filippetti","Dimitri Ognibene"],"pdf_url":"https://arxiv.org/pdf/2407.18022v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18021v1","updated":"2024-07-25T13:15:16Z","published":"2024-07-25T13:15:16Z","title":"Quadratic Advantage with Quantum Randomized Smoothing Applied to\n Time-Series Analysis","summary":" As quantum machine learning continues to develop at a rapid pace, the\nimportance of ensuring the robustness and efficiency of quantum algorithms\ncannot be overstated. Our research presents an analysis of quantum randomized\nsmoothing, how data encoding and perturbation modeling approaches can be\nmatched to achieve meaningful robustness certificates. By utilizing an\ninnovative approach integrating Grover's algorithm, a quadratic sampling\nadvantage over classical randomized smoothing is achieved. This strategy\nnecessitates a basis state encoding, thus restricting the space of meaningful\nperturbations. We show how constrained $k$-distant Hamming weight perturbations\nare a suitable noise distribution here, and elucidate how they can be\nconstructed on a quantum computer. The efficacy of the proposed framework is\ndemonstrated on a time series classification task employing a Bag-of-Words\npre-processing solution. The advantage of quadratic sample reduction is\nrecovered especially in the regime with large number of samples. This may allow\nquantum computers to efficiently scale randomized smoothing to more complex\ntasks beyond the reach of classical methods.\n","authors":["Nicola Franco","Marie Kempkes","Jakob Spiegelberg","Jeanette Miriam Lorenz"],"pdf_url":"https://arxiv.org/pdf/2407.18021v1.pdf","comment":"Accepted at the IEEE International Conference on Quantum Computing\n and Engineering (QCE)"},{"id":"http://arxiv.org/abs/2406.19146v2","updated":"2024-07-25T13:09:18Z","published":"2024-06-27T13:02:43Z","title":"Resolving Discrepancies in Compute-Optimal Scaling of Language Models","summary":" Kaplan et al. and Hoffmann et al. developed influential scaling laws for the\noptimal model size as a function of the compute budget, but these laws yield\nsubstantially different predictions. We explain the discrepancy by reproducing\nthe Kaplan scaling law on two datasets (OpenWebText2 and RefinedWeb) and\nidentifying three factors causing the difference: last layer computational\ncost, warmup duration, and scale-dependent optimizer tuning. With these factors\ncorrected, we obtain excellent agreement with the Hoffmann et al. (i.e.,\n\"Chinchilla\") scaling law. Counter to a hypothesis of Hoffmann et al., we find\nthat careful learning rate decay is not essential for the validity of their\nscaling law. As a secondary result, we derive scaling laws for the optimal\nlearning rate and batch size, finding that tuning the AdamW $\\beta_2$ parameter\nis essential at lower batch sizes.\n","authors":["Tomer Porian","Mitchell Wortsman","Jenia Jitsev","Ludwig Schmidt","Yair Carmon"],"pdf_url":"https://arxiv.org/pdf/2406.19146v2.pdf","comment":"Fixing bug in small models with tuned LR"},{"id":"http://arxiv.org/abs/2407.18013v1","updated":"2024-07-25T13:06:30Z","published":"2024-07-25T13:06:30Z","title":"Self-Supervision Improves Diffusion Models for Tabular Data Imputation","summary":" The ubiquity of missing data has sparked considerable attention and focus on\ntabular data imputation methods. Diffusion models, recognized as the\ncutting-edge technique for data generation, demonstrate significant potential\nin tabular data imputation tasks. However, in pursuit of diversity, vanilla\ndiffusion models often exhibit sensitivity to initialized noises, which hinders\nthe models from generating stable and accurate imputation results.\nAdditionally, the sparsity inherent in tabular data poses challenges for\ndiffusion models in accurately modeling the data manifold, impacting the\nrobustness of these models for data imputation. To tackle these challenges,\nthis paper introduces an advanced diffusion model named Self-supervised\nimputation Diffusion Model (SimpDM for brevity), specifically tailored for\ntabular data imputation tasks. To mitigate sensitivity to noise, we introduce a\nself-supervised alignment mechanism that aims to regularize the model, ensuring\nconsistent and stable imputation predictions. Furthermore, we introduce a\ncarefully devised state-dependent data augmentation strategy within SimpDM,\nenhancing the robustness of the diffusion model when dealing with limited data.\nExtensive experiments demonstrate that SimpDM matches or outperforms\nstate-of-the-art imputation methods across various scenarios.\n","authors":["Yixin Liu","Thalaiyasingam Ajanthan","Hisham Husain","Vu Nguyen"],"pdf_url":"https://arxiv.org/pdf/2407.18013v1.pdf","comment":"10 pages, 5 figures. Accepted by CIKM 2024"},{"id":"http://arxiv.org/abs/2201.07794v5","updated":"2024-07-25T13:05:25Z","published":"2022-01-18T23:31:06Z","title":"A Non-Expert's Introduction to Data Ethics for Mathematicians","summary":" I give a short introduction to data ethics. I begin with some background\ninformation and societal context for data ethics. I then discuss data ethics in\nmathematical-science education and indicate some available course material. I\nbriefly highlight a few efforts -- at my home institution and elsewhere -- on\ndata ethics, society, and social good. I then discuss open data in research,\nresearch replicability and some other ethical issues in research, and the\ntension between privacy and open data and code, and a few controversial studies\nand reactions to studies. I then discuss ethical principles, institutional\nreview boards, and a few other considerations in the scientific use of human\ndata. I then briefly survey a variety of research and lay articles that are\nrelevant to data ethics and data privacy. I conclude with a brief summary and\nsome closing remarks.\n My focal audience is mathematicians, but I hope that this chapter will also\nbe useful to others. I am not an expert about data ethics, and this chapter\nprovides only a starting point on this wide-ranging topic. I encourage you to\nexamine the resources that I discuss and to reflect carefully on data ethics,\nits role in mathematics education, and the societal implications of data and\ndata analysis. As data and technology continue to evolve, I hope that such\ncareful reflection will continue throughout your life.\n","authors":["Mason A. Porter"],"pdf_url":"https://arxiv.org/pdf/2201.07794v5.pdf","comment":"A few more small tweaks. This is a book chapter. It is associated\n with my data-ethics lecture at the 2021 AMS Short Course on Mathematical and\n Computational Methods for Complex Social Systems"},{"id":"http://arxiv.org/abs/2407.18011v1","updated":"2024-07-25T13:05:00Z","published":"2024-07-25T13:05:00Z","title":"HANNA: Hard-constraint Neural Network for Consistent Activity\n Coefficient Prediction","summary":" We present the first hard-constraint neural network for predicting activity\ncoefficients (HANNA), a thermodynamic mixture property that is the basis for\nmany applications in science and engineering. Unlike traditional neural\nnetworks, which ignore physical laws and result in inconsistent predictions,\nour model is designed to strictly adhere to all thermodynamic consistency\ncriteria. By leveraging deep-set neural networks, HANNA maintains symmetry\nunder the permutation of the components. Furthermore, by hard-coding physical\nconstraints in the network architecture, we ensure consistency with the\nGibbs-Duhem equation and in modeling the pure components. The model was trained\nand evaluated on 317,421 data points for activity coefficients in binary\nmixtures from the Dortmund Data Bank, achieving significantly higher prediction\naccuracies than the current state-of-the-art model UNIFAC. Moreover, HANNA only\nrequires the SMILES of the components as input, making it applicable to any\nbinary mixture of interest. HANNA is fully open-source and available for free\nuse.\n","authors":["Thomas Specht","Mayank Nagda","Sophie Fellenz","Stephan Mandt","Hans Hasse","Fabian Jirasek"],"pdf_url":"https://arxiv.org/pdf/2407.18011v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12856v2","updated":"2024-07-25T12:56:35Z","published":"2024-03-19T16:01:25Z","title":"Equivariant Ensembles and Regularization for Reinforcement Learning in\n Map-based Path Planning","summary":" In reinforcement learning (RL), exploiting environmental symmetries can\nsignificantly enhance efficiency, robustness, and performance. However,\nensuring that the deep RL policy and value networks are respectively\nequivariant and invariant to exploit these symmetries is a substantial\nchallenge. Related works try to design networks that are equivariant and\ninvariant by construction, limiting them to a very restricted library of\ncomponents, which in turn hampers the expressiveness of the networks. This\npaper proposes a method to construct equivariant policies and invariant value\nfunctions without specialized neural network components, which we term\nequivariant ensembles. We further add a regularization term for adding\ninductive bias during training. In a map-based path planning case study, we\nshow how equivariant ensembles and regularization benefit sample efficiency and\nperformance.\n","authors":["Mirco Theile","Hongpeng Cao","Marco Caccamo","Alberto L. Sangiovanni-Vincentelli"],"pdf_url":"https://arxiv.org/pdf/2403.12856v2.pdf","comment":"Accepted at IROS 2024. A video can be found here:\n https://youtu.be/L6NOdvU7n7s. The code is available at\n https://github.com/theilem/uavSim"},{"id":"http://arxiv.org/abs/2407.18002v1","updated":"2024-07-25T12:53:21Z","published":"2024-07-25T12:53:21Z","title":"Network Inversion of Convolutional Neural Nets","summary":" Neural networks have emerged as powerful tools across various applications,\nyet their decision-making process often remains opaque, leading to them being\nperceived as \"black boxes.\" This opacity raises concerns about their\ninterpretability and reliability, especially in safety-critical scenarios.\nNetwork inversion techniques offer a solution by allowing us to peek inside\nthese black boxes, revealing the features and patterns learned by the networks\nbehind their decision-making processes and thereby provide valuable insights\ninto how neural networks arrive at their conclusions, making them more\ninterpretable and trustworthy. This paper presents a simple yet effective\napproach to network inversion using a carefully conditioned generator that\nlearns the data distribution in the input space of the trained neural network,\nenabling the reconstruction of inputs that would most likely lead to the\ndesired outputs. To capture the diversity in the input space for a given\noutput, instead of simply revealing the conditioning labels to the generator,\nwe hideously encode the conditioning label information into vectors, further\nexemplified by heavy dropout in the generation process and minimisation of\ncosine similarity between the features corresponding to the generated images.\nThe paper concludes with immediate applications of Network Inversion including\nin interpretability, explainability and generation of adversarial samples.\n","authors":["Pirzada Suhail","Amit Sethi"],"pdf_url":"https://arxiv.org/pdf/2407.18002v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17999v1","updated":"2024-07-25T12:48:56Z","published":"2024-07-25T12:48:56Z","title":"Lightweight Industrial Cohorted Federated Learning for Heterogeneous\n Assets","summary":" Federated Learning (FL) is the most widely adopted collaborative learning\napproach for training decentralized Machine Learning (ML) models by exchanging\nlearning between clients without sharing the data and compromising privacy.\nHowever, since great data similarity or homogeneity is taken for granted in all\nFL tasks, FL is still not specifically designed for the industrial setting.\nRarely this is the case in industrial data because there are differences in\nmachine type, firmware version, operational conditions, environmental factors,\nand hence, data distribution. Albeit its popularity, it has been observed that\nFL performance degrades if the clients have heterogeneous data distributions.\nTherefore, we propose a Lightweight Industrial Cohorted FL (LICFL) algorithm\nthat uses model parameters for cohorting without any additional on-edge\n(clientlevel) computations and communications than standard FL and mitigates\nthe shortcomings from data heterogeneity in industrial applications. Our\napproach enhances client-level model performance by allowing them to\ncollaborate with similar clients and train more specialized or personalized\nmodels. Also, we propose an adaptive aggregation algorithm that extends the\nLICFL to Adaptive LICFL (ALICFL) for further improving the global model\nperformance and speeding up the convergence. Through numerical experiments on\nreal-time data, we demonstrate the efficacy of the proposed algorithms and\ncompare the performance with existing approaches.\n","authors":["Madapu Amarlingam","Abhishek Wani","Adarsh NL"],"pdf_url":"https://arxiv.org/pdf/2407.17999v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17998v1","updated":"2024-07-25T12:48:41Z","published":"2024-07-25T12:48:41Z","title":"iNNspector: Visual, Interactive Deep Model Debugging","summary":" Deep learning model design, development, and debugging is a process driven by\nbest practices, guidelines, trial-and-error, and the personal experiences of\nmodel developers. At multiple stages of this process, performance and internal\nmodel data can be logged and made available. However, due to the sheer\ncomplexity and scale of this data and process, model developers often resort to\nevaluating their model performance based on abstract metrics like accuracy and\nloss. We argue that a structured analysis of data along the model's\narchitecture and at multiple abstraction levels can considerably streamline the\ndebugging process. Such a systematic analysis can further connect the\ndeveloper's design choices to their impacts on the model behavior, facilitating\nthe understanding, diagnosis, and refinement of deep learning models. Hence, in\nthis paper, we (1) contribute a conceptual framework structuring the data space\nof deep learning experiments. Our framework, grounded in literature analysis\nand requirements interviews, captures design dimensions and proposes mechanisms\nto make this data explorable and tractable. To operationalize our framework in\na ready-to-use application, we (2) present the iNNspector system. iNNspector\nenables tracking of deep learning experiments and provides interactive\nvisualizations of the data on all levels of abstraction from multiple models to\nindividual neurons. Finally, we (3) evaluate our approach with three real-world\nuse-cases and a user study with deep learning developers and data analysts,\nproving its effectiveness and usability.\n","authors":["Thilo Spinner","Daniel Fürst","Mennatallah El-Assady"],"pdf_url":"https://arxiv.org/pdf/2407.17998v1.pdf","comment":"41 pages paper, 4 pages references, 3 pages appendix, 19 figures, 2\n tables"},{"id":"http://arxiv.org/abs/2407.17997v1","updated":"2024-07-25T12:44:45Z","published":"2024-07-25T12:44:45Z","title":"On the Effect of Purely Synthetic Training Data for Different Automatic\n Speech Recognition Architectures","summary":" In this work we evaluate the utility of synthetic data for training automatic\nspeech recognition (ASR). We use the ASR training data to train a\ntext-to-speech (TTS) system similar to FastSpeech-2. With this TTS we reproduce\nthe original training data, training ASR systems solely on synthetic data. For\nASR, we use three different architectures, attention-based encoder-decoder,\nhybrid deep neural network hidden Markov model and a Gaussian mixture hidden\nMarkov model, showing the different sensitivity of the models to synthetic data\ngeneration. In order to extend previous work, we present a number of ablation\nstudies on the effectiveness of synthetic vs. real training data for ASR. In\nparticular we focus on how the gap between training on synthetic and real data\nchanges by varying the speaker embedding or by scaling the model size. For the\nlatter we show that the TTS models generalize well, even when training scores\nindicate overfitting.\n","authors":["Nick Rossenbach","Benedikt Hilmes","Ralf Schlüter"],"pdf_url":"https://arxiv.org/pdf/2407.17997v1.pdf","comment":"Accepted at the SynData4GenAI 2024 workshop"},{"id":"http://arxiv.org/abs/2407.17992v1","updated":"2024-07-25T12:38:08Z","published":"2024-07-25T12:38:08Z","title":"Amortized Active Learning for Nonparametric Functions","summary":" Active learning (AL) is a sequential learning scheme aiming to select the\nmost informative data. AL reduces data consumption and avoids the cost of\nlabeling large amounts of data. However, AL trains the model and solves an\nacquisition optimization for each selection. It becomes expensive when the\nmodel training or acquisition optimization is challenging. In this paper, we\nfocus on active nonparametric function learning, where the gold standard\nGaussian process (GP) approaches suffer from cubic time complexity. We propose\nan amortized AL method, where new data are suggested by a neural network which\nis trained up-front without any real data (Figure 1). Our method avoids\nrepeated model training and requires no acquisition optimization during the AL\ndeployment. We (i) utilize GPs as function priors to construct an AL simulator,\n(ii) train an AL policy that can zero-shot generalize from simulation to real\nlearning problems of nonparametric functions and (iii) achieve real-time data\nselection and comparable learning performances to time-consuming baseline\nmethods.\n","authors":["Cen-You Li","Marc Toussaint","Barbara Rakitsch","Christoph Zimmer"],"pdf_url":"https://arxiv.org/pdf/2407.17992v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.08210v2","updated":"2024-07-25T12:23:26Z","published":"2024-06-12T13:41:07Z","title":"Expressivity and Generalization: Fragment-Biases for Molecular GNNs","summary":" Although recent advances in higher-order Graph Neural Networks (GNNs) improve\nthe theoretical expressiveness and molecular property predictive performance,\nthey often fall short of the empirical performance of models that explicitly\nuse fragment information as inductive bias. However, for these approaches,\nthere exists no theoretic expressivity study. In this work, we propose the\nFragment-WL test, an extension to the well-known Weisfeiler & Leman (WL) test,\nwhich enables the theoretic analysis of these fragment-biased GNNs. Building on\nthe insights gained from the Fragment-WL test, we develop a new GNN\narchitecture and a fragmentation with infinite vocabulary that significantly\nboosts expressiveness. We show the effectiveness of our model on synthetic and\nreal-world data where we outperform all GNNs on Peptides and have 12% lower\nerror than all GNNs on ZINC and 34% lower error than other fragment-biased\nmodels. Furthermore, we show that our model exhibits superior generalization\ncapabilities compared to the latest transformer-based architectures,\npositioning it as a robust solution for a range of molecular modeling tasks.\n","authors":["Tom Wollschläger","Niklas Kemper","Leon Hetzel","Johanna Sommer","Stephan Günnemann"],"pdf_url":"https://arxiv.org/pdf/2406.08210v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17436v3","updated":"2024-07-25T11:51:04Z","published":"2024-03-26T07:05:06Z","title":"Particle identification with machine learning from incomplete data in\n the ALICE experiment","summary":" The ALICE experiment at the LHC measures properties of the strongly\ninteracting matter formed in ultrarelativistic heavy-ion collisions. Such\nstudies require accurate particle identification (PID). ALICE provides PID\ninformation via several detectors for particles with momentum from about 100\nMeV/c up to 20 GeV/c. Traditionally, particles are selected with rectangular\ncuts. A much better performance can be achieved with machine learning (ML)\nmethods. Our solution uses multiple neural networks (NN) serving as binary\nclassifiers. Moreover, we extended our particle classifier with Feature Set\nEmbedding and attention in order to train on data with incomplete samples. We\nalso present the integration of the ML project with the ALICE analysis\nsoftware, and we discuss domain adaptation, the ML technique needed to transfer\nthe knowledge between simulated and real experimental data.\n","authors":["Maja Karwowska","Łukasz Graczykowski","Kamil Deja","Miłosz Kasak","Małgorzata Janik"],"pdf_url":"https://arxiv.org/pdf/2403.17436v3.pdf","comment":"Proceedings of 3rd Artificial Intelligence for the Electron-Ion\n Collider workshop -- AI4EIC2023, 28.11-1.12.2023"},{"id":"http://arxiv.org/abs/2308.12112v4","updated":"2024-07-25T11:49:54Z","published":"2023-08-23T13:02:52Z","title":"Category Adaptation Meets Projected Distillation in Generalized\n Continual Category Discovery","summary":" Generalized Continual Category Discovery (GCCD) tackles learning from\nsequentially arriving, partially labeled datasets while uncovering new\ncategories. Traditional methods depend on feature distillation to prevent\nforgetting the old knowledge. However, this strategy restricts the model's\nability to adapt and effectively distinguish new categories. To address this,\nwe introduce a novel technique integrating a learnable projector with feature\ndistillation, thus enhancing model adaptability without sacrificing past\nknowledge. The resulting distribution shift of the previously learned\ncategories is mitigated with the auxiliary category adaptation network. We\ndemonstrate that while each component offers modest benefits individually,\ntheir combination - dubbed CAMP (Category Adaptation Meets Projected\ndistillation) - significantly improves the balance between learning new\ninformation and retaining old. CAMP exhibits superior performance across\nseveral GCCD and Class Incremental Learning scenarios. The code is available at\nhttps://github.com/grypesc/CAMP.\n","authors":["Grzegorz Rypeść","Daniel Marczak","Sebastian Cygert","Tomasz Trzciński","Bartłomiej Twardowski"],"pdf_url":"https://arxiv.org/pdf/2308.12112v4.pdf","comment":"Accepted for ECCV 2024"},{"id":"http://arxiv.org/abs/2404.00725v2","updated":"2024-07-25T11:37:54Z","published":"2024-03-31T15:55:49Z","title":"The Larger the Better? Improved LLM Code-Generation via Budget\n Reallocation","summary":" It is a common belief that large language models (LLMs) are better than\nsmaller-sized ones. However, larger models also require significantly more time\nand compute during inference. This begs the question: what happens when both\nmodels operate under the same budget? (e.g., compute, run-time). To address\nthis question, we analyze code generation LLMs of various sizes and make\ncomparisons such as running a 70B model once vs. generating five outputs from a\n13B model. We consider a standard unit-test setup, which can be used to select\nthe correct output from the smaller model. Our findings reveal that the\nrepeated use of smaller models can yield consistent improvements, with gains of\nup to 15% across five tasks. On the other hand, in scenarios where unit-tests\nare unavailable, a ranking-based selection of candidates from the smaller model\nfalls short of the performance of a single output from larger ones. Our results\nhighlight the potential of using smaller models instead of larger ones, and the\nimportance of studying approaches for ranking LLM outputs.\n","authors":["Michael Hassid","Tal Remez","Jonas Gehring","Roy Schwartz","Yossi Adi"],"pdf_url":"https://arxiv.org/pdf/2404.00725v2.pdf","comment":"COLM 2024"},{"id":"http://arxiv.org/abs/2407.17963v1","updated":"2024-07-25T11:35:22Z","published":"2024-07-25T11:35:22Z","title":"Relating the Seemingly Unrelated: Principled Understanding of\n Generalization for Generative Models in Arithmetic Reasoning Tasks","summary":" Large language models (LLMs) have demonstrated impressive versatility across\nnumerous tasks, yet their generalization capabilities remain poorly understood.\nTo investigate these behaviors, arithmetic tasks serve as important venues. In\nprevious studies, seemingly unrelated mysteries still exist -- (1) models with\nappropriate positional embeddings can correctly perform longer unseen\narithmetic operations such as addition, but their effectiveness varies in more\ncomplex tasks like multiplication; (2) models perform well for longer unseen\ncases in modular addition under specific moduli (e.g., modulo 100) but struggle\nunder very close moduli (e.g., modulo 101), regardless of the positional\nencoding used. We believe previous studies have been treating the symptoms\nrather than addressing the root cause -- they have paid excessive attention to\nimproving model components, while overlooking the differences in task\nproperties that may be the real drivers. This is confirmed by our unified\ntheoretical framework for different arithmetic scenarios. For example, unlike\nmultiplication, the digital addition task has the property of translation\ninvariance which naturally aligns with the relative positional encoding, and\nthis combination leads to successful generalization of addition to unseen\nlonger domains. The discrepancy in operations modulo 100 and 101 arises from\nthe base. Modulo 100, unlike 101, is compatible with the decimal system (base\n10), such that unseen information in digits beyond the units digit and the tens\ndigit is actually not needed for the task. Extensive experiments with GPT-like\nmodels validate our theoretical predictions. These findings deepen our\nunderstanding of the generalization mechanisms, and facilitate more\ndata-efficient model training and objective-oriented AI alignment.\n","authors":["Xingcheng Xu","Zibo Zhao","Haipeng Zhang","Yanqing Yang"],"pdf_url":"https://arxiv.org/pdf/2407.17963v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17957v1","updated":"2024-07-25T11:24:44Z","published":"2024-07-25T11:24:44Z","title":"Neural Networks for Generating Better Local Optima in Topology\n Optimization","summary":" Neural networks have recently been employed as material discretizations\nwithin adjoint optimization frameworks for inverse problems and topology\noptimization. While advantageous regularization effects and better optima have\nbeen found for some inverse problems, the benefit for topology optimization has\nbeen limited -- where the focus of investigations has been the compliance\nproblem. We demonstrate how neural network material discretizations can, under\ncertain conditions, find better local optima in more challenging optimization\nproblems, where we here specifically consider acoustic topology optimization.\nThe chances of identifying a better optimum can significantly be improved by\nrunning multiple partial optimizations with different neural network\ninitializations. Furthermore, we show that the neural network material\ndiscretization's advantage comes from the interplay with the Adam optimizer and\nemphasize its current limitations when competing with constrained and\nhigher-order optimization techniques. At the moment, this discretization has\nonly been shown to be beneficial for unconstrained first-order optimization.\n","authors":["Leon Herrmann","Ole Sigmund","Viola Muning Li","Christian Vogl","Stefan Kollmannsberger"],"pdf_url":"https://arxiv.org/pdf/2407.17957v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16639v2","updated":"2024-07-25T11:21:50Z","published":"2024-05-26T17:30:44Z","title":"A unified law of robustness for Bregman divergence losses","summary":" In contemporary deep learning practice, models are often trained to near zero\nloss i.e. to nearly interpolate the training data. However, the number of\nparameters in the model is usually far more than the number of data points $n$,\nthe theoretical minimum needed for interpolation: a phenomenon referred to as\noverparameterization. In an interesting piece of work that contributes to the\nconsiderable research that has been devoted to understand overparameterization,\nBubeck and Sellke showed that for a broad class of covariate distributions\n(specifically those satisfying a natural notion of concentration of measure),\noverparameterization is necessary for robust interpolation i.e. if the\ninterpolating function is required to be Lipschitz. However, their robustness\nresults were proved only in the setting of regression with square loss. In\npractice, however many other kinds of losses are used, e.g. cross entropy loss\nfor classification. In this work, we generalize Bubeck and Selke's result to\nBregman divergence losses, which form a common generalization of square loss\nand cross-entropy loss. Our generalization relies on identifying a\nbias-variance type decomposition that lies at the heart of the proof and Bubeck\nand Sellke.\n","authors":["Santanu Das","Jatin Batra","Piyush Srivastava"],"pdf_url":"https://arxiv.org/pdf/2405.16639v2.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2407.17954v1","updated":"2024-07-25T11:19:55Z","published":"2024-07-25T11:19:55Z","title":"Scaling Training Data with Lossy Image Compression","summary":" Empirically-determined scaling laws have been broadly successful in\npredicting the evolution of large machine learning models with training data\nand number of parameters. As a consequence, they have been useful for\noptimizing the allocation of limited resources, most notably compute time.\n In certain applications, storage space is an important constraint, and data\nformat needs to be chosen carefully as a consequence. Computer vision is a\nprominent example: images are inherently analog, but are always stored in a\ndigital format using a finite number of bits. Given a dataset of digital\nimages, the number of bits $L$ to store each of them can be further reduced\nusing lossy data compression. This, however, can degrade the quality of the\nmodel trained on such images, since each example has lower resolution.\n In order to capture this trade-off and optimize storage of training data, we\npropose a `storage scaling law' that describes the joint evolution of test\nerror with sample size and number of bits per image. We prove that this law\nholds within a stylized model for image compression, and verify it empirically\non two computer vision tasks, extracting the relevant parameters. We then show\nthat this law can be used to optimize the lossy compression level. At given\nstorage, models trained on optimally compressed images present a significantly\nsmaller test error with respect to models trained on the original data.\nFinally, we investigate the potential benefits of randomizing the compression\nlevel.\n","authors":["Katherine L. Mentzer","Andrea Montanari"],"pdf_url":"https://arxiv.org/pdf/2407.17954v1.pdf","comment":"21 pages, 27 figures"},{"id":"http://arxiv.org/abs/2407.17950v1","updated":"2024-07-25T11:11:05Z","published":"2024-07-25T11:11:05Z","title":"Real Time American Sign Language Detection Using Yolo-v9","summary":" This paper focuses on real-time American Sign Language Detection. YOLO is a\nconvolutional neural network (CNN) based model, which was first released in\n2015. In recent years, it gained popularity for its real-time detection\ncapabilities. Our study specifically targets YOLO-v9 model, released in 2024.\nAs the model is newly introduced, not much work has been done on it, especially\nnot in Sign Language Detection. Our paper provides deep insight on how YOLO- v9\nworks and better than previous model.\n","authors":["Amna Imran","Meghana Shashishekhara Hulikal","Hamza A. A. Gardi"],"pdf_url":"https://arxiv.org/pdf/2407.17950v1.pdf","comment":"11 pages, 13 figures, 1 table"},{"id":"http://arxiv.org/abs/2407.17949v1","updated":"2024-07-25T11:08:53Z","published":"2024-07-25T11:08:53Z","title":"Fast convergence of the Expectation Maximization algorithm under a\n logarithmic Sobolev inequality","summary":" By utilizing recently developed tools for constructing gradient flows on\nWasserstein spaces, we extend an analysis technique commonly employed to\nunderstand alternating minimization algorithms on Euclidean space to the\nExpectation Maximization (EM) algorithm via its representation as\ncoordinate-wise minimization on the product of a Euclidean space and a space of\nprobability distributions due to Neal and Hinton (1998). In so doing we obtain\nfinite sample error bounds and exponential convergence of the EM algorithm\nunder a natural generalisation of a log-Sobolev inequality. We further\ndemonstrate that the analysis technique is sufficiently flexible to allow also\nthe analysis of several variants of the EM algorithm.\n","authors":["Rocco Caprio","Adam M Johansen"],"pdf_url":"https://arxiv.org/pdf/2407.17949v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08061v2","updated":"2024-07-25T10:52:26Z","published":"2024-04-11T18:03:59Z","title":"Physics-Enhanced Graph Neural Networks For Soft Sensing in Industrial\n Internet of Things","summary":" The Industrial Internet of Things (IIoT) is reshaping manufacturing,\nindustrial processes, and infrastructure management. By fostering new levels of\nautomation, efficiency, and predictive maintenance, IIoT is transforming\ntraditional industries into intelligent, seamlessly interconnected ecosystems.\nHowever, achieving highly reliable IIoT can be hindered by factors such as the\ncost of installing large numbers of sensors, limitations in retrofitting\nexisting systems with sensors, or harsh environmental conditions that may make\nsensor installation impractical. Soft (virtual) sensing leverages mathematical\nmodels to estimate variables from physical sensor data, offering a solution to\nthese challenges. Data-driven and physics-based modeling are the two main\nmethodologies widely used for soft sensing. The choice between these strategies\ndepends on the complexity of the underlying system, with the data-driven\napproach often being preferred when the physics-based inference models are\nintricate and present challenges for state estimation. However, conventional\ndeep learning models are typically hindered by their inability to explicitly\nrepresent the complex interactions among various sensors. To address this\nlimitation, we adopt Graph Neural Networks (GNNs), renowned for their ability\nto effectively capture the complex relationships between sensor measurements.\nIn this research, we propose physics-enhanced GNNs, which integrate principles\nof physics into graph-based methodologies. This is achieved by augmenting\nadditional nodes in the input graph derived from the underlying characteristics\nof the physical processes. Our evaluation of the proposed methodology on the\ncase study of district heating networks reveals significant improvements over\npurely data-driven GNNs, even in the presence of noise and parameter\ninaccuracies.\n","authors":["Keivan Faghih Niresi","Hugo Bissig","Henri Baumann","Olga Fink"],"pdf_url":"https://arxiv.org/pdf/2404.08061v2.pdf","comment":"14 pages, 10 figures. Accepted to IEEE Internet of Things Journal"},{"id":"http://arxiv.org/abs/2407.17930v1","updated":"2024-07-25T10:39:50Z","published":"2024-07-25T10:39:50Z","title":"Comparison of different Artificial Neural Networks for Bitcoin price\n forecasting","summary":" This study investigates the impact of varying sequence lengths on the\naccuracy of predicting cryptocurrency returns using Artificial Neural Networks\n(ANNs). Utilizing the Mean Absolute Error (MAE) as a threshold criterion, we\naim to enhance prediction accuracy by excluding returns that are smaller than\nthis threshold, thus mitigating errors associated with minor returns. The\nsubsequent evaluation focuses on the accuracy of predicted returns that exceed\nthis threshold. We compare four sequence lengths 168 hours (7 days), 72 hours\n(3 days), 24 hours, and 12 hours each with a return prediction interval of 2\nhours. Our findings reveal the influence of sequence length on prediction\naccuracy and underscore the potential for optimized sequence configurations in\nfinancial forecasting models.\n","authors":["Silas Baumann","Karl A. Busch","Hamza A. A. Gardi"],"pdf_url":"https://arxiv.org/pdf/2407.17930v1.pdf","comment":"9 pages, 8 figures, 2 tables"},{"id":"http://arxiv.org/abs/2407.15900v2","updated":"2024-07-25T10:39:15Z","published":"2024-07-22T11:07:52Z","title":"Improving probabilistic forecasts of extreme wind speeds by training\n statistical post-processing models with weighted scoring rules","summary":" Accurate forecasts of extreme wind speeds are of high importance for many\napplications. Such forecasts are usually generated by ensembles of numerical\nweather prediction (NWP) models, which however can be biased and have errors in\ndispersion, thus necessitating the application of statistical post-processing\ntechniques. In this work we aim to improve statistical post-processing models\nfor probabilistic predictions of extreme wind speeds. We do this by adjusting\nthe training procedure used to fit ensemble model output statistics (EMOS)\nmodels - a commonly applied post-processing technique - and propose estimating\nparameters using the so-called threshold-weighted continuous ranked probability\nscore (twCRPS), a proper scoring rule that places special emphasis on\npredictions over a threshold. We show that training using the twCRPS leads to\nimproved extreme event performance of post-processing models for a variety of\nthresholds. We find a distribution body-tail trade-off where improved\nperformance for probabilistic predictions of extreme events comes with worse\nperformance for predictions of the distribution body. However, we introduce\nstrategies to mitigate this trade-off based on weighted training and linear\npooling. Finally, we consider some synthetic experiments to explain the\ntraining impact of the twCRPS and derive closed-form expressions of the twCRPS\nfor a number of distributions, giving the first such collection in the\nliterature. The results will enable researchers and practitioners alike to\nimprove the performance of probabilistic forecasting models for extremes and\nother events of interest.\n","authors":["Jakob Benjamin Wessel","Christopher A. T. Ferro","Gavin R. Evans","Frank Kwasniok"],"pdf_url":"https://arxiv.org/pdf/2407.15900v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17929v1","updated":"2024-07-25T10:38:32Z","published":"2024-07-25T10:38:32Z","title":"Guided Latent Slot Diffusion for Object-Centric Learning","summary":" Slot attention aims to decompose an input image into a set of meaningful\nobject files (slots). These latent object representations enable various\ndownstream tasks. Yet, these slots often bind to object parts, not objects\nthemselves, especially for real-world datasets. To address this, we introduce\nGuided Latent Slot Diffusion - GLASS, an object-centric model that uses\ngenerated captions as a guiding signal to better align slots with objects. Our\nkey insight is to learn the slot-attention module in the space of generated\nimages. This allows us to repurpose the pre-trained diffusion decoder model,\nwhich reconstructs the images from the slots, as a semantic mask generator\nbased on the generated captions. GLASS learns an object-level representation\nsuitable for multiple tasks simultaneously, e.g., segmentation, image\ngeneration, and property prediction, outperforming previous methods. For object\ndiscovery, GLASS achieves approx. a +35% and +10% relative improvement for mIoU\nover the previous state-of-the-art (SOTA) method on the VOC and COCO datasets,\nrespectively, and establishes a new SOTA FID score for conditional image\ngeneration amongst slot-attention-based methods. For the segmentation task,\nGLASS surpasses SOTA weakly-supervised and language-based segmentation models,\nwhich were specifically designed for the task.\n","authors":["Krishnakant Singh","Simone Schaub-Meyer","Stefan Roth"],"pdf_url":"https://arxiv.org/pdf/2407.17929v1.pdf","comment":"Project Page: https://guided-sa.github.io"},{"id":"http://arxiv.org/abs/2401.13429v3","updated":"2024-07-25T10:15:51Z","published":"2024-01-24T12:58:08Z","title":"Detection of Correlated Random Vectors","summary":" In this paper, we investigate the problem of deciding whether two standard\nnormal random vectors $\\mathsf{X}\\in\\mathbb{R}^{n}$ and\n$\\mathsf{Y}\\in\\mathbb{R}^{n}$ are correlated or not. This is formulated as a\nhypothesis testing problem, where under the null hypothesis, these vectors are\nstatistically independent, while under the alternative, $\\mathsf{X}$ and a\nrandomly and uniformly permuted version of $\\mathsf{Y}$, are correlated with\ncorrelation $\\rho$. We analyze the thresholds at which optimal testing is\ninformation-theoretically impossible and possible, as a function of $n$ and\n$\\rho$. To derive our information-theoretic lower bounds, we develop a novel\ntechnique for evaluating the second moment of the likelihood ratio using an\northogonal polynomials expansion, which among other things, reveals a\nsurprising connection to integer partition functions. We also study a\nmulti-dimensional generalization of the above setting, where rather than two\nvectors we observe two databases/matrices, and furthermore allow for partial\ncorrelations between these two.\n","authors":["Dor Elimelech","Wasim Huleihel"],"pdf_url":"https://arxiv.org/pdf/2401.13429v3.pdf","comment":"42 pages"},{"id":"http://arxiv.org/abs/2212.03117v2","updated":"2024-07-25T10:11:29Z","published":"2022-12-06T16:29:47Z","title":"Q-Pensieve: Boosting Sample Efficiency of Multi-Objective RL Through\n Memory Sharing of Q-Snapshots","summary":" Many real-world continuous control problems are in the dilemma of weighing\nthe pros and cons, multi-objective reinforcement learning (MORL) serves as a\ngeneric framework of learning control policies for different preferences over\nobjectives. However, the existing MORL methods either rely on multiple passes\nof explicit search for finding the Pareto front and therefore are not\nsample-efficient, or utilizes a shared policy network for coarse knowledge\nsharing among policies. To boost the sample efficiency of MORL, we propose\nQ-Pensieve, a policy improvement scheme that stores a collection of Q-snapshots\nto jointly determine the policy update direction and thereby enables data\nsharing at the policy level. We show that Q-Pensieve can be naturally\nintegrated with soft policy iteration with convergence guarantee. To\nsubstantiate this concept, we propose the technique of Q replay buffer, which\nstores the learned Q-networks from the past iterations, and arrive at a\npractical actor-critic implementation. Through extensive experiments and an\nablation study, we demonstrate that with much fewer samples, the proposed\nalgorithm can outperform the benchmark MORL methods on a variety of MORL\nbenchmark tasks.\n","authors":["Wei Hung","Bo-Kai Huang","Ping-Chun Hsieh","Xi Liu"],"pdf_url":"https://arxiv.org/pdf/2212.03117v2.pdf","comment":"20 pages, 15 figures"},{"id":"http://arxiv.org/abs/2407.17910v1","updated":"2024-07-25T10:02:11Z","published":"2024-07-25T10:02:11Z","title":"Causal Deepsets for Off-policy Evaluation under Spatial or\n Spatio-temporal Interferences","summary":" Off-policy evaluation (OPE) is widely applied in sectors such as\npharmaceuticals and e-commerce to evaluate the efficacy of novel products or\npolicies from offline datasets. This paper introduces a causal deepset\nframework that relaxes several key structural assumptions, primarily the\nmean-field assumption, prevalent in existing OPE methodologies that handle\nspatio-temporal interference. These traditional assumptions frequently prove\ninadequate in real-world settings, thereby restricting the capability of\ncurrent OPE methods to effectively address complex interference effects. In\nresponse, we advocate for the implementation of the permutation invariance (PI)\nassumption. This innovative approach enables the data-driven, adaptive learning\nof the mean-field function, offering a more flexible estimation method beyond\nconventional averaging. Furthermore, we present novel algorithms that\nincorporate the PI assumption into OPE and thoroughly examine their theoretical\nfoundations. Our numerical analyses demonstrate that this novel approach yields\nsignificantly more precise estimations than existing baseline algorithms,\nthereby substantially improving the practical applicability and effectiveness\nof OPE methodologies. A Python implementation of our proposed method is\navailable at https://github.com/BIG-S2/Causal-Deepsets.\n","authors":["Runpeng Dai","Jianing Wang","Fan Zhou","Shikai Luo","Zhiwei Qin","Chengchun Shi","Hongtu Zhu"],"pdf_url":"https://arxiv.org/pdf/2407.17910v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17909v1","updated":"2024-07-25T10:00:21Z","published":"2024-07-25T10:00:21Z","title":"Separating Novel Features for Logical Anomaly Detection: A\n Straightforward yet Effective Approach","summary":" Vision-based inspection algorithms have significantly contributed to quality\ncontrol in industrial settings, particularly in addressing structural defects\nlike dent and contamination which are prevalent in mass production. Extensive\nresearch efforts have led to the development of related benchmarks such as\nMVTec AD (Bergmann et al., 2019). However, in industrial settings, there can be\ninstances of logical defects, where acceptable items are found in unsuitable\nlocations or product pairs do not match as expected. Recent methods tackling\nlogical defects effectively employ knowledge distillation to generate\ndifference maps. Knowledge distillation (KD) is used to learn normal data\ndistribution in unsupervised manner. Despite their effectiveness, these methods\noften overlook the potential false negatives. Excessive similarity between the\nteacher network and student network can hinder the generation of a suitable\ndifference map for logical anomaly detection. This technical report provides\ninsights on handling potential false negatives by utilizing a simple constraint\nin KD-based logical anomaly detection methods. We select EfficientAD as a\nstate-of-the-art baseline and apply a margin-based constraint to its\nunsupervised learning scheme. Applying this constraint, we can improve the\nAUROC for MVTec LOCO AD by 1.3 %.\n","authors":["Kangil Lee","Geonuk Kim"],"pdf_url":"https://arxiv.org/pdf/2407.17909v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17907v1","updated":"2024-07-25T09:53:12Z","published":"2024-07-25T09:53:12Z","title":"Amortized Posterior Sampling with Diffusion Prior Distillation","summary":" We propose a variational inference approach to sample from the posterior\ndistribution for solving inverse problems. From a pre-trained diffusion model,\nour approach trains a conditional flow model to minimize the divergence between\nthe proposal variational distribution and the posterior distribution implicitly\ndefined through the diffusion model. Once trained, the flow model is capable of\nsampling from the posterior distribution with a single NFE, amortized with\nrespect to the measurement. The proposed method paves a new path for distilling\na diffusion prior for efficient posterior sampling. We show that our method is\napplicable to standard signals in Euclidean space, as well as signals on\nmanifold.\n","authors":["Abbas Mammadov","Hyungjin Chung","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2407.17907v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17900v1","updated":"2024-07-25T09:42:24Z","published":"2024-07-25T09:42:24Z","title":"The Power of Combining Data and Knowledge: GPT-4o is an Effective\n Interpreter of Machine Learning Models in Predicting Lymph Node Metastasis of\n Lung Cancer","summary":" Lymph node metastasis (LNM) is a crucial factor in determining the initial\ntreatment for patients with lung cancer, yet accurate preoperative diagnosis of\nLNM remains challenging. Recently, large language models (LLMs) have garnered\nsignificant attention due to their remarkable text generation capabilities.\nLeveraging the extensive medical knowledge learned from vast corpora, LLMs can\nestimate probabilities for clinical problems, though their performance has\nhistorically been inferior to data-driven machine learning models. In this\npaper, we propose a novel ensemble method that combines the medical knowledge\nacquired by LLMs with the latent patterns identified by machine learning models\nto enhance LNM prediction performance. Initially, we developed machine learning\nmodels using patient data. We then designed a prompt template to integrate the\npatient data with the predicted probability from the machine learning model.\nSubsequently, we instructed GPT-4o, the most advanced LLM developed by OpenAI,\nto estimate the likelihood of LNM based on patient data and then adjust the\nestimate using the machine learning output. Finally, we collected three outputs\nfrom the GPT-4o using the same prompt and ensembled these results as the final\nprediction. Using the proposed method, our models achieved an AUC value of\n0.765 and an AP value of 0.415 for LNM prediction, significantly improving\npredictive performance compared to baseline machine learning models. The\nexperimental results indicate that GPT-4o can effectively leverage its medical\nknowledge and the probabilities predicted by machine learning models to achieve\nmore accurate LNM predictions. These findings demonstrate that LLMs can perform\nwell in clinical risk prediction tasks, offering a new paradigm for integrating\nmedical knowledge and patient data in clinical predictions.\n","authors":["Danqing Hu","Bing Liu","Xiaofeng Zhu","Nan Wu"],"pdf_url":"https://arxiv.org/pdf/2407.17900v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.07987v5","updated":"2024-07-25T09:33:50Z","published":"2024-05-13T17:58:30Z","title":"The Platonic Representation Hypothesis","summary":" We argue that representations in AI models, particularly deep networks, are\nconverging. First, we survey many examples of convergence in the literature:\nover time and across multiple domains, the ways by which different neural\nnetworks represent data are becoming more aligned. Next, we demonstrate\nconvergence across data modalities: as vision models and language models get\nlarger, they measure distance between datapoints in a more and more alike way.\nWe hypothesize that this convergence is driving toward a shared statistical\nmodel of reality, akin to Plato's concept of an ideal reality. We term such a\nrepresentation the platonic representation and discuss several possible\nselective pressures toward it. Finally, we discuss the implications of these\ntrends, their limitations, and counterexamples to our analysis.\n","authors":["Minyoung Huh","Brian Cheung","Tongzhou Wang","Phillip Isola"],"pdf_url":"https://arxiv.org/pdf/2405.07987v5.pdf","comment":"Equal contributions. Project: https://phillipi.github.io/prh/ Code:\n https://github.com/minyoungg/platonic-rep"},{"id":"http://arxiv.org/abs/2407.17892v1","updated":"2024-07-25T09:26:07Z","published":"2024-07-25T09:26:07Z","title":"An Iterative Approach to Topic Modelling","summary":" Topic modelling has become increasingly popular for summarizing text data,\nsuch as social media posts and articles. However, topic modelling is usually\ncompleted in one shot. Assessing the quality of resulting topics is\nchallenging. No effective methods or measures have been developed for assessing\nthe results or for making further enhancements to the topics. In this research,\nwe propose we propose to use an iterative process to perform topic modelling\nthat gives rise to a sense of completeness of the resulting topics when the\nprocess is complete. Using the BERTopic package, a popular method in topic\nmodelling, we demonstrate how the modelling process can be applied iteratively\nto arrive at a set of topics that could not be further improved upon using one\nof the three selected measures for clustering comparison as the decision\ncriteria. This demonstration is conducted using a subset of the COVIDSenti-A\ndataset. The early success leads us to believe that further research using in\nusing this approach in conjunction with other topic modelling algorithms could\nbe viable.\n","authors":["Albert Wong","Florence Wing Yau Cheng","Ashley Keung","Yamileth Hercules","Mary Alexandra Garcia","Yew-Wei Lim","Lien Pham"],"pdf_url":"https://arxiv.org/pdf/2407.17892v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02737v2","updated":"2024-07-25T09:18:24Z","published":"2024-03-05T07:45:29Z","title":"Neural Fractional Differential Equations","summary":" Fractional Differential Equations (FDEs) are essential tools for modelling\ncomplex systems in science and engineering. They extend the traditional\nconcepts of differentiation and integration to non-integer orders, enabling a\nmore precise representation of processes characterised by non-local and\nmemory-dependent behaviours.\n This property is useful in systems where variables do not respond to changes\ninstantaneously, but instead exhibit a strong memory of past interactions.\n Having this in mind, and drawing inspiration from Neural Ordinary\nDifferential Equations (Neural ODEs), we propose the Neural FDE, a novel deep\nneural network architecture that adjusts a FDE to the dynamics of data.\n This work provides a comprehensive overview of the numerical method employed\nin Neural FDEs and the Neural FDE architecture. The numerical outcomes suggest\nthat, despite being more computationally demanding, the Neural FDE may\noutperform the Neural ODE in modelling systems with memory or dependencies on\npast states, and it can effectively be applied to learn more intricate\ndynamical systems.\n","authors":["C. Coelho","M. Fernanda P. Costa","L. L. Ferrás"],"pdf_url":"https://arxiv.org/pdf/2403.02737v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02750v2","updated":"2024-07-25T09:16:05Z","published":"2024-02-05T06:06:47Z","title":"KIVI: A Tuning-Free Asymmetric 2bit Quantization for KV Cache","summary":" Efficiently serving large language models (LLMs) requires batching of many\nrequests to reduce the cost per request. Yet, with larger batch sizes and\nlonger context lengths, the key-value (KV) cache, which stores attention keys\nand values to avoid re-computations, significantly increases memory demands and\nbecomes the new bottleneck in speed and memory usage. Additionally, the loading\nof the KV cache causes the computational core to be idle, which limits the\ninference speed. A straightforward and effective solution to reduce KV cache\nsize is quantization, which decreases the total bytes taken by KV cache.\nHowever, there is a lack of in-depth studies that explore the element\ndistribution of KV cache to understand the hardness and limitation of KV cache\nquantization. To fill the gap, we conducted a comprehensive study on the\nelement distribution in KV cache of popular LLMs. Our findings indicate that\nthe key cache should be quantized per-channel, i.e., group elements along the\nchannel dimension and quantize them together. In contrast, the value cache\nshould be quantized per-token. From this analysis, we developed a tuning-free\n2bit KV cache quantization algorithm named KIVI. With hardware-friendly\nimplementation, KIVI can enable Llama, Falcon, and Mistral models to maintain\nalmost the same quality while using $\\mathbf{2.6\\times}$ less peak memory\n(including model weight). This reduction in memory usage enables up to\n$\\mathbf{4\\times}$ larger batch size, bringing $\\mathbf{2.35\\times \\sim\n3.47\\times}$ throughput on real LLM inference workload. The source code is\navailable at https://github.com/jy-yuan/KIVI.\n","authors":["Zirui Liu","Jiayi Yuan","Hongye Jin","Shaochen Zhong","Zhaozhuo Xu","Vladimir Braverman","Beidi Chen","Xia Hu"],"pdf_url":"https://arxiv.org/pdf/2402.02750v2.pdf","comment":"ICML2024"},{"id":"http://arxiv.org/abs/2405.09597v2","updated":"2024-07-25T08:59:36Z","published":"2024-05-15T13:50:23Z","title":"When AI Eats Itself: On the Caveats of Data Pollution in the Era of\n Generative AI","summary":" Generative artificial intelligence (AI) technologies and large models are\nproducing realistic outputs across various domains, such as images, text,\nspeech, and music. Creating these advanced generative models requires\nsignificant resources, particularly large and high-quality datasets. To\nminimize training expenses, many algorithm developers use data created by the\nmodels themselves as a cost-effective training solution. However, not all\nsynthetic data effectively improve model performance, necessitating a strategic\nbalance in the use of real versus synthetic data to optimize outcomes.\n Currently, the previously well-controlled integration of real and synthetic\ndata is becoming uncontrollable. The widespread and unregulated dissemination\nof synthetic data online leads to the contamination of datasets traditionally\ncompiled through web scraping, now mixed with unlabeled synthetic data. This\ntrend portends a future where generative AI systems may increasingly rely\nblindly on consuming self-generated data, raising concerns about model\nperformance and ethical issues. What will happen if generative AI continuously\nconsumes itself without discernment? What measures can we take to mitigate the\npotential adverse effects?\n There is a significant gap in the scientific literature regarding the impact\nof synthetic data use in generative AI, particularly in terms of the fusion of\nmultimodal information. To address this research gap, this review investigates\nthe consequences of integrating synthetic data blindly on training generative\nAI on both image and text modalities and explores strategies to mitigate these\neffects. The goal is to offer a comprehensive view of synthetic data's role,\nadvocating for a balanced approach to its use and exploring practices that\npromote the sustainable development of generative AI technologies in the era of\nlarge models.\n","authors":["Xiaodan Xing","Fadong Shi","Jiahao Huang","Yinzhe Wu","Yang Nan","Sheng Zhang","Yingying Fang","Mike Roberts","Carola-Bibiane Schönlieb","Javier Del Ser","Guang Yang"],"pdf_url":"https://arxiv.org/pdf/2405.09597v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17880v1","updated":"2024-07-25T08:48:07Z","published":"2024-07-25T08:48:07Z","title":"DAM: Towards A Foundation Model for Time Series Forecasting","summary":" It is challenging to scale time series forecasting models such that they\nforecast accurately for multiple distinct domains and datasets, all with\npotentially different underlying collection procedures (e.g., sample\nresolution), patterns (e.g., periodicity), and prediction requirements (e.g.,\nreconstruction vs. forecasting). We call this general task universal\nforecasting. Existing methods usually assume that input data is regularly\nsampled, and they forecast to pre-determined horizons, resulting in failure to\ngeneralise outside of the scope of their training. We propose the DAM - a\nneural model that takes randomly sampled histories and outputs an adjustable\nbasis composition as a continuous function of time for forecasting to non-fixed\nhorizons. It involves three key components: (1) a flexible approach for using\nrandomly sampled histories from a long-tail distribution, that enables an\nefficient global perspective of the underlying temporal dynamics while\nretaining focus on the recent history; (2) a transformer backbone that is\ntrained on these actively sampled histories to produce, as representational\noutput, (3) the basis coefficients of a continuous function of time. We show\nthat a single univariate DAM, trained on 25 time series datasets, either\noutperformed or closely matched existing SoTA models at multivariate long-term\nforecasting across 18 datasets, including 8 held-out for zero-shot transfer,\neven though these models were trained to specialise for each dataset-horizon\ncombination. This single DAM excels at zero-shot transfer and very-long-term\nforecasting, performs well at imputation, is interpretable via basis function\ncomposition and attention, can be tuned for different inference-cost\nrequirements, is robust to missing and irregularly sampled data {by design}.\n","authors":["Luke Darlow","Qiwen Deng","Ahmed Hassan","Martin Asenov","Rajkarn Singh","Artjom Joosen","Adam Barker","Amos Storkey"],"pdf_url":"https://arxiv.org/pdf/2407.17880v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17876v1","updated":"2024-07-25T08:46:49Z","published":"2024-07-25T08:46:49Z","title":"A Large-Scale Sensitivity Analysis on Latent Embeddings and\n Dimensionality Reductions for Text Spatializations","summary":" The semantic similarity between documents of a text corpus can be visualized\nusing map-like metaphors based on two-dimensional scatterplot layouts. These\nlayouts result from a dimensionality reduction on the document-term matrix or a\nrepresentation within a latent embedding, including topic models. Thereby, the\nresulting layout depends on the input data and hyperparameters of the\ndimensionality reduction and is therefore affected by changes in them.\nFurthermore, the resulting layout is affected by changes in the input data and\nhyperparameters of the dimensionality reduction. However, such changes to the\nlayout require additional cognitive efforts from the user. In this work, we\npresent a sensitivity study that analyzes the stability of these layouts\nconcerning (1) changes in the text corpora, (2) changes in the hyperparameter,\nand (3) randomness in the initialization. Our approach has two stages: data\nmeasurement and data analysis. First, we derived layouts for the combination of\nthree text corpora and six text embeddings and a grid-search-inspired\nhyperparameter selection of the dimensionality reductions. Afterward, we\nquantified the similarity of the layouts through ten metrics, concerning local\nand global structures and class separation. Second, we analyzed the resulting\n42817 tabular data points in a descriptive statistical analysis. From this, we\nderived guidelines for informed decisions on the layout algorithm and highlight\nspecific hyperparameter settings. We provide our implementation as a Git\nrepository at\nhttps://github.com/hpicgs/Topic-Models-and-Dimensionality-Reduction-Sensitivity-Study\nand results as Zenodo archive at https://doi.org/10.5281/zenodo.12772898.\n","authors":["Daniel Atzberger","Tim Cech","Willy Scheibel","Jürgen Döllner","Michael Behrisch","Tobias Schreck"],"pdf_url":"https://arxiv.org/pdf/2407.17876v1.pdf","comment":"To be published at IEEE VIS 2024 conference"},{"id":"http://arxiv.org/abs/2407.17869v1","updated":"2024-07-25T08:42:23Z","published":"2024-07-25T08:42:23Z","title":"EllipBench: A Large-scale Benchmark for Machine-learning based\n Ellipsometry Modeling","summary":" Ellipsometry is used to indirectly measure the optical properties and\nthickness of thin films. However, solving the inverse problem of ellipsometry\nis time-consuming since it involves human expertise to apply the data fitting\ntechniques. Many studies use traditional machine learning-based methods to\nmodel the complex mathematical fitting process. In our work, we approach this\nproblem from a deep learning perspective. First, we introduce a large-scale\nbenchmark dataset to facilitate deep learning methods. The proposed dataset\nencompasses 98 types of thin film materials and 4 types of substrate materials,\nincluding metals, alloys, compounds, and polymers, among others. Additionally,\nwe propose a deep learning framework that leverages residual connections and\nself-attention mechanisms to learn the massive data points. We also introduce a\nreconstruction loss to address the common challenge of multiple solutions in\nthin film thickness prediction. Compared to traditional machine learning\nmethods, our framework achieves state-of-the-art (SOTA) performance on our\nproposed dataset. The dataset and code will be available upon acceptance.\n","authors":["Yiming Ma","Xinjie Li","Xin Sun","Zhiyong Wang","Lionel Z. Wang"],"pdf_url":"https://arxiv.org/pdf/2407.17869v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.14729v2","updated":"2024-07-25T08:34:58Z","published":"2022-12-30T14:15:54Z","title":"Batchless Normalization: How to Normalize Activations Across Instances\n with Minimal Memory Requirements","summary":" In training neural networks, batch normalization has many benefits, not all\nof them entirely understood. But it also has some drawbacks. Foremost is\narguably memory consumption, as computing the batch statistics requires all\ninstances within the batch to be processed simultaneously, whereas without\nbatch normalization it would be possible to process them one by one while\naccumulating the weight gradients. Another drawback is that that distribution\nparameters (mean and standard deviation) are unlike all other model parameters\nin that they are not trained using gradient descent but require special\ntreatment, complicating implementation. In this paper, I show a simple and\nstraightforward way to address these issues. The idea, in short, is to add\nterms to the loss that, for each activation, cause the minimization of the\nnegative log likelihood of a Gaussian distribution that is used to normalize\nthe activation. Among other benefits, this will hopefully contribute to the\ndemocratization of AI research by means of lowering the hardware requirements\nfor training larger models.\n","authors":["Benjamin Berger","Victor Uc Cetina"],"pdf_url":"https://arxiv.org/pdf/2212.14729v2.pdf","comment":"17 pages (12 without appendices), 12 figures, 5 tables"},{"id":"http://arxiv.org/abs/2407.11055v3","updated":"2024-07-25T08:26:35Z","published":"2024-07-09T22:04:23Z","title":"Knowledge boosting during low-latency inference","summary":" Models for low-latency, streaming applications could benefit from the\nknowledge capacity of larger models, but edge devices cannot run these models\ndue to resource constraints. A possible solution is to transfer hints during\ninference from a large model running remotely to a small model running\non-device. However, this incurs a communication delay that breaks real-time\nrequirements and does not guarantee that both models will operate on the same\ndata at the same time. We propose knowledge boosting, a novel technique that\nallows a large model to operate on time-delayed input during inference, while\nstill boosting small model performance. Using a streaming neural network that\nprocesses 8 ms chunks, we evaluate different speech separation and enhancement\ntasks with communication delays of up to six chunks or 48 ms. Our results show\nlarger gains where the performance gap between the small and large models is\nwide, demonstrating a promising method for large-small model collaboration for\nlow-latency applications. Code, dataset, and audio samples available at\nhttps://knowledgeboosting.cs.washington.edu/.\n","authors":["Vidya Srinivas","Malek Itani","Tuochao Chen","Sefik Emre Eskimez","Takuya Yoshioka","Shyamnath Gollakota"],"pdf_url":"https://arxiv.org/pdf/2407.11055v3.pdf","comment":"Accepted by Interspeech 2024"},{"id":"http://arxiv.org/abs/2407.17856v1","updated":"2024-07-25T08:21:46Z","published":"2024-07-25T08:21:46Z","title":"MDS-ED: Multimodal Decision Support in the Emergency Department -- a\n Benchmark Dataset for Diagnoses and Deterioration Prediction in Emergency\n Medicine","summary":" Background: Benchmarking medical decision support algorithms often struggles\ndue to limited access to datasets, narrow prediction tasks, and restricted\ninput modalities. These limitations affect their clinical relevance and\nperformance in high-stakes areas like emergency care, complicating replication,\nvalidation, and improvement of benchmarks.\n Methods: We introduce a dataset based on MIMIC-IV, benchmarking protocol, and\ninitial results for evaluating multimodal decision support in the emergency\ndepartment (ED). We use diverse data modalities from the first 1.5 hours of\npatient arrival, including demographics, biometrics, vital signs, lab values,\nand electrocardiogram waveforms. We analyze 1443 clinical labels across two\ncontexts: predicting diagnoses with ICD-10 codes and forecasting patient\ndeterioration.\n Results: Our multimodal diagnostic model achieves an AUROC score over 0.8 in\na statistically significant manner for 357 out of 1428 conditions, including\ncardiac issues like myocardial infarction and non-cardiac conditions such as\nrenal disease and diabetes. The deterioration model scores above 0.8 in a\nstatistically significant manner for 13 out of 15 targets, including critical\nevents like cardiac arrest and mechanical ventilation, ICU admission as well as\nshort- and long-term mortality. Incorporating raw waveform data significantly\nimproves model performance, which represents one of the first robust\ndemonstrations of this effect.\n Conclusions: This study highlights the uniqueness of our dataset, which\nencompasses a wide range of clinical tasks and utilizes a comprehensive set of\nfeatures collected early during the emergency after arriving at the ED. The\nstrong performance, as evidenced by high AUROC scores across diagnostic and\ndeterioration targets, underscores the potential of our approach to\nrevolutionize decision-making in acute and emergency medicine.\n","authors":["Juan Miguel Lopez Alcaraz","Nils Strodthoff"],"pdf_url":"https://arxiv.org/pdf/2407.17856v1.pdf","comment":"14 pages, 1 figure, code available under\n https://github.com/AI4HealthUOL/MDS-ED"},{"id":"http://arxiv.org/abs/2401.00773v3","updated":"2024-07-25T08:13:27Z","published":"2024-01-01T14:34:11Z","title":"Unsupervised Outlier Detection using Random Subspace and Subsampling\n Ensembles of Dirichlet Process Mixtures","summary":" Probabilistic mixture models are recognized as effective tools for\nunsupervised outlier detection owing to their interpretability and global\ncharacteristics. Among these, Dirichlet process mixture models stand out as a\nstrong alternative to conventional finite mixture models for both clustering\nand outlier detection tasks. Unlike finite mixture models, Dirichlet process\nmixtures are infinite mixture models that automatically determine the number of\nmixture components based on the data. Despite their advantages, the adoption of\nDirichlet process mixture models for unsupervised outlier detection has been\nlimited by challenges related to computational inefficiency and sensitivity to\noutliers in the construction of outlier detectors. Additionally, Dirichlet\nprocess Gaussian mixtures struggle to effectively model non-Gaussian data with\ndiscrete or binary features. To address these challenges, we propose a novel\noutlier detection method that utilizes ensembles of Dirichlet process Gaussian\nmixtures. This unsupervised algorithm employs random subspace and subsampling\nensembles to ensure efficient computation and improve the robustness of the\noutlier detector. The ensemble approach further improves the suitability of the\nproposed method for detecting outliers in non-Gaussian data. Furthermore, our\nmethod uses variational inference for Dirichlet process mixtures, which ensures\nboth efficient and rapid computation. Empirical analyses using benchmark\ndatasets demonstrate that our method outperforms existing approaches in\nunsupervised outlier detection.\n","authors":["Dongwook Kim","Juyeon Park","Hee Cheol Chung","Seonghyun Jeong"],"pdf_url":"https://arxiv.org/pdf/2401.00773v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12832v2","updated":"2024-07-25T08:09:12Z","published":"2024-04-19T12:09:49Z","title":"COIN: Counterfactual inpainting for weakly supervised semantic\n segmentation for medical images","summary":" Deep learning is dramatically transforming the field of medical imaging and\nradiology, enabling the identification of pathologies in medical images,\nincluding computed tomography (CT) and X-ray scans. However, the performance of\ndeep learning models, particularly in segmentation tasks, is often limited by\nthe need for extensive annotated datasets. To address this challenge, the\ncapabilities of weakly supervised semantic segmentation are explored through\nthe lens of Explainable AI and the generation of counterfactual explanations.\nThe scope of this research is development of a novel counterfactual inpainting\napproach (COIN) that flips the predicted classification label from abnormal to\nnormal by using a generative model. For instance, if the classifier deems an\ninput medical image X as abnormal, indicating the presence of a pathology, the\ngenerative model aims to inpaint the abnormal region, thus reversing the\nclassifier's original prediction label. The approach enables us to produce\nprecise segmentations for pathologies without depending on pre-existing\nsegmentation masks. Crucially, image-level labels are utilized, which are\nsubstantially easier to acquire than creating detailed segmentation masks. The\neffectiveness of the method is demonstrated by segmenting synthetic targets and\nactual kidney tumors from CT images acquired from Tartu University Hospital in\nEstonia. The findings indicate that COIN greatly surpasses established\nattribution methods, such as RISE, ScoreCAM, and LayerCAM, as well as an\nalternative counterfactual explanation method introduced by Singla et al. This\nevidence suggests that COIN is a promising approach for semantic segmentation\nof tumors in CT images, and presents a step forward in making deep learning\napplications more accessible and effective in healthcare, where annotated data\nis scarce.\n","authors":["Dmytro Shvetsov","Joonas Ariva","Marharyta Domnich","Raul Vicente","Dmytro Fishman"],"pdf_url":"https://arxiv.org/pdf/2404.12832v2.pdf","comment":"This work has been accepted to be presented to The 2nd World\n Conference on eXplainable Artificial Intelligence (xAI 2024), July 17-19,\n 2024 - Valletta, Malta"},{"id":"http://arxiv.org/abs/2401.14351v2","updated":"2024-07-25T08:08:11Z","published":"2024-01-25T17:55:07Z","title":"ServerlessLLM: Low-Latency Serverless Inference for Large Language\n Models","summary":" This paper presents ServerlessLLM, a distributed system designed to support\nlow-latency serverless inference for Large Language Models (LLMs). By\nharnessing the substantial near-GPU storage and memory capacities of inference\nservers, ServerlessLLM achieves effective local checkpoint storage, minimizing\nthe need for remote checkpoint downloads and ensuring efficient checkpoint\nloading. The design of ServerlessLLM features three core contributions: (i)\n\\emph{fast multi-tier checkpoint loading}, featuring a new loading-optimized\ncheckpoint format and a multi-tier loading system, fully utilizing the\nbandwidth of complex storage hierarchies on GPU servers; (ii) \\emph{efficient\nlive migration of LLM inference}, which enables newly initiated inferences to\ncapitalize on local checkpoint storage while ensuring minimal user\ninterruption; and (iii) \\emph{startup-time-optimized model scheduling}, which\nassesses the locality statuses of checkpoints on each server and schedules the\nmodel onto servers that minimize the time to start the inference. Comprehensive\nevaluations, including microbenchmarks and real-world scenarios, demonstrate\nthat ServerlessLLM dramatically outperforms state-of-the-art serverless\nsystems, reducing latency by 10 - 200X across various LLM inference workloads.\n","authors":["Yao Fu","Leyang Xue","Yeqi Huang","Andrei-Octavian Brabete","Dmitrii Ustiugov","Yuvraj Patel","Luo Mai"],"pdf_url":"https://arxiv.org/pdf/2401.14351v2.pdf","comment":"18th USENIX Symposium on Operating Systems Design and Implementation"},{"id":"http://arxiv.org/abs/2406.08401v2","updated":"2024-07-25T08:01:32Z","published":"2024-06-12T16:50:12Z","title":"Nyström Kernel Stein Discrepancy","summary":" Kernel methods underpin many of the most successful approaches in data\nscience and statistics, and they allow representing probability measures as\nelements of a reproducing kernel Hilbert space without loss of information.\nRecently, the kernel Stein discrepancy (KSD), which combines Stein's method\nwith kernel techniques, gained considerable attention. Through the Stein\noperator, KSD allows the construction of powerful goodness-of-fit tests where\nit is sufficient to know the target distribution up to a multiplicative\nconstant. However, the typical U- and V-statistic-based KSD estimators suffer\nfrom a quadratic runtime complexity, which hinders their application in\nlarge-scale settings. In this work, we propose a Nystr\\\"om-based KSD\nacceleration -- with runtime $\\mathcal O\\!\\left(mn+m^3\\right)$ for $n$ samples\nand $m\\ll n$ Nystr\\\"om points -- , show its $\\sqrt{n}$-consistency under the\nnull with a classical sub-Gaussian assumption, and demonstrate its\napplicability for goodness-of-fit testing on a suite of benchmarks.\n","authors":["Florian Kalinke","Zoltan Szabo","Bharath K. Sriperumbudur"],"pdf_url":"https://arxiv.org/pdf/2406.08401v2.pdf","comment":"Update proof of Lemma B.3, milder Assumption 1, more experiments"},{"id":"http://arxiv.org/abs/2404.12810v2","updated":"2024-07-25T08:00:44Z","published":"2024-04-19T11:47:17Z","title":"Enhancing Counterfactual Explanation Search with Diffusion Distance and\n Directional Coherence","summary":" A pressing issue in the adoption of AI models is the increasing demand for\nmore human-centric explanations of their predictions. To advance towards more\nhuman-centric explanations, understanding how humans produce and select\nexplanations has been beneficial. In this work, inspired by insights of human\ncognition we propose and test the incorporation of two novel biases to enhance\nthe search for effective counterfactual explanations. Central to our\nmethodology is the application of diffusion distance, which emphasizes data\nconnectivity and actionability in the search for feasible counterfactual\nexplanations. In particular, diffusion distance effectively weights more those\npoints that are more interconnected by numerous short-length paths. This\napproach brings closely connected points nearer to each other, identifying a\nfeasible path between them. We also introduce a directional coherence term that\nallows the expression of a preference for the alignment between the joint and\nmarginal directional changes in feature space to reach a counterfactual. This\nterm enables the generation of counterfactual explanations that align with a\nset of marginal predictions based on expectations of how the outcome of the\nmodel varies by changing one feature at a time. We evaluate our method, named\nCoherent Directional Counterfactual Explainer (CoDiCE), and the impact of the\ntwo novel biases against existing methods such as DiCE, FACE, Prototypes, and\nGrowing Spheres. Through a series of ablation experiments on both synthetic and\nreal datasets with continuous and mixed-type features, we demonstrate the\neffectiveness of our method.\n","authors":["Marharyta Domnich","Raul Vicente"],"pdf_url":"https://arxiv.org/pdf/2404.12810v2.pdf","comment":"This work has been accepted to be presented to The 2nd World\n Conference on eXplainable Artificial Intelligence (xAI 2024), July 17-19,\n 2024 - Valletta, Malta"},{"id":"http://arxiv.org/abs/2407.17844v1","updated":"2024-07-25T07:58:19Z","published":"2024-07-25T07:58:19Z","title":"Innovative Speech-Based Deep Learning Approaches for Parkinson's Disease\n Classification: A Systematic Review","summary":" Parkinson's disease (PD), the second most prevalent neurodegenerative\ndisorder worldwide, frequently presents with early-stage speech impairments.\nRecent advancements in Artificial Intelligence (AI), particularly deep learning\n(DL), have significantly enhanced PD diagnosis through the analysis of speech\ndata. Nevertheless, the progress of research is restricted by the limited\navailability of publicly accessible speech-based PD datasets, primarily due to\nprivacy and ethical concerns. This review covers the latest DL-based AI\napproaches for speech-based PD classification, focusing on performance,\navailable resources and associated challenges of 33 scientific works published\nbetween 2020 and March 2024. These DL approaches are categorized into\nend-to-end (E2E) learning, transfer learning (TL) and deep acoustic features\n(DAF) extraction. Among E2E approaches, Convolutional Neural Networks (CNNs)\nare prevalent, though Transformers are increasingly popular. E2E approaches\nface challenges such as limited data and computational resources, especially\nwith Transformers. TL addresses these issues by providing more robust PD\ndiagnosis and better generalizability across languages. DAF extraction aims to\nimprove the explainability and interpretability of results by examining the\nspecific effects of deep features on both other DL approaches and more\ntraditional machine learning (ML) methods. However, it often underperforms\ncompared to E2E and TL approaches. This review also discusses unresolved issues\nrelated to bias, explainability and privacy, highlighting the need for future\nresearch.\n","authors":["Lisanne van Gelderen","Cristian Tejedor-García"],"pdf_url":"https://arxiv.org/pdf/2407.17844v1.pdf","comment":"Submitted in Applied Sciences - peer reviewed Open Access journal.\n This research was funded by the NWO research programme AiNed Fellowship\n Grants under the project Responsible AI for Voice Diagnostics (RAIVD) - grant\n number NGF.1607.22.013"},{"id":"http://arxiv.org/abs/2407.17842v1","updated":"2024-07-25T07:57:34Z","published":"2024-07-25T07:57:34Z","title":"On the Opportunities of (Re)-Exploring Atmospheric Science by Foundation\n Models: A Case Study","summary":" Most state-of-the-art AI applications in atmospheric science are based on\nclassic deep learning approaches. However, such approaches cannot automatically\nintegrate multiple complicated procedures to construct an intelligent agent,\nsince each functionality is enabled by a separate model learned from\nindependent climate datasets. The emergence of foundation models, especially\nmultimodal foundation models, with their ability to process heterogeneous input\ndata and execute complex tasks, offers a substantial opportunity to overcome\nthis challenge. In this report, we want to explore a central question - how the\nstate-of-the-art foundation model, i.e., GPT-4o, performs various atmospheric\nscientific tasks. Toward this end, we conduct a case study by categorizing the\ntasks into four main classes, including climate data processing, physical\ndiagnosis, forecast and prediction, and adaptation and mitigation. For each\ntask, we comprehensively evaluate the GPT-4o's performance along with a\nconcrete discussion. We hope that this report may shed new light on future AI\napplications and research in atmospheric science.\n","authors":["Lujia Zhang","Hanzhe Cui","Yurong Song","Chenyue Li","Binhang Yuan","Mengqian Lu"],"pdf_url":"https://arxiv.org/pdf/2407.17842v1.pdf","comment":"28 pages, 12 figures"},{"id":"http://arxiv.org/abs/2407.17839v1","updated":"2024-07-25T07:54:07Z","published":"2024-07-25T07:54:07Z","title":"Long-term Fairness in Ride-Hailing Platform","summary":" Matching in two-sided markets such as ride-hailing has recently received\nsignificant attention. However, existing studies on ride-hailing mainly focus\non optimising efficiency, and fairness issues in ride-hailing have been\nneglected. Fairness issues in ride-hailing, including significant earning\ndifferences between drivers and variance of passenger waiting times among\ndifferent locations, have potential impacts on economic and ethical aspects.\nThe recent studies that focus on fairness in ride-hailing exploit traditional\noptimisation methods and the Markov Decision Process to balance efficiency and\nfairness. However, there are several issues in these existing studies, such as\nmyopic short-term decision-making from traditional optimisation and instability\nof fairness in a comparably longer horizon from both traditional optimisation\nand Markov Decision Process-based methods. To address these issues, we propose\na dynamic Markov Decision Process model to alleviate fairness issues currently\nfaced by ride-hailing, and seek a balance between efficiency and fairness, with\ntwo distinct characteristics: (i) a prediction module to predict the number of\nrequests that will be raised in the future from different locations to allow\nthe proposed method to consider long-term fairness based on the whole timeline\ninstead of consider fairness only based on historical and current data\npatterns; (ii) a customised scalarisation function for multi-objective\nmulti-agent Q Learning that aims to balance efficiency and fairness. Extensive\nexperiments on a publicly available real-world dataset demonstrate that our\nproposed method outperforms existing state-of-the-art methods.\n","authors":["Yufan Kang","Jeffrey Chan","Wei Shao","Flora D. Salim","Christopher Leckie"],"pdf_url":"https://arxiv.org/pdf/2407.17839v1.pdf","comment":"Accepted by ECML PKDD 2024"},{"id":"http://arxiv.org/abs/2406.16087v3","updated":"2024-07-25T07:50:58Z","published":"2024-06-23T12:02:17Z","title":"Imperative Learning: A Self-supervised Neural-Symbolic Learning\n Framework for Robot Autonomy","summary":" Data-driven methods such as reinforcement and imitation learning have\nachieved remarkable success in robot autonomy. However, their data-centric\nnature still hinders them from generalizing well to ever-changing environments.\nMoreover, collecting large datasets for robotic tasks is often impractical and\nexpensive. To overcome these challenges, we introduce a new self-supervised\nneural-symbolic (NeSy) computational framework, imperative learning (IL), for\nrobot autonomy, leveraging the generalization abilities of symbolic reasoning.\nThe framework of IL consists of three primary components: a neural module, a\nreasoning engine, and a memory system. We formulate IL as a special bilevel\noptimization (BLO), which enables reciprocal learning over the three modules.\nThis overcomes the label-intensive obstacles associated with data-driven\napproaches and takes advantage of symbolic reasoning concerning logical\nreasoning, physical principles, geometric analysis, etc. We discuss several\noptimization techniques for IL and verify their effectiveness in five distinct\nrobot autonomy tasks including path planning, rule induction, optimal control,\nvisual odometry, and multi-robot routing. Through various experiments, we show\nthat IL can significantly enhance robot autonomy capabilities and we anticipate\nthat it will catalyze further research across diverse domains.\n","authors":["Chen Wang","Kaiyi Ji","Junyi Geng","Zhongqiang Ren","Taimeng Fu","Fan Yang","Yifan Guo","Haonan He","Xiangyu Chen","Zitong Zhan","Qiwei Du","Shaoshu Su","Bowen Li","Yuheng Qiu","Yi Du","Qihang Li","Yifan Yang","Xiao Lin","Zhipeng Zhao"],"pdf_url":"https://arxiv.org/pdf/2406.16087v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17835v1","updated":"2024-07-25T07:46:30Z","published":"2024-07-25T07:46:30Z","title":"IsUMap: Manifold Learning and Data Visualization leveraging\n Vietoris-Rips filtrations","summary":" This work introduces IsUMap, a novel manifold learning technique that\nenhances data representation by integrating aspects of UMAP and Isomap with\nVietoris-Rips filtrations. We present a systematic and detailed construction of\na metric representation for locally distorted metric spaces that captures\ncomplex data structures more accurately than the previous schemes. Our approach\naddresses limitations in existing methods by accommodating non-uniform data\ndistributions and intricate local geometries. We validate its performance\nthrough extensive experiments on examples of various geometric objects and\nbenchmark real-world datasets, demonstrating significant improvements in\nrepresentation quality.\n","authors":["Lukas Silvester Barth"," Fatemeh"," Fahimi","Parvaneh Joharinad","Jürgen Jost","Janis Keck"],"pdf_url":"https://arxiv.org/pdf/2407.17835v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03059v7","updated":"2024-07-25T07:42:15Z","published":"2023-10-04T16:49:36Z","title":"Point-PEFT: Parameter-Efficient Fine-Tuning for 3D Pre-trained Models","summary":" The popularity of pre-trained large models has revolutionized downstream\ntasks across diverse fields, such as language, vision, and multi-modality. To\nminimize the adaption cost for downstream tasks, many Parameter-Efficient\nFine-Tuning (PEFT) techniques are proposed for language and 2D image\npre-trained models. However, the specialized PEFT method for 3D pre-trained\nmodels is still under-explored. To this end, we introduce Point-PEFT, a novel\nframework for adapting point cloud pre-trained models with minimal learnable\nparameters. Specifically, for a pre-trained 3D model, we freeze most of its\nparameters, and only tune the newly added PEFT modules on downstream tasks,\nwhich consist of a Point-prior Prompt and a Geometry-aware Adapter. The\nPoint-prior Prompt adopts a set of learnable prompt tokens, for which we\npropose to construct a memory bank with domain-specific knowledge, and utilize\na parameter-free attention to enhance the prompt tokens. The Geometry-aware\nAdapter aims to aggregate point cloud features within spatial neighborhoods to\ncapture fine-grained geometric information through local interactions.\nExtensive experiments indicate that our Point-PEFT can achieve better\nperformance than the full fine-tuning on various downstream tasks, while using\nonly 5% of the trainable parameters, demonstrating the efficiency and\neffectiveness of our approach. Code is released at\nhttps://github.com/Ivan-Tang-3D/Point-PEFT.\n","authors":["Yiwen Tang","Ray Zhang","Zoey Guo","Dong Wang","Zhigang Wang","Bin Zhao","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2310.03059v7.pdf","comment":"The specialized PEFT framework for 3D pre-trained models, which\n achieves competitive performance to full fine-tuning, and significantly\n reduces the computational resources. Project page:\n https://github.com/Ivan-Tang-3D/Point-PEFT"},{"id":"http://arxiv.org/abs/2407.17125v2","updated":"2024-07-25T07:39:44Z","published":"2024-07-24T09:48:48Z","title":"Behavioral Testing: Can Large Language Models Implicitly Resolve\n Ambiguous Entities?","summary":" One of the major aspects contributing to the striking performance of large\nlanguage models (LLMs) is the vast amount of factual knowledge accumulated\nduring pre-training. Yet, many LLMs suffer from self-inconsistency, which\nraises doubts about their trustworthiness and reliability. In this paper, we\nfocus on entity type ambiguity and analyze current state-of-the-art LLMs for\ntheir proficiency and consistency in applying their factual knowledge when\nprompted for entities under ambiguity. To do so, we propose an evaluation\nprotocol that disentangles knowing from applying knowledge, and test\nstate-of-the-art LLMs on 49 entities. Our experiments reveal that LLMs perform\npoorly with ambiguous prompts, achieving only 80% accuracy. Our results further\ndemonstrate systematic discrepancies in LLM behavior and their failure to\nconsistently apply information, indicating that the models can exhibit\nknowledge without being able to utilize it, significant biases for preferred\nreadings, as well as self inconsistencies. Our study highlights the importance\nof handling entity ambiguity in future for more trustworthy LLMs\n","authors":["Anastasiia Sedova","Robert Litschko","Diego Frassinelli","Benjamin Roth","Barbara Plank"],"pdf_url":"https://arxiv.org/pdf/2407.17125v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17829v1","updated":"2024-07-25T07:38:27Z","published":"2024-07-25T07:38:27Z","title":"Image Segmentation via Divisive Normalization: dealing with\n environmental diversity","summary":" Autonomous driving is a challenging scenario for image segmentation due to\nthe presence of uncontrolled environmental conditions and the eventually\ncatastrophic consequences of failures. Previous work suggested that a\nbiologically motivated computation, the so-called Divisive Normalization, could\nbe useful to deal with image variability, but its effects have not been\nsystematically studied over different data sources and environmental factors.\nHere we put segmentation U-nets augmented with Divisive Normalization to work\nfar from training conditions to find where this adaptation is more critical. We\ncategorize the scenes according to their radiance level and dynamic range\n(day/night), and according to their achromatic/chromatic contrasts. We also\nconsider video game (synthetic) images to broaden the range of environments. We\ncheck the performance in the extreme percentiles of such categorization. Then,\nwe push the limits further by artificially modifying the images in\nperceptually/environmentally relevant dimensions: luminance, contrasts and\nspectral radiance. Results show that neural networks with Divisive\nNormalization get better results in all the scenarios and their performance\nremains more stable with regard to the considered environmental factors and\nnature of the source. Finally, we explain the improvements in segmentation\nperformance in two ways: (1) by quantifying the invariance of the responses\nthat incorporate Divisive Normalization, and (2) by illustrating the adaptive\nnonlinearity of the different layers that depends on the local activity.\n","authors":["Pablo Hernández-Cámara","Jorge Vila-Tomás","Paula Dauden-Oliver","Nuria Alabau-Bosque","Valero Laparra","Jesús Malo"],"pdf_url":"https://arxiv.org/pdf/2407.17829v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17827v1","updated":"2024-07-25T07:35:27Z","published":"2024-07-25T07:35:27Z","title":"Unified Lexical Representation for Interpretable Visual-Language\n Alignment","summary":" Visual-Language Alignment (VLA) has gained a lot of attention since CLIP's\ngroundbreaking work. Although CLIP performs well, the typical direct latent\nfeature alignment lacks clarity in its representation and similarity scores. On\nthe other hand, lexical representation, a vector whose element represents the\nsimilarity between the sample and a word from the vocabulary, is a natural\nsparse representation and interpretable, providing exact matches for individual\nwords. However, lexical representations is difficult to learn due to no\nground-truth supervision and false-discovery issues, and thus requires complex\ndesign to train effectively. In this paper, we introduce LexVLA, a more\ninterpretable VLA framework by learning a unified lexical representation for\nboth modalities without complex design. We use DINOv2 as our visual model for\nits local-inclined features and Llama 2, a generative language model, to\nleverage its in-context lexical prediction ability. To avoid the false\ndiscovery, we propose an overuse penalty to refrain the lexical representation\nfrom falsely frequently activating meaningless words. We demonstrate that these\ntwo pre-trained uni-modal models can be well-aligned by fine-tuning on modest\nmulti-modal dataset and avoid intricate training configurations. On cross-modal\nretrieval benchmarks, LexVLA, trained on the CC-12M multi-modal dataset,\noutperforms baselines fine-tuned on larger datasets (e.g., YFCC15M) and those\ntrained from scratch on even bigger datasets (e.g., 1.1B data, including\nCC-12M). We conduct extensive experiments to analyze LexVLA.\n","authors":["Yifan Li","Yikai Wang","Yanwei Fu","Dongyu Ru","Zheng Zhang","Tong He"],"pdf_url":"https://arxiv.org/pdf/2407.17827v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11869v3","updated":"2024-07-25T07:29:02Z","published":"2024-04-18T03:03:37Z","title":"Node-like as a Whole: Structure-aware Searching and Coarsening for Graph\n Classification","summary":" Graph Transformers (GTs) have made remarkable achievements in graph-level\ntasks. However, most existing works regard graph structures as a form of\nguidance or bias for enhancing node representations, which focuses on\nnode-central perspectives and lacks explicit representations of edges and\nstructures. One natural question is, can we treat graph structures node-like as\na whole to learn high-level features? Through experimental analysis, we explore\nthe feasibility of this assumption. Based on our findings, we propose a novel\nmulti-view graph representation learning model via structure-aware searching\nand coarsening (GRLsc) on GT architecture for graph classification.\nSpecifically, we build three unique views, original, coarsening, and\nconversion, to learn a thorough structural representation. We compress loops\nand cliques via hierarchical heuristic graph coarsening and restrict them with\nwell-designed constraints, which builds the coarsening view to learn high-level\ninteractions between structures. We also introduce line graphs for edge\nembeddings and switch to edge-central perspective to construct the conversion\nview. Experiments on eight real-world datasets demonstrate the improvements of\nGRLsc over 28 baselines from various architectures.\n","authors":["Xiaorui Qi","Qijie Bai","Yanlong Wen","Haiwei Zhang","Xiaojie Yuan"],"pdf_url":"https://arxiv.org/pdf/2404.11869v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17823v1","updated":"2024-07-25T07:25:06Z","published":"2024-07-25T07:25:06Z","title":"Optimal Hessian/Jacobian-Free Nonconvex-PL Bilevel Optimization","summary":" Bilevel optimization is widely applied in many machine learning tasks such as\nhyper-parameter learning, meta learning and reinforcement learning. Although\nmany algorithms recently have been developed to solve the bilevel optimization\nproblems, they generally rely on the (strongly) convex lower-level problems.\nMore recently, some methods have been proposed to solve the nonconvex-PL\nbilevel optimization problems, where their upper-level problems are possibly\nnonconvex, and their lower-level problems are also possibly nonconvex while\nsatisfying Polyak-{\\L}ojasiewicz (PL) condition. However, these methods still\nhave a high convergence complexity or a high computation complexity such as\nrequiring compute expensive Hessian/Jacobian matrices and its inverses. In the\npaper, thus, we propose an efficient Hessian/Jacobian-free method (i.e.,\nHJFBiO) with the optimal convergence complexity to solve the nonconvex-PL\nbilevel problems. Theoretically, under some mild conditions, we prove that our\nHJFBiO method obtains an optimal convergence rate of $O(\\frac{1}{T})$, where\n$T$ denotes the number of iterations, and has an optimal gradient complexity of\n$O(\\epsilon^{-1})$ in finding an $\\epsilon$-stationary solution. We conduct\nsome numerical experiments on the bilevel PL game and hyper-representation\nlearning task to demonstrate efficiency of our proposed method.\n","authors":["Feihu Huang"],"pdf_url":"https://arxiv.org/pdf/2407.17823v1.pdf","comment":"ICML 2024 (Oral). arXiv admin note: text overlap with\n arXiv:2311.04520"},{"id":"http://arxiv.org/abs/2407.17822v1","updated":"2024-07-25T07:24:41Z","published":"2024-07-25T07:24:41Z","title":"Advanced deep-reinforcement-learning methods for flow control:\n group-invariant and positional-encoding networks improve learning speed and\n quality","summary":" Flow control is key to maximize energy efficiency in a wide range of\napplications. However, traditional flow-control methods face significant\nchallenges in addressing non-linear systems and high-dimensional data, limiting\ntheir application in realistic energy systems. This study advances\ndeep-reinforcement-learning (DRL) methods for flow control, particularly\nfocusing on integrating group-invariant networks and positional encoding into\nDRL architectures. Our methods leverage multi-agent reinforcement learning\n(MARL) to exploit policy invariance in space, in combination with\ngroup-invariant networks to ensure local symmetry invariance. Additionally, a\npositional encoding inspired by the transformer architecture is incorporated to\nprovide location information to the agents, mitigating action constraints from\nstrict invariance. The proposed methods are verified using a case study of\nRayleigh-B\\'enard convection, where the goal is to minimize the Nusselt number\nNu. The group-invariant neural networks (GI-NNs) show faster convergence\ncompared to the base MARL, achieving better average policy performance. The\nGI-NNs not only cut DRL training time in half but also notably enhance learning\nreproducibility. Positional encoding further enhances these results,\neffectively reducing the minimum Nu and stabilizing convergence. Interestingly,\ngroup invariant networks specialize in improving learning speed and positional\nencoding specializes in improving learning quality. These results demonstrate\nthat choosing a suitable feature-representation method according to the purpose\nas well as the characteristics of each control problem is essential. We believe\nthat the results of this study will not only inspire novel DRL methods with\ninvariant and unique representations, but also provide useful insights for\nindustrial applications.\n","authors":["Joogoo Jeon","Jean Rabault","Joel Vasanth","Francisco Alcántara-Ávila","Shilaj Baral","Ricardo Vinuesa"],"pdf_url":"https://arxiv.org/pdf/2407.17822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15899v3","updated":"2024-07-25T07:18:05Z","published":"2024-07-22T10:20:34Z","title":"Spatial-Temporal Cross-View Contrastive Pre-training for Check-in\n Sequence Representation Learning","summary":" The rapid growth of location-based services (LBS) has yielded massive amounts\nof data on human mobility. Effectively extracting meaningful representations\nfor user-generated check-in sequences is pivotal for facilitating various\ndownstream services. However, the user-generated check-in data are\nsimultaneously influenced by the surrounding objective circumstances and the\nuser's subjective intention. Specifically, the temporal uncertainty and spatial\ndiversity exhibited in check-in data make it difficult to capture the\nmacroscopic spatial-temporal patterns of users and to understand the semantics\nof user mobility activities. Furthermore, the distinct characteristics of the\ntemporal and spatial information in check-in sequences call for an effective\nfusion method to incorporate these two types of information. In this paper, we\npropose a novel Spatial-Temporal Cross-view Contrastive Representation (STCCR)\nframework for check-in sequence representation learning. Specifically, STCCR\naddresses the above challenges by employing self-supervision from \"spatial\ntopic\" and \"temporal intention\" views, facilitating effective fusion of spatial\nand temporal information at the semantic level. Besides, STCCR leverages\ncontrastive clustering to uncover users' shared spatial topics from diverse\nmobility activities, while employing angular momentum contrast to mitigate the\nimpact of temporal uncertainty and noise. We extensively evaluate STCCR on\nthree real-world datasets and demonstrate its superior performance across three\ndownstream tasks.\n","authors":["Letian Gong","Huaiyu Wan","Shengnan Guo","Xiucheng Li","Yan Lin","Erwen Zheng","Tianyi Wang","Zeyu Zhou","Youfang Lin"],"pdf_url":"https://arxiv.org/pdf/2407.15899v3.pdf","comment":"This paper has been accepted as a regular paper at IEEE TKDE"},{"id":"http://arxiv.org/abs/2407.17817v1","updated":"2024-07-25T07:10:31Z","published":"2024-07-25T07:10:31Z","title":"Demystifying Verbatim Memorization in Large Language Models","summary":" Large Language Models (LLMs) frequently memorize long sequences verbatim,\noften with serious legal and privacy implications. Much prior work has studied\nsuch verbatim memorization using observational data. To complement such work,\nwe develop a framework to study verbatim memorization in a controlled setting\nby continuing pre-training from Pythia checkpoints with injected sequences. We\nfind that (1) non-trivial amounts of repetition are necessary for verbatim\nmemorization to happen; (2) later (and presumably better) checkpoints are more\nlikely to verbatim memorize sequences, even for out-of-distribution sequences;\n(3) the generation of memorized sequences is triggered by distributed model\nstates that encode high-level features and makes important use of general\nlanguage modeling capabilities. Guided by these insights, we develop stress\ntests to evaluate unlearning methods and find they often fail to remove the\nverbatim memorized information, while also degrading the LM. Overall, these\nfindings challenge the hypothesis that verbatim memorization stems from\nspecific model weights or mechanisms. Rather, verbatim memorization is\nintertwined with the LM's general capabilities and thus will be very difficult\nto isolate and suppress without degrading model quality.\n","authors":["Jing Huang","Diyi Yang","Christopher Potts"],"pdf_url":"https://arxiv.org/pdf/2407.17817v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17816v1","updated":"2024-07-25T07:10:08Z","published":"2024-07-25T07:10:08Z","title":"NC-NCD: Novel Class Discovery for Node Classification","summary":" Novel Class Discovery (NCD) involves identifying new categories within\nunlabeled data by utilizing knowledge acquired from previously established\ncategories. However, existing NCD methods often struggle to maintain a balance\nbetween the performance of old and new categories. Discovering unlabeled new\ncategories in a class-incremental way is more practical but also more\nchallenging, as it is frequently hindered by either catastrophic forgetting of\nold categories or an inability to learn new ones. Furthermore, the\nimplementation of NCD on continuously scalable graph-structured data remains an\nunder-explored area. In response to these challenges, we introduce for the\nfirst time a more practical NCD scenario for node classification (i.e.,\nNC-NCD), and propose a novel self-training framework with prototype replay and\ndistillation called SWORD, adopted to our NC-NCD setting. Our approach enables\nthe model to cluster unlabeled new category nodes after learning labeled nodes\nwhile preserving performance on old categories without reliance on old category\nnodes. SWORD achieves this by employing a self-training strategy to learn new\ncategories and preventing the forgetting of old categories through the joint\nuse of feature prototypes and knowledge distillation. Extensive experiments on\nfour common benchmarks demonstrate the superiority of SWORD over other\nstate-of-the-art methods.\n","authors":["Yue Hou","Xueyuan Chen","He Zhu","Romei Liu","Bowen Shi","Jiaheng Liu","Junran Wu","Ke Xu"],"pdf_url":"https://arxiv.org/pdf/2407.17816v1.pdf","comment":"Accepted by CIKM'24"},{"id":"http://arxiv.org/abs/2407.17815v1","updated":"2024-07-25T07:09:53Z","published":"2024-07-25T07:09:53Z","title":"Nested replicator dynamics, nested logit choice, and similarity-based\n learning","summary":" We consider a model of learning and evolution in games whose action sets are\nendowed with a partition-based similarity structure intended to capture\nexogenous similarities between strategies. In this model, revising agents have\na higher probability of comparing their current strategy with other strategies\nthat they deem similar, and they switch to the observed strategy with\nprobability proportional to its payoff excess. Because of this implicit bias\ntoward similar strategies, the resulting dynamics - which we call the nested\nreplicator dynamics - do not satisfy any of the standard monotonicity\npostulates for imitative game dynamics; nonetheless, we show that they retain\nthe main long-run rationality properties of the replicator dynamics, albeit at\nquantitatively different rates. We also show that the induced dynamics can be\nviewed as a stimulus-response model in the spirit of Erev & Roth (1998), with\nchoice probabilities given by the nested logit choice rule of Ben-Akiva (1973)\nand McFadden (1978). This result generalizes an existing relation between the\nreplicator dynamics and the exponential weights algorithm in online learning,\nand provides an additional layer of interpretation to our analysis and results.\n","authors":["Panayotis Mertikopoulos","William H. Sandholm"],"pdf_url":"https://arxiv.org/pdf/2407.17815v1.pdf","comment":"37 pages, 9 figures"},{"id":"http://arxiv.org/abs/2407.11652v2","updated":"2024-07-25T07:04:32Z","published":"2024-07-16T12:18:20Z","title":"CCVA-FL: Cross-Client Variations Adaptive Federated Learning for Medical\n Imaging","summary":" Federated Learning (FL) offers a privacy-preserving approach to train models\non decentralized data. Its potential in healthcare is significant, but\nchallenges arise due to cross-client variations in medical image data,\nexacerbated by limited annotations. This paper introduces Cross-Client\nVariations Adaptive Federated Learning (CCVA-FL) to address these issues.\nCCVA-FL aims to minimize cross-client variations by transforming images into a\ncommon feature space. It involves expert annotation of a subset of images from\neach client, followed by the selection of a client with the least data\ncomplexity as the target. Synthetic medical images are then generated using\nScalable Diffusion Models with Transformers (DiT) based on the target client's\nannotated images. These synthetic images, capturing diversity and representing\nthe original data, are shared with other clients. Each client then translates\nits local images into the target image space using image-to-image translation.\nThe translated images are subsequently used in a federated learning setting to\ndevelop a server model. Our results demonstrate that CCVA-FL outperforms\nVanilla Federated Averaging by effectively addressing data distribution\ndifferences across clients without compromising privacy.\n","authors":["Sunny Gupta","Amit Sethi"],"pdf_url":"https://arxiv.org/pdf/2407.11652v2.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2206.02617v7","updated":"2024-07-25T06:33:58Z","published":"2022-06-06T13:49:37Z","title":"Individual Privacy Accounting for Differentially Private Stochastic\n Gradient Descent","summary":" Differentially private stochastic gradient descent (DP-SGD) is the workhorse\nalgorithm for recent advances in private deep learning. It provides a single\nprivacy guarantee to all datapoints in the dataset. We propose output-specific\n$(\\varepsilon,\\delta)$-DP to characterize privacy guarantees for individual\nexamples when releasing models trained by DP-SGD. We also design an efficient\nalgorithm to investigate individual privacy across a number of datasets. We\nfind that most examples enjoy stronger privacy guarantees than the worst-case\nbound. We further discover that the training loss and the privacy parameter of\nan example are well-correlated. This implies groups that are underserved in\nterms of model utility simultaneously experience weaker privacy guarantees. For\nexample, on CIFAR-10, the average $\\varepsilon$ of the class with the lowest\ntest accuracy is 44.2\\% higher than that of the class with the highest\naccuracy.\n","authors":["Da Yu","Gautam Kamath","Janardhan Kulkarni","Tie-Yan Liu","Jian Yin","Huishuai Zhang"],"pdf_url":"https://arxiv.org/pdf/2206.02617v7.pdf","comment":"Add clarification about the applicability of Definition 4"},{"id":"http://arxiv.org/abs/2402.12656v4","updated":"2024-07-25T06:28:01Z","published":"2024-02-20T02:09:55Z","title":"HyperMoE: Towards Better Mixture of Experts via Transferring Among\n Experts","summary":" The Mixture of Experts (MoE) for language models has been proven effective in\naugmenting the capacity of models by dynamically routing each input token to a\nspecific subset of experts for processing. Despite the success, most existing\nmethods face a challenge for balance between sparsity and the availability of\nexpert knowledge: enhancing performance through increased use of expert\nknowledge often results in diminishing sparsity during expert selection. To\nmitigate this contradiction, we propose HyperMoE, a novel MoE framework built\nupon Hypernetworks. This framework integrates the computational processes of\nMoE with the concept of knowledge transferring in multi-task learning. Specific\nmodules generated based on the information of unselected experts serve as\nsupplementary information, which allows the knowledge of experts not selected\nto be used while maintaining selection sparsity. Our comprehensive empirical\nevaluations across multiple datasets and backbones establish that HyperMoE\nsignificantly outperforms existing MoE methods under identical conditions\nconcerning the number of experts.\n","authors":["Hao Zhao","Zihan Qiu","Huijia Wu","Zili Wang","Zhaofeng He","Jie Fu"],"pdf_url":"https://arxiv.org/pdf/2402.12656v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17803v1","updated":"2024-07-25T06:22:25Z","published":"2024-07-25T06:22:25Z","title":"Automatic Data Labeling for Software Vulnerability Prediction Models:\n How Far Are We?","summary":" Background: Software Vulnerability (SV) prediction needs large-sized and\nhigh-quality data to perform well. Current SV datasets mostly require expensive\nlabeling efforts by experts (human-labeled) and thus are limited in size.\nMeanwhile, there are growing efforts in automatic SV labeling at scale.\nHowever, the fitness of auto-labeled data for SV prediction is still largely\nunknown. Aims: We quantitatively and qualitatively study the quality and use of\nthe state-of-the-art auto-labeled SV data, D2A, for SV prediction. Method:\nUsing multiple sources and manual validation, we curate clean SV data from\nhuman-labeled SV-fixing commits in two well-known projects for investigating\nthe auto-labeled counterparts. Results: We discover that 50+% of the\nauto-labeled SVs are noisy (incorrectly labeled), and they hardly overlap with\nthe publicly reported ones. Yet, SV prediction models utilizing the noisy\nauto-labeled SVs can perform up to 22% and 90% better in Matthews Correlation\nCoefficient and Recall, respectively, than the original models. We also reveal\nthe promises and difficulties of applying noise-reduction methods for\nautomatically addressing the noise in auto-labeled SV data to maximize the data\nutilization for SV prediction. Conclusions: Our study informs the benefits and\nchallenges of using auto-labeled SVs, paving the way for large-scale SV\nprediction.\n","authors":["Triet H. M. Le","M. Ali Babar"],"pdf_url":"https://arxiv.org/pdf/2407.17803v1.pdf","comment":"Accepted as a full paper in the technical track at The International\n Symposium on Empirical Software Engineering and Measurement (ESEM) 2024"},{"id":"http://arxiv.org/abs/2407.17236v2","updated":"2024-07-25T06:21:01Z","published":"2024-07-24T12:45:02Z","title":"Statistical Batch-Based Bearing Fault Detection","summary":" In the domain of rotating machinery, bearings are vulnerable to different\nmechanical faults, including ball, inner, and outer race faults. Various\ntechniques can be used in condition-based monitoring, from classical signal\nanalysis to deep learning methods. Based on the complex working conditions of\nrotary machines, multivariate statistical process control charts such as\nHotelling's $T^2$ and Squared Prediction Error are useful for providing early\nwarnings. However, these methods are rarely applied to condition monitoring of\nrotating machinery due to the univariate nature of the datasets. In the present\npaper, we propose a multivariate statistical process control-based fault\ndetection method that utilizes multivariate data composed of Fourier transform\nfeatures extracted for fixed-time batches. Our approach makes use of the\nmultidimensional nature of Fourier transform characteristics, which record more\ndetailed information about the machine's status, in an effort to enhance early\ndefect detection and diagnosis. Experiments with varying vibration measurement\nlocations (Fan End, Drive End), fault types (ball, inner, and outer race\nfaults), and motor loads (0-3 horsepower) are used to validate the suggested\napproach. The outcomes illustrate our method's effectiveness in fault detection\nand point to possible broader uses in industrial maintenance.\n","authors":["Victoria Jorry","Zina-Sabrina Duma","Tuomas Sihvonen","Satu-Pia Reinikainen","Lassi Roininen"],"pdf_url":"https://arxiv.org/pdf/2407.17236v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17801v1","updated":"2024-07-25T06:20:03Z","published":"2024-07-25T06:20:03Z","title":"EEG-SSM: Leveraging State-Space Model for Dementia Detection","summary":" State-space models (SSMs) have garnered attention for effectively processing\nlong data sequences, reducing the need to segment time series into shorter\nintervals for model training and inference. Traditionally, SSMs capture only\nthe temporal dynamics of time series data, omitting the equally critical\nspectral features. This study introduces EEG-SSM, a novel state-space\nmodel-based approach for dementia classification using EEG data. Our model\nfeatures two primary innovations: EEG-SSM temporal and EEG-SSM spectral\ncomponents. The temporal component is designed to efficiently process EEG\nsequences of varying lengths, while the spectral component enhances the model\nby integrating frequency-domain information from EEG signals. The synergy of\nthese components allows EEG-SSM to adeptly manage the complexities of\nmultivariate EEG data, significantly improving accuracy and stability across\ndifferent temporal resolutions. Demonstrating a remarkable 91.0 percent\naccuracy in classifying Healthy Control (HC), Frontotemporal Dementia (FTD),\nand Alzheimer's Disease (AD) groups, EEG-SSM outperforms existing models on the\nsame dataset. The development of EEG-SSM represents an improvement in the use\nof state-space models for screening dementia, offering more precise and\ncost-effective tools for clinical neuroscience.\n","authors":["Xuan-The Tran","Linh Le","Quoc Toan Nguyen","Thomas Do","Chin-Teng Lin"],"pdf_url":"https://arxiv.org/pdf/2407.17801v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17795v1","updated":"2024-07-25T06:09:44Z","published":"2024-07-25T06:09:44Z","title":"Enhancing Diversity in Multi-objective Feature Selection","summary":" Feature selection plays a pivotal role in the data preprocessing and\nmodel-building pipeline, significantly enhancing model performance,\ninterpretability, and resource efficiency across diverse domains. In\npopulation-based optimization methods, the generation of diverse individuals\nholds utmost importance for adequately exploring the problem landscape,\nparticularly in highly multi-modal multi-objective optimization problems. Our\nstudy reveals that, in line with findings from several prior research papers,\ncommonly employed crossover and mutation operations lack the capability to\ngenerate high-quality diverse individuals and tend to become confined to\nlimited areas around various local optima. This paper introduces an\naugmentation to the diversity of the population in the well-established\nmulti-objective scheme of the genetic algorithm, NSGA-II. This enhancement is\nachieved through two key components: the genuine initialization method and the\nsubstitution of the worst individuals with new randomly generated individuals\nas a re-initialization approach in each generation. The proposed\nmulti-objective feature selection method undergoes testing on twelve real-world\nclassification problems, with the number of features ranging from 2,400 to\nnearly 50,000. The results demonstrate that replacing the last front of the\npopulation with an equivalent number of new random individuals generated using\nthe genuine initialization method and featuring a limited number of features\nsubstantially improves the population's quality and, consequently, enhances the\nperformance of the multi-objective algorithm.\n","authors":["Sevil Zanjani Miyandoab","Shahryar Rahnamayan","Azam Asilian Bidgoli","Sevda Ebrahimi","Masoud Makrehchi"],"pdf_url":"https://arxiv.org/pdf/2407.17795v1.pdf","comment":"8 pages, 3 figures, accepted to be published in IEEE WCCI 2024\n conference"},{"id":"http://arxiv.org/abs/2404.09302v2","updated":"2024-07-25T06:05:54Z","published":"2024-04-14T16:57:41Z","title":"High Significant Fault Detection in Azure Core Workload Insights","summary":" Azure Core workload insights have time-series data with different metric\nunits. Faults or Anomalies are observed in these time-series data owing to\nfaults observed with respect to metric name, resources region, dimensions, and\nits dimension value associated with the data. For Azure Core, an important task\nis to highlight faults or anomalies to the user on a dashboard that they can\nperceive easily. The number of anomalies reported should be highly significant\nand in a limited number, e.g., 5-20 anomalies reported per hour. The reported\nanomalies will have significant user perception and high reconstruction error\nin any time-series forecasting model. Hence, our task is to automatically\nidentify 'high significant anomalies' and their associated information for user\nperception.\n","authors":["Pranay Lohia","Laurent Boue","Sharath Rangappa","Vijay Agneeswaran"],"pdf_url":"https://arxiv.org/pdf/2404.09302v2.pdf","comment":"Published in IAAI 2024, which is the Industrial track of AAAI 2024"},{"id":"http://arxiv.org/abs/2305.14275v3","updated":"2024-07-25T05:53:46Z","published":"2023-05-23T17:24:04Z","title":"Variational Inference with Coverage Guarantees in Simulation-Based\n Inference","summary":" Amortized variational inference is an often employed framework in\nsimulation-based inference that produces a posterior approximation that can be\nrapidly computed given any new observation. Unfortunately, there are few\nguarantees about the quality of these approximate posteriors. We propose\nConformalized Amortized Neural Variational Inference (CANVI), a procedure that\nis scalable, easily implemented, and provides guaranteed marginal coverage.\nGiven a collection of candidate amortized posterior approximators, CANVI\nconstructs conformalized predictors based on each candidate, compares the\npredictors using a metric known as predictive efficiency, and returns the most\nefficient predictor. CANVI ensures that the resulting predictor constructs\nregions that contain the truth with a user-specified level of probability.\nCANVI is agnostic to design decisions in formulating the candidate\napproximators and only requires access to samples from the forward model,\npermitting its use in likelihood-free settings. We prove lower bounds on the\npredictive efficiency of the regions produced by CANVI and explore how the\nquality of a posterior approximation relates to the predictive efficiency of\nprediction regions based on that approximation. Finally, we demonstrate the\naccurate calibration and high predictive efficiency of CANVI on a suite of\nsimulation-based inference benchmark tasks and an important scientific task:\nanalyzing galaxy emission spectra.\n","authors":["Yash Patel","Declan McNamara","Jackson Loper","Jeffrey Regier","Ambuj Tewari"],"pdf_url":"https://arxiv.org/pdf/2305.14275v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17790v1","updated":"2024-07-25T05:52:48Z","published":"2024-07-25T05:52:48Z","title":"Exploring the Limitations of Kolmogorov-Arnold Networks in\n Classification: Insights to Software Training and Hardware Implementation","summary":" Kolmogorov-Arnold Networks (KANs), a novel type of neural network, have\nrecently gained popularity and attention due to the ability to substitute\nmulti-layer perceptions (MLPs) in artificial intelligence (AI) with higher\naccuracy and interoperability. However, KAN assessment is still limited and\ncannot provide an in-depth analysis of a specific domain. Furthermore, no study\nhas been conducted on the implementation of KANs in hardware design, which\nwould directly demonstrate whether KANs are truly superior to MLPs in practical\napplications. As a result, in this paper, we focus on verifying KANs for\nclassification issues, which are a common but significant topic in AI using\nfour different types of datasets. Furthermore, the corresponding hardware\nimplementation is considered using the Vitis high-level synthesis (HLS) tool.\nTo the best of our knowledge, this is the first article to implement hardware\nfor KAN. The results indicate that KANs cannot achieve more accuracy than MLPs\nin high complex datasets while utilizing substantially higher hardware\nresources. Therefore, MLP remains an effective approach for achieving accuracy\nand efficiency in software and hardware implementation.\n","authors":["an Duy Tran","Tran Xuan Hieu Le","Thi Diem Tran","Hoai Luan Pham","Vu Trung Duong Le","Tuan Hai Vu","Van Tinh Nguyen","Yasuhiko Nakashima"],"pdf_url":"https://arxiv.org/pdf/2407.17790v1.pdf","comment":"6 pages, 3 figures, 2 tables"},{"id":"http://arxiv.org/abs/2407.17781v1","updated":"2024-07-25T05:22:08Z","published":"2024-07-25T05:22:08Z","title":"Integrating Ensemble Kalman Filter with AI-based Weather Prediction\n Model ClimaX","summary":" Artificial intelligence (AI)-based weather prediction research is growing\nrapidly and has shown to be competitive with the advanced dynamic numerical\nweather prediction models. However, research combining AI-based weather\nprediction models with data assimilation remains limited partially because\nlong-term sequential data assimilation cycles are required to evaluate data\nassimilation systems. This study explores integrating the local ensemble\ntransform Kalman filter (LETKF) with an AI-based weather prediction model\nClimaX. Our experiments demonstrated that the ensemble data assimilation cycled\nstably for the AI-based weather prediction model using covariance inflation and\nlocalization techniques inside the LETKF. While ClimaX showed some limitations\nin capturing flow-dependent error covariance compared to dynamical models, the\nAI-based ensemble forecasts provided reasonable and beneficial error covariance\nin sparsely observed regions. These findings highlight the potential of AI\nmodels in weather forecasting and the importance of physical consistency and\naccurate error growth representation in improving ensemble data assimilation.\n","authors":["Shunji Kotsuki","Kenta Shiraishi","Atsushi Okazaki"],"pdf_url":"https://arxiv.org/pdf/2407.17781v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17773v1","updated":"2024-07-25T05:02:39Z","published":"2024-07-25T05:02:39Z","title":"KiVA: Kid-inspired Visual Analogies for Testing Large Multimodal Models","summary":" This paper investigates visual analogical reasoning in large multimodal\nmodels (LMMs) compared to human adults and children. A \"visual analogy\" is an\nabstract rule inferred from one image and applied to another. While benchmarks\nexist for testing visual reasoning in LMMs, they require advanced skills and\nomit basic visual analogies that even young children can make. Inspired by\ndevelopmental psychology, we propose a new benchmark of 1,400 visual\ntransformations of everyday objects to test LMMs on visual analogical reasoning\nand compare them to children and adults. We structure the evaluation into three\nstages: identifying what changed (e.g., color, number, etc.), how it changed\n(e.g., added one object), and applying the rule to new scenarios. Our findings\nshow that while models like GPT-4V, LLaVA-1.5, and MANTIS identify the \"what\"\neffectively, they struggle with quantifying the \"how\" and extrapolating this\nrule to new objects. In contrast, children and adults exhibit much stronger\nanalogical reasoning at all three stages. Additionally, the strongest tested\nmodel, GPT-4V, performs better in tasks involving simple visual attributes like\ncolor and size, correlating with quicker human adult response times.\nConversely, more complex tasks such as number, rotation, and reflection, which\nnecessitate extensive cognitive processing and understanding of the 3D physical\nworld, present more significant challenges. Altogether, these findings\nhighlight the limitations of training models on data that primarily consists of\n2D images and text.\n","authors":["Eunice Yiu","Maan Qraitem","Charlie Wong","Anisa Noor Majhi","Yutong Bai","Shiry Ginosar","Alison Gopnik","Kate Saenko"],"pdf_url":"https://arxiv.org/pdf/2407.17773v1.pdf","comment":"9 pages. For the KiVA benchmark, see https://github.com/ey242/KiVA"},{"id":"http://arxiv.org/abs/2407.17767v1","updated":"2024-07-25T04:48:56Z","published":"2024-07-25T04:48:56Z","title":"Online Learning for Autonomous Management of Intent-based 6G Networks","summary":" The growing complexity of networks and the variety of future scenarios with\ndiverse and often stringent performance requirements call for a higher level of\nautomation. Intent-based management emerges as a solution to attain high level\nof automation, enabling human operators to solely communicate with the network\nthrough high-level intents. The intents consist of the targets in the form of\nexpectations (i.e., latency expectation) from a service and based on the\nexpectations the required network configurations should be done accordingly. It\nis almost inevitable that when a network action is taken to fulfill one intent,\nit can cause negative impacts on the performance of another intent, which\nresults in a conflict. In this paper, we aim to address the conflict issue and\nautonomous management of intent-based networking, and propose an online\nlearning method based on the hierarchical multi-armed bandits approach for an\neffective management. Thanks to this hierarchical structure, it performs an\nefficient exploration and exploitation of network configurations with respect\nto the dynamic network conditions. We show that our algorithm is an effective\napproach regarding resource allocation and satisfaction of intent expectations.\n","authors":["Erciyes Karakaya","Ozgur Ercetin","Huseyin Ozkan","Mehmet Karaca","Elham Dehghan Biyar","Alexandros Palaios"],"pdf_url":"https://arxiv.org/pdf/2407.17767v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02290v2","updated":"2024-07-25T04:43:32Z","published":"2024-02-03T23:04:32Z","title":"Goodness-of-Fit and Clustering of Spherical Data: the QuadratiK package\n in R and Python","summary":" We introduce the QuadratiK package that incorporates innovative data analysis\nmethodologies. The presented software, implemented in both R and Python, offers\na comprehensive set of goodness-of-fit tests and clustering techniques using\nkernel-based quadratic distances, thereby bridging the gap between the\nstatistical and machine learning literatures. Our software implements one, two\nand k-sample tests for goodness of fit, providing an efficient and\nmathematically sound way to assess the fit of probability distributions.\nExpanded capabilities of our software include supporting tests for uniformity\non the d-dimensional Sphere based on Poisson kernel densities. Particularly\nnoteworthy is the incorporation of a unique clustering algorithm specifically\ntailored for spherical data that leverages a mixture of Poisson kernel-based\ndensities on the sphere. Alongside this, our software includes additional\ngraphical functions, aiding the users in validating, as well as visualizing and\nrepresenting clustering results. This enhances interpretability and usability\nof the analysis. In summary, our R and Python packages serve as a powerful\nsuite of tools, offering researchers and practitioners the means to delve\ndeeper into their data, draw robust inference, and conduct potentially\nimpactful analyses and inference across a wide array of disciplines.\n","authors":["Giovanni Saraceno","Marianthi Markatou","Raktim Mukhopadhyay","Mojgan Golzy"],"pdf_url":"https://arxiv.org/pdf/2402.02290v2.pdf","comment":"36 pages, 9 figures"},{"id":"http://arxiv.org/abs/2308.16884v2","updated":"2024-07-25T04:30:15Z","published":"2023-08-31T17:43:08Z","title":"The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122\n Language Variants","summary":" We present Belebele, a multiple-choice machine reading comprehension (MRC)\ndataset spanning 122 language variants. Significantly expanding the language\ncoverage of natural language understanding (NLU) benchmarks, this dataset\nenables the evaluation of text models in high-, medium-, and low-resource\nlanguages. Each question is based on a short passage from the Flores-200\ndataset and has four multiple-choice answers. The questions were carefully\ncurated to discriminate between models with different levels of general\nlanguage comprehension. The English dataset on its own proves difficult enough\nto challenge state-of-the-art language models. Being fully parallel, this\ndataset enables direct comparison of model performance across all languages. We\nuse this dataset to evaluate the capabilities of multilingual masked language\nmodels (MLMs) and large language models (LLMs). We present extensive results\nand find that despite significant cross-lingual transfer in English-centric\nLLMs, much smaller MLMs pretrained on balanced multilingual data still\nunderstand far more languages. We also observe that larger vocabulary size and\nconscious vocabulary construction correlate with better performance on\nlow-resource languages. Overall, Belebele opens up new avenues for evaluating\nand analyzing the multilingual capabilities of NLP systems.\n","authors":["Lucas Bandarkar","Davis Liang","Benjamin Muller","Mikel Artetxe","Satya Narayan Shukla","Donald Husa","Naman Goyal","Abhinandan Krishnan","Luke Zettlemoyer","Madian Khabsa"],"pdf_url":"https://arxiv.org/pdf/2308.16884v2.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2407.11358v2","updated":"2024-07-25T04:20:12Z","published":"2024-07-16T03:46:57Z","title":"SES: Bridging the Gap Between Explainability and Prediction of Graph\n Neural Networks","summary":" Despite the Graph Neural Networks' (GNNs) proficiency in analyzing graph\ndata, achieving high-accuracy and interpretable predictions remains\nchallenging. Existing GNN interpreters typically provide post-hoc explanations\ndisjointed from GNNs' predictions, resulting in misrepresentations.\nSelf-explainable GNNs offer built-in explanations during the training process.\nHowever, they cannot exploit the explanatory outcomes to augment prediction\nperformance, and they fail to provide high-quality explanations of node\nfeatures and require additional processes to generate explainable subgraphs,\nwhich is costly. To address the aforementioned limitations, we propose a\nself-explained and self-supervised graph neural network (SES) to bridge the gap\nbetween explainability and prediction. SES comprises two processes: explainable\ntraining and enhanced predictive learning. During explainable training, SES\nemploys a global mask generator co-trained with a graph encoder and directly\nproduces crucial structure and feature masks, reducing time consumption and\nproviding node feature and subgraph explanations. In the enhanced predictive\nlearning phase, mask-based positive-negative pairs are constructed utilizing\nthe explanations to compute a triplet loss and enhance the node representations\nby contrastive learning.\n","authors":["Zhenhua Huang","Kunhao Li","Shaojie Wang","Zhaohong Jia","Wentao Zhu","Sharad Mehrotra"],"pdf_url":"https://arxiv.org/pdf/2407.11358v2.pdf","comment":"Accepted as a conference paper at ICDE 2024"},{"id":"http://arxiv.org/abs/2403.08757v3","updated":"2024-07-25T04:12:17Z","published":"2024-03-13T17:55:34Z","title":"Efficient Combinatorial Optimization via Heat Diffusion","summary":" Combinatorial optimization problems are widespread but inherently challenging\ndue to their discrete nature. The primary limitation of existing methods is\nthat they can only access a small fraction of the solution space at each\niteration, resulting in limited efficiency for searching the global optimal.To\novercome this challenge, diverging from conventional efforts of expanding the\nsolver's search scope, we focus on enabling information to actively propagate\nto the solver through heat diffusion. By transforming the target function while\npreserving its optima, heat diffusion facilitates information flow from distant\nregions to the solver, providing more efficient navigation. Utilizing heat\ndiffusion, we propose a framework for solving general combinatorial\noptimization problems.The proposed methodology demonstrates superior\nperformance across a range of the most challenging and widely encountered\ncombinatorial optimizations. Echoing recent advancements in harnessing\nthermodynamics for generative artificial intelligence, our study further\nreveals its significant potential in advancing combinatorial optimization.\n","authors":["Hengyuan Ma","Wenlian Lu","Jianfeng Feng"],"pdf_url":"https://arxiv.org/pdf/2403.08757v3.pdf","comment":"Code is available in https://github.com/AwakerMhy/HeO"},{"id":"http://arxiv.org/abs/2407.17754v1","updated":"2024-07-25T04:09:12Z","published":"2024-07-25T04:09:12Z","title":"DualFed: Enjoying both Generalization and Personalization in Federated\n Learning via Hierachical Representations","summary":" In personalized federated learning (PFL), it is widely recognized that\nachieving both high model generalization and effective personalization poses a\nsignificant challenge due to their conflicting nature. As a result, existing\nPFL methods can only manage a trade-off between these two objectives. This\nraises an interesting question: Is it feasible to develop a model capable of\nachieving both objectives simultaneously? Our paper presents an affirmative\nanswer, and the key lies in the observation that deep models inherently exhibit\nhierarchical architectures, which produce representations with various levels\nof generalization and personalization at different stages. A straightforward\napproach stemming from this observation is to select multiple representations\nfrom these layers and combine them to concurrently achieve generalization and\npersonalization. However, the number of candidate representations is commonly\nhuge, which makes this method infeasible due to high computational costs.To\naddress this problem, we propose DualFed, a new method that can directly yield\ndual representations correspond to generalization and personalization\nrespectively, thereby simplifying the optimization task. Specifically, DualFed\ninserts a personalized projection network between the encoder and classifier.\nThe pre-projection representations are able to capture generalized information\nshareable across clients, and the post-projection representations are effective\nto capture task-specific information on local clients. This design minimizes\nthe mutual interference between generalization and personalization, thereby\nachieving a win-win situation. Extensive experiments show that DualFed can\noutperform other FL methods. Code is available at\nhttps://github.com/GuogangZhu/DualFed.\n","authors":["Guogang Zhu","Xuefeng Liu","Jianwei Niu","Shaojie Tang","Xinghao Wu","Jiayuan Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.17754v1.pdf","comment":"Accepted by ACM MutltiMedia 2024"},{"id":"http://arxiv.org/abs/2404.01039v3","updated":"2024-07-25T03:35:48Z","published":"2024-04-01T10:50:34Z","title":"A Survey on Hypergraph Neural Networks: An In-Depth and Step-By-Step\n Guide","summary":" Higher-order interactions (HOIs) are ubiquitous in real-world complex systems\nand applications. Investigation of deep learning for HOIs, thus, has become a\nvaluable agenda for the data mining and machine learning communities. As\nnetworks of HOIs are expressed mathematically as hypergraphs, hypergraph neural\nnetworks (HNNs) have emerged as a powerful tool for representation learning on\nhypergraphs. Given the emerging trend, we present the first survey dedicated to\nHNNs, with an in-depth and step-by-step guide. Broadly, the present survey\noverviews HNN architectures, training strategies, and applications. First, we\nbreak existing HNNs down into four design components: (i) input features, (ii)\ninput structures, (iii) message-passing schemes, and (iv) training strategies.\nSecond, we examine how HNNs address and learn HOIs with each of their\ncomponents. Third, we overview the recent applications of HNNs in\nrecommendation, bioinformatics and medical science, time series analysis, and\ncomputer vision. Lastly, we conclude with a discussion on limitations and\nfuture directions.\n","authors":["Sunwoo Kim","Soo Yong Lee","Yue Gao","Alessia Antelmi","Mirko Polato","Kijung Shin"],"pdf_url":"https://arxiv.org/pdf/2404.01039v3.pdf","comment":"To appear in KDD 2024 (survey paper). The typo in Equation (5) has\n been fixed"},{"id":"http://arxiv.org/abs/2311.07052v3","updated":"2024-07-25T03:20:15Z","published":"2023-11-13T03:36:18Z","title":"Towards the Law of Capacity Gap in Distilling Language Models","summary":" Language model (LM) distillation is a trending area that aims to distil the\nknowledge residing in a large teacher LM to a small student one. While various\nmethods have been proposed to maximize the effectiveness of the distillation,\nsignificant challenges persist, particularly when there is a substantial\ncapacity gap between the teacher and student LMs. This issue, often referred to\nas the \\textit{curse} of capacity gap, suggests that a larger teacher does not\nnecessarily result in a superior student compared to one distilled from a\nsmaller teacher. In other words, there is likely an optimal teacher yielding\nthe best student along the scaling course of the teacher. However, the curse of\ncapacity gap can not be tackled without notable compute overhead, as indicated\nin previous studies. In the context of large LMs (LLMs), previously viable\napproaches become much less meaningful, as it is an impossible triangle to\ndistill an expected student from an optimal teacher student with small compute\noverhead. Fortunately, the impossible triangle can fortunately be possible\nprovided an inducted \\textit{law} of capacity gap. In this paper, we take the\nspirits of scaling law and reveal that the optimal teacher scale almost\nconsistently follows a linear scaling with the student scale across different\nmodel architectures and data scales. The law later guides us to distil a 3B\nstudent LM (termed \\textsc{MiniMA}) from LLaMA2-7B. \\textsc{MiniMA} is\ndemonstrated to outperform a wide range of 3B competitors and could even\ncompete with several 7B models.\n","authors":["Chen Zhang","Dawei Song","Zheyu Ye","Yan Gao"],"pdf_url":"https://arxiv.org/pdf/2311.07052v3.pdf","comment":"32 pages, 10 figures, 15 tables, work in progress. Code and\n checkpoints are available at https://github.com/GeneZC/MiniMA"},{"id":"http://arxiv.org/abs/2402.18729v2","updated":"2024-07-25T03:06:54Z","published":"2024-02-28T22:19:55Z","title":"A Priori Uncertainty Quantification of Reacting Turbulence Closure\n Models using Bayesian Neural Networks","summary":" While many physics-based closure model forms have been posited for the\nsub-filter scale (SFS) in large eddy simulation (LES), vast amounts of data\navailable from direct numerical simulation (DNS) create opportunities to\nleverage data-driven modeling techniques. Albeit flexible, data-driven models\nstill depend on the dataset and the functional form of the model chosen.\nIncreased adoption of such models requires reliable uncertainty estimates both\nin the data-informed and out-of-distribution regimes. In this work, we employ\nBayesian neural networks (BNNs) to capture both epistemic and aleatoric\nuncertainties in a reacting flow model. In particular, we model the filtered\nprogress variable scalar dissipation rate which plays a key role in the\ndynamics of turbulent premixed flames. We demonstrate that BNN models can\nprovide unique insights about the structure of uncertainty of the data-driven\nclosure models. We also propose a method for the incorporation of\nout-of-distribution information in a BNN. The efficacy of the model is\ndemonstrated by a priori evaluation on a dataset consisting of a variety of\nflame conditions and fuels.\n","authors":["Graham Pash","Malik Hassanaly","Shashank Yellapantula"],"pdf_url":"https://arxiv.org/pdf/2402.18729v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17731v1","updated":"2024-07-25T03:03:20Z","published":"2024-07-25T03:03:20Z","title":"Optimal Trade and Industrial Policies in the Global Economy: A Deep\n Learning Framework","summary":" We propose a deep learning framework, DL-opt, designed to efficiently solve\nfor optimal policies in quantifiable general equilibrium trade models. DL-opt\nintegrates (i) a nested fixed point (NFXP) formulation of the optimization\nproblem, (ii) automatic implicit differentiation to enhance gradient descent\nfor solving unilateral optimal policies, and (iii) a best-response dynamics\napproach for finding Nash equilibria. Utilizing DL-opt, we solve for\nnon-cooperative tariffs and industrial subsidies across 7 economies and 44\nsectors, incorporating sectoral external economies of scale. Our quantitative\nanalysis reveals significant sectoral heterogeneity in Nash policies: Nash\nindustrial subsidies increase with scale elasticities, whereas Nash tariffs\ndecrease with trade elasticities. Moreover, we show that global dual\ncompetition, involving both tariffs and industrial subsidies, results in lower\ntariffs and higher welfare outcomes compared to a global tariff war. These\nfindings highlight the importance of considering sectoral heterogeneity and\npolicy combinations in understanding global economic competition.\n","authors":["Zi Wang","Xingcheng Xu","Yanqing Yang","Xiaodong Zhu"],"pdf_url":"https://arxiv.org/pdf/2407.17731v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17726v1","updated":"2024-07-25T02:55:39Z","published":"2024-07-25T02:55:39Z","title":"Multi-modal Data Binding for Survival Analysis Modeling with Incomplete\n Data and Annotations","summary":" Survival analysis stands as a pivotal process in cancer treatment research,\ncrucial for predicting patient survival rates accurately. Recent advancements\nin data collection techniques have paved the way for enhancing survival\npredictions by integrating information from multiple modalities. However,\nreal-world scenarios often present challenges with incomplete data,\nparticularly when dealing with censored survival labels. Prior works have\naddressed missing modalities but have overlooked incomplete labels, which can\nintroduce bias and limit model efficacy. To bridge this gap, we introduce a\nnovel framework that simultaneously handles incomplete data across modalities\nand censored survival labels. Our approach employs advanced foundation models\nto encode individual modalities and align them into a universal representation\nspace for seamless fusion. By generating pseudo labels and incorporating\nuncertainty, we significantly enhance predictive accuracy. The proposed method\ndemonstrates outstanding prediction accuracy in two survival analysis tasks on\nboth employed datasets. This innovative approach overcomes limitations\nassociated with disparate modalities and improves the feasibility of\ncomprehensive survival analysis using multiple large foundation models.\n","authors":["Linhao Qu","Dan Huang","Shaoting Zhang","Xiaosong Wang"],"pdf_url":"https://arxiv.org/pdf/2407.17726v1.pdf","comment":"Accepted by MICCAI 2024"},{"id":"http://arxiv.org/abs/2407.17723v1","updated":"2024-07-25T02:53:11Z","published":"2024-07-25T02:53:11Z","title":"Your Graph Recommender is Provably a Single-view Graph Contrastive\n Learning","summary":" Graph recommender (GR) is a type of graph neural network (GNNs) encoder that\nis customized for extracting information from the user-item interaction graph.\nDue to its strong performance on the recommendation task, GR has gained\nsignificant attention recently. Graph contrastive learning (GCL) is also a\npopular research direction that aims to learn, often unsupervised, GNNs with\ncertain contrastive objectives. As a general graph representation learning\nmethod, GCLs have been widely adopted with the supervised recommendation loss\nfor joint training of GRs. Despite the intersection of GR and GCL research,\ntheoretical understanding of the relationship between the two fields is\nsurprisingly sparse. This vacancy inevitably leads to inefficient scientific\nresearch.\n In this paper, we aim to bridge the gap between the field of GR and GCL from\nthe perspective of encoders and loss functions. With mild assumptions, we\ntheoretically show an astonishing fact that graph recommender is equivalent to\na commonly-used single-view graph contrastive model. Specifically, we find that\n(1) the classic encoder in GR is essentially a linear graph convolutional\nnetwork with one-hot inputs, and (2) the loss function in GR is well bounded by\na single-view GCL loss with certain hyperparameters. The first observation\nenables us to explain crucial designs of GR models, e.g., the removal of\nself-loop and nonlinearity. And the second finding can easily prompt many\ncross-field research directions. We empirically show a remarkable result that\nthe recommendation loss and the GCL loss can be used interchangeably. The fact\nthat we can train GR models solely with the GCL loss is particularly\ninsightful, since before this work, GCLs were typically viewed as unsupervised\nmethods that need fine-tuning. We also discuss some potential future works\ninspired by our theory.\n","authors":["Wenjie Yang","Shengzhong Zhang","Jiaxing Guo","Zengfeng Huang"],"pdf_url":"https://arxiv.org/pdf/2407.17723v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17722v1","updated":"2024-07-25T02:48:56Z","published":"2024-07-25T02:48:56Z","title":"Text-Driven Neural Collaborative Filtering Model for Paper Source\n Tracing","summary":" Identifying significant references within the complex interrelations of a\ncitation knowledge graph is challenging, which encompasses connections through\ncitations, authorship, keywords, and other relational attributes. The Paper\nSource Tracing (PST) task seeks to automate the identification of pivotal\nreferences for given scholarly articles utilizing advanced data mining\ntechniques. In the KDD CUP 2024, we design a recommendation-based framework\ntailored for the PST task. This framework employs the Neural Collaborative\nFiltering (NCF) model to generate final predictions. To process the textual\nattributes of the papers and extract input features for the model, we utilize\nSciBERT, a pre-trained language model. According to the experimental results,\nour method achieved a score of 0.37814 on the Mean Average Precision (MAP)\nmetric, outperforming baseline models and ranking 11th among all participating\nteams. The source code is publicly available at\nhttps://github.com/MyLove-XAB/KDDCupFinal.\n","authors":["Aobo Xu","Bingyu Chang","Qingpeng Liu","Ling Jian"],"pdf_url":"https://arxiv.org/pdf/2407.17722v1.pdf","comment":"KDD CUP 2024 OAG-Challenges, Paper Source Tracing, Technical Report\n of Team AoboSama @ KDD CUP 2024. August 25--29, 2024. Barcelona, Spain"},{"id":"http://arxiv.org/abs/2407.17721v1","updated":"2024-07-25T02:48:22Z","published":"2024-07-25T02:48:22Z","title":"A Two-Stage Imaging Framework Combining CNN and Physics-Informed Neural\n Networks for Full-Inverse Tomography: A Case Study in Electrical Impedance\n Tomography (EIT)","summary":" Physics-Informed Neural Networks (PINNs) are a machine learning technique for\nsolving partial differential equations (PDEs) by incorporating PDEs as loss\nterms in neural networks and minimizing the loss function during training.\nTomographic imaging, a method to reconstruct internal properties from external\nmeasurement data, is highly complex and ill-posed, making it an inverse\nproblem. Recently, PINNs have shown significant potential in computational\nfluid dynamics (CFD) and have advantages in solving inverse problems. However,\nexisting research has primarily focused on semi-inverse Electrical Impedance\nTomography (EIT), where internal electric potentials are accessible. The\npractical full inverse EIT problem, where only boundary voltage measurements\nare available, remains challenging. To address this, we propose a two-stage\nhybrid learning framework combining Convolutional Neural Networks (CNNs) and\nPINNs to solve the full inverse EIT problem. This framework integrates\ndata-driven and model-driven approaches, combines supervised and unsupervised\nlearning, and decouples the forward and inverse problems within the PINN\nframework in EIT. Stage I: a U-Net constructs an end-to-end mapping from\nboundary voltage measurements to the internal potential distribution using\nsupervised learning. Stage II: a Multilayer Perceptron (MLP)-based PINN takes\nthe predicted internal potentials as input to solve for the conductivity\ndistribution through unsupervised learning.\n","authors":["Xuanxuan Yang","Yangming Zhang","Haofeng Chen","Gang Ma","Xiaojie Wang"],"pdf_url":"https://arxiv.org/pdf/2407.17721v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17716v1","updated":"2024-07-25T02:30:40Z","published":"2024-07-25T02:30:40Z","title":"Describe Where You Are: Improving Noise-Robustness for Speech Emotion\n Recognition with Text Description of the Environment","summary":" Speech emotion recognition (SER) systems often struggle in real-world\nenvironments, where ambient noise severely degrades their performance. This\npaper explores a novel approach that exploits prior knowledge of testing\nenvironments to maximize SER performance under noisy conditions. To address\nthis task, we propose a text-guided, environment-aware training where an SER\nmodel is trained with contaminated speech samples and their paired noise\ndescription. We use a pre-trained text encoder to extract the text-based\nenvironment embedding and then fuse it to a transformer-based SER model during\ntraining and inference. We demonstrate the effectiveness of our approach\nthrough our experiment with the MSP-Podcast corpus and real-world additive\nnoise samples collected from the Freesound repository. Our experiment indicates\nthat the text-based environment descriptions processed by a large language\nmodel (LLM) produce representations that improve the noise-robustness of the\nSER system. In addition, our proposed approach with an LLM yields better\nperformance than our environment-agnostic baselines, especially in low\nsignal-to-noise ratio (SNR) conditions. When testing at -5dB SNR level, our\nproposed method shows better performance than our best baseline model by 31.8 %\n(arousal), 23.5% (dominance), and 9.5% (valence).\n","authors":["Seong-Gyun Leem","Daniel Fulford","Jukka-Pekka Onnela","David Gard","Carlos Busso"],"pdf_url":"https://arxiv.org/pdf/2407.17716v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06679v3","updated":"2024-07-25T02:30:32Z","published":"2023-09-13T02:34:21Z","title":"Robust experimental data assimilation for the Spalart-Allmaras\n turbulence model","summary":" This study presents a methodology focusing on the use of computational model\nand experimental data fusion to improve the Spalart-Allmaras (SA) closure model\nfor Reynolds-averaged Navier-Stokes solutions. In particular, our goal is to\ndevelop a technique that not only assimilates sparse experimental data to\nimprove turbulence model performance, but also preserves generalization for\nunseen cases by recovering classical SA behavior. We achieve our goals using\ndata assimilation, namely the Ensemble Kalman filtering approach (EnKF), to\ncalibrate the coefficients of the SA model for separated flows. A holistic\ncalibration strategy is implemented via the parameterization of the production,\ndiffusion, and destruction terms. This calibration relies on the assimilation\nof experimental data collected in the form of velocity profiles, skin friction,\nand pressure coefficients. Despite using observational data from a single flow\ncondition around a backward-facing step (BFS), the recalibrated SA model\ndemonstrates generalization to other separated flows, including cases such as\nthe 2D NASA wall mounted hump (2D-WMH) and modified BFS. Significant\nimprovement is observed in the quantities of interest, i.e., skin friction\ncoefficient ($C_f$) and pressure coefficient ($C_p$) for each flow tested.\nFinally, it is also demonstrated that the newly proposed model recovers SA\nproficiency for flows, such as a NACA-0012 airfoil and axisymmetric jet (ASJ),\nand that the individually calibrated terms in the SA model target specific\nflow-physics wherein the calibrated production term improves the re-circulation\nzone while destruction improves the recovery zone.\n","authors":["Deepinder Jot Singh Aulakh","Xiang Yang","Romit Maulik"],"pdf_url":"https://arxiv.org/pdf/2309.06679v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01599v2","updated":"2024-07-25T02:25:11Z","published":"2024-06-26T02:20:23Z","title":"JailbreakZoo: Survey, Landscapes, and Horizons in Jailbreaking Large\n Language and Vision-Language Models","summary":" The rapid evolution of artificial intelligence (AI) through developments in\nLarge Language Models (LLMs) and Vision-Language Models (VLMs) has brought\nsignificant advancements across various technological domains. While these\nmodels enhance capabilities in natural language processing and visual\ninteractive tasks, their growing adoption raises critical concerns regarding\nsecurity and ethical alignment. This survey provides an extensive review of the\nemerging field of jailbreaking--deliberately circumventing the ethical and\noperational boundaries of LLMs and VLMs--and the consequent development of\ndefense mechanisms. Our study categorizes jailbreaks into seven distinct types\nand elaborates on defense strategies that address these vulnerabilities.\nThrough this comprehensive examination, we identify research gaps and propose\ndirections for future studies to enhance the security frameworks of LLMs and\nVLMs. Our findings underscore the necessity for a unified perspective that\nintegrates both jailbreak strategies and defensive solutions to foster a\nrobust, secure, and reliable environment for the next generation of language\nmodels. More details can be found on our website:\n\\url{https://chonghan-chen.com/llm-jailbreak-zoo-survey/}.\n","authors":["Haibo Jin","Leyang Hu","Xinuo Li","Peiyan Zhang","Chonghan Chen","Jun Zhuang","Haohan Wang"],"pdf_url":"https://arxiv.org/pdf/2407.01599v2.pdf","comment":"45 pages"},{"id":"http://arxiv.org/abs/2407.17712v1","updated":"2024-07-25T02:17:53Z","published":"2024-07-25T02:17:53Z","title":"Improving Online Algorithms via ML Predictions","summary":" In this work we study the problem of using machine-learned predictions to\nimprove the performance of online algorithms. We consider two classical\nproblems, ski rental and non-clairvoyant job scheduling, and obtain new online\nalgorithms that use predictions to make their decisions. These algorithms are\noblivious to the performance of the predictor, improve with better predictions,\nbut do not degrade much if the predictions are poor.\n","authors":["Ravi Kumar","Manish Purohit","Zoya Svitkina"],"pdf_url":"https://arxiv.org/pdf/2407.17712v1.pdf","comment":"Conference version appeared in Neurips 2018"},{"id":"http://arxiv.org/abs/1810.13431v3","updated":"2024-07-25T10:21:32Z","published":"2018-10-31T17:44:20Z","title":"Targeted stochastic gradient Markov chain Monte Carlo for hidden Markov\n models with rare latent states","summary":" Markov chain Monte Carlo (MCMC) algorithms for hidden Markov models often\nrely on the forward-backward sampler. This makes them computationally slow as\nthe length of the time series increases, motivating the development of\nsub-sampling-based approaches. These approximate the full posterior by using\nsmall random subsequences of the data at each MCMC iteration within stochastic\ngradient MCMC. In the presence of imbalanced data resulting from rare latent\nstates, subsequences often exclude rare latent state data, leading to\ninaccurate inference and prediction/detection of rare events. We propose a\ntargeted sub-sampling (TASS) approach that over-samples observations\ncorresponding to rare latent states when calculating the stochastic gradient of\nparameters associated with them. TASS uses an initial clustering of the data to\nconstruct subsequence weights that reduce the variance in gradient estimation.\nThis leads to improved sampling efficiency, in particular in settings where the\nrare latent states correspond to extreme observations. We demonstrate\nsubstantial gains in predictive and inferential accuracy on real and synthetic\nexamples.\n","authors":["Rihui Ou","Deborshee Sen","Alexander L Young","David B Dunson"],"pdf_url":"https://arxiv.org/pdf/1810.13431v3.pdf","comment":null},{"id":"http://arxiv.org/abs/1906.08619v2","updated":"2024-07-25T08:29:47Z","published":"2019-06-20T13:51:07Z","title":"Bayesian Modelling in Practice: Using Uncertainty to Improve\n Trustworthiness in Medical Applications","summary":" The Intensive Care Unit (ICU) is a hospital department where machine learning\nhas the potential to provide valuable assistance in clinical decision making.\nClassical machine learning models usually only provide point-estimates and no\nuncertainty of predictions. In practice, uncertain predictions should be\npresented to doctors with extra care in order to prevent potentially\ncatastrophic treatment decisions. In this work we show how Bayesian modelling\nand the predictive uncertainty that it provides can be used to mitigate risk of\nmisguided prediction and to detect out-of-domain examples in a medical setting.\nWe derive analytically a bound on the prediction loss with respect to\npredictive uncertainty. The bound shows that uncertainty can mitigate loss.\nFurthermore, we apply a Bayesian Neural Network to the MIMIC-III dataset,\npredicting risk of mortality of ICU patients. Our empirical results show that\nuncertainty can indeed prevent potential errors and reliably identifies\nout-of-domain patients. These results suggest that Bayesian predictive\nuncertainty can greatly improve trustworthiness of machine learning models in\nhigh-risk settings such as the ICU.\n","authors":["David Ruhe","Giovanni Cinà","Michele Tonutti","Daan de Bruin","Paul Elbers"],"pdf_url":"https://arxiv.org/pdf/1906.08619v2.pdf","comment":"Presented at AISG @ ICML2019:\n https://aiforsocialgood.github.io/icml2019/index.htm"},{"id":"http://arxiv.org/abs/2402.04417v2","updated":"2024-07-25T23:31:56Z","published":"2024-02-06T21:33:34Z","title":"Decentralized Blockchain-based Robust Multi-agent Multi-armed Bandit","summary":" We study a robust, i.e. in presence of malicious participants, multi-agent\nmulti-armed bandit problem where multiple participants are distributed on a\nfully decentralized blockchain, with the possibility of some being malicious.\nThe rewards of arms are homogeneous among the honest participants, following\ntime-invariant stochastic distributions, which are revealed to the participants\nonly when certain conditions are met to ensure that the coordination mechanism\nis secure enough. The coordination mechanism's objective is to efficiently\nensure the cumulative rewards gained by the honest participants are maximized.\nTo this end, we are the first to incorporate advanced techniques from\nblockchains, as well as novel mechanisms, into such a cooperative decision\nmaking framework to design optimal strategies for honest participants. This\nframework allows various malicious behaviors and the maintenance of security\nand participant privacy. More specifically, we select a pool of validators who\ncommunicate to all participants, design a new consensus mechanism based on\ndigital signatures for these validators, invent a UCB-based strategy that\nrequires less information from participants through secure multi-party\ncomputation, and design the chain-participant interaction and an incentive\nmechanism to encourage participants' participation. Notably, we are the first\nto prove the theoretical regret of the proposed algorithm and claim its\noptimality. Unlike existing work that integrates blockchains with learning\nproblems such as federated learning which mainly focuses on optimality via\ncomputational experiments, we demonstrate that the regret of honest\nparticipants is upper bounded by $\\log{T}$ under certain assumptions. The\nregret bound is consistent with the multi-agent multi-armed bandit problem,\nboth without malicious participants and with purely Byzantine attacks which do\nnot affect the entire system.\n","authors":["Mengfan Xu","Diego Klabjan"],"pdf_url":"https://arxiv.org/pdf/2402.04417v2.pdf","comment":"45 pages"},{"id":"http://arxiv.org/abs/2407.18428v1","updated":"2024-07-25T23:27:10Z","published":"2024-07-25T23:27:10Z","title":"Weighted Risk Invariance: Domain Generalization under Invariant Feature\n Shift","summary":" Learning models whose predictions are invariant under multiple environments\nis a promising approach for out-of-distribution generalization. Such models are\ntrained to extract features $X_{\\text{inv}}$ where the conditional distribution\n$Y \\mid X_{\\text{inv}}$ of the label given the extracted features does not\nchange across environments. Invariant models are also supposed to generalize to\nshifts in the marginal distribution $p(X_{\\text{inv}})$ of the extracted\nfeatures $X_{\\text{inv}}$, a type of shift we call an $\\textit{invariant\ncovariate shift}$. However, we show that proposed methods for learning\ninvariant models underperform under invariant covariate shift, either failing\nto learn invariant models$\\unicode{x2014}$even for data generated from simple\nand well-studied linear-Gaussian models$\\unicode{x2014}$or having poor\nfinite-sample performance. To alleviate these problems, we propose\n$\\textit{weighted risk invariance}$ (WRI). Our framework is based on imposing\ninvariance of the loss across environments subject to appropriate reweightings\nof the training examples. We show that WRI provably learns invariant models,\ni.e. discards spurious correlations, in linear-Gaussian settings. We propose a\npractical algorithm to implement WRI by learning the density\n$p(X_{\\text{inv}})$ and the model parameters simultaneously, and we demonstrate\nempirically that WRI outperforms previous invariant learning methods under\ninvariant covariate shift.\n","authors":["Gina Wong","Joshua Gleason","Rama Chellappa","Yoav Wald","Anqi Liu"],"pdf_url":"https://arxiv.org/pdf/2407.18428v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18426v1","updated":"2024-07-25T23:04:37Z","published":"2024-07-25T23:04:37Z","title":"Diffusion-based subsurface multiphysics monitoring and forecasting","summary":" Carbon capture and storage (CCS) plays a crucial role in mitigating\ngreenhouse gas emissions, particularly from industrial outputs. Using seismic\nmonitoring can aid in an accurate and robust monitoring system to ensure the\neffectiveness of CCS and mitigate associated risks. However, conventional\nseismic wave equation-based approaches are computationally demanding, which\nhinders real-time applications. In addition to efficiency, forecasting and\nuncertainty analysis are not easy to handle using such\nnumerical-simulation-based approaches. To this end, we propose a novel\nsubsurface multiphysics monitoring and forecasting framework utilizing video\ndiffusion models. This approach can generate high-quality representations of\nCO$2$ evolution and associated changes in subsurface elastic properties. With\nreconstruction guidance, forecasting and inversion can be achieved conditioned\non historical frames and/or observational data. Meanwhile, due to the\ngenerative nature of the approach, we can quantify uncertainty in the\nprediction. Tests based on the Compass model show that the proposed method\nsuccessfully captured the inherently complex physical phenomena associated with\nCO$_2$ monitoring, and it can predict and invert the subsurface elastic\nproperties and CO$_2$ saturation with consistency in their evolution.\n","authors":["Xinquan Huang","Fu Wang","Tariq Alkhalifah"],"pdf_url":"https://arxiv.org/pdf/2407.18426v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18424v1","updated":"2024-07-25T22:56:21Z","published":"2024-07-25T22:56:21Z","title":"Model-driven Heart Rate Estimation and Heart Murmur Detection based on\n Phonocardiogram","summary":" Acoustic signals are crucial for health monitoring, particularly heart sounds\nwhich provide essential data like heart rate and detect cardiac anomalies such\nas murmurs. This study utilizes a publicly available phonocardiogram (PCG)\ndataset to estimate heart rate using model-driven methods and extends the\nbest-performing model to a multi-task learning (MTL) framework for simultaneous\nheart rate estimation and murmur detection. Heart rate estimates are derived\nusing a sliding window technique on heart sound snippets, analyzed with a\ncombination of acoustic features (Mel spectrogram, cepstral coefficients, power\nspectral density, root mean square energy). Our findings indicate that a 2D\nconvolutional neural network (\\textbf{\\texttt{2dCNN}}) is most effective for\nheart rate estimation, achieving a mean absolute error (MAE) of 1.312 bpm. We\nsystematically investigate the impact of different feature combinations and\nfind that utilizing all four features yields the best results. The MTL model\n(\\textbf{\\texttt{2dCNN-MTL}}) achieves accuracy over 95% in murmur detection,\nsurpassing existing models, while maintaining an MAE of 1.636 bpm in heart rate\nestimation, satisfying the requirements stated by Association for the\nAdvancement of Medical Instrumentation (AAMI).\n","authors":["Jingping Nie","Ran Liu","Behrooz Mahasseni","Erdrin Azemi","Vikramjit Mitra"],"pdf_url":"https://arxiv.org/pdf/2407.18424v1.pdf","comment":"6 pages, 10 figures"},{"id":"http://arxiv.org/abs/2407.18423v1","updated":"2024-07-25T22:48:08Z","published":"2024-07-25T22:48:08Z","title":"HDL-GPT: High-Quality HDL is All You Need","summary":" This paper presents Hardware Description Language Generative Pre-trained\nTransformers (HDL-GPT), a novel approach that leverages the vast repository of\nopen-source High Definition Language (HDL) codes to train superior quality\nlarge code models. The core premise of this paper is the hypothesis that\nhigh-quality HDL is all you need to create models with exceptional performance\nand broad zero-shot generalization abilities. The paper elucidates the methods\nemployed for the curation and augmentation of large corpora from open-source\nHDL code, transforming highly variable quality data into high-quality data\nthrough careful prompting and context maintenance. We demonstrate that the\ncareful selection, filtering, and augmentation of data across HDLs can yield\npowerful models that surpass current state-of-the-art models. We also explore\nthe impact of different fine-tuning methods on the quality of results. We\ndescribe experimental results across a range of fine-tuned SOTA LLMs,\nsubstantiating our claims. We demonstrate improvements of 50% to 200% over SOTA\nHDL models on current benchmarks in tasks ranging from HDL circuit\nexplanations, code generation, formal and simulation testbench creation,\ntriaging bugs, and fixing them. HDL-GPT opens new avenues for the development\nof advanced model training techniques for circuit design tasks.\n","authors":["Bhuvnesh Kumar","Saurav Nanda","Ganapathy Parthasarathy","Pawan Patil","Austin Tsai","Parivesh Choudhary"],"pdf_url":"https://arxiv.org/pdf/2407.18423v1.pdf","comment":"DAC 2024 Invited Paper"},{"id":"http://arxiv.org/abs/2407.18421v1","updated":"2024-07-25T22:42:36Z","published":"2024-07-25T22:42:36Z","title":"Self-Directed Synthetic Dialogues and Revisions Technical Report","summary":" Synthetic data has become an important tool in the fine-tuning of language\nmodels to follow instructions and solve complex problems. Nevertheless, the\nmajority of open data to date is often lacking multi-turn data and collected on\nclosed models, limiting progress on advancing open fine-tuning methods. We\nintroduce Self Directed Synthetic Dialogues (SDSD), an experimental dataset\nconsisting of guided conversations of language models talking to themselves.\nThe dataset consists of multi-turn conversations generated with DBRX, Llama 2\n70B, and Mistral Large, all instructed to follow a conversation plan generated\nprior to the conversation. We also explore including principles from\nConstitutional AI and other related works to create synthetic preference data\nvia revisions to the final conversation turn. We hope this work encourages\nfurther exploration in multi-turn data and the use of open models for expanding\nthe impact of synthetic data.\n","authors":["Nathan Lambert","Hailey Schoelkopf","Aaron Gokaslan","Luca Soldaini","Valentina Pyatkin","Louis Castricato"],"pdf_url":"https://arxiv.org/pdf/2407.18421v1.pdf","comment":"25 pages, 3 figures, 4 tables"},{"id":"http://arxiv.org/abs/2301.02819v8","updated":"2024-07-25T22:27:56Z","published":"2023-01-07T09:42:03Z","title":"ExcelFormer: A neural network surpassing GBDTs on tabular data","summary":" Data organized in tabular format is ubiquitous in real-world applications,\nand users often craft tables with biased feature definitions and flexibly set\nprediction targets of their interests. Thus, a rapid development of a robust,\neffective, dataset-versatile, user-friendly tabular prediction approach is\nhighly desired. While Gradient Boosting Decision Trees (GBDTs) and existing\ndeep neural networks (DNNs) have been extensively utilized by professional\nusers, they present several challenges for casual users, particularly: (i) the\ndilemma of model selection due to their different dataset preferences, and (ii)\nthe need for heavy hyperparameter searching, failing which their performances\nare deemed inadequate. In this paper, we delve into this question: Can we\ndevelop a deep learning model that serves as a \"sure bet\" solution for a wide\nrange of tabular prediction tasks, while also being user-friendly for casual\nusers? We delve into three key drawbacks of deep tabular models, encompassing:\n(P1) lack of rotational variance property, (P2) large data demand, and (P3)\nover-smooth solution. We propose ExcelFormer, addressing these challenges\nthrough a semi-permeable attention module that effectively constrains the\ninfluence of less informative features to break the DNNs' rotational invariance\nproperty (for P1), data augmentation approaches tailored for tabular data (for\nP2), and attentive feedforward network to boost the model fitting capability\n(for P3). These designs collectively make ExcelFormer a \"sure bet\" solution for\ndiverse tabular datasets. Extensive and stratified experiments conducted on\nreal-world datasets demonstrate that our model outperforms previous approaches\nacross diverse tabular data prediction tasks, and this framework can be\nfriendly to casual users, offering ease of use without the heavy hyperparameter\ntuning.\n","authors":["Jintai Chen","Jiahuan Yan","Qiyuan Chen","Danny Ziyi Chen","Jian Wu","Jimeng Sun"],"pdf_url":"https://arxiv.org/pdf/2301.02819v8.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12844v4","updated":"2024-07-25T22:27:38Z","published":"2024-03-19T15:51:21Z","title":"MELTing point: Mobile Evaluation of Language Transformers","summary":" Transformers have revolutionized the machine learning landscape, gradually\nmaking their way into everyday tasks and equipping our computers with \"sparks\nof intelligence\". However, their runtime requirements have prevented them from\nbeing broadly deployed on mobile. As personal devices become increasingly\npowerful and prompt privacy becomes an ever more pressing issue, we explore the\ncurrent state of mobile execution of Large Language Models (LLMs). To achieve\nthis, we have created our own automation infrastructure, MELT, which supports\nthe headless execution and benchmarking of LLMs on device, supporting different\nmodels, devices and frameworks, including Android, iOS and Nvidia Jetson\ndevices. We evaluate popular instruction fine-tuned LLMs and leverage different\nframeworks to measure their end-to-end and granular performance, tracing their\nmemory and energy requirements along the way. Our analysis is the first\nsystematic study of on-device LLM execution, quantifying performance, energy\nefficiency and accuracy across various state-of-the-art models and showcases\nthe state of on-device intelligence in the era of hyperscale models. Results\nhighlight the performance heterogeneity across targets and corroborates that\nLLM inference is largely memory-bound. Quantization drastically reduces memory\nrequirements and renders execution viable, but at a non-negligible accuracy\ncost. Drawing from its energy footprint and thermal behavior, the continuous\nexecution of LLMs remains elusive, as both factors negatively affect user\nexperience. Last, our experience shows that the ecosystem is still in its\ninfancy, and algorithmic as well as hardware breakthroughs can significantly\nshift the execution cost. We expect NPU acceleration, and framework-hardware\nco-design to be the biggest bet towards efficient standalone execution, with\nthe alternative of offloading tailored towards edge deployments.\n","authors":["Stefanos Laskaridis","Kleomenis Katevas","Lorenzo Minto","Hamed Haddadi"],"pdf_url":"https://arxiv.org/pdf/2403.12844v4.pdf","comment":"Accepted at the 30th Annual International Conference On Mobile\n Computing And Networking (MobiCom 2024)"},{"id":"http://arxiv.org/abs/2405.19995v2","updated":"2024-07-25T22:27:27Z","published":"2024-05-30T12:32:18Z","title":"Symmetries in Overparametrized Neural Networks: A Mean-Field View","summary":" We develop a Mean-Field (MF) view of the learning dynamics of\noverparametrized Artificial Neural Networks (NN) under data symmetric in law\nwrt the action of a general compact group $G$. We consider for this a class of\ngeneralized shallow NNs given by an ensemble of $N$ multi-layer units, jointly\ntrained using stochastic gradient descent (SGD) and possibly\nsymmetry-leveraging (SL) techniques, such as Data Augmentation (DA), Feature\nAveraging (FA) or Equivariant Architectures (EA). We introduce the notions of\nweakly and strongly invariant laws (WI and SI) on the parameter space of each\nsingle unit, corresponding, respectively, to $G$-invariant distributions, and\nto distributions supported on parameters fixed by the group action (which\nencode EA). This allows us to define symmetric models compatible with taking\n$N\\to\\infty$ and give an interpretation of the asymptotic dynamics of DA, FA\nand EA in terms of Wasserstein Gradient Flows describing their MF limits. When\nactivations respect the group action, we show that, for symmetric data, DA, FA\nand freely-trained models obey the exact same MF dynamic, which stays in the\nspace of WI laws and minimizes therein the population risk. We also give a\ncounterexample to the general attainability of an optimum over SI laws. Despite\nthis, quite remarkably, we show that the set of SI laws is also preserved by\nthe MF dynamics even when freely trained. This sharply contrasts the finite-$N$\nsetting, in which EAs are generally not preserved by unconstrained SGD. We\nillustrate the validity of our findings as $N$ gets larger in a teacher-student\nexperimental setting, training a student NN to learn from a WI, SI or arbitrary\nteacher model through various SL schemes. We last deduce a data-driven\nheuristic to discover the largest subspace of parameters supporting SI\ndistributions for a problem, that could be used for designing EA with minimal\ngeneralization error.\n","authors":["Javier Maass","Joaquin Fontbona"],"pdf_url":"https://arxiv.org/pdf/2405.19995v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18416v1","updated":"2024-07-25T22:24:45Z","published":"2024-07-25T22:24:45Z","title":"PersonaGym: Evaluating Persona Agents and LLMs","summary":" Persona agents, which are LLM agents that act according to an assigned\npersona, have demonstrated impressive contextual response capabilities across\nvarious applications. These persona agents offer significant enhancements\nacross diverse sectors, such as education, healthcare, and entertainment, where\nmodel developers can align agent responses to different user requirements\nthereby broadening the scope of agent applications. However, evaluating persona\nagent performance is incredibly challenging due to the complexity of assessing\npersona adherence in free-form interactions across various environments that\nare relevant to each persona agent. We introduce PersonaGym, the first dynamic\nevaluation framework for assessing persona agents, and PersonaScore, the first\nautomated human-aligned metric grounded in decision theory for comprehensive\nlarge-scale evaluation of persona agents. Our evaluation of 6 open and\nclosed-source LLMs, using a benchmark encompassing 200 personas and 10,000\nquestions, reveals significant opportunities for advancement in persona agent\ncapabilities across state-of-the-art models. For example, Claude 3.5 Sonnet\nonly has a 2.97% relative improvement in PersonaScore than GPT 3.5 despite\nbeing a much more advanced model. Importantly, we find that increased model\nsize and complexity do not necessarily imply enhanced persona agent\ncapabilities thereby highlighting the pressing need for algorithmic and\narchitectural invention towards faithful and performant persona agents.\n","authors":["Vinay Samuel","Henry Peng Zou","Yue Zhou","Shreyas Chaudhari","Ashwin Kalyan","Tanmay Rajpurohit","Ameet Deshpande","Karthik Narasimhan","Vishvak Murahari"],"pdf_url":"https://arxiv.org/pdf/2407.18416v1.pdf","comment":"21 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.18414v1","updated":"2024-07-25T22:12:47Z","published":"2024-07-25T22:12:47Z","title":"Adversarial Robust Decision Transformer: Enhancing Robustness of RvS via\n Minimax Returns-to-go","summary":" Decision Transformer (DT), as one of the representative Reinforcement\nLearning via Supervised Learning (RvS) methods, has achieved strong performance\nin offline learning tasks by leveraging the powerful Transformer architecture\nfor sequential decision-making. However, in adversarial environments, these\nmethods can be non-robust, since the return is dependent on the strategies of\nboth the decision-maker and adversary. Training a probabilistic model\nconditioned on observed return to predict action can fail to generalize, as the\ntrajectories that achieve a return in the dataset might have done so due to a\nweak and suboptimal behavior adversary. To address this, we propose a\nworst-case-aware RvS algorithm, the Adversarial Robust Decision Transformer\n(ARDT), which learns and conditions the policy on in-sample minimax\nreturns-to-go. ARDT aligns the target return with the worst-case return learned\nthrough minimax expectile regression, thereby enhancing robustness against\npowerful test-time adversaries. In experiments conducted on sequential games\nwith full data coverage, ARDT can generate a maximin (Nash Equilibrium)\nstrategy, the solution with the largest adversarial robustness. In large-scale\nsequential games and continuous adversarial RL environments with partial data\ncoverage, ARDT demonstrates significantly superior robustness to powerful\ntest-time adversaries and attains higher worst-case returns compared to\ncontemporary DT methods.\n","authors":["Xiaohang Tang","Afonso Marques","Parameswaran Kamalaruban","Ilija Bogunovic"],"pdf_url":"https://arxiv.org/pdf/2407.18414v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2407.18413v1","updated":"2024-07-25T22:11:30Z","published":"2024-07-25T22:11:30Z","title":"Simulation of Neural Responses to Classical Music Using Organoid\n Intelligence Methods","summary":" Music is a complex auditory stimulus capable of eliciting significant changes\nin brain activity, influencing cognitive processes such as memory, attention,\nand emotional regulation. However, the underlying mechanisms of music-induced\ncognitive processes remain largely unknown. Organoid intelligence and deep\nlearning models show promise for simulating and analyzing these neural\nresponses to classical music, an area significantly unexplored in computational\nneuroscience. Hence, we present the PyOrganoid library, an innovative tool that\nfacilitates the simulation of organoid learning models, integrating\nsophisticated machine learning techniques with biologically inspired organoid\nsimulations. Our study features the development of the Pianoid model, a \"deep\norganoid learning\" model that utilizes a Bidirectional LSTM network to predict\nEEG responses based on audio features from classical music recordings. This\nmodel demonstrates the feasibility of using computational methods to replicate\ncomplex neural processes, providing valuable insights into music perception and\ncognition. Likewise, our findings emphasize the utility of synthetic models in\nneuroscience research and highlight the PyOrganoid library's potential as a\nversatile tool for advancing studies in neuroscience and artificial\nintelligence.\n","authors":["Daniel Szelogowski"],"pdf_url":"https://arxiv.org/pdf/2407.18413v1.pdf","comment":"10 pages, 9 figures"},{"id":"http://arxiv.org/abs/2407.18407v1","updated":"2024-07-25T21:42:10Z","published":"2024-07-25T21:42:10Z","title":"Large Language Model Integrated Healthcare Cyber-Physical Systems\n Architecture","summary":" Cyber-physical systems have become an essential part of the modern healthcare\nindustry. The healthcare cyber-physical systems (HCPS) combine physical and\ncyber components to improve the healthcare industry. While HCPS has many\nadvantages, it also has some drawbacks, such as a lengthy data entry process, a\nlack of real-time processing, and limited real-time patient visualization. To\novercome these issues, this paper represents an innovative approach to\nintegrating large language model (LLM) to enhance the efficiency of the\nhealthcare system. By incorporating LLM at various layers, HCPS can leverage\nadvanced AI capabilities to improve patient outcomes, advance data processing,\nand enhance decision-making.\n","authors":["Malithi Wanniarachchi Kankanamge","Syed Mhamudul Hasan","Abdur R. Shahid","Ning Yang"],"pdf_url":"https://arxiv.org/pdf/2407.18407v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18402v1","updated":"2024-07-25T21:33:54Z","published":"2024-07-25T21:33:54Z","title":"The seismic purifier: An unsupervised approach to seismic signal\n detection via representation learning","summary":" In this paper, we develop an unsupervised learning approach to earthquake\ndetection. We train a specific class of deep auto-encoders that learn to\nreproduce the input waveforms after a data-compressive bottleneck, and then use\na simple triggering algorithm at the bottleneck to label waveforms as noise or\nsignal.\n Our approach is motivated by the intuition that efficient compression of data\nshould represent signals differently from noise, and is facilitated by a\ntime-axis-preserving approach to auto-encoding and intuitively-motivated\nchoices on the architecture and triggering.\n We demonstrate that the detection performance of the unsupervised approach is\ncomparable to, and in some cases better than, some of the state-of-the-art\nsupervised methods. Moreover, it has strong \\emph{cross-dataset\ngeneralization}. By experimenting with various modifications, we demonstrate\nthat the detection performance is insensitive to various technical choices made\nin the algorithm.\n Our approach has the potential to be useful for other signal detection\nproblems with time series data.\n","authors":["Onur Efe","Arkadas Ozakin"],"pdf_url":"https://arxiv.org/pdf/2407.18402v1.pdf","comment":"Submitted to IEEE-TGRS"},{"id":"http://arxiv.org/abs/2407.01603v2","updated":"2024-07-25T21:23:15Z","published":"2024-06-26T17:33:21Z","title":"A Review of Large Language Models and Autonomous Agents in Chemistry","summary":" Large language models (LLMs) have emerged as powerful tools in chemistry,\nsignificantly impacting molecule design, property prediction, and synthesis\noptimization. This review highlights LLM capabilities in these domains and\ntheir potential to accelerate scientific discovery through automation. We also\nreview LLM-based autonomous agents: LLMs with a broader set of tools to\ninteract with their surrounding environment. These agents perform diverse tasks\nsuch as paper scraping, interfacing with automated laboratories, and synthesis\nplanning. As agents are an emerging topic, we extend the scope of our review of\nagents beyond chemistry and discuss across any scientific domains. This review\ncovers the recent history, current capabilities, and design of LLMs and\nautonomous agents, addressing specific challenges, opportunities, and future\ndirections in chemistry. Key challenges include data quality and integration,\nmodel interpretability, and the need for standard benchmarks, while future\ndirections point towards more sophisticated multi-modal agents and enhanced\ncollaboration between agents and experimental methods. Due to the quick pace of\nthis field, a repository has been built to keep track of the latest studies:\nhttps://github.com/ur-whitelab/LLMs-in-science.\n","authors":["Mayk Caldas Ramos","Christopher J. Collison","Andrew D. White"],"pdf_url":"https://arxiv.org/pdf/2407.01603v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18397v1","updated":"2024-07-25T21:09:20Z","published":"2024-07-25T21:09:20Z","title":"Gaussian Process Kolmogorov-Arnold Networks","summary":" In this paper, we introduce a probabilistic extension to Kolmogorov Arnold\nNetworks (KANs) by incorporating Gaussian Process (GP) as non-linear neurons,\nwhich we refer to as GP-KAN. A fully analytical approach to handling the output\ndistribution of one GP as an input to another GP is achieved by considering the\nfunction inner product of a GP function sample with the input distribution.\nThese GP neurons exhibit robust non-linear modelling capabilities while using\nfew parameters and can be easily and fully integrated in a feed-forward network\nstructure. They provide inherent uncertainty estimates to the model prediction\nand can be trained directly on the log-likelihood objective function, without\nneeding variational lower bounds or approximations. In the context of MNIST\nclassification, a model based on GP-KAN of 80 thousand parameters achieved\n98.5% prediction accuracy, compared to current state-of-the-art models with 1.5\nmillion parameters.\n","authors":["Andrew Siyuan Chen"],"pdf_url":"https://arxiv.org/pdf/2407.18397v1.pdf","comment":"related code: https://github.com/siyuan0/gp-kan"},{"id":"http://arxiv.org/abs/2407.18387v1","updated":"2024-07-25T20:42:16Z","published":"2024-07-25T20:42:16Z","title":"SCALE: Self-regulated Clustered federAted LEarning in a Homogeneous\n Environment","summary":" Federated Learning (FL) has emerged as a transformative approach for enabling\ndistributed machine learning while preserving user privacy, yet it faces\nchallenges like communication inefficiencies and reliance on centralized\ninfrastructures, leading to increased latency and costs. This paper presents a\nnovel FL methodology that overcomes these limitations by eliminating the\ndependency on edge servers, employing a server-assisted Proximity Evaluation\nfor dynamic cluster formation based on data similarity, performance indices,\nand geographical proximity. Our integrated approach enhances operational\nefficiency and scalability through a Hybrid Decentralized Aggregation Protocol,\nwhich merges local model training with peer-to-peer weight exchange and a\ncentralized final aggregation managed by a dynamically elected driver node,\nsignificantly curtailing global communication overhead. Additionally, the\nmethodology includes Decentralized Driver Selection, Check-pointing to reduce\nnetwork traffic, and a Health Status Verification Mechanism for system\nrobustness. Validated using the breast cancer dataset, our architecture not\nonly demonstrates a nearly tenfold reduction in communication overhead but also\nshows remarkable improvements in reducing training latency and energy\nconsumption while maintaining high learning performance, offering a scalable,\nefficient, and privacy-preserving solution for the future of federated learning\necosystems.\n","authors":["Sai Puppala","Ismail Hossain","Md Jahangir Alam","Sajedul Talukder","Zahidur Talukder","Syed Bahauddin"],"pdf_url":"https://arxiv.org/pdf/2407.18387v1.pdf","comment":"This research article got accepted in COMPSAC conference and going to\n be published to IEEE"},{"id":"http://arxiv.org/abs/2407.18384v1","updated":"2024-07-25T20:37:12Z","published":"2024-07-25T20:37:12Z","title":"Mathematical theory of deep learning","summary":" This book provides an introduction to the mathematical analysis of deep\nlearning. It covers fundamental results in approximation theory, optimization\ntheory, and statistical learning theory, which are the three main pillars of\ndeep neural network theory. Serving as a guide for students and researchers in\nmathematics and related fields, the book aims to equip readers with\nfoundational knowledge on the topic. It prioritizes simplicity over generality,\nand presents rigorous yet accessible results to help build an understanding of\nthe essential mathematical concepts underpinning deep learning.\n","authors":["Philipp Petersen","Jakob Zech"],"pdf_url":"https://arxiv.org/pdf/2407.18384v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2407.14242v2","updated":"2024-07-25T13:30:33Z","published":"2024-07-19T12:22:32Z","title":"Continual Panoptic Perception: Towards Multi-modal Incremental\n Interpretation of Remote Sensing Images","summary":" Continual learning (CL) breaks off the one-way training manner and enables a\nmodel to adapt to new data, semantics and tasks continuously. However, current\nCL methods mainly focus on single tasks. Besides, CL models are plagued by\ncatastrophic forgetting and semantic drift since the lack of old data, which\noften occurs in remote-sensing interpretation due to the intricate fine-grained\nsemantics. In this paper, we propose Continual Panoptic Perception (CPP), a\nunified continual learning model that leverages multi-task joint learning\ncovering pixel-level classification, instance-level segmentation and\nimage-level perception for universal interpretation in remote sensing images.\nConcretely, we propose a collaborative cross-modal encoder (CCE) to extract the\ninput image features, which supports pixel classification and caption\ngeneration synchronously. To inherit the knowledge from the old model without\nexemplar memory, we propose a task-interactive knowledge distillation (TKD)\nmethod, which leverages cross-modal optimization and task-asymmetric\npseudo-labeling (TPL) to alleviate catastrophic forgetting. Furthermore, we\nalso propose a joint optimization mechanism to achieve end-to-end multi-modal\npanoptic perception. Experimental results on the fine-grained panoptic\nperception dataset validate the effectiveness of the proposed model, and also\nprove that joint optimization can boost sub-task CL efficiency with over 13\\%\nrelative improvement on panoptic quality.\n","authors":["Bo Yuan","Danpei Zhao","Zhuoran Liu","Wentao Li","Tian Li"],"pdf_url":"https://arxiv.org/pdf/2407.14242v2.pdf","comment":"Accepted in ACMMM 2024"},{"id":"http://arxiv.org/abs/2407.17911v1","updated":"2024-07-25T10:06:26Z","published":"2024-07-25T10:06:26Z","title":"ReCorD: Reasoning and Correcting Diffusion for HOI Generation","summary":" Diffusion models revolutionize image generation by leveraging natural\nlanguage to guide the creation of multimedia content. Despite significant\nadvancements in such generative models, challenges persist in depicting\ndetailed human-object interactions, especially regarding pose and object\nplacement accuracy. We introduce a training-free method named Reasoning and\nCorrecting Diffusion (ReCorD) to address these challenges. Our model couples\nLatent Diffusion Models with Visual Language Models to refine the generation\nprocess, ensuring precise depictions of HOIs. We propose an interaction-aware\nreasoning module to improve the interpretation of the interaction, along with\nan interaction correcting module to refine the output image for more precise\nHOI generation delicately. Through a meticulous process of pose selection and\nobject positioning, ReCorD achieves superior fidelity in generated images while\nefficiently reducing computational requirements. We conduct comprehensive\nexperiments on three benchmarks to demonstrate the significant progress in\nsolving text-to-image generation tasks, showcasing ReCorD's ability to render\ncomplex interactions accurately by outperforming existing methods in HOI\nclassification score, as well as FID and Verb CLIP-Score. Project website is\navailable at https://alberthkyhky.github.io/ReCorD/ .\n","authors":["Jian-Yu Jiang-Lin","Kang-Yang Huang","Ling Lo","Yi-Ning Huang","Terence Lin","Jhih-Ciang Wu","Hong-Han Shuai","Wen-Huang Cheng"],"pdf_url":"https://arxiv.org/pdf/2407.17911v1.pdf","comment":"Accepted by ACM MM 2024. Project website:\n https://alberthkyhky.github.io/ReCorD/"},{"id":"http://arxiv.org/abs/2407.17854v1","updated":"2024-07-25T08:15:43Z","published":"2024-07-25T08:15:43Z","title":"Shapley Value-based Contrastive Alignment for Multimodal Information\n Extraction","summary":" The rise of social media and the exponential growth of multimodal\ncommunication necessitates advanced techniques for Multimodal Information\nExtraction (MIE). However, existing methodologies primarily rely on direct\nImage-Text interactions, a paradigm that often faces significant challenges due\nto semantic and modality gaps between images and text. In this paper, we\nintroduce a new paradigm of Image-Context-Text interaction, where large\nmultimodal models (LMMs) are utilized to generate descriptive textual context\nto bridge these gaps. In line with this paradigm, we propose a novel Shapley\nValue-based Contrastive Alignment (Shap-CA) method, which aligns both\ncontext-text and context-image pairs. Shap-CA initially applies the Shapley\nvalue concept from cooperative game theory to assess the individual\ncontribution of each element in the set of contexts, texts and images towards\ntotal semantic and modality overlaps. Following this quantitative evaluation, a\ncontrastive learning strategy is employed to enhance the interactive\ncontribution within context-text/image pairs, while minimizing the influence\nacross these pairs. Furthermore, we design an adaptive fusion module for\nselective cross-modal fusion. Extensive experiments across four MIE datasets\ndemonstrate that our method significantly outperforms existing state-of-the-art\nmethods.\n","authors":["Wen Luo","Yu Xia","Shen Tianshu","Sujian Li"],"pdf_url":"https://arxiv.org/pdf/2407.17854v1.pdf","comment":"Accepted at ACM Multimedia 2024"},{"id":"http://arxiv.org/abs/2407.17028v2","updated":"2024-07-25T05:23:24Z","published":"2024-07-24T06:15:28Z","title":"Enhancing Environmental Monitoring through Multispectral Imaging: The\n WasteMS Dataset for Semantic Segmentation of Lakeside Waste","summary":" Environmental monitoring of lakeside green areas is crucial for environmental\nprotection. Compared to manual inspections, computer vision technologies offer\na more efficient solution when deployed on-site. Multispectral imaging provides\ndiverse information about objects under different spectrums, aiding in the\ndifferentiation between waste and lakeside lawn environments. This study\nintroduces WasteMS, the first multispectral dataset established for the\nsemantic segmentation of lakeside waste. WasteMS includes a diverse range of\nwaste types in lawn environments, captured under various lighting conditions.\nWe implemented a rigorous annotation process to label waste in images.\nRepresentative semantic segmentation frameworks were used to evaluate\nsegmentation accuracy using WasteMS. Challenges encountered when using WasteMS\nfor segmenting waste on lakeside lawns were discussed. The WasteMS dataset is\navailable at https://github.com/zhuqinfeng1999/WasteMS.\n","authors":["Qinfeng Zhu","Ningxin Weng","Lei Fan","Yuanzhi Cai"],"pdf_url":"https://arxiv.org/pdf/2407.17028v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18376v1","updated":"2024-07-25T20:19:29Z","published":"2024-07-25T20:19:29Z","title":"Exploring Bengali Religious Dialect Biases in Large Language Models with\n Evaluation Perspectives","summary":" While Large Language Models (LLM) have created a massive technological impact\nin the past decade, allowing for human-enabled applications, they can produce\noutput that contains stereotypes and biases, especially when using low-resource\nlanguages. This can be of great ethical concern when dealing with sensitive\ntopics such as religion. As a means toward making LLMS more fair, we explore\nbias from a religious perspective in Bengali, focusing specifically on two main\nreligious dialects: Hindu and Muslim-majority dialects. Here, we perform\ndifferent experiments and audit showing the comparative analysis of different\nsentences using three commonly used LLMs: ChatGPT, Gemini, and Microsoft\nCopilot, pertaining to the Hindu and Muslim dialects of specific words and\nshowcasing which ones catch the social biases and which do not. Furthermore, we\nanalyze our findings and relate them to potential reasons and evaluation\nperspectives, considering their global impact with over 300 million speakers\nworldwide. With this work, we hope to establish the rigor for creating more\nfairness in LLMs, as these are widely used as creative writing agents.\n","authors":["Azmine Toushik Wasi","Raima Islam","Mst Rafia Islam","Taki Hasan Rafi","Dong-Kyu Chae"],"pdf_url":"https://arxiv.org/pdf/2407.18376v1.pdf","comment":"10 Pages, 4 Figures. Accepted to the 1st Human-centered Evaluation\n and Auditing of Language Models Workshop at CHI 2024 (Workshop website:\n https://heal-workshop.github.io/#:~:text=Exploring%20Bengali%20Religious%20Dialect%20Biases%20in%20Large%20Language%20Models%20with%20Evaluation%20Perspectives)"}]},"2024-07-26T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2407.18908v1","updated":"2024-07-26T17:59:09Z","published":"2024-07-26T17:59:09Z","title":"Wolf: Captioning Everything with a World Summarization Framework","summary":" We propose Wolf, a WOrLd summarization Framework for accurate video\ncaptioning. Wolf is an automated captioning framework that adopts a\nmixture-of-experts approach, leveraging complementary strengths of Vision\nLanguage Models (VLMs). By utilizing both image and video models, our framework\ncaptures different levels of information and summarizes them efficiently. Our\napproach can be applied to enhance video understanding, auto-labeling, and\ncaptioning. To evaluate caption quality, we introduce CapScore, an LLM-based\nmetric to assess the similarity and quality of generated captions compared to\nthe ground truth captions. We further build four human-annotated datasets in\nthree domains: autonomous driving, general scenes, and robotics, to facilitate\ncomprehensive comparisons. We show that Wolf achieves superior captioning\nperformance compared to state-of-the-art approaches from the research community\n(VILA1.5, CogAgent) and commercial solutions (Gemini-Pro-1.5, GPT-4V). For\ninstance, in comparison with GPT-4V, Wolf improves CapScore both quality-wise\nby 55.6% and similarity-wise by 77.4% on challenging driving videos. Finally,\nwe establish a benchmark for video captioning and introduce a leaderboard,\naiming to accelerate advancements in video understanding, captioning, and data\nalignment. Leaderboard: https://wolfv0.github.io/leaderboard.html.\n","authors":["Boyi Li","Ligeng Zhu","Ran Tian","Shuhan Tan","Yuxiao Chen","Yao Lu","Yin Cui","Sushant Veer","Max Ehrlich","Jonah Philion","Xinshuo Weng","Fuzhao Xue","Andrew Tao","Ming-Yu Liu","Sanja Fidler","Boris Ivanovic","Trevor Darrell","Jitendra Malik","Song Han","Marco Pavone"],"pdf_url":"https://arxiv.org/pdf/2407.18908v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18901v1","updated":"2024-07-26T17:55:45Z","published":"2024-07-26T17:55:45Z","title":"AppWorld: A Controllable World of Apps and People for Benchmarking\n Interactive Coding Agents","summary":" Autonomous agents that address day-to-day digital tasks (e.g., ordering\ngroceries for a household), must not only operate multiple apps (e.g., notes,\nmessaging, shopping app) via APIs, but also generate rich code with complex\ncontrol flow in an iterative manner based on their interaction with the\nenvironment. However, existing benchmarks for tool use are inadequate, as they\nonly cover tasks that require a simple sequence of API calls.\n To remedy this gap, we built $\\textbf{AppWorld Engine}$, a high-quality\nexecution environment (60K lines of code) of 9 day-to-day apps operable via 457\nAPIs and populated with realistic digital activities simulating the lives of\n~100 fictitious users. We then created $\\textbf{AppWorld Benchmark}$ (40K lines\nof code), a suite of 750 natural, diverse, and challenging autonomous agent\ntasks requiring rich and interactive code generation. It supports robust\nprogrammatic evaluation with state-based unit tests, allowing for different\nways of completing a task while also checking for unexpected changes, i.e.,\ncollateral damage. The state-of-the-art LLM, GPT-4o, solves only ~49% of our\n'normal' tasks and ~30% of 'challenge' tasks, while other models solve at least\n16% fewer. This highlights the benchmark's difficulty and AppWorld's potential\nto push the frontiers of interactive coding agents. The project website is\navailable at https://appworld.dev/.\n","authors":["Harsh Trivedi","Tushar Khot","Mareike Hartmann","Ruskin Manku","Vinty Dong","Edward Li","Shashank Gupta","Ashish Sabharwal","Niranjan Balasubramanian"],"pdf_url":"https://arxiv.org/pdf/2407.18901v1.pdf","comment":"ACL'24 Camera Ready"},{"id":"http://arxiv.org/abs/2407.18219v2","updated":"2024-07-26T17:50:27Z","published":"2024-07-25T17:35:59Z","title":"Recursive Introspection: Teaching Language Model Agents How to\n Self-Improve","summary":" A central piece in enabling intelligent agentic behavior in foundation models\nis to make them capable of introspecting upon their behavior, reasoning, and\ncorrecting their mistakes as more computation or interaction is available. Even\nthe strongest proprietary large language models (LLMs) do not quite exhibit the\nability of continually improving their responses sequentially, even in\nscenarios where they are explicitly told that they are making a mistake. In\nthis paper, we develop RISE: Recursive IntroSpEction, an approach for\nfine-tuning LLMs to introduce this capability, despite prior work hypothesizing\nthat this capability may not be possible to attain. Our approach prescribes an\niterative fine-tuning procedure, which attempts to teach the model how to alter\nits response after having executed previously unsuccessful attempts to solve a\nhard test-time problem, with optionally additional environment feedback. RISE\nposes fine-tuning for a single-turn prompt as solving a multi-turn Markov\ndecision process (MDP), where the initial state is the prompt. Inspired by\nprinciples in online imitation learning and reinforcement learning, we propose\nstrategies for multi-turn data collection and training so as to imbue an LLM\nwith the capability to recursively detect and correct its previous mistakes in\nsubsequent iterations. Our experiments show that RISE enables Llama2, Llama3,\nand Mistral models to improve themselves with more turns on math reasoning\ntasks, outperforming several single-turn strategies given an equal amount of\ninference-time computation. We also find that RISE scales well, often attaining\nlarger benefits with more capable models. Our analysis shows that RISE makes\nmeaningful improvements to responses to arrive at the correct solution for\nchallenging prompts, without disrupting one-turn abilities as a result of\nexpressing more complex distributions.\n","authors":["Yuxiao Qu","Tianjun Zhang","Naman Garg","Aviral Kumar"],"pdf_url":"https://arxiv.org/pdf/2407.18219v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18887v1","updated":"2024-07-26T17:36:40Z","published":"2024-07-26T17:36:40Z","title":"Embedding And Clustering Your Data Can Improve Contrastive Pretraining","summary":" Recent studies of large-scale contrastive pretraining in the text embedding\ndomain show that using single-source minibatches, rather than mixed-source\nminibatches, can substantially improve overall model accuracy. In this work, we\nexplore extending training data stratification beyond source granularity by\nleveraging a pretrained text embedding model and the classic k-means clustering\nalgorithm to further split training data apart by the semantic clusters within\neach source. Experimentally, we observe a notable increase in NDCG@10 when\npretraining a BERT-based text embedding model on query-passage pairs from the\nMSMARCO passage retrieval dataset. Additionally, we conceptually connect our\nclustering approach to both the Topic Aware Sampling (TAS) aspect of the TAS-B\nmethodology and the nearest-neighbor-based hard-negative mining aspect of the\nANCE methodology and discuss how this unified view motivates future lines of\nresearch on the organization of contrastive pretraining data.\n","authors":["Luke Merrick"],"pdf_url":"https://arxiv.org/pdf/2407.18887v1.pdf","comment":"16 pages, 3 figures, 2 tables"},{"id":"http://arxiv.org/abs/2407.08112v2","updated":"2024-07-26T17:31:51Z","published":"2024-07-11T01:08:39Z","title":"How Well Can a Long Sequence Model Model Long Sequences? Comparing\n Architechtural Inductive Biases on Long-Context Abilities","summary":" Long sequences occur in abundance within real-world scenarios, hence properly\nmodelling them opens numerous down-stream use-cases. Deep neural networks,\nhowever, have often struggled with these for a variety of reasons. Recent\nadvances, both in system engineering as well as model design, have enabled the\nscaling up of model that are purported to support extended context length. In\nparticular, the state-space and linear recurrent neural network families of\nmodels hypothetically can entend to infinite sequence lenth. However, is this\ntoo good to be true? We conduct an evaluation to show that while such claims\nmay be sound theoretically, there remain large practical gaps that are\nempirically observed. In particular, recurrent models still suffer in the same\nsettings as long-context LLMs with attention. We further show that different\ninductive biases have inconsistent extrapolation capabilities, highlighting the\nneed to further study such paradigms and investigate why long-context models\nseemingly fail to behave as one might expect.\n","authors":["Jerry Huang"],"pdf_url":"https://arxiv.org/pdf/2407.08112v2.pdf","comment":"Work In Progress. 9 pages"},{"id":"http://arxiv.org/abs/2209.00568v3","updated":"2024-07-26T17:04:53Z","published":"2022-09-01T16:19:22Z","title":"Distilling Multi-Scale Knowledge for Event Temporal Relation Extraction","summary":" Event Temporal Relation Extraction (ETRE) is paramount but challenging.\nWithin a discourse, event pairs are situated at different distances or the\nso-called proximity bands. The temporal ordering communicated about event pairs\nwhere at more remote (i.e., ``long'') or less remote (i.e., ``short'')\nproximity bands are encoded differently. SOTA models have tended to perform\nwell on events situated at either short or long proximity bands, but not both.\nNonetheless, real-world, natural texts contain all types of temporal\nevent-pairs. In this paper, we present MulCo: Distilling Multi-Scale Knowledge\nvia Contrastive Learning, a knowledge co-distillation approach that shares\nknowledge across multiple event pair proximity bands to improve performance on\nall types of temporal datasets. Our experimental results show that MulCo\nsuccessfully integrates linguistic cues pertaining to temporal reasoning across\nboth short and long proximity bands and achieves new state-of-the-art results\non several ETRE benchmark datasets.\n","authors":["Hao-Ren Yao","Luke Breitfeller","Aakanksha Naik","Chunxiao Zhou","Carolyn Rose"],"pdf_url":"https://arxiv.org/pdf/2209.00568v3.pdf","comment":"Accepted to CIKM 2024 Full Research Track, camera ready version"},{"id":"http://arxiv.org/abs/2407.18129v2","updated":"2024-07-26T15:34:12Z","published":"2024-07-25T15:36:48Z","title":"Dallah: A Dialect-Aware Multimodal Large Language Model for Arabic","summary":" Recent advancements have significantly enhanced the capabilities of\nMultimodal Large Language Models (MLLMs) in generating and understanding\nimage-to-text content. Despite these successes, progress is predominantly\nlimited to English due to the scarcity of high quality multimodal resources in\nother languages. This limitation impedes the development of competitive models\nin languages such as Arabic. To alleviate this situation, we introduce an\nefficient Arabic multimodal assistant, dubbed Dallah, that utilizes an advanced\nlanguage model based on LLaMA-2 to facilitate multimodal interactions. Dallah\ndemonstrates state-of-the-art performance in Arabic MLLMs. Through fine-tuning\nsix Arabic dialects, Dallah showcases its capability to handle complex\ndialectal interactions incorporating both textual and visual elements. The\nmodel excels in two benchmark tests: one evaluating its performance on Modern\nStandard Arabic (MSA) and another specifically designed to assess dialectal\nresponses. Beyond its robust performance in multimodal interaction tasks,\nDallah has the potential to pave the way for further development of\ndialect-aware Arabic MLLMs.\n","authors":["Fakhraddin Alwajih","Gagan Bhatia","Muhammad Abdul-Mageed"],"pdf_url":"https://arxiv.org/pdf/2407.18129v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00019v2","updated":"2024-07-26T15:13:08Z","published":"2024-05-23T07:14:21Z","title":"EHR-SeqSQL : A Sequential Text-to-SQL Dataset For Interactively\n Exploring Electronic Health Records","summary":" In this paper, we introduce EHR-SeqSQL, a novel sequential text-to-SQL\ndataset for Electronic Health Record (EHR) databases. EHR-SeqSQL is designed to\naddress critical yet underexplored aspects in text-to-SQL parsing:\ninteractivity, compositionality, and efficiency. To the best of our knowledge,\nEHR-SeqSQL is not only the largest but also the first medical text-to-SQL\ndataset benchmark to include sequential and contextual questions. We provide a\ndata split and the new test set designed to assess compositional generalization\nability. Our experiments demonstrate the superiority of a multi-turn approach\nover a single-turn approach in learning compositionality. Additionally, our\ndataset integrates specially crafted tokens into SQL queries to improve\nexecution efficiency. With EHR-SeqSQL, we aim to bridge the gap between\npractical needs and academic research in the text-to-SQL domain. EHR-SeqSQL is\navailable \\href{https://github.com/seonhee99/EHR-SeqSQL}{at this https URL}.\n","authors":["Jaehee Ryu","Seonhee Cho","Gyubok Lee","Edward Choi"],"pdf_url":"https://arxiv.org/pdf/2406.00019v2.pdf","comment":"ACL 2024 (Findings)"},{"id":"http://arxiv.org/abs/2310.05140v4","updated":"2024-07-26T15:07:01Z","published":"2023-10-08T12:21:24Z","title":"Harnessing the Power of Large Language Models for Empathetic Response\n Generation: Empirical Investigations and Improvements","summary":" Empathetic dialogue is an indispensable part of building harmonious social\nrelationships and contributes to the development of a helpful AI. Previous\napproaches are mainly based on fine small-scale language models. With the\nadvent of ChatGPT, the application effect of large language models (LLMs) in\nthis field has attracted great attention. This work empirically investigates\nthe performance of LLMs in generating empathetic responses and proposes three\nimprovement methods of semantically similar in-context learning, two-stage\ninteractive generation, and combination with the knowledge base. Extensive\nexperiments show that LLMs can significantly benefit from our proposed methods\nand is able to achieve state-of-the-art performance in both automatic and human\nevaluations. Additionally, we explore the possibility of GPT-4 simulating human\nevaluators.\n","authors":["Yushan Qian","Wei-Nan Zhang","Ting Liu"],"pdf_url":"https://arxiv.org/pdf/2310.05140v4.pdf","comment":"Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2407.18789v1","updated":"2024-07-26T14:52:37Z","published":"2024-07-26T14:52:37Z","title":"Granularity is crucial when applying differential privacy to text: An\n investigation for neural machine translation","summary":" Applying differential privacy (DP) by means of the DP-SGD algorithm to\nprotect individual data points during training is becoming increasingly popular\nin NLP. However, the choice of granularity at which DP is applied is often\nneglected. For example, neural machine translation (NMT) typically operates on\nthe sentence-level granularity. From the perspective of DP, this setup assumes\nthat each sentence belongs to a single person and any two sentences in the\ntraining dataset are independent. This assumption is however violated in many\nreal-world NMT datasets, e.g. those including dialogues. For proper application\nof DP we thus must shift from sentences to entire documents. In this paper, we\ninvestigate NMT at both the sentence and document levels, analyzing the\nprivacy/utility trade-off for both scenarios, and evaluating the risks of not\nusing the appropriate privacy granularity in terms of leaking personally\nidentifiable information (PII). Our findings indicate that the document-level\nNMT system is more resistant to membership inference attacks, emphasizing the\nsignificance of using the appropriate granularity when working with DP.\n","authors":["Doan Nam Long Vu","Timour Igamberdiev","Ivan Habernal"],"pdf_url":"https://arxiv.org/pdf/2407.18789v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18786v1","updated":"2024-07-26T14:47:31Z","published":"2024-07-26T14:47:31Z","title":"The power of Prompts: Evaluating and Mitigating Gender Bias in MT with\n LLMs","summary":" This paper studies gender bias in machine translation through the lens of\nLarge Language Models (LLMs). Four widely-used test sets are employed to\nbenchmark various base LLMs, comparing their translation quality and gender\nbias against state-of-the-art Neural Machine Translation (NMT) models for\nEnglish to Catalan (En $\\rightarrow$ Ca) and English to Spanish (En\n$\\rightarrow$ Es) translation directions. Our findings reveal pervasive gender\nbias across all models, with base LLMs exhibiting a higher degree of bias\ncompared to NMT models. To combat this bias, we explore prompting engineering\ntechniques applied to an instruction-tuned LLM. We identify a prompt structure\nthat significantly reduces gender bias by up to 12% on the WinoMT evaluation\ndataset compared to more straightforward prompts. These results significantly\nreduce the gender bias accuracy gap between LLMs and traditional NMT systems.\n","authors":["Aleix Sant","Carlos Escolano","Audrey Mash","Francesca De Luca Fornaciari","Maite Melero"],"pdf_url":"https://arxiv.org/pdf/2407.18786v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.06468v2","updated":"2024-07-26T14:18:48Z","published":"2024-05-10T13:27:32Z","title":"Pseudo-Prompt Generating in Pre-trained Vision-Language Models for\n Multi-Label Medical Image Classification","summary":" The task of medical image recognition is notably complicated by the presence\nof varied and multiple pathological indications, presenting a unique challenge\nin multi-label classification with unseen labels. This complexity underlines\nthe need for computer-aided diagnosis methods employing multi-label zero-shot\nlearning. Recent advancements in pre-trained vision-language models (VLMs) have\nshowcased notable zero-shot classification abilities on medical images.\nHowever, these methods have limitations on leveraging extensive pre-trained\nknowledge from broader image datasets, and often depend on manual prompt\nconstruction by expert radiologists. By automating the process of prompt\ntuning, prompt learning techniques have emerged as an efficient way to adapt\nVLMs to downstream tasks. Yet, existing CoOp-based strategies fall short in\nperforming class-specific prompts on unseen categories, limiting\ngeneralizability in fine-grained scenarios. To overcome these constraints, we\nintroduce a novel prompt generation approach inspirited by text generation in\nnatural language processing (NLP). Our method, named Pseudo-Prompt Generating\n(PsPG), capitalizes on the priori knowledge of multi-modal features. Featuring\na RNN-based decoder, PsPG autoregressively generates class-tailored embedding\nvectors, i.e., pseudo-prompts. Comparative evaluations on various multi-label\nchest radiograph datasets affirm the superiority of our approach against\nleading medical vision-language and multi-label prompt learning methods. The\nsource code is available at https://github.com/fallingnight/PsPG\n","authors":["Yaoqin Ye","Junjie Zhang","Hongwei Shi"],"pdf_url":"https://arxiv.org/pdf/2405.06468v2.pdf","comment":"Accepted by PRCV 2024"},{"id":"http://arxiv.org/abs/2402.05121v2","updated":"2024-07-26T14:12:33Z","published":"2024-02-04T00:47:53Z","title":"Large Language Model for Table Processing: A Survey","summary":" Tables, typically two-dimensional and structured to store large amounts of\ndata, are essential in daily activities like database queries, spreadsheet\nmanipulations, web table question answering, and image table information\nextraction. Automating these table-centric tasks with Large Language Models\n(LLMs) or Visual Language Models (VLMs) offers significant public benefits,\ngarnering interest from academia and industry. This survey provides a\ncomprehensive overview of table-related tasks, examining both user scenarios\nand technical aspects. It covers traditional tasks like table question\nanswering as well as emerging fields such as spreadsheet manipulation and table\ndata analysis. We summarize the training techniques for LLMs and VLMs tailored\nfor table processing. Additionally, we discuss prompt engineering, particularly\nthe use of LLM-powered agents, for various table-related tasks. Finally, we\nhighlight several challenges, including processing implicit user intentions and\nextracting information from various table sources.\n","authors":["Weizheng Lu","Jing Zhang","Ju Fan","Zihao Fu","Yueguo Chen","Xiaoyong Du"],"pdf_url":"https://arxiv.org/pdf/2402.05121v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18752v1","updated":"2024-07-26T14:07:00Z","published":"2024-07-26T14:07:00Z","title":"Knowledge Graph Structure as Prompt: Improving Small Language Models\n Capabilities for Knowledge-based Causal Discovery","summary":" Causal discovery aims to estimate causal structures among variables based on\nobservational data. Large Language Models (LLMs) offer a fresh perspective to\ntackle the causal discovery problem by reasoning on the metadata associated\nwith variables rather than their actual data values, an approach referred to as\nknowledge-based causal discovery. In this paper, we investigate the\ncapabilities of Small Language Models (SLMs, defined as LLMs with fewer than 1\nbillion parameters) with prompt-based learning for knowledge-based causal\ndiscovery. Specifically, we present KG Structure as Prompt, a novel approach\nfor integrating structural information from a knowledge graph, such as common\nneighbor nodes and metapaths, into prompt-based learning to enhance the\ncapabilities of SLMs. Experimental results on three types of biomedical and\nopen-domain datasets under few-shot settings demonstrate the effectiveness of\nour approach, surpassing most baselines and even conventional fine-tuning\napproaches trained on full datasets. Our findings further highlight the strong\ncapabilities of SLMs: in combination with knowledge graphs and prompt-based\nlearning, SLMs demonstrate the potential to surpass LLMs with larger number of\nparameters. Our code and datasets are available on GitHub.\n","authors":["Yuni Susanti","Michael Färber"],"pdf_url":"https://arxiv.org/pdf/2407.18752v1.pdf","comment":"accepted at ISWC'24"},{"id":"http://arxiv.org/abs/2407.18743v1","updated":"2024-07-26T13:55:21Z","published":"2024-07-26T13:55:21Z","title":"Towards Effective and Efficient Continual Pre-training of Large Language\n Models","summary":" Continual pre-training (CPT) has been an important approach for adapting\nlanguage models to specific domains or tasks. To make the CPT approach more\ntraceable, this paper presents a technical report for continually pre-training\nLlama-3 (8B), which significantly enhances the Chinese language ability and\nscientific reasoning ability of the backbone model. To enhance the new\nabilities while retaining the original abilities, we design specific data\nmixture and curriculum strategies by utilizing existing datasets and\nsynthesizing high-quality datasets. Specifically, we synthesize\nmultidisciplinary scientific question and answer (QA) pairs based on related\nweb pages, and subsequently incorporate these synthetic data to improve the\nscientific reasoning ability of Llama-3. We refer to the model after CPT as\nLlama-3-SynE (Synthetic data Enhanced Llama-3). We also present the tuning\nexperiments with a relatively small model -- TinyLlama, and employ the derived\nfindings to train the backbone model. Extensive experiments on a number of\nevaluation benchmarks show that our approach can largely improve the\nperformance of the backbone models, including both the general abilities (+8.81\non C-Eval and +6.31 on CMMLU) and the scientific reasoning abilities (+12.00 on\nMATH and +4.13 on SciEval), without hurting the original capacities. Our model,\ndata, and codes are available at https://github.com/RUC-GSAI/Llama-3-SynE.\n","authors":["Jie Chen","Zhipeng Chen","Jiapeng Wang","Kun Zhou","Yutao Zhu","Jinhao Jiang","Yingqian Min","Wayne Xin Zhao","Zhicheng Dou","Jiaxin Mao","Yankai Lin","Ruihua Song","Jun Xu","Xu Chen","Rui Yan","Zhewei Wei","Di Hu","Wenbing Huang","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2407.18743v1.pdf","comment":"16 pages, 10 figures, 16 tables"},{"id":"http://arxiv.org/abs/2407.18738v1","updated":"2024-07-26T13:50:22Z","published":"2024-07-26T13:50:22Z","title":"Towards Generalized Offensive Language Identification","summary":" The prevalence of offensive content on the internet, encompassing hate speech\nand cyberbullying, is a pervasive issue worldwide. Consequently, it has\ngarnered significant attention from the machine learning (ML) and natural\nlanguage processing (NLP) communities. As a result, numerous systems have been\ndeveloped to automatically identify potentially harmful content and mitigate\nits impact. These systems can follow two approaches; (1) Use publicly available\nmodels and application endpoints, including prompting large language models\n(LLMs) (2) Annotate datasets and train ML models on them. However, both\napproaches lack an understanding of how generalizable they are. Furthermore,\nthe applicability of these systems is often questioned in off-domain and\npractical environments. This paper empirically evaluates the generalizability\nof offensive language detection models and datasets across a novel generalized\nbenchmark. We answer three research questions on generalizability. Our findings\nwill be useful in creating robust real-world offensive language detection\nsystems.\n","authors":["Alphaeus Dmonte","Tejas Arya","Tharindu Ranasinghe","Marcos Zampieri"],"pdf_url":"https://arxiv.org/pdf/2407.18738v1.pdf","comment":"Accepted to ASONAM 2024"},{"id":"http://arxiv.org/abs/2407.18730v1","updated":"2024-07-26T13:30:24Z","published":"2024-07-26T13:30:24Z","title":"Creating an Aligned Corpus of Sound and Text: The Multimodal Corpus of\n Shakespeare and Milton","summary":" In this work we present a corpus of poems by William Shakespeare and John\nMilton that have been enriched with readings from the public domain. We have\naligned all the lines with their respective audio segments, at the line, word,\nsyllable and phone level, and we have included their scansion. We make a basic\nvisualization platform for these poems and we conclude by conjecturing possible\nfuture directions.\n","authors":["Manex Agirrezabal"],"pdf_url":"https://arxiv.org/pdf/2407.18730v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18716v1","updated":"2024-07-26T13:05:24Z","published":"2024-07-26T13:05:24Z","title":"ChatSchema: A pipeline of extracting structured information with Large\n Multimodal Models based on schema","summary":" Objective: This study introduces ChatSchema, an effective method for\nextracting and structuring information from unstructured data in medical paper\nreports using a combination of Large Multimodal Models (LMMs) and Optical\nCharacter Recognition (OCR) based on the schema. By integrating predefined\nschema, we intend to enable LMMs to directly extract and standardize\ninformation according to the schema specifications, facilitating further data\nentry. Method: Our approach involves a two-stage process, including\nclassification and extraction for categorizing report scenarios and structuring\ninformation. We established and annotated a dataset to verify the effectiveness\nof ChatSchema, and evaluated key extraction using precision, recall, F1-score,\nand accuracy metrics. Based on key extraction, we further assessed value\nextraction. We conducted ablation studies on two LMMs to illustrate the\nimprovement of structured information extraction with different input modals\nand methods. Result: We analyzed 100 medical reports from Peking University\nFirst Hospital and established a ground truth dataset with 2,945 key-value\npairs. We evaluated ChatSchema using GPT-4o and Gemini 1.5 Pro and found a\nhigher overall performance of GPT-4o. The results are as follows: For the\nresult of key extraction, key-precision was 98.6%, key-recall was 98.5%,\nkey-F1-score was 98.6%. For the result of value extraction based on correct key\nextraction, the overall accuracy was 97.2%, precision was 95.8%, recall was\n95.8%, and F1-score was 95.8%. An ablation study demonstrated that ChatSchema\nachieved significantly higher overall accuracy and overall F1-score of\nkey-value extraction, compared to the Baseline, with increases of 26.9% overall\naccuracy and 27.4% overall F1-score, respectively.\n","authors":["Fei Wang","Yuewen Zheng","Qin Li","Jingyi Wu","Pengfei Li","Luxia Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.18716v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.13623v2","updated":"2024-07-26T12:59:47Z","published":"2024-07-18T15:58:54Z","title":"Scaling Laws with Vocabulary: Larger Models Deserve Larger Vocabularies","summary":" Research on scaling large language models (LLMs) has primarily focused on\nmodel parameters and training data size, overlooking the role of vocabulary\nsize. We investigate how vocabulary size impacts LLM scaling laws by training\nmodels ranging from 33M to 3B parameters on up to 500B characters with various\nvocabulary configurations. We propose three complementary approaches for\npredicting the compute-optimal vocabulary size: IsoFLOPs analysis, derivative\nestimation, and parametric fit of the loss function. Our approaches converge on\nthe same result that the optimal vocabulary size depends on the available\ncompute budget and that larger models deserve larger vocabularies. However,\nmost LLMs use too small vocabulary sizes. For example, we predict that the\noptimal vocabulary size of Llama2-70B should have been at least 216K, 7 times\nlarger than its vocabulary of 32K. We validate our predictions empirically by\ntraining models with 3B parameters across different FLOPs budgets. Adopting our\npredicted optimal vocabulary size consistently improves downstream performance\nover commonly used vocabulary sizes. By increasing the vocabulary size from the\nconventional 32K to 43K, we improve performance on ARC-Challenge from 29.1 to\n32.0 with the same 2.3e21 FLOPs. Our work emphasizes the necessity of jointly\nconsidering model parameters and vocabulary size for efficient scaling.\n","authors":["Chaofan Tao","Qian Liu","Longxu Dou","Niklas Muennighoff","Zhongwei Wan","Ping Luo","Min Lin","Ngai Wong"],"pdf_url":"https://arxiv.org/pdf/2407.13623v2.pdf","comment":"26 pages, 12 figures. Add more related work"},{"id":"http://arxiv.org/abs/2407.18712v1","updated":"2024-07-26T12:57:54Z","published":"2024-07-26T12:57:54Z","title":"Cluster-norm for Unsupervised Probing of Knowledge","summary":" The deployment of language models brings challenges in generating reliable\ninformation, especially when these models are fine-tuned using human\npreferences. To extract encoded knowledge without (potentially) biased human\nlabels, unsupervised probing techniques like Contrast-Consistent Search (CCS)\nhave been developed (Burns et al., 2022). However, salient but unrelated\nfeatures in a given dataset can mislead these probes (Farquhar et al., 2023).\nAddressing this, we propose a cluster normalization method to minimize the\nimpact of such features by clustering and normalizing activations of contrast\npairs before applying unsupervised probing techniques. While this approach does\nnot address the issue of differentiating between knowledge in general and\nsimulated knowledge - a major issue in the literature of latent knowledge\nelicitation (Christiano et al., 2021) - it significantly improves the ability\nof unsupervised probes to identify the intended knowledge amidst distractions.\n","authors":["Walter Laurito","Sharan Maiya","Grégoire Dhimoïla"," Owen"," Yeung","Kaarel Hänni"],"pdf_url":"https://arxiv.org/pdf/2407.18712v1.pdf","comment":"34 pages, 35 figures"},{"id":"http://arxiv.org/abs/2407.17688v2","updated":"2024-07-26T12:47:13Z","published":"2024-07-25T01:11:38Z","title":"Examining the Influence of Political Bias on Large Language Model\n Performance in Stance Classification","summary":" Large Language Models (LLMs) have demonstrated remarkable capabilities in\nexecuting tasks based on natural language queries. However, these models,\ntrained on curated datasets, inherently embody biases ranging from racial to\nnational and gender biases. It remains uncertain whether these biases impact\nthe performance of LLMs for certain tasks. In this study, we investigate the\npolitical biases of LLMs within the stance classification task, specifically\nexamining whether these models exhibit a tendency to more accurately classify\npolitically-charged stances. Utilizing three datasets, seven LLMs, and four\ndistinct prompting schemes, we analyze the performance of LLMs on politically\noriented statements and targets. Our findings reveal a statistically\nsignificant difference in the performance of LLMs across various politically\noriented stance classification tasks. Furthermore, we observe that this\ndifference primarily manifests at the dataset level, with models and prompting\nschemes showing statistically similar performances across different stance\nclassification datasets. Lastly, we observe that when there is greater\nambiguity in the target the statement is directed towards, LLMs have poorer\nstance classification accuracy.\n Code & Dataset: http://doi.org/10.5281/zenodo.12938478\n","authors":["Lynnette Hui Xian Ng","Iain Cruickshank","Roy Ka-Wei Lee"],"pdf_url":"https://arxiv.org/pdf/2407.17688v2.pdf","comment":"Accepted at ICWSM 2025"},{"id":"http://arxiv.org/abs/2407.12126v2","updated":"2024-07-26T12:37:58Z","published":"2024-07-16T19:32:23Z","title":"LLMs-in-the-loop Part-1: Expert Small AI Models for Bio-Medical Text\n Translation","summary":" Machine translation is indispensable in healthcare for enabling the global\ndissemination of medical knowledge across languages. However, complex medical\nterminology poses unique challenges to achieving adequate translation quality\nand accuracy. This study introduces a novel \"LLMs-in-the-loop\" approach to\ndevelop supervised neural machine translation models optimized specifically for\nmedical texts. While large language models (LLMs) have demonstrated powerful\ncapabilities, this research shows that small, specialized models trained on\nhigh-quality in-domain (mostly synthetic) data can outperform even vastly\nlarger LLMs.\n Custom parallel corpora in six languages were compiled from scientific\narticles, synthetically generated clinical documents, and medical texts. Our\nLLMs-in-the-loop methodology employs synthetic data generation, rigorous\nevaluation, and agent orchestration to enhance performance. We developed small\nmedical translation models using the MarianMT base model. We introduce a new\nmedical translation test dataset to standardize evaluation in this domain.\nAssessed using BLEU, METEOR, ROUGE, and BERT scores on this test set, our\nMarianMT-based models outperform Google Translate, DeepL, and GPT-4-Turbo.\n Results demonstrate that our LLMs-in-the-loop approach, combined with\nfine-tuning high-quality, domain-specific data, enables specialized models to\noutperform general-purpose and some larger systems. This research, part of a\nbroader series on expert small models, paves the way for future\nhealthcare-related AI developments, including deidentification and bio-medical\nentity extraction models. Our study underscores the potential of tailored\nneural translation models and the LLMs-in-the-loop methodology to advance the\nfield through improved data generation, evaluation, agent, and modeling\ntechniques.\n","authors":["Bunyamin Keles","Murat Gunay","Serdar I. Caglar"],"pdf_url":"https://arxiv.org/pdf/2407.12126v2.pdf","comment":"14 pages, 2 figures, 9 tables"},{"id":"http://arxiv.org/abs/2407.18698v1","updated":"2024-07-26T12:23:54Z","published":"2024-07-26T12:23:54Z","title":"Adaptive Contrastive Search: Uncertainty-Guided Decoding for Open-Ended\n Text Generation","summary":" Decoding from the output distributions of large language models to produce\nhigh-quality text is a complex challenge in language modeling. Various\napproaches, such as beam search, sampling with temperature, $k-$sampling,\nnucleus $p-$sampling, typical decoding, contrastive decoding, and contrastive\nsearch, have been proposed to address this problem, aiming to improve\ncoherence, diversity, as well as resemblance to human-generated text. In this\nstudy, we introduce adaptive contrastive search, a novel decoding strategy\nextending contrastive search by incorporating an adaptive degeneration penalty,\nguided by the estimated uncertainty of the model at each generation step. This\nstrategy is designed to enhance both the creativity and diversity of the\nlanguage modeling process while at the same time producing coherent and\nhigh-quality generated text output. Our findings indicate performance\nenhancement in both aspects, across different model architectures and datasets,\nunderscoring the effectiveness of our method in text generation tasks. Our code\nbase, datasets, and models are publicly available.\n","authors":["Esteban Garces Arias","Julian Rodemann","Meimingwei Li","Christian Heumann","Matthias Aßenmacher"],"pdf_url":"https://arxiv.org/pdf/2407.18698v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18689v1","updated":"2024-07-26T12:13:45Z","published":"2024-07-26T12:13:45Z","title":"The BIAS Detection Framework: Bias Detection in Word Embeddings and\n Language Models for European Languages","summary":" The project BIAS: Mitigating Diversity Biases of AI in the Labor Market is a\nfour-year project funded by the European commission and supported by the Swiss\nState Secretariat for Education, Research and Innovation (SERI). As part of the\nproject, novel bias detection methods to identify societal bias in language\nmodels and word embeddings in European languages are developed, with particular\nattention to linguistic and geographic particularities. This technical report\ndescribes the overall architecture and components of the BIAS Detection\nFramework. The code described in this technical report is available and will be\nupdated and expanded continuously with upcoming results from the BIAS project.\nThe details about the datasets for the different languages are described in\ncorresponding papers at scientific venues.\n","authors":["Alexandre Puttick","Leander Rankwiler","Catherine Ikae","Mascha Kurpicz-Briki"],"pdf_url":"https://arxiv.org/pdf/2407.18689v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18213v2","updated":"2024-07-26T11:51:58Z","published":"2024-07-25T17:26:41Z","title":"Exploring Scaling Trends in LLM Robustness","summary":" Language model capabilities predictably improve from scaling a model's size\nand training data. Motivated by this, increasingly large language models have\nbeen trained, yielding an array of impressive capabilities. Yet these models\nare vulnerable to adversarial prompts, such as \"jailbreaks\" that hijack models\nto perform undesired behaviors, posing a significant risk of misuse. Prior work\nindicates that computer vision models become more robust with model and data\nscaling, raising the question: does language model robustness also improve with\nscale? We study this question empirically, finding that larger models respond\nsubstantially better to adversarial training, but there is little to no benefit\nfrom model scale in the absence of explicit defenses.\n","authors":["Nikolaus Howe","Michał Zajac","Ian McKenzie","Oskar Hollinsworth","Tom Tseng","Pierre-Luc Bacon","Adam Gleave"],"pdf_url":"https://arxiv.org/pdf/2407.18213v2.pdf","comment":"31 pages; edit fixed metadata typo (author name)"},{"id":"http://arxiv.org/abs/2402.12750v2","updated":"2024-07-26T10:15:38Z","published":"2024-02-20T06:38:10Z","title":"Model Composition for Multimodal Large Language Models","summary":" Recent developments in Multimodal Large Language Models (MLLMs) have shown\nrapid progress, moving towards the goal of creating versatile MLLMs that\nunderstand inputs from various modalities. However, existing methods typically\nrely on joint training with paired multimodal instruction data, which is\nresource-intensive and challenging to extend to new modalities. In this paper,\nwe propose a new paradigm through the model composition of existing MLLMs to\ncreate a new model that retains the modal understanding capabilities of each\noriginal model. Our basic implementation, NaiveMC, demonstrates the\neffectiveness of this paradigm by reusing modality encoders and merging LLM\nparameters. Furthermore, we introduce DAMC to address parameter interference\nand mismatch issues during the merging process, thereby enhancing the model\nperformance. To facilitate research in this area, we propose MCUB, a benchmark\nfor assessing ability of MLLMs to understand inputs from diverse modalities.\nExperiments on this benchmark and four other multimodal understanding tasks\nshow significant improvements over baselines, proving that model composition\ncan create a versatile model capable of processing inputs from multiple\nmodalities.\n","authors":["Chi Chen","Yiyang Du","Zheng Fang","Ziyue Wang","Fuwen Luo","Peng Li","Ming Yan","Ji Zhang","Fei Huang","Maosong Sun","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2402.12750v2.pdf","comment":"ACL2024 Main Conference; Code is available at\n https://github.com/THUNLP-MT/ModelCompose"},{"id":"http://arxiv.org/abs/2404.08368v2","updated":"2024-07-26T09:48:05Z","published":"2024-04-12T10:12:38Z","title":"Automatic Speech Recognition Advancements for Indigenous Languages of\n the Americas","summary":" Indigenous languages are a fundamental legacy in the development of human\ncommunication, embodying the unique identity and culture of local communities\nin America. The Second AmericasNLP (Americas Natural Language Processing)\nCompetition Track 1 of NeurIPS (Neural Information Processing Systems) 2022\nproposed the task of training automatic speech recognition (ASR) systems for\nfive Indigenous languages: Quechua, Guarani, Bribri, Kotiria, and Wa'ikhana. In\nthis paper, we describe the fine-tuning of a state-of-the-art ASR model for\neach target language, using approximately 36.65 h of transcribed speech data\nfrom diverse sources enriched with data augmentation methods. We systematically\ninvestigate, using a Bayesian search, the impact of the different\nhyperparameters on the Wav2vec2.0 XLS-R (Cross-Lingual Speech Representations)\nvariants of 300 M and 1 B parameters. Our findings indicate that data and\ndetailed hyperparameter tuning significantly affect ASR accuracy, but language\ncomplexity determines the final result. The Quechua model achieved the lowest\ncharacter error rate (CER) (12.14), while the Kotiria model, despite having the\nmost extensive dataset during the fine-tuning phase, showed the highest CER\n(36.59). Conversely, with the smallest dataset, the Guarani model achieved a\nCER of 15.59, while Bribri and Wa'ikhana obtained, respectively, CERs of 34.70\nand 35.23. Additionally, Sobol' sensitivity analysis highlighted the crucial\nroles of freeze fine-tuning updates and dropout rates. We release our best\nmodels for each language, marking the first open ASR models for Wa'ikhana and\nKotiria. This work opens avenues for future research to advance ASR techniques\nin preserving minority Indigenous languages\n","authors":["Monica Romero","Sandra Gomez","Ivan G. Torre"],"pdf_url":"https://arxiv.org/pdf/2404.08368v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18626v1","updated":"2024-07-26T09:35:36Z","published":"2024-07-26T09:35:36Z","title":"Every Part Matters: Integrity Verification of Scientific Figures Based\n on Multimodal Large Language Models","summary":" This paper tackles a key issue in the interpretation of scientific figures:\nthe fine-grained alignment of text and figures. It advances beyond prior\nresearch that primarily dealt with straightforward, data-driven visualizations\nsuch as bar and pie charts and only offered a basic understanding of diagrams\nthrough captioning and classification. We introduce a novel task, Figure\nIntegrity Verification, designed to evaluate the precision of technologies in\naligning textual knowledge with visual elements in scientific figures. To\nsupport this, we develop a semi-automated method for constructing a large-scale\ndataset, Figure-seg, specifically designed for this task. Additionally, we\npropose an innovative framework, Every Part Matters (EPM), which leverages\nMultimodal Large Language Models (MLLMs) to not only incrementally improve the\nalignment and verification of text-figure integrity but also enhance integrity\nthrough analogical reasoning. Our comprehensive experiments show that these\ninnovations substantially improve upon existing methods, allowing for more\nprecise and thorough analysis of complex scientific figures. This progress not\nonly enhances our understanding of multimodal technologies but also stimulates\nfurther research and practical applications across fields requiring the\naccurate interpretation of complex visual data.\n","authors":["Xiang Shi","Jiawei Liu","Yinpeng Liu","Qikai Cheng","Wei Lu"],"pdf_url":"https://arxiv.org/pdf/2407.18626v1.pdf","comment":"28 pages, 11 figures, under review"},{"id":"http://arxiv.org/abs/2310.16340v2","updated":"2024-07-26T08:44:59Z","published":"2023-10-25T03:53:31Z","title":"RCAgent: Cloud Root Cause Analysis by Autonomous Agents with\n Tool-Augmented Large Language Models","summary":" Large language model (LLM) applications in cloud root cause analysis (RCA)\nhave been actively explored recently. However, current methods are still\nreliant on manual workflow settings and do not unleash LLMs' decision-making\nand environment interaction capabilities. We present RCAgent, a tool-augmented\nLLM autonomous agent framework for practical and privacy-aware industrial RCA\nusage. Running on an internally deployed model rather than GPT families,\nRCAgent is capable of free-form data collection and comprehensive analysis with\ntools. Our framework combines a variety of enhancements, including a unique\nSelf-Consistency for action trajectories, and a suite of methods for context\nmanagement, stabilization, and importing domain knowledge. Our experiments show\nRCAgent's evident and consistent superiority over ReAct across all aspects of\nRCA -- predicting root causes, solutions, evidence, and responsibilities -- and\ntasks covered or uncovered by current rules, as validated by both automated\nmetrics and human evaluations. Furthermore, RCAgent has already been integrated\ninto the diagnosis and issue discovery workflow of the Real-time Compute\nPlatform for Apache Flink of Alibaba Cloud.\n","authors":["Zefan Wang","Zichuan Liu","Yingying Zhang","Aoxiao Zhong","Jihong Wang","Fengbin Yin","Lunting Fan","Lingfei Wu","Qingsong Wen"],"pdf_url":"https://arxiv.org/pdf/2310.16340v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15353v2","updated":"2024-07-26T08:36:25Z","published":"2024-07-22T03:44:27Z","title":"Customized Retrieval Augmented Generation and Benchmarking for EDA Tool\n Documentation QA","summary":" Retrieval augmented generation (RAG) enhances the accuracy and reliability of\ngenerative AI models by sourcing factual information from external databases,\nwhich is extensively employed in document-grounded question-answering (QA)\ntasks. Off-the-shelf RAG flows are well pretrained on general-purpose\ndocuments, yet they encounter significant challenges when being applied to\nknowledge-intensive vertical domains, such as electronic design automation\n(EDA). This paper addresses such issue by proposing a customized RAG framework\nalong with three domain-specific techniques for EDA tool documentation QA,\nincluding a contrastive learning scheme for text embedding model fine-tuning, a\nreranker distilled from proprietary LLM, and a generative LLM fine-tuned with\nhigh-quality domain corpus. Furthermore, we have developed and released a\ndocumentation QA evaluation benchmark, ORD-QA, for OpenROAD, an advanced\nRTL-to-GDSII design platform. Experimental results demonstrate that our\nproposed RAG flow and techniques have achieved superior performance on ORD-QA\nas well as on a commercial tool, compared with state-of-the-arts. The ORD-QA\nbenchmark and the training dataset for our customized RAG flow are open-source\nat https://github.com/lesliepy99/RAG-EDA.\n","authors":["Yuan Pu","Zhuolun He","Tairu Qiu","Haoyuan Wu","Bei Yu"],"pdf_url":"https://arxiv.org/pdf/2407.15353v2.pdf","comment":"Accepted by ICCAD 2024"},{"id":"http://arxiv.org/abs/2311.10777v5","updated":"2024-07-26T08:22:07Z","published":"2023-11-16T06:01:47Z","title":"A Systematic Review of Aspect-based Sentiment Analysis: Domains,\n Methods, and Trends","summary":" Aspect-based Sentiment Analysis (ABSA) is a fine-grained type of sentiment\nanalysis that identifies aspects and their associated opinions from a given\ntext. With the surge of digital opinionated text data, ABSA gained increasing\npopularity for its ability to mine more detailed and targeted insights. Many\nreview papers on ABSA subtasks and solution methodologies exist, however, few\nfocus on trends over time or systemic issues relating to research application\ndomains, datasets, and solution approaches. To fill the gap, this paper\npresents a Systematic Literature Review (SLR) of ABSA studies with a focus on\ntrends and high-level relationships among these fundamental components. This\nreview is one of the largest SLRs on ABSA. To our knowledge, it is also the\nfirst to systematically examine the interrelations among ABSA research and data\ndistribution across domains, as well as trends in solution paradigms and\napproaches. Our sample includes 727 primary studies screened from 8550 search\nresults without time constraints via an innovative automatic filtering process.\nOur quantitative analysis not only identifies trends in nearly two decades of\nABSA research development but also unveils a systemic lack of dataset and\ndomain diversity as well as domain mismatch that may hinder the development of\nfuture ABSA research. We discuss these findings and their implications and\npropose suggestions for future research.\n","authors":["Yan Cathy Hua","Paul Denny","Katerina Taskova","Jörg Wicker"],"pdf_url":"https://arxiv.org/pdf/2311.10777v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10515v2","updated":"2024-07-26T08:03:32Z","published":"2024-06-15T05:52:32Z","title":"Reactor Mk.1 performances: MMLU, HumanEval and BBH test results","summary":" The paper presents the performance results of Reactor Mk.1, ARCs flagship\nlarge language model, through a benchmarking process analysis. The model\nutilizes the Lychee AI engine and possesses less than 100 billion parameters,\nresulting in a combination of efficiency and potency. The Reactor Mk.1\noutperformed models such as GPT-4o, Claude Opus, and Llama 3, with achieved\nscores of 92% on the MMLU dataset, 91% on HumanEval dataset, and 88% on BBH\ndataset. It excels in both managing difficult jobs and reasoning, establishing\nas a prominent AI solution in the present cutting-edge AI technology.\n","authors":["TJ Dunham","Henry Syahputra"],"pdf_url":"https://arxiv.org/pdf/2406.10515v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18581v1","updated":"2024-07-26T08:03:07Z","published":"2024-07-26T08:03:07Z","title":"Dynamic Language Group-Based MoE: Enhancing Efficiency and Flexibility\n for Code-Switching Speech Recognition","summary":" The Mixture of Experts (MoE) approach is ideally suited for tackling\nmultilingual and code-switching (CS) challenges due to its multi-expert\narchitecture. This work introduces the DLG-MoE, which is optimized for\nbilingual and CS scenarios. Our novel Dynamic Language Group-based MoE layer\nfeatures a language router with shared weights for explicit language modeling,\nwhile independent unsupervised routers within the language group handle\nattributes beyond language. This structure not only enhances expert extension\ncapabilities but also supports dynamic top-k training, allowing for flexible\ninference across various top-k values and improving overall performance. The\nmodel requires no pre-training and supports streaming recognition, achieving\nstate-of-the-art (SOTA) results with unmatched flexibility compared to other\nmethods. The Code will be released.\n","authors":["Hukai Huang","Shenghui Lu","Yahui Shan","He Qu","Wenhao Guan","Qingyang Hong","Lin Li"],"pdf_url":"https://arxiv.org/pdf/2407.18581v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18562v1","updated":"2024-07-26T07:30:41Z","published":"2024-07-26T07:30:41Z","title":"Learning Robust Named Entity Recognizers From Noisy Data With Retrieval\n Augmentation","summary":" Named entity recognition (NER) models often struggle with noisy inputs, such\nas those with spelling mistakes or errors generated by Optical Character\nRecognition processes, and learning a robust NER model is challenging. Existing\nrobust NER models utilize both noisy text and its corresponding gold text for\ntraining, which is infeasible in many real-world applications in which gold\ntext is not available. In this paper, we consider a more realistic setting in\nwhich only noisy text and its NER labels are available. We propose to retrieve\nrelevant text of the noisy text from a knowledge corpus and use it to enhance\nthe representation of the original noisy input. We design three retrieval\nmethods: sparse retrieval based on lexicon similarity, dense retrieval based on\nsemantic similarity, and self-retrieval based on task-specific text. After\nretrieving relevant text, we concatenate the retrieved text with the original\nnoisy text and encode them with a transformer network, utilizing self-attention\nto enhance the contextual token representations of the noisy text using the\nretrieved text. We further employ a multi-view training framework that improves\nrobust NER without retrieving text during inference. Experiments show that our\nretrieval-augmented model achieves significant improvements in various noisy\nNER settings.\n","authors":["Chaoyi Ai","Yong Jiang","Shen Huang","Pengjun Xie","Kewei Tu"],"pdf_url":"https://arxiv.org/pdf/2407.18562v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18552v1","updated":"2024-07-26T07:05:04Z","published":"2024-07-26T07:05:04Z","title":"Multimodal Emotion Recognition using Audio-Video Transformer Fusion with\n Cross Attention","summary":" Understanding emotions is a fundamental aspect of human communication.\nIntegrating audio and video signals offers a more comprehensive understanding\nof emotional states compared to traditional methods that rely on a single data\nsource, such as speech or facial expressions. Despite its potential, multimodal\nemotion recognition faces significant challenges, particularly in\nsynchronization, feature extraction, and fusion of diverse data sources. To\naddress these issues, this paper introduces a novel transformer-based model\nnamed Audio-Video Transformer Fusion with Cross Attention (AVT-CA). The AVT-CA\nmodel employs a transformer fusion approach to effectively capture and\nsynchronize interlinked features from both audio and video inputs, thereby\nresolving synchronization problems. Additionally, the Cross Attention mechanism\nwithin AVT-CA selectively extracts and emphasizes critical features while\ndiscarding irrelevant ones from both modalities, addressing feature extraction\nand fusion challenges. Extensive experimental analysis conducted on the\nCMU-MOSEI, RAVDESS and CREMA-D datasets demonstrates the efficacy of the\nproposed model. The results underscore the importance of AVT-CA in developing\nprecise and reliable multimodal emotion recognition systems for practical\napplications.\n","authors":["Joe Dhanith P R","Shravan Venkatraman","Vigya Sharma","Santhosh Malarvannan"],"pdf_url":"https://arxiv.org/pdf/2407.18552v1.pdf","comment":"38 Pages, 9 Tables, 12 Figures"},{"id":"http://arxiv.org/abs/2402.17983v3","updated":"2024-07-26T06:46:19Z","published":"2024-02-28T01:56:00Z","title":"3MVRD: Multimodal Multi-task Multi-teacher Visually-Rich Form Document\n Understanding","summary":" This paper presents a groundbreaking multimodal, multi-task, multi-teacher\njoint-grained knowledge distillation model for visually-rich form document\nunderstanding. The model is designed to leverage insights from both\nfine-grained and coarse-grained levels by facilitating a nuanced correlation\nbetween token and entity representations, addressing the complexities inherent\nin form documents. Additionally, we introduce new inter-grained and\ncross-grained loss functions to further refine diverse multi-teacher knowledge\ndistillation transfer process, presenting distribution gaps and a harmonised\nunderstanding of form documents. Through a comprehensive evaluation across\npublicly available form document understanding datasets, our proposed model\nconsistently outperforms existing baselines, showcasing its efficacy in\nhandling the intricate structures and content of visually complex form\ndocuments.\n","authors":["Yihao Ding","Lorenzo Vaiani","Caren Han","Jean Lee","Paolo Garza","Josiah Poon","Luca Cagliero"],"pdf_url":"https://arxiv.org/pdf/2402.17983v3.pdf","comment":"Accepted at Findings of ACL 2024"},{"id":"http://arxiv.org/abs/2407.18540v1","updated":"2024-07-26T06:39:35Z","published":"2024-07-26T06:39:35Z","title":"A Universal Prompting Strategy for Extracting Process Model Information\n from Natural Language Text using Large Language Models","summary":" Over the past decade, extensive research efforts have been dedicated to the\nextraction of information from textual process descriptions. Despite the\nremarkable progress witnessed in natural language processing (NLP), information\nextraction within the Business Process Management domain remains predominantly\nreliant on rule-based systems and machine learning methodologies. Data scarcity\nhas so far prevented the successful application of deep learning techniques.\nHowever, the rapid progress in generative large language models (LLMs) makes it\npossible to solve many NLP tasks with very high quality without the need for\nextensive data. Therefore, we systematically investigate the potential of LLMs\nfor extracting information from textual process descriptions, targeting the\ndetection of process elements such as activities and actors, and relations\nbetween them. Using a heuristic algorithm, we demonstrate the suitability of\nthe extracted information for process model generation. Based on a novel\nprompting strategy, we show that LLMs are able to outperform state-of-the-art\nmachine learning approaches with absolute performance improvements of up to 8\\%\n$F_1$ score across three different datasets. We evaluate our prompting strategy\non eight different LLMs, showing it is universally applicable, while also\nanalyzing the impact of certain prompt parts on extraction quality. The number\nof example texts, the specificity of definitions, and the rigour of format\ninstructions are identified as key for improving the accuracy of extracted\ninformation. Our code, prompts, and data are publicly available.\n","authors":["Julian Neuberger","Lars Ackermann","Han van der Aa","Stefan Jablonski"],"pdf_url":"https://arxiv.org/pdf/2407.18540v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18538v1","updated":"2024-07-26T06:34:55Z","published":"2024-07-26T06:34:55Z","title":"Towards a Multidimensional Evaluation Framework for Empathetic\n Conversational Systems","summary":" Empathetic Conversational Systems (ECS) are built to respond empathetically\nto the user's emotions and sentiments, regardless of the application domain.\nCurrent ECS studies evaluation approaches are restricted to offline evaluation\nexperiments primarily for gold standard comparison & benchmarking, and user\nevaluation studies for collecting human ratings on specific constructs. These\nmethods are inadequate in measuring the actual quality of empathy in\nconversations. In this paper, we propose a multidimensional empathy evaluation\nframework with three new methods for measuring empathy at (i) structural level\nusing three empathy-related dimensions, (ii) behavioral level using empathy\nbehavioral types, and (iii) overall level using an empathy lexicon, thereby\nfortifying the evaluation process. Experiments were conducted with the\nstate-of-the-art ECS models and large language models (LLMs) to show the\nframework's usefulness.\n","authors":["Aravind Sesagiri Raamkumar","Siyuan Brandon Loh"],"pdf_url":"https://arxiv.org/pdf/2407.18538v1.pdf","comment":"13 pages, 4 figures"},{"id":"http://arxiv.org/abs/2402.18695v2","updated":"2024-07-26T06:34:15Z","published":"2024-02-28T20:22:17Z","title":"Grounding Language Models for Visual Entity Recognition","summary":" We introduce AutoVER, an Autoregressive model for Visual Entity Recognition.\nOur model extends an autoregressive Multi-modal Large Language Model by\nemploying retrieval augmented constrained generation. It mitigates low\nperformance on out-of-domain entities while excelling in queries that require\nvisually-situated reasoning. Our method learns to distinguish similar entities\nwithin a vast label space by contrastively training on hard negative pairs in\nparallel with a sequence-to-sequence objective without an external retriever.\nDuring inference, a list of retrieved candidate answers explicitly guides\nlanguage generation by removing invalid decoding paths. The proposed method\nachieves significant improvements across different dataset splits in the\nrecently proposed Oven-Wiki benchmark. Accuracy on the Entity seen split rises\nfrom 32.7% to 61.5%. It also demonstrates superior performance on the unseen\nand query splits by a substantial double-digit margin.\n","authors":["Zilin Xiao","Ming Gong","Paola Cascante-Bonilla","Xingyao Zhang","Jie Wu","Vicente Ordonez"],"pdf_url":"https://arxiv.org/pdf/2402.18695v2.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2402.07729v2","updated":"2024-07-26T06:30:47Z","published":"2024-02-12T15:41:22Z","title":"AIR-Bench: Benchmarking Large Audio-Language Models via Generative\n Comprehension","summary":" Recently, instruction-following audio-language models have received broad\nattention for human-audio interaction. However, the absence of benchmarks\ncapable of evaluating audio-centric interaction capabilities has impeded\nadvancements in this field. Previous models primarily focus on assessing\ndifferent fundamental tasks, such as Automatic Speech Recognition (ASR), and\nlack an assessment of the open-ended generative capabilities centered around\naudio. Thus, it is challenging to track the progression in the Large\nAudio-Language Models (LALMs) domain and to provide guidance for future\nimprovement. In this paper, we introduce AIR-Bench (\\textbf{A}udio\n\\textbf{I}nst\\textbf{R}uction \\textbf{Bench}mark), the first benchmark designed\nto evaluate the ability of LALMs to understand various types of audio signals\n(including human speech, natural sounds, and music), and furthermore, to\ninteract with humans in the textual format. AIR-Bench encompasses two\ndimensions: \\textit{foundation} and \\textit{chat} benchmarks. The former\nconsists of 19 tasks with approximately 19k single-choice questions, intending\nto inspect the basic single-task ability of LALMs. The latter one contains 2k\ninstances of open-ended question-and-answer data, directly assessing the\ncomprehension of the model on complex audio and its capacity to follow\ninstructions. Both benchmarks require the model to generate hypotheses\ndirectly. We design a unified framework that leverages advanced language\nmodels, such as GPT-4, to evaluate the scores of generated hypotheses given the\nmeta-information of the audio. Experimental results demonstrate a high level of\nconsistency between GPT-4-based evaluation and human evaluation. By revealing\nthe limitations of existing LALMs through evaluation results, AIR-Bench can\nprovide insights into the direction of future research.\n","authors":["Qian Yang","Jin Xu","Wenrui Liu","Yunfei Chu","Ziyue Jiang","Xiaohuan Zhou","Yichong Leng","Yuanjun Lv","Zhou Zhao","Chang Zhou","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2402.07729v2.pdf","comment":"Code and Data: https://github.com/OFA-Sys/AIR-Bench. Accepted by ACL\n 2024"},{"id":"http://arxiv.org/abs/2407.18525v1","updated":"2024-07-26T06:09:10Z","published":"2024-07-26T06:09:10Z","title":"Is larger always better? Evaluating and prompting large language models\n for non-generative medical tasks","summary":" The use of Large Language Models (LLMs) in medicine is growing, but their\nability to handle both structured Electronic Health Record (EHR) data and\nunstructured clinical notes is not well-studied. This study benchmarks various\nmodels, including GPT-based LLMs, BERT-based models, and traditional clinical\npredictive models, for non-generative medical tasks utilizing renowned\ndatasets. We assessed 14 language models (9 GPT-based and 5 BERT-based) and 7\ntraditional predictive models using the MIMIC dataset (ICU patient records) and\nthe TJH dataset (early COVID-19 EHR data), focusing on tasks such as mortality\nand readmission prediction, disease hierarchy reconstruction, and biomedical\nsentence matching, comparing both zero-shot and finetuned performance. Results\nindicated that LLMs exhibited robust zero-shot predictive capabilities on\nstructured EHR data when using well-designed prompting strategies, frequently\nsurpassing traditional models. However, for unstructured medical texts, LLMs\ndid not outperform finetuned BERT models, which excelled in both supervised and\nunsupervised tasks. Consequently, while LLMs are effective for zero-shot\nlearning on structured data, finetuned BERT models are more suitable for\nunstructured texts, underscoring the importance of selecting models based on\nspecific task requirements and data characteristics to optimize the application\nof NLP technology in healthcare.\n","authors":["Yinghao Zhu","Junyi Gao","Zixiang Wang","Weibin Liao","Xiaochen Zheng","Lifang Liang","Yasha Wang","Chengwei Pan","Ewen M. Harrison","Liantao Ma"],"pdf_url":"https://arxiv.org/pdf/2407.18525v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2402.01713"},{"id":"http://arxiv.org/abs/2310.15469v3","updated":"2024-07-26T04:43:19Z","published":"2023-10-24T02:48:19Z","title":"The Janus Interface: How Fine-Tuning in Large Language Models Amplifies\n the Privacy Risks","summary":" The rapid advancements of large language models (LLMs) have raised public\nconcerns about the privacy leakage of personally identifiable information (PII)\nwithin their extensive training datasets. Recent studies have demonstrated that\nan adversary could extract highly sensitive privacy data from the training data\nof LLMs with carefully designed prompts. However, these attacks suffer from the\nmodel's tendency to hallucinate and catastrophic forgetting (CF) in the\npre-training stage, rendering the veracity of divulged PIIs negligible. In our\nresearch, we propose a novel attack, Janus, which exploits the fine-tuning\ninterface to recover forgotten PIIs from the pre-training data in LLMs. We\nformalize the privacy leakage problem in LLMs and explain why forgotten PIIs\ncan be recovered through empirical analysis on open-source language models.\nBased upon these insights, we evaluate the performance of Janus on both\nopen-source language models and two latest LLMs, i.e., GPT-3.5-Turbo and\nLLaMA-2-7b. Our experiment results show that Janus amplifies the privacy risks\nby over 10 times in comparison with the baseline and significantly outperforms\nthe state-of-the-art privacy extraction attacks including prefix attacks and\nin-context learning (ICL). Furthermore, our analysis validates that existing\nfine-tuning APIs provided by OpenAI and Azure AI Studio are susceptible to our\nJanus attack, allowing an adversary to conduct such an attack at a low cost.\n","authors":["Xiaoyi Chen","Siyuan Tang","Rui Zhu","Shijun Yan","Lei Jin","Zihao Wang","Liya Su","Zhikun Zhang","XiaoFeng Wang","Haixu Tang"],"pdf_url":"https://arxiv.org/pdf/2310.15469v3.pdf","comment":"This work has been accepted by CCS 2024"},{"id":"http://arxiv.org/abs/2407.18501v1","updated":"2024-07-26T04:18:36Z","published":"2024-07-26T04:18:36Z","title":"The formation of perceptual space in early phonetic acquisition: a\n cross-linguistic modeling approach","summary":" This study investigates how learners organize perceptual space in early\nphonetic acquisition by advancing previous studies in two key aspects. Firstly,\nit examines the shape of the learned hidden representation as well as its\nability to categorize phonetic categories. Secondly, it explores the impact of\ntraining models on context-free acoustic information, without involving\ncontextual cues, on phonetic acquisition, closely mimicking the early language\nlearning stage. Using a cross-linguistic modeling approach, autoencoder models\nare trained on English and Mandarin and evaluated in both native and non-native\nconditions, following experimental conditions used in infant language\nperception studies. The results demonstrate that unsupervised bottom-up\ntraining on context-free acoustic information leads to comparable learned\nrepresentations of perceptual space between native and non-native conditions\nfor both English and Mandarin, resembling the early stage of universal\nlistening in infants. These findings provide insights into the organization of\nperceptual space during early phonetic acquisition and contribute to our\nunderstanding of the formation and representation of phonetic categories.\n","authors":["Frank Lihui Tan","Youngah Do"],"pdf_url":"https://arxiv.org/pdf/2407.18501v1.pdf","comment":"51 pages"},{"id":"http://arxiv.org/abs/2407.18498v1","updated":"2024-07-26T04:13:43Z","published":"2024-07-26T04:13:43Z","title":"A Reliable Common-Sense Reasoning Socialbot Built Using LLMs and\n Goal-Directed ASP","summary":" The development of large language models (LLMs), such as GPT, has enabled the\nconstruction of several socialbots, like ChatGPT, that are receiving a lot of\nattention for their ability to simulate a human conversation. However, the\nconversation is not guided by a goal and is hard to control. In addition,\nbecause LLMs rely more on pattern recognition than deductive reasoning, they\ncan give confusing answers and have difficulty integrating multiple topics into\na cohesive response. These limitations often lead the LLM to deviate from the\nmain topic to keep the conversation interesting. We propose AutoCompanion, a\nsocialbot that uses an LLM model to translate natural language into predicates\n(and vice versa) and employs commonsense reasoning based on Answer Set\nProgramming (ASP) to hold a social conversation with a human. In particular, we\nrely on s(CASP), a goal-directed implementation of ASP as the backend. This\npaper presents the framework design and how an LLM is used to parse user\nmessages and generate a response from the s(CASP) engine output. To validate\nour proposal, we describe (real) conversations in which the chatbot's goal is\nto keep the user entertained by talking about movies and books, and s(CASP)\nensures (i) correctness of answers, (ii) coherence (and precision) during the\nconversation, which it dynamically regulates to achieve its specific purpose,\nand (iii) no deviation from the main topic.\n","authors":["Yankai Zeng","Abhiramon Rajashekharan","Kinjal Basu","Huaduo Wang","Joaquín Arias","Gopal Gupta"],"pdf_url":"https://arxiv.org/pdf/2407.18498v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14888v3","updated":"2024-07-26T04:12:16Z","published":"2024-03-21T23:48:21Z","title":"AutoRE: Document-Level Relation Extraction with Large Language Models","summary":" Large Language Models (LLMs) have demonstrated exceptional abilities in\ncomprehending and generating text, motivating numerous researchers to utilize\nthem for Information Extraction (IE) purposes, including Relation Extraction\n(RE). Nonetheless, most existing methods are predominantly designed for\nSentence-level Relation Extraction (SentRE) tasks, which typically encompass a\nrestricted set of relations and triplet facts within a single sentence.\nFurthermore, certain approaches resort to treating relations as candidate\nchoices integrated into prompt templates, leading to inefficient processing and\nsuboptimal performance when tackling Document-Level Relation Extraction (DocRE)\ntasks, which entail handling multiple relations and triplet facts distributed\nacross a given document, posing distinct challenges. To overcome these\nlimitations, we introduce AutoRE, an end-to-end DocRE model that adopts a novel\nRE extraction paradigm named RHF (Relation-Head-Facts). Unlike existing\napproaches, AutoRE does not rely on the assumption of known relation options,\nmaking it more reflective of real-world scenarios. Additionally, we have\ndeveloped an easily extensible RE framework using a Parameters Efficient Fine\nTuning (PEFT) algorithm (QLoRA). Our experiments on the RE-DocRED dataset\nshowcase AutoRE's best performance, achieving state-of-the-art results,\nsurpassing TAG by 10.03\\% and 9.03\\% respectively on the dev and test set. The\ncode is available at https://github.com/THUDM/AutoRE and the demonstration\nvideo is provided at https://www.youtube.com/watch?v=IhKRsZUAxKk.\n","authors":["Lilong Xue","Dan Zhang","Yuxiao Dong","Jie Tang"],"pdf_url":"https://arxiv.org/pdf/2403.14888v3.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2407.18496v1","updated":"2024-07-26T04:01:27Z","published":"2024-07-26T04:01:27Z","title":"Towards More Accurate Prediction of Human Empathy and Emotion in Text\n and Multi-turn Conversations by Combining Advanced NLP, Transformers-based\n Networks, and Linguistic Methodologies","summary":" Based on the WASSA 2022 Shared Task on Empathy Detection and Emotion\nClassification, we predict the level of empathic concern and personal distress\ndisplayed in essays. For the first stage of this project we implemented a\nFeed-Forward Neural Network using sentence-level embeddings as features. We\nexperimented with four different embedding models for generating the inputs to\nthe neural network. The subsequent stage builds upon the previous work and we\nhave implemented three types of revisions. The first revision focuses on the\nenhancements to the model architecture and the training approach. The second\nrevision focuses on handling class imbalance using stratified data sampling.\nThe third revision focuses on leveraging lexical resources, where we apply four\ndifferent resources to enrich the features associated with the dataset. During\nthe final stage of this project, we have created the final end-to-end system\nfor the primary task using an ensemble of models to revise primary task\nperformance. Additionally, as part of the final stage, these approaches have\nbeen adapted to the WASSA 2023 Shared Task on Empathy Emotion and Personality\nDetection in Interactions, in which the empathic concern, emotion polarity, and\nemotion intensity in dyadic text conversations are predicted.\n","authors":["Manisha Singh","Divy Sharma","Alonso Ma","Nora Goldfine"],"pdf_url":"https://arxiv.org/pdf/2407.18496v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18483v1","updated":"2024-07-26T03:23:31Z","published":"2024-07-26T03:23:31Z","title":"A Role-specific Guided Large Language Model for Ophthalmic Consultation\n Based on Stylistic Differentiation","summary":" Ophthalmology consultations are crucial for diagnosing, treating, and\npreventing eye diseases. However, the growing demand for consultations exceeds\nthe availability of ophthalmologists. By leveraging large pre-trained language\nmodels, we can design effective dialogues for specific scenarios, aiding in\nconsultations. Traditional fine-tuning strategies for question-answering tasks\nare impractical due to increasing model size and often ignoring patient-doctor\nrole function during consultations. In this paper, we propose EyeDoctor, an\nophthalmic medical questioning large language model that enhances accuracy\nthrough doctor-patient role perception guided and an augmented knowledge base\nwith external disease information. Experimental results show EyeDoctor achieves\nhigher question-answering precision in ophthalmology consultations. Notably,\nEyeDoctor demonstrated a 7.25% improvement in Rouge-1 scores and a 10.16%\nimprovement in F1 scores on multi-round datasets compared to second best model\nChatGPT, highlighting the importance of doctor-patient role differentiation and\ndynamic knowledge base expansion for intelligent medical consultations. EyeDoc\nalso serves as a free available web based service and souce code is available\nat https://github.com/sperfu/EyeDoc.\n","authors":["Laiyi Fu","Binbin Fan","Hongkai Du","Yanxiang Feng","Chunhua Li","Huping Song"],"pdf_url":"https://arxiv.org/pdf/2407.18483v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18479v1","updated":"2024-07-26T03:13:47Z","published":"2024-07-26T03:13:47Z","title":"Multi-turn Response Selection with Commonsense-enhanced Language Models","summary":" As a branch of advanced artificial intelligence, dialogue systems are\nprospering. Multi-turn response selection is a general research problem in\ndialogue systems. With the assistance of background information and pre-trained\nlanguage models, the performance of state-of-the-art methods on this problem\ngains impressive improvement. However, existing studies neglect the importance\nof external commonsense knowledge. Hence, we design a Siamese network where a\npre-trained Language model merges with a Graph neural network (SinLG). SinLG\ntakes advantage of Pre-trained Language Models (PLMs) to catch the word\ncorrelations in the context and response candidates and utilizes a Graph Neural\nNetwork (GNN) to reason helpful common sense from an external knowledge graph.\nThe GNN aims to assist the PLM in fine-tuning, and arousing its related\nmemories to attain better performance. Specifically, we first extract related\nconcepts as nodes from an external knowledge graph to construct a subgraph with\nthe context response pair as a super node for each sample. Next, we learn two\nrepresentations for the context response pair via both the PLM and GNN. A\nsimilarity loss between the two representations is utilized to transfer the\ncommonsense knowledge from the GNN to the PLM. Then only the PLM is used to\ninfer online so that efficiency can be guaranteed. Finally, we conduct\nextensive experiments on two variants of the PERSONA-CHAT dataset, which proves\nthat our solution can not only improve the performance of the PLM but also\nachieve an efficient inference.\n","authors":["Yuandong Wang","Xuhui Ren","Tong Chen","Yuxiao Dong","Nguyen Quoc Viet Hung","Jie Tang"],"pdf_url":"https://arxiv.org/pdf/2407.18479v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18471v1","updated":"2024-07-26T02:44:55Z","published":"2024-07-26T02:44:55Z","title":"Constructing the CORD-19 Vaccine Dataset","summary":" We introduce new dataset 'CORD-19-Vaccination' to cater to scientists\nspecifically looking into COVID-19 vaccine-related research. This dataset is\nextracted from CORD-19 dataset [Wang et al., 2020] and augmented with new\ncolumns for language detail, author demography, keywords, and topic per paper.\nFacebook's fastText model is used to identify languages [Joulin et al., 2016].\nTo establish author demography (author affiliation, lab/institution location,\nand lab/institution country columns) we processed the JSON file for each paper\nand then further enhanced using Google's search API to determine country\nvalues. 'Yake' was used to extract keywords from the title, abstract, and body\nof each paper and the LDA (Latent Dirichlet Allocation) algorithm was used to\nadd topic information [Campos et al., 2020, 2018a,b]. To evaluate the dataset,\nwe demonstrate a question-answering task like the one used in the CORD-19\nKaggle challenge [Goldbloom et al., 2022]. For further evaluation, sequential\nsentence classification was performed on each paper's abstract using the model\nfrom Dernoncourt et al. [2016]. We partially hand annotated the training\ndataset and used a pre-trained BERT-PubMed layer. 'CORD- 19-Vaccination'\ncontains 30k research papers and can be immensely valuable for NLP research\nsuch as text mining, information extraction, and question answering, specific\nto the domain of COVID-19 vaccine research.\n","authors":["Manisha Singh","Divy Sharma","Alonso Ma","Bridget Tyree","Margaret Mitchell"],"pdf_url":"https://arxiv.org/pdf/2407.18471v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.12393v3","updated":"2024-07-26T02:34:14Z","published":"2024-07-17T08:13:22Z","title":"PersLLM: A Personified Training Approach for Large Language Models","summary":" Large language models exhibit aspects of human-level intelligence that\ncatalyze their application as human-like agents in domains such as social\nsimulations, human-machine interactions, and collaborative multi-agent systems.\nHowever, the absence of distinct personalities, such as displaying ingratiating\nbehaviors, inconsistent opinions, and uniform response patterns, diminish LLMs\nutility in practical applications. Addressing this, the development of\npersonality traits in LLMs emerges as a crucial area of research to unlock\ntheir latent potential. Existing methods to personify LLMs generally involve\nstrategies like employing stylized training data for instruction tuning or\nusing prompt engineering to simulate different personalities. These methods\nonly capture superficial linguistic styles instead of the core of personalities\nand are therefore not stable. In this study, we propose PersLLM, integrating\npsychology-grounded principles of personality: social practice, consistency,\nand dynamic development, into a comprehensive training methodology. We\nincorporate personality traits directly into the model parameters, enhancing\nthe model's resistance to induction, promoting consistency, and supporting the\ndynamic evolution of personality. Single-agent evaluation validates our\nmethod's superiority, as it produces responses more aligned with reference\npersonalities compared to other approaches. Case studies for multi-agent\ncommunication highlight its benefits in enhancing opinion consistency within\nindividual agents and fostering collaborative creativity among multiple agents\nin dialogue contexts, potentially benefiting human simulation and multi-agent\ncooperation. Additionally, human-agent interaction evaluations indicate that\nour personified models significantly enhance interactive experiences,\nunderscoring the practical implications of our research.\n","authors":["Zheni Zeng","Jiayi Chen","Huimin Chen","Yukun Yan","Yuxuan Chen","Zhenghao Liu","Zhiyuan Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2407.12393v3.pdf","comment":"10 pages for main text, 5 figures"},{"id":"http://arxiv.org/abs/2407.18461v1","updated":"2024-07-26T02:03:23Z","published":"2024-07-26T02:03:23Z","title":"Enhancing Dysarthric Speech Recognition for Unseen Speakers via\n Prototype-Based Adaptation","summary":" Dysarthric speech recognition (DSR) presents a formidable challenge due to\ninherent inter-speaker variability, leading to severe performance degradation\nwhen applying DSR models to new dysarthric speakers. Traditional speaker\nadaptation methodologies typically involve fine-tuning models for each speaker,\nbut this strategy is cost-prohibitive and inconvenient for disabled users,\nrequiring substantial data collection. To address this issue, we introduce a\nprototype-based approach that markedly improves DSR performance for unseen\ndysarthric speakers without additional fine-tuning. Our method employs a\nfeature extractor trained with HuBERT to produce per-word prototypes that\nencapsulate the characteristics of previously unseen speakers. These prototypes\nserve as the basis for classification. Additionally, we incorporate supervised\ncontrastive learning to refine feature extraction. By enhancing representation\nquality, we further improve DSR performance, enabling effective personalized\nDSR. We release our code at https://github.com/NKU-HLT/PB-DSR.\n","authors":["Shiyao Wang","Shiwan Zhao","Jiaming Zhou","Aobo Kong","Yong Qin"],"pdf_url":"https://arxiv.org/pdf/2407.18461v1.pdf","comment":"accepted by Interspeech 2024"},{"id":"http://arxiv.org/abs/2402.05140v3","updated":"2024-07-26T01:28:16Z","published":"2024-02-06T20:11:54Z","title":"Tag-LLM: Repurposing General-Purpose LLMs for Specialized Domains","summary":" Large Language Models (LLMs) have demonstrated remarkable proficiency in\nunderstanding and generating natural language. However, their capabilities wane\nin highly specialized domains underrepresented in the pretraining corpus, such\nas physical and biomedical sciences. This work explores how to repurpose\ngeneral LLMs into effective task solvers for specialized domains. We introduce\na novel, model-agnostic framework for learning custom input tags, which are\nparameterized as continuous vectors appended to the LLM's embedding layer, to\ncondition the LLM. We design two types of input tags: domain tags are used to\ndelimit specialized representations (e.g., chemical formulas) and provide\ndomain-relevant context; function tags are used to represent specific functions\n(e.g., predicting molecular properties) and compress function-solving\ninstructions. We develop a three-stage protocol to learn these tags using\nauxiliary data and domain knowledge. By explicitly disentangling task domains\nfrom task functions, our method enables zero-shot generalization to unseen\nproblems through diverse combinations of the input tags. It also boosts LLM's\nperformance in various specialized domains, such as predicting protein or\nchemical properties and modeling drug-target interactions, outperforming expert\nmodels tailored to these tasks.\n","authors":["Junhong Shen","Neil Tenenholtz","James Brian Hall","David Alvarez-Melis","Nicolo Fusi"],"pdf_url":"https://arxiv.org/pdf/2402.05140v3.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2407.18454v1","updated":"2024-07-26T01:21:25Z","published":"2024-07-26T01:21:25Z","title":"Fairness Definitions in Language Models Explained","summary":" Language Models (LMs) have demonstrated exceptional performance across\nvarious Natural Language Processing (NLP) tasks. Despite these advancements,\nLMs can inherit and amplify societal biases related to sensitive attributes\nsuch as gender and race, limiting their adoption in real-world applications.\nTherefore, fairness has been extensively explored in LMs, leading to the\nproposal of various fairness notions. However, the lack of clear agreement on\nwhich fairness definition to apply in specific contexts (\\textit{e.g.,}\nmedium-sized LMs versus large-sized LMs) and the complexity of understanding\nthe distinctions between these definitions can create confusion and impede\nfurther progress. To this end, this paper proposes a systematic survey that\nclarifies the definitions of fairness as they apply to LMs. Specifically, we\nbegin with a brief introduction to LMs and fairness in LMs, followed by a\ncomprehensive, up-to-date overview of existing fairness notions in LMs and the\nintroduction of a novel taxonomy that categorizes these concepts based on their\nfoundational principles and operational distinctions. We further illustrate\neach definition through experiments, showcasing their practical implications\nand outcomes. Finally, we discuss current research challenges and open\nquestions, aiming to foster innovative ideas and advance the field. The\nimplementation and additional resources are publicly available at\nhttps://github.com/LavinWong/Fairness-in-Large-Language-Models/tree/main/definitions.\n","authors":["Thang Viet Doan","Zhibo Chu","Zichong Wang","Wenbin Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.18454v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18442v1","updated":"2024-07-26T00:48:28Z","published":"2024-07-26T00:48:28Z","title":"Guidance-Based Prompt Data Augmentation in Specialized Domains for Named\n Entity Recognition","summary":" While the abundance of rich and vast datasets across numerous fields has\nfacilitated the advancement of natural language processing, sectors in need of\nspecialized data types continue to struggle with the challenge of finding\nquality data. Our study introduces a novel guidance data augmentation technique\nutilizing abstracted context and sentence structures to produce varied\nsentences while maintaining context-entity relationships, addressing data\nscarcity challenges. By fostering a closer relationship between context,\nsentence structure, and role of entities, our method enhances data\naugmentation's effectiveness. Consequently, by showcasing diversification in\nboth entity-related vocabulary and overall sentence structure, and\nsimultaneously improving the training performance of named entity recognition\ntask.\n","authors":["Hyeonseok Kang","Hyein Seo","Jeesu Jung","Sangkeun Jung","Du-Seong Chang","Riwoo Chung"],"pdf_url":"https://arxiv.org/pdf/2407.18442v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.10254v2","updated":"2024-07-26T23:50:35Z","published":"2023-09-19T02:20:10Z","title":"LLM Platform Security: Applying a Systematic Evaluation Framework to\n OpenAI's ChatGPT Plugins","summary":" Large language model (LLM) platforms, such as ChatGPT, have recently begun\noffering an app ecosystem to interface with third-party services on the\ninternet. While these apps extend the capabilities of LLM platforms, they are\ndeveloped by arbitrary third parties and thus cannot be implicitly trusted.\nApps also interface with LLM platforms and users using natural language, which\ncan have imprecise interpretations. In this paper, we propose a framework that\nlays a foundation for LLM platform designers to analyze and improve the\nsecurity, privacy, and safety of current and future third-party integrated LLM\nplatforms. Our framework is a formulation of an attack taxonomy that is\ndeveloped by iteratively exploring how LLM platform stakeholders could leverage\ntheir capabilities and responsibilities to mount attacks against each other. As\npart of our iterative process, we apply our framework in the context of\nOpenAI's plugin (apps) ecosystem. We uncover plugins that concretely\ndemonstrate the potential for the types of issues that we outline in our attack\ntaxonomy. We conclude by discussing novel challenges and by providing\nrecommendations to improve the security, privacy, and safety of present and\nfuture LLM-based computing platforms.\n","authors":["Umar Iqbal","Tadayoshi Kohno","Franziska Roesner"],"pdf_url":"https://arxiv.org/pdf/2309.10254v2.pdf","comment":"To appear in the proceedings of the 7th AAAI / ACM Conference on AI,\n Ethics, and Society (AIES), October 2024"},{"id":"http://arxiv.org/abs/2403.00815v3","updated":"2024-07-26T23:24:39Z","published":"2024-02-25T23:10:20Z","title":"RAM-EHR: Retrieval Augmentation Meets Clinical Predictions on Electronic\n Health Records","summary":" We present RAM-EHR, a Retrieval AugMentation pipeline to improve clinical\npredictions on Electronic Health Records (EHRs). RAM-EHR first collects\nmultiple knowledge sources, converts them into text format, and uses dense\nretrieval to obtain information related to medical concepts. This strategy\naddresses the difficulties associated with complex names for the concepts.\nRAM-EHR then augments the local EHR predictive model co-trained with\nconsistency regularization to capture complementary information from patient\nvisits and summarized knowledge. Experiments on two EHR datasets show the\nefficacy of RAM-EHR over previous knowledge-enhanced baselines (3.4% gain in\nAUROC and 7.2% gain in AUPR), emphasizing the effectiveness of the summarized\nknowledge from RAM-EHR for clinical prediction tasks. The code will be\npublished at \\url{https://github.com/ritaranx/RAM-EHR}.\n","authors":["Ran Xu","Wenqi Shi","Yue Yu","Yuchen Zhuang","Bowen Jin","May D. Wang","Joyce C. Ho","Carl Yang"],"pdf_url":"https://arxiv.org/pdf/2403.00815v3.pdf","comment":"ACL 2024 (Oral)"},{"id":"http://arxiv.org/abs/2407.03190v2","updated":"2024-07-26T21:51:19Z","published":"2024-06-15T02:36:11Z","title":"Cutting through the noise to motivate people: A comprehensive analysis\n of COVID-19 social media posts de/motivating vaccination","summary":" The COVID-19 pandemic exposed significant weaknesses in the healthcare\ninformation system. The overwhelming volume of misinformation on social media\nand other socioeconomic factors created extraordinary challenges to motivate\npeople to take proper precautions and get vaccinated. In this context, our work\nexplored a novel direction by analyzing an extensive dataset collected over two\nyears, identifying the topics de/motivating the public about COVID-19\nvaccination. We analyzed these topics based on time, geographic location, and\npolitical orientation. We noticed that while the motivating topics remain the\nsame over time and geographic location, the demotivating topics change rapidly.\nWe also identified that intrinsic motivation, rather than external mandate, is\nmore advantageous to inspire the public. This study addresses scientific\ncommunication and public motivation in social media. It can help public health\nofficials, policymakers, and social media platforms develop more effective\nmessaging strategies to cut through the noise of misinformation and educate the\npublic about scientific findings.\n","authors":["Ashiqur Rahman","Ehsan Mohammadi","Hamed Alhoori"],"pdf_url":"https://arxiv.org/pdf/2407.03190v2.pdf","comment":"51 pages, 13 figures, 12 tables. Accepted at Natural Language\n Processing Journal"},{"id":"http://arxiv.org/abs/2405.09679v2","updated":"2024-07-26T21:23:14Z","published":"2024-05-15T19:44:54Z","title":"Simulating Policy Impacts: Developing a Generative Scenario Writing\n Method to Evaluate the Perceived Effects of Regulation","summary":" The rapid advancement of AI technologies yields numerous future impacts on\nindividuals and society. Policymakers are tasked to react quickly and establish\npolicies that mitigate those impacts. However, anticipating the effectiveness\nof policies is a difficult task, as some impacts might only be observable in\nthe future and respective policies might not be applicable to the future\ndevelopment of AI. In this work we develop a method for using large language\nmodels (LLMs) to evaluate the efficacy of a given piece of policy at mitigating\nspecified negative impacts. We do so by using GPT-4 to generate scenarios both\npre- and post-introduction of policy and translating these vivid stories into\nmetrics based on human perceptions of impacts. We leverage an already\nestablished taxonomy of impacts of generative AI in the media environment to\ngenerate a set of scenario pairs both mitigated and non-mitigated by the\ntransparency policy in Article 50 of the EU AI Act. We then run a user study\n(n=234) to evaluate these scenarios across four risk-assessment dimensions:\nseverity, plausibility, magnitude, and specificity to vulnerable populations.\nWe find that this transparency legislation is perceived to be effective at\nmitigating harms in areas such as labor and well-being, but largely ineffective\nin areas such as social cohesion and security. Through this case study we\ndemonstrate the efficacy of our method as a tool to iterate on the\neffectiveness of policy for mitigating various negative impacts. We expect this\nmethod to be useful to researchers or other stakeholders who want to brainstorm\nthe potential utility of different pieces of policy or other mitigation\nstrategies.\n","authors":["Julia Barnett","Kimon Kieslich","Nicholas Diakopoulos"],"pdf_url":"https://arxiv.org/pdf/2405.09679v2.pdf","comment":"To be published in the proceedings of the Seventh AAAI/ACM Conference\n on AI, Ethics, and Society"},{"id":"http://arxiv.org/abs/2407.19089v1","updated":"2024-07-26T21:10:50Z","published":"2024-07-26T21:10:50Z","title":"Many-Shot In-Context Learning for Molecular Inverse Design","summary":" Large Language Models (LLMs) have demonstrated great performance in few-shot\nIn-Context Learning (ICL) for a variety of generative and discriminative\nchemical design tasks. The newly expanded context windows of LLMs can further\nimprove ICL capabilities for molecular inverse design and lead optimization. To\ntake full advantage of these capabilities we developed a new semi-supervised\nlearning method that overcomes the lack of experimental data available for\nmany-shot ICL. Our approach involves iterative inclusion of LLM generated\nmolecules with high predicted performance, along with experimental data. We\nfurther integrated our method in a multi-modal LLM which allows for the\ninteractive modification of generated molecular structures using text\ninstructions. As we show, the new method greatly improves upon existing ICL\nmethods for molecular design while being accessible and easy to use for\nscientists.\n","authors":["Saeed Moayedpour","Alejandro Corrochano-Navarro","Faryad Sahneh","Shahriar Noroozizadeh","Alexander Koetter","Jiri Vymetal","Lorenzo Kogler-Anele","Pablo Mas","Yasser Jangjou","Sizhen Li","Michael Bailey","Marc Bianciotto","Hans Matter","Christoph Grebner","Gerhard Hessler","Ziv Bar-Joseph","Sven Jager"],"pdf_url":"https://arxiv.org/pdf/2407.19089v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19056v1","updated":"2024-07-26T19:27:17Z","published":"2024-07-26T19:27:17Z","title":"OfficeBench: Benchmarking Language Agents across Multiple Applications\n for Office Automation","summary":" Office automation significantly enhances human productivity by automatically\nfinishing routine tasks in the workflow. Beyond the basic information\nextraction studied in much of the prior document AI literature, the office\nautomation research should be extended to more realistic office tasks which\nrequire to integrate various information sources in the office system and\nproduce outputs through a series of decision-making processes. We introduce\nOfficeBench, one of the first office automation benchmarks for evaluating\ncurrent LLM agents' capability to address office tasks in realistic office\nworkflows. OfficeBench requires LLM agents to perform feasible long-horizon\nplanning, proficiently switch between applications in a timely manner, and\naccurately ground their actions within a large combined action space, based on\nthe contextual demands of the workflow. Applying our customized evaluation\nmethods on each task, we find that GPT-4 Omni achieves the highest pass rate of\n47.00%, demonstrating a decent performance in handling office tasks. However,\nthis is still far below the human performance and accuracy standards required\nby real-world office workflows. We further observe that most issues are related\nto operation redundancy and hallucinations, as well as limitations in switching\nbetween multiple applications, which may provide valuable insights for\ndeveloping effective agent frameworks for office automation.\n","authors":["Zilong Wang","Yuedong Cui","Li Zhong","Zimin Zhang","Da Yin","Bill Yuchen Lin","Jingbo Shang"],"pdf_url":"https://arxiv.org/pdf/2407.19056v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2407.19041v1","updated":"2024-07-26T18:46:39Z","published":"2024-07-26T18:46:39Z","title":"Optimizing Numerical Estimation and Operational Efficiency in the Legal\n Domain through Large Language Models","summary":" The legal landscape encompasses a wide array of lawsuit types, presenting\nlawyers with challenges in delivering timely and accurate information to\nclients, particularly concerning critical aspects like potential imprisonment\nduration or financial repercussions. Compounded by the scarcity of legal\nexperts, there's an urgent need to enhance the efficiency of traditional legal\nworkflows. Recent advances in deep learning, especially Large Language Models\n(LLMs), offer promising solutions to this challenge. Leveraging LLMs'\nmathematical reasoning capabilities, we propose a novel approach integrating\nLLM-based methodologies with specially designed prompts to address precision\nrequirements in legal Artificial Intelligence (LegalAI) applications. The\nproposed work seeks to bridge the gap between traditional legal practices and\nmodern technological advancements, paving the way for a more accessible,\nefficient, and equitable legal system. To validate this method, we introduce a\ncurated dataset tailored to precision-oriented LegalAI tasks, serving as a\nbenchmark for evaluating LLM-based approaches. Extensive experimentation\nconfirms the efficacy of our methodology in generating accurate numerical\nestimates within the legal domain, emphasizing the role of LLMs in streamlining\nlegal processes and meeting the evolving demands of LegalAI.\n","authors":["Jia-Hong Huang","Chao-Chun Yang","Yixian Shen","Alessio M. Pacces","Evangelos Kanoulas"],"pdf_url":"https://arxiv.org/pdf/2407.19041v1.pdf","comment":"The paper has been accepted by the 33rd ACM International Conference\n on Information and Knowledge Management (CIKM) in 2024"},{"id":"http://arxiv.org/abs/2307.06290v3","updated":"2024-07-26T18:09:11Z","published":"2023-07-12T16:37:31Z","title":"Instruction Mining: Instruction Data Selection for Tuning Large Language\n Models","summary":" Large language models (LLMs) are initially pretrained for broad capabilities\nand then finetuned with instruction-following datasets to improve their\nperformance in interacting with humans. Despite advances in finetuning, a\nstandardized guideline for selecting high-quality datasets to optimize this\nprocess remains elusive. In this paper, we first propose InstructMining, an\ninnovative method designed for automatically selecting premium\ninstruction-following data for finetuning LLMs. Specifically, InstructMining\nutilizes natural language indicators as a measure of data quality, applying\nthem to evaluate unseen datasets. During experimentation, we discover that\ndouble descent phenomenon exists in large language model finetuning. Based on\nthis observation, we further leverage BlendSearch to help find the best subset\namong the entire dataset (i.e., 2,532 out of 100,000). Experiment results show\nthat InstructMining-7B achieves state-of-the-art performance on two of the most\npopular benchmarks: LLM-as-a-judge and Huggingface OpenLLM leaderboard.\n","authors":["Yihan Cao","Yanbin Kang","Chi Wang","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2307.06290v3.pdf","comment":"24 pages, 7 figures"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2407.18914v1","updated":"2024-07-26T17:59:56Z","published":"2024-07-26T17:59:56Z","title":"Floating No More: Object-Ground Reconstruction from a Single Image","summary":" Recent advancements in 3D object reconstruction from single images have\nprimarily focused on improving the accuracy of object shapes. Yet, these\ntechniques often fail to accurately capture the inter-relation between the\nobject, ground, and camera. As a result, the reconstructed objects often appear\nfloating or tilted when placed on flat surfaces. This limitation significantly\naffects 3D-aware image editing applications like shadow rendering and object\npose manipulation. To address this issue, we introduce ORG (Object\nReconstruction with Ground), a novel task aimed at reconstructing 3D object\ngeometry in conjunction with the ground surface. Our method uses two compact\npixel-level representations to depict the relationship between camera, object,\nand ground. Experiments show that the proposed ORG model can effectively\nreconstruct object-ground geometry on unseen data, significantly enhancing the\nquality of shadow generation and pose manipulation compared to conventional\nsingle-image 3D reconstruction techniques.\n","authors":["Yunze Man","Yichen Sheng","Jianming Zhang","Liang-Yan Gui","Yu-Xiong Wang"],"pdf_url":"https://arxiv.org/pdf/2407.18914v1.pdf","comment":"Project Page: https://yunzeman.github.io/ORG/"},{"id":"http://arxiv.org/abs/2407.18911v1","updated":"2024-07-26T17:59:52Z","published":"2024-07-26T17:59:52Z","title":"HRP: Human Affordances for Robotic Pre-Training","summary":" In order to *generalize* to various tasks in the wild, robotic agents will\nneed a suitable representation (i.e., vision network) that enables the robot to\npredict optimal actions given high dimensional vision inputs. However, learning\nsuch a representation requires an extreme amount of diverse training data,\nwhich is prohibitively expensive to collect on a real robot. How can we\novercome this problem? Instead of collecting more robot data, this paper\nproposes using internet-scale, human videos to extract \"affordances,\" both at\nthe environment and agent level, and distill them into a pre-trained\nrepresentation. We present a simple framework for pre-training representations\non hand, object, and contact \"affordance labels\" that highlight relevant\nobjects in images and how to interact with them. These affordances are\nautomatically extracted from human video data (with the help of off-the-shelf\ncomputer vision modules) and used to fine-tune existing representations. Our\napproach can efficiently fine-tune *any* existing representation, and results\nin models with stronger downstream robotic performance across the board. We\nexperimentally demonstrate (using 3000+ robot trials) that this affordance\npre-training scheme boosts performance by a minimum of 15% on 5 real-world\ntasks, which consider three diverse robot morphologies (including a dexterous\nhand). Unlike prior works in the space, these representations improve\nperformance across 3 different camera views. Quantitatively, we find that our\napproach leads to higher levels of generalization in out-of-distribution\nsettings. For code, weights, and data check: https://hrp-robot.github.io\n","authors":["Mohan Kumar Srirama","Sudeep Dasari","Shikhar Bahl","Abhinav Gupta"],"pdf_url":"https://arxiv.org/pdf/2407.18911v1.pdf","comment":"Accepted to Robotics Science and Systems 2024"},{"id":"http://arxiv.org/abs/2311.16917v2","updated":"2024-07-26T17:59:14Z","published":"2023-11-28T16:20:33Z","title":"UGG: Unified Generative Grasping","summary":" Dexterous grasping aims to produce diverse grasping postures with a high\ngrasping success rate. Regression-based methods that directly predict grasping\nparameters given the object may achieve a high success rate but often lack\ndiversity. Generation-based methods that generate grasping postures conditioned\non the object can often produce diverse grasping, but they are insufficient for\nhigh grasping success due to lack of discriminative information. To mitigate,\nwe introduce a unified diffusion-based dexterous grasp generation model, dubbed\nthe name UGG, which operates within the object point cloud and hand parameter\nspaces. Our all-transformer architecture unifies the information from the\nobject, the hand, and the contacts, introducing a novel representation of\ncontact points for improved contact modeling. The flexibility and quality of\nour model enable the integration of a lightweight discriminator, benefiting\nfrom simulated discriminative data, which pushes for a high success rate while\npreserving high diversity. Beyond grasp generation, our model can also generate\nobjects based on hand information, offering valuable insights into object\ndesign and studying how the generative model perceives objects. Our model\nachieves state-of-the-art dexterous grasping on the large-scale DexGraspNet\ndataset while facilitating human-centric object design, marking a significant\nadvancement in dexterous grasping research. Our project page is\nhttps://jiaxin-lu.github.io/ugg/.\n","authors":["Jiaxin Lu","Hao Kang","Haoxiang Li","Bo Liu","Yiding Yang","Qixing Huang","Gang Hua"],"pdf_url":"https://arxiv.org/pdf/2311.16917v2.pdf","comment":"17 pages, 14 figures, ECCV 2024"},{"id":"http://arxiv.org/abs/2407.18908v1","updated":"2024-07-26T17:59:09Z","published":"2024-07-26T17:59:09Z","title":"Wolf: Captioning Everything with a World Summarization Framework","summary":" We propose Wolf, a WOrLd summarization Framework for accurate video\ncaptioning. Wolf is an automated captioning framework that adopts a\nmixture-of-experts approach, leveraging complementary strengths of Vision\nLanguage Models (VLMs). By utilizing both image and video models, our framework\ncaptures different levels of information and summarizes them efficiently. Our\napproach can be applied to enhance video understanding, auto-labeling, and\ncaptioning. To evaluate caption quality, we introduce CapScore, an LLM-based\nmetric to assess the similarity and quality of generated captions compared to\nthe ground truth captions. We further build four human-annotated datasets in\nthree domains: autonomous driving, general scenes, and robotics, to facilitate\ncomprehensive comparisons. We show that Wolf achieves superior captioning\nperformance compared to state-of-the-art approaches from the research community\n(VILA1.5, CogAgent) and commercial solutions (Gemini-Pro-1.5, GPT-4V). For\ninstance, in comparison with GPT-4V, Wolf improves CapScore both quality-wise\nby 55.6% and similarity-wise by 77.4% on challenging driving videos. Finally,\nwe establish a benchmark for video captioning and introduce a leaderboard,\naiming to accelerate advancements in video understanding, captioning, and data\nalignment. Leaderboard: https://wolfv0.github.io/leaderboard.html.\n","authors":["Boyi Li","Ligeng Zhu","Ran Tian","Shuhan Tan","Yuxiao Chen","Yao Lu","Yin Cui","Sushant Veer","Max Ehrlich","Jonah Philion","Xinshuo Weng","Fuzhao Xue","Andrew Tao","Ming-Yu Liu","Sanja Fidler","Boris Ivanovic","Trevor Darrell","Jitendra Malik","Song Han","Marco Pavone"],"pdf_url":"https://arxiv.org/pdf/2407.18908v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18907v1","updated":"2024-07-26T17:58:59Z","published":"2024-07-26T17:58:59Z","title":"SHIC: Shape-Image Correspondences with no Keypoint Supervision","summary":" Canonical surface mapping generalizes keypoint detection by assigning each\npixel of an object to a corresponding point in a 3D template. Popularised by\nDensePose for the analysis of humans, authors have since attempted to apply the\nconcept to more categories, but with limited success due to the high cost of\nmanual supervision. In this work, we introduce SHIC, a method to learn\ncanonical maps without manual supervision which achieves better results than\nsupervised methods for most categories. Our idea is to leverage foundation\ncomputer vision models such as DINO and Stable Diffusion that are open-ended\nand thus possess excellent priors over natural categories. SHIC reduces the\nproblem of estimating image-to-template correspondences to predicting\nimage-to-image correspondences using features from the foundation models. The\nreduction works by matching images of the object to non-photorealistic renders\nof the template, which emulates the process of collecting manual annotations\nfor this task. These correspondences are then used to supervise high-quality\ncanonical maps for any object of interest. We also show that image generators\ncan further improve the realism of the template views, which provide an\nadditional source of supervision for the model.\n","authors":["Aleksandar Shtedritski","Christian Rupprecht","Andrea Vedaldi"],"pdf_url":"https://arxiv.org/pdf/2407.18907v1.pdf","comment":"ECCV 2024. Project website\n https://www.robots.ox.ac.uk/~vgg/research/shic/"},{"id":"http://arxiv.org/abs/2407.18906v1","updated":"2024-07-26T17:58:57Z","published":"2024-07-26T17:58:57Z","title":"A Scalable Quantum Non-local Neural Network for Image Classification","summary":" Non-local operations play a crucial role in computer vision enabling the\ncapture of long-range dependencies through weighted sums of features across the\ninput, surpassing the constraints of traditional convolution operations that\nfocus solely on local neighborhoods. Non-local operations typically require\ncomputing pairwise relationships between all elements in a set, leading to\nquadratic complexity in terms of time and memory. Due to the high computational\nand memory demands, scaling non-local neural networks to large-scale problems\ncan be challenging. This article introduces a hybrid quantum-classical scalable\nnon-local neural network, referred to as Quantum Non-Local Neural Network\n(QNL-Net), to enhance pattern recognition. The proposed QNL-Net relies on\ninherent quantum parallelism to allow the simultaneous processing of a large\nnumber of input features enabling more efficient computations in\nquantum-enhanced feature space and involving pairwise relationships through\nquantum entanglement. We benchmark our proposed QNL-Net with other quantum\ncounterparts to binary classification with datasets MNIST and CIFAR-10. The\nsimulation findings showcase our QNL-Net achieves cutting-edge accuracy levels\nin binary image classification among quantum classifiers while utilizing fewer\nqubits.\n","authors":["Sparsh Gupta","Debanjan Konar","Vaneet Aggarwal"],"pdf_url":"https://arxiv.org/pdf/2407.18906v1.pdf","comment":"draft, 13 pages (including references and appendix), 5 figures"},{"id":"http://arxiv.org/abs/2407.18899v1","updated":"2024-07-26T17:51:58Z","published":"2024-07-26T17:51:58Z","title":"Learn from the Learnt: Source-Free Active Domain Adaptation via\n Contrastive Sampling and Visual Persistence","summary":" Domain Adaptation (DA) facilitates knowledge transfer from a source domain to\na related target domain. This paper investigates a practical DA paradigm,\nnamely Source data-Free Active Domain Adaptation (SFADA), where source data\nbecomes inaccessible during adaptation, and a minimum amount of annotation\nbudget is available in the target domain. Without referencing the source data,\nnew challenges emerge in identifying the most informative target samples for\nlabeling, establishing cross-domain alignment during adaptation, and ensuring\ncontinuous performance improvements through the iterative query-and-adaptation\nprocess. In response, we present learn from the learnt (LFTL), a novel paradigm\nfor SFADA to leverage the learnt knowledge from the source pretrained model and\nactively iterated models without extra overhead. We propose Contrastive Active\nSampling to learn from the hypotheses of the preceding model, thereby querying\ntarget samples that are both informative to the current model and persistently\nchallenging throughout active learning. During adaptation, we learn from\nfeatures of actively selected anchors obtained from previous intermediate\nmodels, so that the Visual Persistence-guided Adaptation can facilitate feature\ndistribution alignment and active sample exploitation. Extensive experiments on\nthree widely-used benchmarks show that our LFTL achieves state-of-the-art\nperformance, superior computational efficiency and continuous improvements as\nthe annotation budget increases. Our code is available at\nhttps://github.com/lyumengyao/lftl.\n","authors":["Mengyao Lyu","Tianxiang Hao","Xinhao Xu","Hui Chen","Zijia Lin","Jungong Han","Guiguang Ding"],"pdf_url":"https://arxiv.org/pdf/2407.18899v1.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2407.18854v1","updated":"2024-07-26T16:30:18Z","published":"2024-07-26T16:30:18Z","title":"Unifying Visual and Semantic Feature Spaces with Diffusion Models for\n Enhanced Cross-Modal Alignment","summary":" Image classification models often demonstrate unstable performance in\nreal-world applications due to variations in image information, driven by\ndiffering visual perspectives of subject objects and lighting discrepancies. To\nmitigate these challenges, existing studies commonly incorporate additional\nmodal information matching the visual data to regularize the model's learning\nprocess, enabling the extraction of high-quality visual features from complex\nimage regions. Specifically, in the realm of multimodal learning, cross-modal\nalignment is recognized as an effective strategy, harmonizing different modal\ninformation by learning a domain-consistent latent feature space for visual and\nsemantic features. However, this approach may face limitations due to the\nheterogeneity between multimodal information, such as differences in feature\ndistribution and structure. To address this issue, we introduce a Multimodal\nAlignment and Reconstruction Network (MARNet), designed to enhance the model's\nresistance to visual noise. Importantly, MARNet includes a cross-modal\ndiffusion reconstruction module for smoothly and stably blending information\nacross different domains. Experiments conducted on two benchmark datasets,\nVireo-Food172 and Ingredient-101, demonstrate that MARNet effectively improves\nthe quality of image information extracted by the model. It is a plug-and-play\nframework that can be rapidly integrated into various image classification\nframeworks, boosting model performance.\n","authors":["Yuze Zheng","Zixuan Li","Xiangxian Li","Jinxing Liu","Yuqing Wang","Xiangxu Meng","Lei Meng"],"pdf_url":"https://arxiv.org/pdf/2407.18854v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16126v2","updated":"2024-07-26T16:20:18Z","published":"2024-07-23T02:21:11Z","title":"MxT: Mamba x Transformer for Image Inpainting","summary":" Image inpainting, or image completion, is a crucial task in computer vision\nthat aims to restore missing or damaged regions of images with semantically\ncoherent content. This technique requires a precise balance of local texture\nreplication and global contextual understanding to ensure the restored image\nintegrates seamlessly with its surroundings. Traditional methods using\nConvolutional Neural Networks (CNNs) are effective at capturing local patterns\nbut often struggle with broader contextual relationships due to the limited\nreceptive fields. Recent advancements have incorporated transformers,\nleveraging their ability to understand global interactions. However, these\nmethods face computational inefficiencies and struggle to maintain fine-grained\ndetails. To overcome these challenges, we introduce MxT composed of the\nproposed Hybrid Module (HM), which combines Mamba with the transformer in a\nsynergistic manner. Mamba is adept at efficiently processing long sequences\nwith linear computational costs, making it an ideal complement to the\ntransformer for handling long-scale data interactions. Our HM facilitates\ndual-level interaction learning at both pixel and patch levels, greatly\nenhancing the model to reconstruct images with high quality and contextual\naccuracy. We evaluate MxT on the widely-used CelebA-HQ and Places2-standard\ndatasets, where it consistently outperformed existing state-of-the-art methods.\n","authors":["Shuang Chen","Amir Atapour-Abarghouei","Haozheng Zhang","Hubert P. H. Shum"],"pdf_url":"https://arxiv.org/pdf/2407.16126v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18839v1","updated":"2024-07-26T16:02:37Z","published":"2024-07-26T16:02:37Z","title":"Scalable Group Choreography via Variational Phase Manifold Learning","summary":" Generating group dance motion from the music is a challenging task with\nseveral industrial applications. Although several methods have been proposed to\ntackle this problem, most of them prioritize optimizing the fidelity in dancing\nmovement, constrained by predetermined dancer counts in datasets. This\nlimitation impedes adaptability to real-world applications. Our study addresses\nthe scalability problem in group choreography while preserving naturalness and\nsynchronization. In particular, we propose a phase-based variational generative\nmodel for group dance generation on learning a generative manifold. Our method\nachieves high-fidelity group dance motion and enables the generation with an\nunlimited number of dancers while consuming only a minimal and constant amount\nof memory. The intensive experiments on two public datasets show that our\nproposed method outperforms recent state-of-the-art approaches by a large\nmargin and is scalable to a great number of dancers beyond the training data.\n","authors":["Nhat Le","Khoa Do","Xuan Bui","Tuong Do","Erman Tjiputra","Quang D. Tran","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2407.18839v1.pdf","comment":"Accepted at ECCV 2024"},{"id":"http://arxiv.org/abs/2401.10805v3","updated":"2024-07-26T16:00:07Z","published":"2024-01-19T16:48:49Z","title":"Learning to Visually Connect Actions and their Effects","summary":" We introduce the novel concept of visually Connecting Actions and Their\nEffects (CATE) in video understanding. CATE can have applications in areas like\ntask planning and learning from demonstration. We identify and explore two\ndifferent aspects of the concept of CATE: Action Selection (AS) and\nEffect-Affinity Assessment (EAA), where video understanding models connect\nactions and effects at semantic and fine-grained levels, respectively. We\ndesign various baseline models for AS and EAA. Despite the intuitive nature of\nthe task, we observe that models struggle, and humans outperform them by a\nlarge margin. Our experiments show that in solving AS and EAA, models learn\nintuitive properties like object tracking and pose encoding without explicit\nsupervision. We demonstrate that CATE can be an effective self-supervised task\nfor learning video representations from unlabeled videos. The study aims to\nshowcase the fundamental nature and versatility of CATE, with the hope of\ninspiring advanced formulations and models.\n","authors":["Paritosh Parmar","Eric Peh","Basura Fernando"],"pdf_url":"https://arxiv.org/pdf/2401.10805v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16977v2","updated":"2024-07-26T15:52:46Z","published":"2024-07-24T03:45:35Z","title":"Selective Vision-Language Subspace Projection for Few-shot CLIP","summary":" Vision-language models such as CLIP are capable of mapping the different\nmodality data into a unified feature space, enabling zero/few-shot inference by\nmeasuring the similarity of given images and texts. However, most existing\nmethods overlook modality gaps in CLIP's encoded features, which is shown as\nthe text and image features lie far apart from each other, resulting in limited\nclassification performance. To tackle this issue, we introduce a method called\nSelective Vision-Language Subspace Projection (SSP), which incorporates local\nimage features and utilizes them as a bridge to enhance the alignment between\nimage-text pairs. Specifically, our SSP framework comprises two parallel\nmodules: a vision projector and a language projector. Both projectors utilize\nlocal image features to span the respective subspaces for image and texts,\nthereby projecting the image and text features into their respective subspaces\nto achieve alignment. Moreover, our approach entails only training-free matrix\ncalculations and can be seamlessly integrated into advanced CLIP-based few-shot\nlearning frameworks. Extensive experiments on 11 datasets have demonstrated\nSSP's superior text-image alignment capabilities, outperforming the\nstate-of-the-art alignment methods. The code is available at\nhttps://github.com/zhuhsingyuu/SSP\n","authors":["Xingyu Zhu","Beier Zhu","Yi Tan","Shuo Wang","Yanbin Hao","Hanwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.16977v2.pdf","comment":"Accepted as an Oral Paper at ACM Multimedia 2024"},{"id":"http://arxiv.org/abs/2402.00019v2","updated":"2024-07-26T15:39:03Z","published":"2024-01-01T13:03:35Z","title":"Diffusion MRI with Machine Learning","summary":" Diffusion-weighted magnetic resonance imaging (dMRI) offers unique\ncapabilities including noninvasive probing of brain's tissue microstructure and\nstructural connectivity. It is widely used for clinical assessment of brain\npathologies and for neuroscience research. Analyzing the dMRI data to extract\nuseful information for medical and scientific purposes can be challenging. The\ndMRI measurements often suffer from strong noise and artifacts, there is\nusually high inter-session and inter-scanner variability in the data, and\nconsiderable inter-subject heterogeneity in brain structure. Moreover, the\nrelationship between measurements and the phenomena of interest can be highly\ncomplex. Recent years have witnessed increasing use of machine learning methods\nfor dMRI analysis. This manuscript aims to assess these efforts, with a focus\non methods that have addressed data preprocessing and harmonization,\nmicrostructure mapping, tractography, and white matter tract analysis. We study\nthe main findings, strengths, and weaknesses of the existing methods and\nsuggest topics for future research. We find that machine learning may be\nexceptionally suited to tackle some of the difficult tasks in dMRI analysis.\nHowever, for this to happen, several shortcomings of existing methods and\ncritical unresolved issues need to be addressed. These include deficient\nevaluation practices, lack of rich training datasets and validation benchmarks,\nas well as model generalizability, reliability, and explainability concerns.\n","authors":["Davood Karimi"],"pdf_url":"https://arxiv.org/pdf/2402.00019v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18821v1","updated":"2024-07-26T15:31:13Z","published":"2024-07-26T15:31:13Z","title":"Deep Companion Learning: Enhancing Generalization Through Historical\n Consistency","summary":" We propose Deep Companion Learning (DCL), a novel training method for Deep\nNeural Networks (DNNs) that enhances generalization by penalizing inconsistent\nmodel predictions compared to its historical performance. To achieve this, we\ntrain a deep-companion model (DCM), by using previous versions of the model to\nprovide forecasts on new inputs. This companion model deciphers a meaningful\nlatent semantic structure within the data, thereby providing targeted\nsupervision that encourages the primary model to address the scenarios it finds\nmost challenging. We validate our approach through both theoretical analysis\nand extensive experimentation, including ablation studies, on a variety of\nbenchmark datasets (CIFAR-100, Tiny-ImageNet, ImageNet-1K) using diverse\narchitectural models (ShuffleNetV2, ResNet, Vision Transformer, etc.),\ndemonstrating state-of-the-art performance.\n","authors":["Ruizhao Zhu","Venkatesh Saligrama"],"pdf_url":"https://arxiv.org/pdf/2407.18821v1.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2208.02529v3","updated":"2024-07-26T15:07:08Z","published":"2022-08-04T08:53:15Z","title":"Metadata-enhanced contrastive learning from retinal optical coherence\n tomography images","summary":" Deep learning has potential to automate screening, monitoring and grading of\ndisease in medical images. Pretraining with contrastive learning enables models\nto extract robust and generalisable features from natural image datasets,\nfacilitating label-efficient downstream image analysis. However, the direct\napplication of conventional contrastive methods to medical datasets introduces\ntwo domain-specific issues. Firstly, several image transformations which have\nbeen shown to be crucial for effective contrastive learning do not translate\nfrom the natural image to the medical image domain. Secondly, the assumption\nmade by conventional methods, that any two images are dissimilar, is\nsystematically misleading in medical datasets depicting the same anatomy and\ndisease. This is exacerbated in longitudinal image datasets that repeatedly\nimage the same patient cohort to monitor their disease progression over time.\nIn this paper we tackle these issues by extending conventional contrastive\nframeworks with a novel metadata-enhanced strategy. Our approach employs widely\navailable patient metadata to approximate the true set of inter-image\ncontrastive relationships. To this end we employ records for patient identity,\neye position (i.e. left or right) and time series information. In experiments\nusing two large longitudinal datasets containing 170,427 retinal OCT images of\n7,912 patients with age-related macular degeneration (AMD), we evaluate the\nutility of using metadata to incorporate the temporal dynamics of disease\nprogression into pretraining. Our metadata-enhanced approach outperforms both\nstandard contrastive methods and a retinal image foundation model in five out\nof six image-level downstream tasks related to AMD. Due to its modularity, our\nmethod can be quickly and cost-effectively tested to establish the potential\nbenefits of including available metadata in contrastive pretraining.\n","authors":["Robbie Holland","Oliver Leingang","Hrvoje Bogunović","Sophie Riedl","Lars Fritsche","Toby Prevost","Hendrik P. N. Scholl","Ursula Schmidt-Erfurth","Sobha Sivaprasad","Andrew J. Lotery","Daniel Rueckert","Martin J. Menten"],"pdf_url":"https://arxiv.org/pdf/2208.02529v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18792v1","updated":"2024-07-26T14:54:16Z","published":"2024-07-26T14:54:16Z","title":"Benchmarking Dependence Measures to Prevent Shortcut Learning in Medical\n Imaging","summary":" Medical imaging cohorts are often confounded by factors such as acquisition\ndevices, hospital sites, patient backgrounds, and many more. As a result, deep\nlearning models tend to learn spurious correlations instead of causally related\nfeatures, limiting their generalizability to new and unseen data. This problem\ncan be addressed by minimizing dependence measures between intermediate\nrepresentations of task-related and non-task-related variables. These measures\ninclude mutual information, distance correlation, and the performance of\nadversarial classifiers. Here, we benchmark such dependence measures for the\ntask of preventing shortcut learning. We study a simplified setting using\nMorpho-MNIST and a medical imaging task with CheXpert chest radiographs. Our\nresults provide insights into how to mitigate confounding factors in medical\nimaging.\n","authors":["Sarah Müller","Louisa Fay","Lisa M. Koch","Sergios Gatidis","Thomas Küstner","Philipp Berens"],"pdf_url":"https://arxiv.org/pdf/2407.18792v1.pdf","comment":"Accepted to the 15th International Workshop on Machine Learning in\n Medical Imaging (MLMI 2024)"},{"id":"http://arxiv.org/abs/2402.17372v2","updated":"2024-07-26T14:48:04Z","published":"2024-02-27T10:10:12Z","title":"Coupled Laplacian Eigenmaps for Locally-Aware 3D Rigid Point Cloud\n Matching","summary":" Point cloud matching, a crucial technique in computer vision, medical and\nrobotics fields, is primarily concerned with finding correspondences between\npairs of point clouds or voxels. In some practical scenarios, emphasizing local\ndifferences is crucial for accurately identifying a correct match, thereby\nenhancing the overall robustness and reliability of the matching process.\nCommonly used shape descriptors have several limitations and often fail to\nprovide meaningful local insights about the paired geometries. In this work, we\npropose a new technique, based on graph Laplacian eigenmaps, to match point\nclouds by taking into account fine local structures. To deal with the order and\nsign ambiguity of Laplacian eigenmaps, we introduce a new operator, called\nCoupled Laplacian (https://github.com/matteo-bastico/CoupLap), that allows to\neasily generate aligned eigenspaces for multiple registered geometries. We show\nthat the similarity between those aligned high-dimensional spaces provides a\nlocally meaningful score to match shapes. We firstly evaluate the performance\nof the proposed technique in a point-wise manner, focusing on the task of\nobject anomaly localization on the MVTec 3D-AD dataset. Additionally, we define\na new medical task, called automatic Bone Side Estimation (BSE), which we\naddress through a global similarity score derived from coupled eigenspaces. In\norder to test it, we propose a benchmark collecting bone surface structures\nfrom various public datasets. Our matching technique, based on Coupled\nLaplacian, outperforms other methods by reaching an impressive accuracy on both\ntasks.\n","authors":["Matteo Bastico","Etienne Decencière","Laurent Corté","Yannick Tillier","David Ryckelynck"],"pdf_url":"https://arxiv.org/pdf/2402.17372v2.pdf","comment":"This paper has been accepted at Computer Vision and Patter\n Recognition (CVPR) 2024"},{"id":"http://arxiv.org/abs/2405.06468v2","updated":"2024-07-26T14:18:48Z","published":"2024-05-10T13:27:32Z","title":"Pseudo-Prompt Generating in Pre-trained Vision-Language Models for\n Multi-Label Medical Image Classification","summary":" The task of medical image recognition is notably complicated by the presence\nof varied and multiple pathological indications, presenting a unique challenge\nin multi-label classification with unseen labels. This complexity underlines\nthe need for computer-aided diagnosis methods employing multi-label zero-shot\nlearning. Recent advancements in pre-trained vision-language models (VLMs) have\nshowcased notable zero-shot classification abilities on medical images.\nHowever, these methods have limitations on leveraging extensive pre-trained\nknowledge from broader image datasets, and often depend on manual prompt\nconstruction by expert radiologists. By automating the process of prompt\ntuning, prompt learning techniques have emerged as an efficient way to adapt\nVLMs to downstream tasks. Yet, existing CoOp-based strategies fall short in\nperforming class-specific prompts on unseen categories, limiting\ngeneralizability in fine-grained scenarios. To overcome these constraints, we\nintroduce a novel prompt generation approach inspirited by text generation in\nnatural language processing (NLP). Our method, named Pseudo-Prompt Generating\n(PsPG), capitalizes on the priori knowledge of multi-modal features. Featuring\na RNN-based decoder, PsPG autoregressively generates class-tailored embedding\nvectors, i.e., pseudo-prompts. Comparative evaluations on various multi-label\nchest radiograph datasets affirm the superiority of our approach against\nleading medical vision-language and multi-label prompt learning methods. The\nsource code is available at https://github.com/fallingnight/PsPG\n","authors":["Yaoqin Ye","Junjie Zhang","Hongwei Shi"],"pdf_url":"https://arxiv.org/pdf/2405.06468v2.pdf","comment":"Accepted by PRCV 2024"},{"id":"http://arxiv.org/abs/2407.17671v2","updated":"2024-07-26T14:09:08Z","published":"2024-07-24T23:23:38Z","title":"Unsqueeze [CLS] Bottleneck to Learn Rich Representations","summary":" Distillation-based self-supervised learning typically leads to more\ncompressed representations due to its radical clustering process and the\nimplementation of a sharper target distribution. To overcome this limitation\nand preserve more information from input, we introduce UDI, conceptualized as\nUnsqueezed Distillation-based self-supervised learning (SSL). UDI enriches the\nlearned representation by encouraging multimodal prediction distilled from a\nconsolidated profile of local predictions that are derived via stratified\nsampling. Our evaluations show that UDI not only promotes semantically\nmeaningful representations at instance level, delivering superior or\ncompetitive results to state-of-the-art SSL methods in image classification,\nbut also effectively preserves the nuisance of input, which yields significant\nimprovement in dense prediction tasks, including object detection and\nsegmentation. Additionally, UDI performs competitively in low-shot image\nclassification, improving the scalability of joint-embedding pipelines. Various\nvisualizations and ablation studies are presented to further elucidate the\nmechanisms behind UDI. Our source code is available at\nhttps://github.com/ISL-CV/udi.\n","authors":["Qing Su","Shihao Ji"],"pdf_url":"https://arxiv.org/pdf/2407.17671v2.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2407.15810v2","updated":"2024-07-26T13:57:32Z","published":"2024-07-22T17:22:04Z","title":"Breaking the Global North Stereotype: A Global South-centric Benchmark\n Dataset for Auditing and Mitigating Biases in Facial Recognition Systems","summary":" Facial Recognition Systems (FRSs) are being developed and deployed globally\nat unprecedented rates. Most platforms are designed in a limited set of\ncountries but deployed in worldwide, without adequate checkpoints. This is\nespecially problematic for Global South countries which lack strong legislation\nto safeguard persons facing disparate performance of these systems. A\ncombination of unavailability of datasets, lack of understanding of FRS\nfunctionality and low-resource bias mitigation measures accentuate the problem.\nIn this work, we propose a new face dataset composed of 6,579 unique male and\nfemale sportspersons from eight countries around the world. More than 50% of\nthe dataset comprises individuals from the Global South countries and is\ndemographically diverse. To aid adversarial audits and robust model training,\neach image has four adversarial variants, totaling over 40,000 images. We also\nbenchmark five popular FRSs, both commercial and open-source, for the task of\ngender prediction (and country prediction for one of the open-source models as\nan example of red-teaming). Experiments on industrial FRSs reveal accuracies\nranging from 98.2%--38.1%, with a large disparity between males and females in\nthe Global South (max difference of 38.5%). Biases are also observed in all\nFRSs between females of the Global North and South (max difference of ~50%).\nGrad-CAM analysis identifies the nose, forehead and mouth as the regions of\ninterest on one of the open-source FRSs. Utilizing this insight, we design\nsimple, low-resource bias mitigation solutions using few-shot and novel\ncontrastive learning techniques significantly improving the accuracy with\ndisparity between males and females reducing from 50% to 1.5% in one of the\nsettings. In the red-teaming experiment with the open-source Deepface model,\ncontrastive learning proves more effective than simple fine-tuning.\n","authors":["Siddharth D Jaiswal","Animesh Ganai","Abhisek Dash","Saptarshi Ghosh","Animesh Mukherjee"],"pdf_url":"https://arxiv.org/pdf/2407.15810v2.pdf","comment":"This work has been accepted for publication at AAAI/ACM AIES 2024"},{"id":"http://arxiv.org/abs/2402.15374v3","updated":"2024-07-26T13:47:11Z","published":"2024-02-23T15:19:37Z","title":"Outlier detection by ensembling uncertainty with negative objectness","summary":" Outlier detection is an essential capability in safety-critical applications\nof supervised visual recognition. Most of the existing methods deliver best\nresults by encouraging standard closed-set models to produce low-confidence\npredictions in negative training data. However, that approach conflates\nprediction uncertainty with recognition of the negative class. We therefore\nreconsider direct prediction of K+1 logits that correspond to K groundtruth\nclasses and one outlier class. This setup allows us to formulate a novel\nanomaly score as an ensemble of in-distribution uncertainty and the posterior\nof the outlier class which we term negative objectness. Now outliers can be\nindependently detected due to i) high prediction uncertainty or ii) similarity\nwith negative data. We embed our method into a dense prediction architecture\nwith mask-level recognition over K+2 classes. The training procedure encourages\nthe novel K+2-th class to learn negative objectness at pasted negative\ninstances. Our models outperform the current state-of-the art on standard\nbenchmarks for image-wide and pixel-level outlier detection with and without\ntraining on real negative data.\n","authors":["Anja Delić","Matej Grcić","Siniša Šegvić"],"pdf_url":"https://arxiv.org/pdf/2402.15374v3.pdf","comment":"Accepted to BMVC 2024"},{"id":"http://arxiv.org/abs/2304.02488v2","updated":"2024-07-26T13:31:21Z","published":"2023-04-05T15:02:30Z","title":"SCB-dataset: A Dataset for Detecting Student Classroom Behavior","summary":" The use of deep learning methods for automatic detection of students'\nclassroom behavior is a promising approach to analyze their class performance\nand enhance teaching effectiveness. However, the lack of publicly available\ndatasets on student behavior poses a challenge for researchers in this field.\nTo address this issue, we propose a Student Classroom Behavior dataset\n(SCB-dataset) that reflects real-life scenarios. Our dataset includes 11,248\nlabels and 4,003 images, with a focus on hand-raising behavior. We evaluated\nthe dataset using the YOLOv7 algorithm, achieving a mean average precision\n(map) of up to 85.3%. We believe that our dataset can serve as a robust\nfoundation for future research in the field of student behavior detection and\npromote further advancements in this area.Our SCB-dataset can be downloaded\nfrom: https://github.com/Whiffe/SCB-dataset\n","authors":["Fan Yang"],"pdf_url":"https://arxiv.org/pdf/2304.02488v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11587v2","updated":"2024-07-26T13:16:28Z","published":"2023-12-18T14:30:13Z","title":"Relightable Neural Actor with Intrinsic Decomposition and Pose Control","summary":" Creating a controllable and relightable digital avatar from multi-view video\nwith fixed illumination is a very challenging problem since humans are highly\narticulated, creating pose-dependent appearance effects, and skin as well as\nclothing require space-varying BRDF modeling. Existing works on creating\nanimatible avatars either do not focus on relighting at all, require controlled\nillumination setups, or try to recover a relightable avatar from very low cost\nsetups, i.e. a single RGB video, at the cost of severely limited result\nquality, e.g. shadows not even being modeled. To address this, we propose\nRelightable Neural Actor, a new video-based method for learning a pose-driven\nneural human model that can be relighted, allows appearance editing, and models\npose-dependent effects such as wrinkles and self-shadows. Importantly, for\ntraining, our method solely requires a multi-view recording of the human under\na known, but static lighting condition. To tackle this challenging problem, we\nleverage an implicit geometry representation of the actor with a drivable\ndensity field that models pose-dependent deformations and derive a dynamic\nmapping between 3D and UV spaces, where normal, visibility, and materials are\neffectively encoded. To evaluate our approach in real-world scenarios, we\ncollect a new dataset with four identities recorded under different light\nconditions, indoors and outdoors, providing the first benchmark of its kind for\nhuman relighting, and demonstrating state-of-the-art relighting results for\nnovel human poses.\n","authors":["Diogo Luvizon","Vladislav Golyanik","Adam Kortylewski","Marc Habermann","Christian Theobalt"],"pdf_url":"https://arxiv.org/pdf/2312.11587v2.pdf","comment":"Accepted to ECCV 2024. Project page:\n https://vcai.mpi-inf.mpg.de/projects/RNA/"},{"id":"http://arxiv.org/abs/2407.12322v2","updated":"2024-07-26T13:04:21Z","published":"2024-07-17T05:47:27Z","title":"Frequency Guidance Matters: Skeletal Action Recognition by\n Frequency-Aware Mixed Transformer","summary":" Recently, transformers have demonstrated great potential for modeling\nlong-term dependencies from skeleton sequences and thereby gained\never-increasing attention in skeleton action recognition. However, the existing\ntransformer-based approaches heavily rely on the naive attention mechanism for\ncapturing the spatiotemporal features, which falls short in learning\ndiscriminative representations that exhibit similar motion patterns. To address\nthis challenge, we introduce the Frequency-aware Mixed Transformer\n(FreqMixFormer), specifically designed for recognizing similar skeletal actions\nwith subtle discriminative motions. First, we introduce a frequency-aware\nattention module to unweave skeleton frequency representations by embedding\njoint features into frequency attention maps, aiming to distinguish the\ndiscriminative movements based on their frequency coefficients. Subsequently,\nwe develop a mixed transformer architecture to incorporate spatial features\nwith frequency features to model the comprehensive frequency-spatial patterns.\nAdditionally, a temporal transformer is proposed to extract the global\ncorrelations across frames. Extensive experiments show that FreqMiXFormer\noutperforms SOTA on 3 popular skeleton action recognition datasets, including\nNTU RGB+D, NTU RGB+D 120, and NW-UCLA datasets.\n","authors":["Wenhan Wu","Ce Zheng","Zihao Yang","Chen Chen","Srijan Das","Aidong Lu"],"pdf_url":"https://arxiv.org/pdf/2407.12322v2.pdf","comment":"Accepted by ACM Multimedia 2024"},{"id":"http://arxiv.org/abs/2407.18715v1","updated":"2024-07-26T13:02:48Z","published":"2024-07-26T13:02:48Z","title":"BCTR: Bidirectional Conditioning Transformer for Scene Graph Generation","summary":" Scene Graph Generation (SGG) remains a challenging task due to its\ncompositional property. Previous approaches improve prediction efficiency by\nlearning in an end-to-end manner. However, these methods exhibit limited\nperformance as they assume unidirectional conditioning between entities and\npredicates, leading to insufficient information interaction. To address this\nlimitation, we propose a novel bidirectional conditioning factorization for\nSGG, introducing efficient interaction between entities and predicates.\nSpecifically, we develop an end-to-end scene graph generation model,\nBidirectional Conditioning Transformer (BCTR), to implement our factorization.\nBCTR consists of two key modules. First, the Bidirectional Conditioning\nGenerator (BCG) facilitates multi-stage interactive feature augmentation\nbetween entities and predicates, enabling mutual benefits between the two\npredictions. Second, Random Feature Alignment (RFA) regularizes the feature\nspace by distilling multi-modal knowledge from pre-trained models, enhancing\nBCTR's ability on tailed categories without relying on statistical priors. We\nconduct a series of experiments on Visual Genome and Open Image V6,\ndemonstrating that BCTR achieves state-of-the-art performance on both\nbenchmarks. The code will be available upon acceptance of the paper.\n","authors":["Peng Hao","Xiaobing Wang","Yingying Jiang","Hanchao Jia","Xiaoshuai Hao"],"pdf_url":"https://arxiv.org/pdf/2407.18715v1.pdf","comment":"9 pages, 3 figures"},{"id":"http://arxiv.org/abs/2310.09982v4","updated":"2024-07-26T12:52:19Z","published":"2023-10-15T23:20:54Z","title":"AEP$n$P: A Less-constrained EP$n$P Solver for Pose Estimation with\n Anisotropic Scaling","summary":" Perspective-$n$-Point (P$n$P) stands as a fundamental algorithm for pose\nestimation in various applications. In this paper, we present a new approach to\nthe P$n$P problem with relaxed constraints, eliminating the need for precise 3D\ncoordinates, which is especially suitable for object pose estimation where\ncorresponding object models may not be available in practice. Built upon the\nclassical EP$n$P solver, we refer to it as AEP$n$P due to its ability to handle\nunknown anisotropic scaling factors in addition to the common 6D\ntransformation. Through a few algebraic manipulations and a well-chosen frame\nof reference, this new problem can be boiled down to a simple linear null-space\nproblem followed by point registration-based identification of a similarity\ntransformation. Experimental results on both simulated and real datasets\ndemonstrate the effectiveness of AEP$n$P as a flexible and practical solution\nto object pose estimation. Code: https://github.com/goldoak/AEPnP.\n","authors":["Jiaxin Wei","Stefan Leutenegger","Laurent Kneip"],"pdf_url":"https://arxiv.org/pdf/2310.09982v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18695v1","updated":"2024-07-26T12:18:29Z","published":"2024-07-26T12:18:29Z","title":"PIV3CAMS: a multi-camera dataset for multiple computer vision problems\n and its application to novel view-point synthesis","summary":" The modern approaches for computer vision tasks significantly rely on machine\nlearning, which requires a large number of quality images. While there is a\nplethora of image datasets with a single type of images, there is a lack of\ndatasets collected from multiple cameras. In this thesis, we introduce Paired\nImage and Video data from three CAMeraS, namely PIV3CAMS, aimed at multiple\ncomputer vision tasks. The PIV3CAMS dataset consists of 8385 pairs of images\nand 82 pairs of videos taken from three different cameras: Canon D5 Mark IV,\nHuawei P20, and ZED stereo camera. The dataset includes various indoor and\noutdoor scenes from different locations in Zurich (Switzerland) and Cheonan\n(South Korea). Some of the computer vision applications that can benefit from\nthe PIV3CAMS dataset are image/video enhancement, view interpolation, image\nmatching, and much more. We provide a careful explanation of the data\ncollection process and detailed analysis of the data. The second part of this\nthesis studies the usage of depth information in the view synthesizing task. In\naddition to the regeneration of a current state-of-the-art algorithm, we\ninvestigate several proposed alternative models that integrate depth\ninformation geometrically. Through extensive experiments, we show that the\neffect of depth is crucial in small view changes. Finally, we apply our model\nto the introduced PIV3CAMS dataset to synthesize novel target views as an\nexample application of PIV3CAMS.\n","authors":["Sohyeong Kim","Martin Danelljan","Radu Timofte","Luc Van Gool","Jean-Philippe Thiran"],"pdf_url":"https://arxiv.org/pdf/2407.18695v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18682v1","updated":"2024-07-26T11:56:23Z","published":"2024-07-26T11:56:23Z","title":"Rapid Object Annotation","summary":" In this report we consider the problem of rapidly annotating a video with\nbounding boxes for a novel object. We describe a UI and associated workflow\ndesigned to make this process fast for an arbitrary novel target.\n","authors":["Misha Denil"],"pdf_url":"https://arxiv.org/pdf/2407.18682v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.09554v3","updated":"2024-07-26T11:54:30Z","published":"2023-12-15T06:16:17Z","title":"Embodied Laser Attack:Leveraging Scene Priors to Achieve Agent-based\n Robust Non-contact Attacks","summary":" As physical adversarial attacks become extensively applied in unearthing the\npotential risk of security-critical scenarios, especially in dynamic scenarios,\ntheir vulnerability to environmental variations has also been brought to light.\nThe non-robust nature of physical adversarial attack methods brings\nless-than-stable performance consequently. Although methods such as EOT have\nenhanced the robustness of traditional contact attacks like adversarial\npatches, they fall short in practicality and concealment within dynamic\nenvironments such as traffic scenarios. Meanwhile, non-contact laser attacks,\nwhile offering enhanced adaptability, face constraints due to a limited\noptimization space for their attributes, rendering EOT less effective. This\nlimitation underscores the necessity for developing a new strategy to augment\nthe robustness of such practices. To address these issues, this paper\nintroduces the Embodied Laser Attack (ELA), a novel framework that leverages\nthe embodied intelligence paradigm of Perception-Decision-Control to\ndynamically tailor non-contact laser attacks. For the perception module, given\nthe challenge of simulating the victim's view by full-image transformation, ELA\nhas innovatively developed a local perspective transformation network, based on\nthe intrinsic prior knowledge of traffic scenes and enables effective and\nefficient estimation. For the decision and control module, ELA trains an attack\nagent with data-driven reinforcement learning instead of adopting\ntime-consuming heuristic algorithms, making it capable of instantaneously\ndetermining a valid attack strategy with the perceived information by\nwell-designed rewards, which is then conducted by a controllable laser emitter.\nExperimentally, we apply our framework to diverse traffic scenarios both in the\ndigital and physical world, verifying the effectiveness of our method under\ndynamic successive scenes.\n","authors":["Yitong Sun","Yao Huang","Xingxing Wei"],"pdf_url":"https://arxiv.org/pdf/2312.09554v3.pdf","comment":"9 pages, 7 figures, Accepted by ACM MM 2024"},{"id":"http://arxiv.org/abs/2404.14351v2","updated":"2024-07-26T11:48:27Z","published":"2024-04-22T17:02:33Z","title":"Scene Coordinate Reconstruction: Posing of Image Collections via\n Incremental Learning of a Relocalizer","summary":" We address the task of estimating camera parameters from a set of images\ndepicting a scene. Popular feature-based structure-from-motion (SfM) tools\nsolve this task by incremental reconstruction: they repeat triangulation of\nsparse 3D points and registration of more camera views to the sparse point\ncloud. We re-interpret incremental structure-from-motion as an iterated\napplication and refinement of a visual relocalizer, that is, of a method that\nregisters new views to the current state of the reconstruction. This\nperspective allows us to investigate alternative visual relocalizers that are\nnot rooted in local feature matching. We show that scene coordinate regression,\na learning-based relocalization approach, allows us to build implicit, neural\nscene representations from unposed images. Different from other learning-based\nreconstruction methods, we do not require pose priors nor sequential inputs,\nand we optimize efficiently over thousands of images. In many cases, our\nmethod, ACE0, estimates camera poses with an accuracy close to feature-based\nSfM, as demonstrated by novel view synthesis. Project page:\nhttps://nianticlabs.github.io/acezero/\n","authors":["Eric Brachmann","Jamie Wynn","Shuai Chen","Tommaso Cavallari","Áron Monszpart","Daniyar Turmukhambetov","Victor Adrian Prisacariu"],"pdf_url":"https://arxiv.org/pdf/2404.14351v2.pdf","comment":"ECCV 2024, Project page: https://nianticlabs.github.io/acezero/"},{"id":"http://arxiv.org/abs/2407.18673v1","updated":"2024-07-26T11:30:22Z","published":"2024-07-26T11:30:22Z","title":"A Survey on Cell Nuclei Instance Segmentation and Classification:\n Leveraging Context and Attention","summary":" Manually annotating nuclei from the gigapixel Hematoxylin and Eosin\n(H&E)-stained Whole Slide Images (WSIs) is a laborious and costly task, meaning\nautomated algorithms for cell nuclei instance segmentation and classification\ncould alleviate the workload of pathologists and clinical researchers and at\nthe same time facilitate the automatic extraction of clinically interpretable\nfeatures. But due to high intra- and inter-class variability of nuclei\nmorphological and chromatic features, as well as H&E-stains susceptibility to\nartefacts, state-of-the-art algorithms cannot correctly detect and classify\ninstances with the necessary performance. In this work, we hypothesise context\nand attention inductive biases in artificial neural networks (ANNs) could\nincrease the generalization of algorithms for cell nuclei instance segmentation\nand classification. We conduct a thorough survey on context and attention\nmethods for cell nuclei instance segmentation and classification from\nH&E-stained microscopy imaging, while providing a comprehensive discussion of\nthe challenges being tackled with context and attention. Besides, we illustrate\nsome limitations of current approaches and present ideas for future research.\nAs a case study, we extend both a general instance segmentation and\nclassification method (Mask-RCNN) and a tailored cell nuclei instance\nsegmentation and classification model (HoVer-Net) with context- and\nattention-based mechanisms, and do a comparative analysis on a multi-centre\ncolon nuclei identification and counting dataset. Although pathologists rely on\ncontext at multiple levels while paying attention to specific Regions of\nInterest (RoIs) when analysing and annotating WSIs, our findings suggest\ntranslating that domain knowledge into algorithm design is no trivial task, but\nto fully exploit these mechanisms, the scientific understanding of these\nmethods should be addressed.\n","authors":["João D. Nunes","Diana Montezuma","Domingos Oliveira","Tania Pereira","Jaime S. Cardoso"],"pdf_url":"https://arxiv.org/pdf/2407.18673v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17596v2","updated":"2024-07-26T11:26:43Z","published":"2024-07-24T19:02:01Z","title":"Quality Assured: Rethinking Annotation Strategies in Imaging AI","summary":" This paper does not describe a novel method. Instead, it studies an essential\nfoundation for reliable benchmarking and ultimately real-world application of\nAI-based image analysis: generating high-quality reference annotations.\nPrevious research has focused on crowdsourcing as a means of outsourcing\nannotations. However, little attention has so far been given to annotation\ncompanies, specifically regarding their internal quality assurance (QA)\nprocesses. Therefore, our aim is to evaluate the influence of QA employed by\nannotation companies on annotation quality and devise methodologies for\nmaximizing data annotation efficacy. Based on a total of 57,648 instance\nsegmented images obtained from a total of 924 annotators and 34 QA workers from\nfour annotation companies and Amazon Mechanical Turk (MTurk), we derived the\nfollowing insights: (1) Annotation companies perform better both in terms of\nquantity and quality compared to the widely used platform MTurk. (2) Annotation\ncompanies' internal QA only provides marginal improvements, if any. However,\nimproving labeling instructions instead of investing in QA can substantially\nboost annotation performance. (3) The benefit of internal QA depends on\nspecific image characteristics. Our work could enable researchers to derive\nsubstantially more value from a fixed annotation budget and change the way\nannotation companies conduct internal QA.\n","authors":["Tim Rädsch","Annika Reinke","Vivienn Weru","Minu D. Tizabi","Nicholas Heller","Fabian Isensee","Annette Kopp-Schneider","Lena Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2407.17596v2.pdf","comment":"Accepted at ECCV 2024, preprint, Computer Vision, Data Annotation"},{"id":"http://arxiv.org/abs/2309.07986v2","updated":"2024-07-26T11:14:21Z","published":"2023-09-14T18:52:16Z","title":"Viewpoint Textual Inversion: Discovering Scene Representations and 3D\n View Control in 2D Diffusion Models","summary":" Text-to-image diffusion models generate impressive and realistic images, but\ndo they learn to represent the 3D world from only 2D supervision? We\ndemonstrate that yes, certain 3D scene representations are encoded in the text\nembedding space of models like Stable Diffusion. Our approach, Viewpoint Neural\nTextual Inversion (ViewNeTI), is to discover 3D view tokens; these tokens\ncontrol the 3D viewpoint - the rendering pose in a scene - of generated images.\nSpecifically, we train a small neural mapper to take continuous camera\nviewpoint parameters and predict a view token (a word embedding). This token\nconditions diffusion generation via cross-attention to produce images with the\ndesired camera viewpoint. Using ViewNeTI as an evaluation tool, we report two\nfindings: first, the text latent space has a continuous view-control manifold\nfor particular 3D scenes; second, we find evidence for a generalized\nview-control manifold for all scenes. We conclude that since the view token\ncontrols the 3D `rendering' viewpoint, there is likely a scene representation\nembedded in frozen 2D diffusion models. Finally, we exploit the 3D scene\nrepresentations for 3D vision tasks, namely, view-controlled text-to-image\ngeneration, and novel view synthesis from a single image, where our approach\nsets state-of-the-art for LPIPS. Code available at\nhttps://github.com/jmhb0/view_neti\n","authors":["James Burgess","Kuan-Chieh Wang","Serena Yeung-Levy"],"pdf_url":"https://arxiv.org/pdf/2309.07986v2.pdf","comment":"ECCV 2024 (European Conference on Computer Vision). Project page:\n https://jmhb0.github.io/view_neti/"},{"id":"http://arxiv.org/abs/2407.18667v1","updated":"2024-07-26T11:03:18Z","published":"2024-07-26T11:03:18Z","title":"A Labeled Ophthalmic Ultrasound Dataset with Medical Report Generation\n Based on Cross-modal Deep Learning","summary":" Ultrasound imaging reveals eye morphology and aids in diagnosing and treating\neye diseases. However, interpreting diagnostic reports requires specialized\nphysicians. We present a labeled ophthalmic dataset for the precise analysis\nand the automated exploration of medical images along with their associated\nreports. It collects three modal data, including the ultrasound images, blood\nflow information and examination reports from 2,417 patients at an\nophthalmology hospital in Shenyang, China, during the year 2018, in which the\npatient information is de-identified for privacy protection. To the best of our\nknowledge, it is the only ophthalmic dataset that contains the three modal\ninformation simultaneously. It incrementally consists of 4,858 images with the\ncorresponding free-text reports, which describe 15 typical imaging findings of\nintraocular diseases and the corresponding anatomical locations. Each image\nshows three kinds of blood flow indices at three specific arteries, i.e., nine\nparameter values to describe the spectral characteristics of blood flow\ndistribution. The reports were written by ophthalmologists during the clinical\ncare. The proposed dataset is applied to generate medical report based on the\ncross-modal deep learning model. The experimental results demonstrate that our\ndataset is suitable for training supervised models concerning cross-modal\nmedical data.\n","authors":["Jing Wang","Junyan Fan","Meng Zhou","Yanzhu Zhang","Mingyu Shi"],"pdf_url":"https://arxiv.org/pdf/2407.18667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.09174v2","updated":"2024-07-26T11:01:21Z","published":"2024-07-12T11:16:44Z","title":"DART: An Automated End-to-End Object Detection Pipeline with Data\n Diversification, Open-Vocabulary Bounding Box Annotation, Pseudo-Label\n Review, and Model Training","summary":" Accurate real-time object detection is vital across numerous industrial\napplications, from safety monitoring to quality control. Traditional\napproaches, however, are hindered by arduous manual annotation and data\ncollection, struggling to adapt to ever-changing environments and novel target\nobjects. To address these limitations, this paper presents DART, an innovative\nautomated end-to-end pipeline that revolutionizes object detection workflows\nfrom data collection to model evaluation. It eliminates the need for laborious\nhuman labeling and extensive data collection while achieving outstanding\naccuracy across diverse scenarios. DART encompasses four key stages: (1) Data\nDiversification using subject-driven image generation (DreamBooth with SDXL),\n(2) Annotation via open-vocabulary object detection (Grounding DINO) to\ngenerate bounding box and class labels (3) Review of generated images and\npseudo-labels by large multimodal models (InternVL-1.5 and GPT-4o) to guarantee\ncredibility, (4) Training of real-time object detectors (YOLOv8 and YOLOv10)\nusing the verified data as ground truth. We apply DART to a self-collected\ndataset of construction machines named Liebherr Product, which contains over\n15K high-quality images across 23 categories. The current instantiation of DART\nsignificantly increases average precision (AP) from 0.064 to 0.832. Its modular\ndesign ensures easy exchangeability and extensibility, allowing for future\nalgorithm upgrades, seamless integration of new object categories, and\nadaptability to customized environments without manual labeling and additional\ndata collection. The code and dataset are released at\nhttps://github.com/chen-xin-94/DART.\n","authors":["Chen Xin","Andreas Hartel","Enkelejda Kasneci"],"pdf_url":"https://arxiv.org/pdf/2407.09174v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18665v1","updated":"2024-07-26T10:59:19Z","published":"2024-07-26T10:59:19Z","title":"Local Binary Pattern(LBP) Optimization for Feature Extraction","summary":" The rapid growth of image data has led to the development of advanced image\nprocessing and computer vision techniques, which are crucial in various\napplications such as image classification, image segmentation, and pattern\nrecognition. Texture is an important feature that has been widely used in many\nimage processing tasks. Therefore, analyzing and understanding texture plays a\npivotal role in image analysis and understanding.Local binary pattern (LBP) is\na powerful operator that describes the local texture features of images. This\npaper provides a novel mathematical representation of the LBP by separating the\noperator into three matrices, two of which are always fixed and do not depend\non the input data. These fixed matrices are analyzed in depth, and a new\nalgorithm is proposed to optimize them for improved classification performance.\nThe optimization process is based on the singular value decomposition (SVD)\nalgorithm. As a result, the authors present optimal LBPs that effectively\ndescribe the texture of human face images. Several experiment results presented\nin this paper convincingly verify the efficiency and superiority of the\noptimized LBPs for face detection and facial expression recognition tasks.\n","authors":["Zeinab Sedaghatjoo","Hossein Hosseinzadeh","Bahram Sadeghi Bigham"],"pdf_url":"https://arxiv.org/pdf/2407.18665v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18731v2","updated":"2024-07-26T10:50:43Z","published":"2024-04-29T14:17:52Z","title":"Real Time Multi Organ Classification on Computed Tomography Images","summary":" Organ segmentation is a fundamental task in medical imaging since it is\nuseful for many clinical automation pipelines. However, some tasks do not\nrequire full segmentation. Instead, a classifier can identify the selected\norgan without segmenting the entire volume. In this study, we demonstrate a\nclassifier based method to obtain organ labels in real time by using a large\ncontext size with a sparse data sampling strategy. Although our method operates\nas an independent classifier at query locations, it can generate full\nsegmentations by querying grid locations at any resolution, offering faster\nperformance than segmentation algorithms. We compared our method with existing\nsegmentation techniques, demonstrating its superior runtime potential for\npractical applications in medical imaging.\n","authors":["Halid Ziya Yerebakan","Yoshihisa Shinagawa","Gerardo Hermosillo Valadez"],"pdf_url":"https://arxiv.org/pdf/2404.18731v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18658v1","updated":"2024-07-26T10:49:14Z","published":"2024-07-26T10:49:14Z","title":"Adversarial Robustification via Text-to-Image Diffusion Models","summary":" Adversarial robustness has been conventionally believed as a challenging\nproperty to encode for neural networks, requiring plenty of training data. In\nthe recent paradigm of adopting off-the-shelf models, however, access to their\ntraining data is often infeasible or not practical, while most of such models\nare not originally trained concerning adversarial robustness. In this paper, we\ndevelop a scalable and model-agnostic solution to achieve adversarial\nrobustness without using any data. Our intuition is to view recent\ntext-to-image diffusion models as \"adaptable\" denoisers that can be optimized\nto specify target tasks. Based on this, we propose: (a) to initiate a\ndenoise-and-classify pipeline that offers provable guarantees against\nadversarial attacks, and (b) to leverage a few synthetic reference images\ngenerated from the text-to-image model that enables novel adaptation schemes.\nOur experiments show that our data-free scheme applied to the pre-trained CLIP\ncould improve the (provable) adversarial robustness of its diverse zero-shot\nclassification derivatives (while maintaining their accuracy), significantly\nsurpassing prior approaches that utilize the full training data. Not only for\nCLIP, we also demonstrate that our framework is easily applicable for\nrobustifying other visual classifiers efficiently.\n","authors":["Daewon Choi","Jongheon Jeong","Huiwon Jang","Jinwoo Shin"],"pdf_url":"https://arxiv.org/pdf/2407.18658v1.pdf","comment":"Code is available at https://github.com/ChoiDae1/robustify-T2I"},{"id":"http://arxiv.org/abs/2407.18656v1","updated":"2024-07-26T10:45:57Z","published":"2024-07-26T10:45:57Z","title":"Auto DragGAN: Editing the Generative Image Manifold in an Autoregressive\n Manner","summary":" Pixel-level fine-grained image editing remains an open challenge. Previous\nworks fail to achieve an ideal trade-off between control granularity and\ninference speed. They either fail to achieve pixel-level fine-grained control,\nor their inference speed requires optimization. To address this, this paper for\nthe first time employs a regression-based network to learn the variation\npatterns of StyleGAN latent codes during the image dragging process. This\nmethod enables pixel-level precision in dragging editing with little time cost.\nUsers can specify handle points and their corresponding target points on any\nGAN-generated images, and our method will move each handle point to its\ncorresponding target point. Through experimental analysis, we discover that a\nshort movement distance from handle points to target points yields a\nhigh-fidelity edited image, as the model only needs to predict the movement of\na small portion of pixels. To achieve this, we decompose the entire movement\nprocess into multiple sub-processes. Specifically, we develop a transformer\nencoder-decoder based network named 'Latent Predictor' to predict the latent\ncode motion trajectories from handle points to target points in an\nautoregressive manner. Moreover, to enhance the prediction stability, we\nintroduce a component named 'Latent Regularizer', aimed at constraining the\nlatent code motion within the distribution of natural images. Extensive\nexperiments demonstrate that our method achieves state-of-the-art (SOTA)\ninference speed and image editing performance at the pixel-level granularity.\n","authors":["Pengxiang Cai","Zhiwei Liu","Guibo Zhu","Yunfang Niu","Jinqiao Wang"],"pdf_url":"https://arxiv.org/pdf/2407.18656v1.pdf","comment":"This paper has been accepted as a poster paper for ACM Multimedia\n 2024"},{"id":"http://arxiv.org/abs/2402.12750v2","updated":"2024-07-26T10:15:38Z","published":"2024-02-20T06:38:10Z","title":"Model Composition for Multimodal Large Language Models","summary":" Recent developments in Multimodal Large Language Models (MLLMs) have shown\nrapid progress, moving towards the goal of creating versatile MLLMs that\nunderstand inputs from various modalities. However, existing methods typically\nrely on joint training with paired multimodal instruction data, which is\nresource-intensive and challenging to extend to new modalities. In this paper,\nwe propose a new paradigm through the model composition of existing MLLMs to\ncreate a new model that retains the modal understanding capabilities of each\noriginal model. Our basic implementation, NaiveMC, demonstrates the\neffectiveness of this paradigm by reusing modality encoders and merging LLM\nparameters. Furthermore, we introduce DAMC to address parameter interference\nand mismatch issues during the merging process, thereby enhancing the model\nperformance. To facilitate research in this area, we propose MCUB, a benchmark\nfor assessing ability of MLLMs to understand inputs from diverse modalities.\nExperiments on this benchmark and four other multimodal understanding tasks\nshow significant improvements over baselines, proving that model composition\ncan create a versatile model capable of processing inputs from multiple\nmodalities.\n","authors":["Chi Chen","Yiyang Du","Zheng Fang","Ziyue Wang","Fuwen Luo","Peng Li","Ming Yan","Ji Zhang","Fei Huang","Maosong Sun","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2402.12750v2.pdf","comment":"ACL2024 Main Conference; Code is available at\n https://github.com/THUNLP-MT/ModelCompose"},{"id":"http://arxiv.org/abs/2407.18637v1","updated":"2024-07-26T10:08:01Z","published":"2024-07-26T10:08:01Z","title":"DynamicTrack: Advancing Gigapixel Tracking in Crowded Scenes","summary":" Tracking in gigapixel scenarios holds numerous potential applications in\nvideo surveillance and pedestrian analysis. Existing algorithms attempt to\nperform tracking in crowded scenes by utilizing multiple cameras or group\nrelationships. However, their performance significantly degrades when\nconfronted with complex interaction and occlusion inherent in gigapixel images.\nIn this paper, we introduce DynamicTrack, a dynamic tracking framework designed\nto address gigapixel tracking challenges in crowded scenes. In particular, we\npropose a dynamic detector that utilizes contrastive learning to jointly detect\nthe head and body of pedestrians. Building upon this, we design a dynamic\nassociation algorithm that effectively utilizes head and body information for\nmatching purposes. Extensive experiments show that our tracker achieves\nstate-of-the-art performance on widely used tracking benchmarks specifically\ndesigned for gigapixel crowded scenes.\n","authors":["Yunqi Zhao","Yuchen Guo","Zheng Cao","Kai Ni","Ruqi Huang","Lu Fang"],"pdf_url":"https://arxiv.org/pdf/2407.18637v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14897v3","updated":"2024-07-26T09:53:21Z","published":"2024-03-22T01:02:09Z","title":"Geometric Generative Models based on Morphological Equivariant PDEs and\n GANs","summary":" Content and image generation consist in creating or generating data from\nnoisy information by extracting specific features such as texture, edges, and\nother thin image structures. We are interested here in generative models, and\ntwo main problems are addressed. Firstly, the improvements of specific feature\nextraction while accounting at multiscale levels intrinsic geometric features;\nand secondly, the equivariance of the network to reduce its complexity and\nprovide a geometric interpretability. To proceed, we propose a geometric\ngenerative model based on an equivariant partial differential equation (PDE)\nfor group convolution neural networks (G-CNNs), so called PDE-G-CNNs, built on\nmorphology operators and generative adversarial networks (GANs). Equivariant\nmorphological PDE layers are composed of multiscale dilations and erosions\nformulated in Riemannian manifolds, while group symmetries are defined on a Lie\ngroup. We take advantage of the Lie group structure to properly integrate the\nequivariance in layers, and are able to use the Riemannian metric to solve the\nmultiscale morphological operations. Each point of the Lie group is associated\nwith a unique point in the manifold, which helps us derive a metric on the\nRiemannian manifold from a tensor field invariant under the Lie group so that\nthe induced metric has the same symmetries. The proposed geometric\nmorphological GAN (GM-GAN) is obtained by using the proposed morphological\nequivariant convolutions in PDE-G-CNNs to bring nonlinearity in classical CNNs.\nGM-GAN is evaluated on MNIST data and compared with GANs. Preliminary results\nshow that GM-GAN model outperforms classical GAN.\n","authors":["El Hadji S. Diop","Thierno Fall","Alioune Mbengue","Mohamed Daoudi"],"pdf_url":"https://arxiv.org/pdf/2403.14897v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18626v1","updated":"2024-07-26T09:35:36Z","published":"2024-07-26T09:35:36Z","title":"Every Part Matters: Integrity Verification of Scientific Figures Based\n on Multimodal Large Language Models","summary":" This paper tackles a key issue in the interpretation of scientific figures:\nthe fine-grained alignment of text and figures. It advances beyond prior\nresearch that primarily dealt with straightforward, data-driven visualizations\nsuch as bar and pie charts and only offered a basic understanding of diagrams\nthrough captioning and classification. We introduce a novel task, Figure\nIntegrity Verification, designed to evaluate the precision of technologies in\naligning textual knowledge with visual elements in scientific figures. To\nsupport this, we develop a semi-automated method for constructing a large-scale\ndataset, Figure-seg, specifically designed for this task. Additionally, we\npropose an innovative framework, Every Part Matters (EPM), which leverages\nMultimodal Large Language Models (MLLMs) to not only incrementally improve the\nalignment and verification of text-figure integrity but also enhance integrity\nthrough analogical reasoning. Our comprehensive experiments show that these\ninnovations substantially improve upon existing methods, allowing for more\nprecise and thorough analysis of complex scientific figures. This progress not\nonly enhances our understanding of multimodal technologies but also stimulates\nfurther research and practical applications across fields requiring the\naccurate interpretation of complex visual data.\n","authors":["Xiang Shi","Jiawei Liu","Yinpeng Liu","Qikai Cheng","Wei Lu"],"pdf_url":"https://arxiv.org/pdf/2407.18626v1.pdf","comment":"28 pages, 11 figures, under review"},{"id":"http://arxiv.org/abs/2407.18616v1","updated":"2024-07-26T09:20:29Z","published":"2024-07-26T09:20:29Z","title":"MOoSE: Multi-Orientation Sharing Experts for Open-set Scene Text\n Recognition","summary":" Open-set text recognition, which aims to address both novel characters and\npreviously seen ones, is one of the rising subtopics in the text recognition\nfield. However, the current open-set text recognition solutions only focuses on\nhorizontal text, which fail to model the real-life challenges posed by the\nvariety of writing directions in real-world scene text. Multi-orientation text\nrecognition, in general, faces challenges from the diverse image aspect ratios,\nsignificant imbalance in data amount, and domain gaps between orientations. In\nthis work, we first propose a Multi-Oriented Open-Set Text Recognition task\n(MOOSTR) to model the challenges of both novel characters and writing direction\nvariety. We then propose a Multi-Orientation Sharing Experts (MOoSE) framework\nas a strong baseline solution. MOoSE uses a mixture-of-experts scheme to\nalleviate the domain gaps between orientations, while exploiting common\nstructural knowledge among experts to alleviate the data scarcity that some\nexperts face. The proposed MOoSE framework is validated by ablative\nexperiments, and also tested for feasibility on the existing open-set\nbenchmark. Code, models, and documents are available at:\nhttps://github.com/lancercat/Moose/\n","authors":["Chang Liu","Simon Corbillé","Elisa H Barney Smith"],"pdf_url":"https://arxiv.org/pdf/2407.18616v1.pdf","comment":"Accepted in ICDAR2024"},{"id":"http://arxiv.org/abs/2407.18614v1","updated":"2024-07-26T09:15:29Z","published":"2024-07-26T09:15:29Z","title":"LookupForensics: A Large-Scale Multi-Task Dataset for Multi-Phase\n Image-Based Fact Verification","summary":" Amid the proliferation of forged images, notably the tsunami of deepfake\ncontent, extensive research has been conducted on using artificial intelligence\n(AI) to identify forged content in the face of continuing advancements in\ncounterfeiting technologies. We have investigated the use of AI to provide the\noriginal authentic image after deepfake detection, which we believe is a\nreliable and persuasive solution. We call this \"image-based automated fact\nverification,\" a name that originated from a text-based fact-checking system\nused by journalists. We have developed a two-phase open framework that\nintegrates detection and retrieval components. Additionally, inspired by a\ndataset proposed by Meta Fundamental AI Research, we further constructed a\nlarge-scale dataset that is specifically designed for this task. This dataset\nsimulates real-world conditions and includes both content-preserving and\ncontent-aware manipulations that present a range of difficulty levels and have\npotential for ongoing research. This multi-task dataset is fully annotated,\nenabling it to be utilized for sub-tasks within the forgery identification and\nfact retrieval domains. This paper makes two main contributions: (1) We\nintroduce a new task, \"image-based automated fact verification,\" and present a\nnovel two-phase open framework combining \"forgery identification\" and \"fact\nretrieval.\" (2) We present a large-scale dataset tailored for this new task\nthat features various hand-crafted image edits and machine learning-driven\nmanipulations, with extensive annotations suitable for various sub-tasks.\nExtensive experimental results validate its practicality for fact verification\nresearch and clarify its difficulty levels for various sub-tasks.\n","authors":["Shuhan Cui","Huy H. Nguyen","Trung-Nghia Le","Chun-Shien Lu","Isao Echizen"],"pdf_url":"https://arxiv.org/pdf/2407.18614v1.pdf","comment":"Pages 1-13 are the main body of the paper, and pages 14-16 are the\n supplementary material"},{"id":"http://arxiv.org/abs/2407.02394v3","updated":"2024-07-26T09:15:06Z","published":"2024-07-02T16:12:37Z","title":"Similarity Distance-Based Label Assignment for Tiny Object Detection","summary":" Tiny object detection is becoming one of the most challenging tasks in\ncomputer vision because of the limited object size and lack of information. The\nlabel assignment strategy is a key factor affecting the accuracy of object\ndetection. Although there are some effective label assignment strategies for\ntiny objects, most of them focus on reducing the sensitivity to the bounding\nboxes to increase the number of positive samples and have some fixed\nhyperparameters need to set. However, more positive samples may not necessarily\nlead to better detection results, in fact, excessive positive samples may lead\nto more false positives. In this paper, we introduce a simple but effective\nstrategy named the Similarity Distance (SimD) to evaluate the similarity\nbetween bounding boxes. This proposed strategy not only considers both location\nand shape similarity but also learns hyperparameters adaptively, ensuring that\nit can adapt to different datasets and various object sizes in a dataset. Our\napproach can be simply applied in common anchor-based detectors in place of the\nIoU for label assignment and Non Maximum Suppression (NMS). Extensive\nexperiments on four mainstream tiny object detection datasets demonstrate\nsuperior performance of our method, especially, 1.8 AP points and 4.1 AP points\nof very tiny higher than the state-of-the-art competitors on AI-TOD. Code is\navailable at: \\url{https://github.com/cszzshi/SimD}.\n","authors":["Shuohao Shi","Qiang Fang","Tong Zhao","Xin Xu"],"pdf_url":"https://arxiv.org/pdf/2407.02394v3.pdf","comment":"8 pages, 4 figures, this paper has been accepted by IEEE/RSJ\n International Conference on Intelligent Robots and Systems (IROS 2024)"},{"id":"http://arxiv.org/abs/2407.18613v1","updated":"2024-07-26T09:12:30Z","published":"2024-07-26T09:12:30Z","title":"Dilated Strip Attention Network for Image Restoration","summary":" Image restoration is a long-standing task that seeks to recover the latent\nsharp image from its deteriorated counterpart. Due to the robust capacity of\nself-attention to capture long-range dependencies, transformer-based methods or\nsome attention-based convolutional neural networks have demonstrated promising\nresults on many image restoration tasks in recent years. However, existing\nattention modules encounters limited receptive fields or abundant parameters.\nIn order to integrate contextual information more effectively and efficiently,\nin this paper, we propose a dilated strip attention network (DSAN) for image\nrestoration. Specifically, to gather more contextual information for each pixel\nfrom its neighboring pixels in the same row or column, a dilated strip\nattention (DSA) mechanism is elaborately proposed. By employing the DSA\noperation horizontally and vertically, each location can harvest the contextual\ninformation from a much wider region. In addition, we utilize multi-scale\nreceptive fields across different feature groups in DSA to improve\nrepresentation learning. Extensive experiments show that our DSAN outperforms\nstate-of-the-art algorithms on several image restoration tasks.\n","authors":["Fangwei Hao","Jiesheng Wu","Ji Du","Yinjie Wang","Jing Xu"],"pdf_url":"https://arxiv.org/pdf/2407.18613v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18611v1","updated":"2024-07-26T09:11:25Z","published":"2024-07-26T09:11:25Z","title":"IOVS4NeRF:Incremental Optimal View Selection for Large-Scale NeRFs","summary":" Urban-level three-dimensional reconstruction for modern applications demands\nhigh rendering fidelity while minimizing computational costs. The advent of\nNeural Radiance Fields (NeRF) has enhanced 3D reconstruction, yet it exhibits\nartifacts under multiple viewpoints. In this paper, we propose a new NeRF\nframework method to address these issues. Our method uses image content and\npose data to iteratively plan the next best view. A crucial aspect of this\nmethod involves uncertainty estimation, guiding the selection of views with\nmaximum information gain from a candidate set. This iterative process enhances\nrendering quality over time. Simultaneously, we introduce the Vonoroi diagram\nand threshold sampling together with flight classifier to boost the efficiency,\nwhile keep the original NeRF network intact. It can serve as a plug-in tool to\nassist in better rendering, outperforming baselines and similar prior works.\n","authors":["Jingpeng Xie","Shiyu Tan","Yuanlei Wang","Yizhen Lao"],"pdf_url":"https://arxiv.org/pdf/2407.18611v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.07741v4","updated":"2024-07-26T08:48:47Z","published":"2024-06-11T21:55:20Z","title":"Back to the Color: Learning Depth to Specific Color Transformation for\n Unsupervised Depth Estimation","summary":" Virtual engines can generate dense depth maps for various synthetic scenes,\nmaking them invaluable for training depth estimation models. However,\ndiscrepancies between synthetic and real-world colors pose significant\nchallenges for depth estimation in real-world scenes, especially in complex and\nuncertain environments encountered in unsupervised monocular depth estimation\ntasks. To address this issue, we propose Back2Color, a framework that predicts\nrealistic colors from depth using a model trained on real-world data, thus\ntransforming synthetic colors into their real-world counterparts. Additionally,\nwe introduce the Syn-Real CutMix method for joint training with both real-world\nunsupervised and synthetic supervised depth samples, enhancing monocular depth\nestimation performance in real-world scenes. Furthermore, to mitigate the\nimpact of non-rigid motions on depth estimation, we present an auto-learning\nuncertainty temporal-spatial fusion method (Auto-UTSF), which leverages the\nstrengths of unsupervised learning in both temporal and spatial dimensions. We\nalso designed VADepth, based on the Vision Attention Network, which offers\nlower computational complexity and higher accuracy than transformers. Our\nBack2Color framework achieves state-of-the-art performance on the Kitti\ndataset, as evidenced by improvements in performance metrics and the production\nof fine-grained details. This is particularly evident on more challenging\ndatasets such as Cityscapes for unsupervised depth estimation.\n","authors":["Yufan Zhu","Chongzhi Ran","Mingtao Feng","Fangfang Wu","Le Dong","Weisheng Dong","Antonio M. López","Guangming Shi"],"pdf_url":"https://arxiv.org/pdf/2406.07741v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.12257v2","updated":"2024-07-26T08:46:26Z","published":"2024-07-17T01:59:34Z","title":"Compound Expression Recognition via Multi Model Ensemble for the ABAW7\n Challenge","summary":" Compound Expression Recognition (CER) is vital for effective interpersonal\ninteractions. Human emotional expressions are inherently complex due to the\npresence of compound expressions, requiring the consideration of both local and\nglobal facial cues for accurate judgment. In this paper, we propose an ensemble\nlearning-based solution to address this complexity. Our approach involves\ntraining three distinct expression classification models using convolutional\nnetworks, Vision Transformers, and multiscale local attention networks. By\nemploying late fusion for model ensemble, we combine the outputs of these\nmodels to predict the final results. Our method demonstrates high accuracy on\nthe RAF-DB datasets and is capable of recognizing expressions in certain\nportions of the C-EXPR-DB through zero-shot learning.\n","authors":["Xuxiong Liu","Kang Shen","Jun Yao","Boyan Wang","Minrui Liu","Liuwei An","Zishun Cui","Weijie Feng","Xiao Sun"],"pdf_url":"https://arxiv.org/pdf/2407.12257v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2403.12572 by other authors"},{"id":"http://arxiv.org/abs/2407.12258v2","updated":"2024-07-26T08:42:10Z","published":"2024-07-17T02:01:34Z","title":"Facial Affect Recognition based on Multi Architecture Encoder and\n Feature Fusion for the ABAW7 Challenge","summary":" In this paper, we present our approach to addressing the challenges of the\n7th ABAW competition. The competition comprises three sub-challenges: Valence\nArousal (VA) estimation, Expression (Expr) classification, and Action Unit (AU)\ndetection. To tackle these challenges, we employ state-of-the-art models to\nextract powerful visual features. Subsequently, a Transformer Encoder is\nutilized to integrate these features for the VA, Expr, and AU sub-challenges.\nTo mitigate the impact of varying feature dimensions, we introduce an affine\nmodule to align the features to a common dimension. Overall, our results\nsignificantly outperform the baselines.\n","authors":["Kang Shen","Xuxiong Liu","Boyan Wang","Jun Yao","Xin Liu","Yujie Guan","Yu Wang","Gengchen Li","Xiao Sun"],"pdf_url":"https://arxiv.org/pdf/2407.12258v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17904v2","updated":"2024-07-26T08:33:23Z","published":"2024-07-25T09:49:04Z","title":"Exploring the Effect of Dataset Diversity in Self-Supervised Learning\n for Surgical Computer Vision","summary":" Over the past decade, computer vision applications in minimally invasive\nsurgery have rapidly increased. Despite this growth, the impact of surgical\ncomputer vision remains limited compared to other medical fields like pathology\nand radiology, primarily due to the scarcity of representative annotated data.\nWhereas transfer learning from large annotated datasets such as ImageNet has\nbeen conventionally the norm to achieve high-performing models, recent\nadvancements in self-supervised learning (SSL) have demonstrated superior\nperformance. In medical image analysis, in-domain SSL pretraining has already\nbeen shown to outperform ImageNet-based initialization. Although unlabeled data\nin the field of surgical computer vision is abundant, the diversity within this\ndata is limited. This study investigates the role of dataset diversity in SSL\nfor surgical computer vision, comparing procedure-specific datasets against a\nmore heterogeneous general surgical dataset across three different downstream\nsurgical applications. The obtained results show that using solely\nprocedure-specific data can lead to substantial improvements of 13.8%, 9.5%,\nand 36.8% compared to ImageNet pretraining. However, extending this data with\nmore heterogeneous surgical data further increases performance by an additional\n5.0%, 5.2%, and 2.5%, suggesting that increasing diversity within SSL data is\nbeneficial for model performance. The code and pretrained model weights are\nmade publicly available at https://github.com/TimJaspers0801/SurgeNet.\n","authors":["Tim J. M. Jaspers","Ronald L. P. D. de Jong","Yasmina Al Khalil","Tijn Zeelenberg","Carolus H. J. Kusters","Yiping Li","Romy C. van Jaarsveld","Franciscus H. A. Bakker","Jelle P. Ruurda","Willem M. Brinkman","Peter H. N. De With","Fons van der Sommen"],"pdf_url":"https://arxiv.org/pdf/2407.17904v2.pdf","comment":"accepted - Data Engineering in Medical Imaging (DEMI) Workshop @\n MICCAI2024"},{"id":"http://arxiv.org/abs/2407.18595v1","updated":"2024-07-26T08:30:06Z","published":"2024-07-26T08:30:06Z","title":"LinguaLinker: Audio-Driven Portraits Animation with Implicit Facial\n Control Enhancement","summary":" This study delves into the intricacies of synchronizing facial dynamics with\nmultilingual audio inputs, focusing on the creation of visually compelling,\ntime-synchronized animations through diffusion-based techniques. Diverging from\ntraditional parametric models for facial animation, our approach, termed\nLinguaLinker, adopts a holistic diffusion-based framework that integrates\naudio-driven visual synthesis to enhance the synergy between auditory stimuli\nand visual responses. We process audio features separately and derive the\ncorresponding control gates, which implicitly govern the movements in the\nmouth, eyes, and head, irrespective of the portrait's origin. The advanced\naudio-driven visual synthesis mechanism provides nuanced control but keeps the\ncompatibility of output video and input audio, allowing for a more tailored and\neffective portrayal of distinct personas across different languages. The\nsignificant improvements in the fidelity of animated portraits, the accuracy of\nlip-syncing, and the appropriate motion variations achieved by our method\nrender it a versatile tool for animating any portrait in any language.\n","authors":["Rui Zhang","Yixiao Fang","Zhengnan Lu","Pei Cheng","Zebiao Huang","Bin Fu"],"pdf_url":"https://arxiv.org/pdf/2407.18595v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18593v1","updated":"2024-07-26T08:28:53Z","published":"2024-07-26T08:28:53Z","title":"Content-driven Magnitude-Derivative Spectrum Complementary Learning for\n Hyperspectral Image Classification","summary":" Extracting discriminative information from complex spectral details in\nhyperspectral image (HSI) for HSI classification is pivotal. While current\nprevailing methods rely on spectral magnitude features, they could cause\nconfusion in certain classes, resulting in misclassification and decreased\naccuracy. We find that the derivative spectrum proves more adept at capturing\nconcealed information, thereby offering a distinct advantage in separating\nthese confusion classes. Leveraging the complementarity between spectral\nmagnitude and derivative features, we propose a Content-driven Spectrum\nComplementary Network based on Magnitude-Derivative Dual Encoder, employing\nthese two features as combined inputs. To fully utilize their complementary\ninformation, we raise a Content-adaptive Point-wise Fusion Module, enabling\nadaptive fusion of dual-encoder features in a point-wise selective manner,\ncontingent upon feature representation. To preserve a rich source of\ncomplementary information while extracting more distinguishable features, we\nintroduce a Hybrid Disparity-enhancing Loss that enhances the differential\nexpression of the features from the two branches and increases the inter-class\ndistance. As a result, our method achieves state-of-the-art results on the\nextensive WHU-OHS dataset and eight other benchmark datasets.\n","authors":["Huiyan Bai","Tingfa Xu","Huan Chen","Peifu Liu","Jianan Li"],"pdf_url":"https://arxiv.org/pdf/2407.18593v1.pdf","comment":"accepted by TGRS"},{"id":"http://arxiv.org/abs/2407.18590v1","updated":"2024-07-26T08:27:26Z","published":"2024-07-26T08:27:26Z","title":"From 2D to 3D: AISG-SLA Visual Localization Challenge","summary":" Research in 3D mapping is crucial for smart city applications, yet the cost\nof acquiring 3D data often hinders progress. Visual localization, particularly\nmonocular camera position estimation, offers a solution by determining the\ncamera's pose solely through visual cues. However, this task is challenging due\nto limited data from a single camera. To tackle these challenges, we organized\nthe AISG-SLA Visual Localization Challenge (VLC) at IJCAI 2023 to explore how\nAI can accurately extract camera pose data from 2D images in 3D space. The\nchallenge attracted over 300 participants worldwide, forming 50+ teams. Winning\nteams achieved high accuracy in pose estimation using images from a car-mounted\ncamera with low frame rates. The VLC dataset is available for research purposes\nupon request via vlc-dataset@aisingapore.org.\n","authors":["Jialin Gao","Bill Ong","Darld Lwi","Zhen Hao Ng","Xun Wei Yee","Mun-Thye Mak","Wee Siong Ng","See-Kiong Ng","Hui Ying Teo","Victor Khoo","Georg Bökman","Johan Edstedt","Kirill Brodt","Clémentin Boittiaux","Maxime Ferrera","Stepan Konev"],"pdf_url":"https://arxiv.org/pdf/2407.18590v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18589v1","updated":"2024-07-26T08:24:30Z","published":"2024-07-26T08:24:30Z","title":"HICEScore: A Hierarchical Metric for Image Captioning Evaluation","summary":" Image captioning evaluation metrics can be divided into two categories,\nreference-based metrics and reference-free metrics. However, reference-based\napproaches may struggle to evaluate descriptive captions with abundant visual\ndetails produced by advanced multimodal large language models, due to their\nheavy reliance on limited human-annotated references. In contrast, previous\nreference-free metrics have been proven effective via CLIP cross-modality\nsimilarity. Nonetheless, CLIP-based metrics, constrained by their solution of\nglobal image-text compatibility, often have a deficiency in detecting local\ntextual hallucinations and are insensitive to small visual objects. Besides,\ntheir single-scale designs are unable to provide an interpretable evaluation\nprocess such as pinpointing the position of caption mistakes and identifying\nvisual regions that have not been described. To move forward, we propose a\nnovel reference-free metric for image captioning evaluation, dubbed\nHierarchical Image Captioning Evaluation Score (HICE-S). By detecting local\nvisual regions and textual phrases, HICE-S builds an interpretable hierarchical\nscoring mechanism, breaking through the barriers of the single-scale structure\nof existing reference-free metrics. Comprehensive experiments indicate that our\nproposed metric achieves the SOTA performance on several benchmarks,\noutperforming existing reference-free metrics like CLIP-S and PAC-S, and\nreference-based metrics like METEOR and CIDEr. Moreover, several case studies\nreveal that the assessment process of HICE-S on detailed captions closely\nresembles interpretable human judgments.Our code is available at\nhttps://github.com/joeyz0z/HICE.\n","authors":["Zequn Zeng","Jianqiao Sun","Hao Zhang","Tiansheng Wen","Yudi Su","Yan Xie","Zhengjue Wang","Bo Chen"],"pdf_url":"https://arxiv.org/pdf/2407.18589v1.pdf","comment":"Accepted by ACM MM2024"},{"id":"http://arxiv.org/abs/2407.16684v2","updated":"2024-07-26T07:58:24Z","published":"2024-07-23T17:50:00Z","title":"AutoRG-Brain: Grounded Report Generation for Brain MRI","summary":" Radiologists are tasked with interpreting a large number of images in a daily\nbase, with the responsibility of generating corresponding reports. This\ndemanding workload elevates the risk of human error, potentially leading to\ntreatment delays, increased healthcare costs, revenue loss, and operational\ninefficiencies. To address these challenges, we initiate a series of work on\ngrounded Automatic Report Generation (AutoRG), starting from the brain MRI\ninterpretation system, which supports the delineation of brain structures, the\nlocalization of anomalies, and the generation of well-organized findings. We\nmake contributions from the following aspects, first, on dataset construction,\nwe release a comprehensive dataset encompassing segmentation masks of anomaly\nregions and manually authored reports, termed as RadGenome-Brain MRI. This data\nresource is intended to catalyze ongoing research and development in the field\nof AI-assisted report generation systems. Second, on system design, we propose\nAutoRG-Brain, the first brain MRI report generation system with pixel-level\ngrounded visual clues. Third, for evaluation, we conduct quantitative\nassessments and human evaluations of brain structure segmentation, anomaly\nlocalization, and report generation tasks to provide evidence of its\nreliability and accuracy. This system has been integrated into real clinical\nscenarios, where radiologists were instructed to write reports based on our\ngenerated findings and anomaly segmentation masks. The results demonstrate that\nour system enhances the report-writing skills of junior doctors, aligning their\nperformance more closely with senior doctors, thereby boosting overall\nproductivity.\n","authors":["Jiayu Lei","Xiaoman Zhang","Chaoyi Wu","Lisong Dai","Ya Zhang","Yanyong Zhang","Yanfeng Wang","Weidi Xie","Yuehua Li"],"pdf_url":"https://arxiv.org/pdf/2407.16684v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00465v2","updated":"2024-07-26T07:57:18Z","published":"2024-06-29T15:21:20Z","title":"Characterizing Continual Learning Scenarios and Strategies for Audio\n Analysis","summary":" Audio analysis is useful in many application scenarios. The state-of-the-art\naudio analysis approaches assume the data distribution at training and\ndeployment time will be the same. However, due to various real-life challenges,\nthe data may encounter drift in its distribution or can encounter new classes\nin the late future. Thus, a one-time trained model might not perform\nadequately. Continual learning (CL) approaches are devised to handle such\nchanges in data distribution. There have been a few attempts to use CL\napproaches for audio analysis. Yet, there is a lack of a systematic evaluation\nframework. In this paper, we create a comprehensive CL dataset and characterize\nCL approaches for audio-based monitoring tasks. We have investigated the\nfollowing CL and non-CL approaches: EWC, LwF, SI, GEM, A-GEM, GDumb, Replay,\nNaive, Cumulative, and Joint training. The study is very beneficial for\nresearchers and practitioners working in the area of audio analysis for\ndeveloping adaptive models. We observed that Replay achieved better results\nthan other methods in the DCASE challenge data. It achieved an accuracy of\n70.12% for the domain incremental scenario and an accuracy of 96.98% for the\nclass incremental scenario.\n","authors":["Ruchi Bhatt","Pratibha Kumari","Dwarikanath Mahapatra","Abdulmotaleb El Saddik","Mukesh Saini"],"pdf_url":"https://arxiv.org/pdf/2407.00465v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18574v1","updated":"2024-07-26T07:57:07Z","published":"2024-07-26T07:57:07Z","title":"Learning to Enhance Aperture Phasor Field for Non-Line-of-Sight Imaging","summary":" This paper aims to facilitate more practical NLOS imaging by reducing the\nnumber of samplings and scan areas. To this end, we introduce a phasor-based\nenhancement network that is capable of predicting clean and full measurements\nfrom noisy partial observations. We leverage a denoising autoencoder scheme to\nacquire rich and noise-robust representations in the measurement space. Through\nthis pipeline, our enhancement network is trained to accurately reconstruct\ncomplete measurements from their corrupted and partial counterparts. However,\nwe observe that the \\naive application of denoising often yields degraded and\nover-smoothed results, caused by unnecessary and spurious frequency signals\npresent in measurements. To address this issue, we introduce a phasor-based\npipeline designed to limit the spectrum of our network to the frequency range\nof interests, where the majority of informative signals are detected. The\nphasor wavefronts at the aperture, which are band-limited signals, are employed\nas inputs and outputs of the network, guiding our network to learn from the\nfrequency range of interests and discard unnecessary information. The\nexperimental results in more practical acquisition scenarios demonstrate that\nwe can look around the corners with $16\\times$ or $64\\times$ fewer samplings\nand $4\\times$ smaller apertures. Our code is available at\n\\url{https://github.com/join16/LEAP}.\n","authors":["In Cho","Hyunbo Shim","Seon Joo Kim"],"pdf_url":"https://arxiv.org/pdf/2407.18574v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18568v1","updated":"2024-07-26T07:50:48Z","published":"2024-07-26T07:50:48Z","title":"Learning Spectral-Decomposed Tokens for Domain Generalized Semantic\n Segmentation","summary":" The rapid development of Vision Foundation Model (VFM) brings inherent\nout-domain generalization for a variety of down-stream tasks. Among them,\ndomain generalized semantic segmentation (DGSS) holds unique challenges as the\ncross-domain images share common pixel-wise content information but vary\ngreatly in terms of the style. In this paper, we present a novel\nSpectral-dEcomposed Token (SET) learning framework to advance the frontier.\nDelving into further than existing fine-tuning token & frozen backbone\nparadigm, the proposed SET especially focuses on the way learning\nstyle-invariant features from these learnable tokens. Particularly, the frozen\nVFM features are first decomposed into the phase and amplitude components in\nthe frequency space, which mainly contain the information of content and style,\nrespectively, and then separately processed by learnable tokens for\ntask-specific information extraction. After the decomposition, style variation\nprimarily impacts the token-based feature enhancement within the amplitude\nbranch. To address this issue, we further develop an attention optimization\nmethod to bridge the gap between style-affected representation and static\ntokens during inference. Extensive cross-domain experiments show its\nstate-of-the-art performance.\n","authors":["Jingjun Yi","Qi Bi","Hao Zheng","Haolan Zhan","Wei Ji","Yawen Huang","Yuexiang Li","Yefeng Zheng"],"pdf_url":"https://arxiv.org/pdf/2407.18568v1.pdf","comment":"accecpted by ACMM MM2024"},{"id":"http://arxiv.org/abs/2403.10519v2","updated":"2024-07-26T07:45:35Z","published":"2024-03-15T17:59:40Z","title":"Frozen Feature Augmentation for Few-Shot Image Classification","summary":" Training a linear classifier or lightweight model on top of pretrained vision\nmodel outputs, so-called 'frozen features', leads to impressive performance on\na number of downstream few-shot tasks. Currently, frozen features are not\nmodified during training. On the other hand, when networks are trained directly\non images, data augmentation is a standard recipe that improves performance\nwith no substantial overhead. In this paper, we conduct an extensive pilot\nstudy on few-shot image classification that explores applying data\naugmentations in the frozen feature space, dubbed 'frozen feature augmentation\n(FroFA)', covering twenty augmentations in total. Our study demonstrates that\nadopting a deceptively simple pointwise FroFA, such as brightness, can improve\nfew-shot performance consistently across three network architectures, three\nlarge pretraining datasets, and eight transfer datasets.\n","authors":["Andreas Bär","Neil Houlsby","Mostafa Dehghani","Manoj Kumar"],"pdf_url":"https://arxiv.org/pdf/2403.10519v2.pdf","comment":"CVPR 2024 (18 pages, main paper + supplementary material)"},{"id":"http://arxiv.org/abs/2407.17272v2","updated":"2024-07-26T07:40:47Z","published":"2024-07-24T13:39:07Z","title":"DenseTrack: Drone-based Crowd Tracking via Density-aware\n Motion-appearance Synergy","summary":" Drone-based crowd tracking faces difficulties in accurately identifying and\nmonitoring objects from an aerial perspective, largely due to their small size\nand close proximity to each other, which complicates both localization and\ntracking. To address these challenges, we present the Density-aware Tracking\n(DenseTrack) framework. DenseTrack capitalizes on crowd counting to precisely\ndetermine object locations, blending visual and motion cues to improve the\ntracking of small-scale objects. It specifically addresses the problem of\ncross-frame motion to enhance tracking accuracy and dependability. DenseTrack\nemploys crowd density estimates as anchors for exact object localization within\nvideo frames. These estimates are merged with motion and position information\nfrom the tracking network, with motion offsets serving as key tracking cues.\nMoreover, DenseTrack enhances the ability to distinguish small-scale objects\nusing insights from the visual-language model, integrating appearance with\nmotion cues. The framework utilizes the Hungarian algorithm to ensure the\naccurate matching of individuals across frames. Demonstrated on DroneCrowd\ndataset, our approach exhibits superior performance, confirming its\neffectiveness in scenarios captured by drones.\n","authors":["Yi Lei","Huilin Zhu","Jingling Yuan","Guangli Xiang","Xian Zhong","Shengfeng He"],"pdf_url":"https://arxiv.org/pdf/2407.17272v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18559v1","updated":"2024-07-26T07:16:52Z","published":"2024-07-26T07:16:52Z","title":"VSSD: Vision Mamba with Non-Casual State Space Duality","summary":" Vision transformers have significantly advanced the field of computer vision,\noffering robust modeling capabilities and global receptive field. However,\ntheir high computational demands limit their applicability in processing long\nsequences. To tackle this issue, State Space Models (SSMs) have gained\nprominence in vision tasks as they offer linear computational complexity.\nRecently, State Space Duality (SSD), an improved variant of SSMs, was\nintroduced in Mamba2 to enhance model performance and efficiency. However, the\ninherent causal nature of SSD/SSMs restricts their applications in non-causal\nvision tasks. To address this limitation, we introduce Visual State Space\nDuality (VSSD) model, which has a non-causal format of SSD. Specifically, we\npropose to discard the magnitude of interactions between the hidden state and\ntokens while preserving their relative weights, which relieves the dependencies\nof token contribution on previous tokens. Together with the involvement of\nmulti-scan strategies, we show that the scanning results can be integrated to\nachieve non-causality, which not only improves the performance of SSD in vision\ntasks but also enhances its efficiency. We conduct extensive experiments on\nvarious benchmarks including image classification, detection, and segmentation,\nwhere VSSD surpasses existing state-of-the-art SSM-based models. Code and\nweights are available at \\url{https://github.com/YuHengsss/VSSD}.\n","authors":["Yuheng Shi","Minjing Dong","Mingjia Li","Chang Xu"],"pdf_url":"https://arxiv.org/pdf/2407.18559v1.pdf","comment":"16 pages, 5 figures, 7 tables"},{"id":"http://arxiv.org/abs/2407.18555v1","updated":"2024-07-26T07:08:05Z","published":"2024-07-26T07:08:05Z","title":"How To Segment in 3D Using 2D Models: Automated 3D Segmentation of\n Prostate Cancer Metastatic Lesions on PET Volumes Using Multi-Angle Maximum\n Intensity Projections and Diffusion Models","summary":" Prostate specific membrane antigen (PSMA) positron emission\ntomography/computed tomography (PET/CT) imaging provides a tremendously\nexciting frontier in visualization of prostate cancer (PCa) metastatic lesions.\nHowever, accurate segmentation of metastatic lesions is challenging due to low\nsignal-to-noise ratios and variable sizes, shapes, and locations of the\nlesions. This study proposes a novel approach for automated segmentation of\nmetastatic lesions in PSMA PET/CT 3D volumetric images using 2D denoising\ndiffusion probabilistic models (DDPMs). Instead of 2D trans-axial slices or 3D\nvolumes, the proposed approach segments the lesions on generated multi-angle\nmaximum intensity projections (MA-MIPs) of the PSMA PET images, then obtains\nthe final 3D segmentation masks from 3D ordered subset expectation maximization\n(OSEM) reconstruction of 2D MA-MIPs segmentations. Our proposed method achieved\nsuperior performance compared to state-of-the-art 3D segmentation approaches in\nterms of accuracy and robustness in detecting and segmenting small metastatic\nPCa lesions. The proposed method has significant potential as a tool for\nquantitative analysis of metastatic burden in PCa patients.\n","authors":["Amirhosein Toosi","Sara Harsini","François Bénard","Carlos Uribe","Arman Rahmim"],"pdf_url":"https://arxiv.org/pdf/2407.18555v1.pdf","comment":"11 pages, 2 figures, accepted in the DGM4MICCAI workshop, MICCAI,\n 2024"},{"id":"http://arxiv.org/abs/2407.18554v1","updated":"2024-07-26T07:06:42Z","published":"2024-07-26T07:06:42Z","title":"Skin Cancer Detection utilizing Deep Learning: Classification of Skin\n Lesion Images using a Vision Transformer","summary":" Skin cancer detection still represents a major challenge in healthcare.\nCommon detection methods can be lengthy and require human assistance which\nfalls short in many countries. Previous research demonstrates how convolutional\nneural networks (CNNs) can help effectively through both automation and an\naccuracy that is comparable to the human level. However, despite the progress\nin previous decades, the precision is still limited, leading to substantial\nmisclassifications that have a serious impact on people's health. Hence, we\nemploy a Vision Transformer (ViT) that has been developed in recent years based\non the idea of a self-attention mechanism, specifically two configurations of a\npre-trained ViT. We generally find superior metrics for classifying skin\nlesions after comparing them to base models such as decision tree classifier\nand k-nearest neighbor (KNN) classifier, as well as to CNNs and less complex\nViTs. In particular, we attach greater importance to the performance of\nmelanoma, which is the most lethal type of skin cancer. The ViT-L32 model\nachieves an accuracy of 91.57% and a melanoma recall of 58.54%, while ViT-L16\nachieves an accuracy of 92.79% and a melanoma recall of 56.10%. This offers a\npotential tool for faster and more accurate diagnoses and an overall\nimprovement for the healthcare sector.\n","authors":["Carolin Flosdorf","Justin Engelker","Igor Keller","Nicolas Mohr"],"pdf_url":"https://arxiv.org/pdf/2407.18554v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18552v1","updated":"2024-07-26T07:05:04Z","published":"2024-07-26T07:05:04Z","title":"Multimodal Emotion Recognition using Audio-Video Transformer Fusion with\n Cross Attention","summary":" Understanding emotions is a fundamental aspect of human communication.\nIntegrating audio and video signals offers a more comprehensive understanding\nof emotional states compared to traditional methods that rely on a single data\nsource, such as speech or facial expressions. Despite its potential, multimodal\nemotion recognition faces significant challenges, particularly in\nsynchronization, feature extraction, and fusion of diverse data sources. To\naddress these issues, this paper introduces a novel transformer-based model\nnamed Audio-Video Transformer Fusion with Cross Attention (AVT-CA). The AVT-CA\nmodel employs a transformer fusion approach to effectively capture and\nsynchronize interlinked features from both audio and video inputs, thereby\nresolving synchronization problems. Additionally, the Cross Attention mechanism\nwithin AVT-CA selectively extracts and emphasizes critical features while\ndiscarding irrelevant ones from both modalities, addressing feature extraction\nand fusion challenges. Extensive experimental analysis conducted on the\nCMU-MOSEI, RAVDESS and CREMA-D datasets demonstrates the efficacy of the\nproposed model. The results underscore the importance of AVT-CA in developing\nprecise and reliable multimodal emotion recognition systems for practical\napplications.\n","authors":["Joe Dhanith P R","Shravan Venkatraman","Vigya Sharma","Santhosh Malarvannan"],"pdf_url":"https://arxiv.org/pdf/2407.18552v1.pdf","comment":"38 Pages, 9 Tables, 12 Figures"},{"id":"http://arxiv.org/abs/2407.08265v3","updated":"2024-07-26T06:59:00Z","published":"2024-07-11T08:06:31Z","title":"Coordinate-Aware Thermal Infrared Tracking Via Natural Language Modeling","summary":" Thermal infrared (TIR) tracking is pivotal in computer vision tasks due to\nits all-weather imaging capability. Traditional tracking methods predominantly\nrely on hand-crafted features, and while deep learning has introduced\ncorrelation filtering techniques, these are often constrained by rudimentary\ncorrelation operations. Furthermore, transformer-based approaches tend to\noverlook temporal and coordinate information, which is critical for TIR\ntracking that lacks texture and color information. In this paper, to address\nthese issues, we apply natural language modeling to TIR tracking and propose a\ncoordinate-aware thermal infrared tracking model called NLMTrack, which\nenhances the utilization of coordinate and temporal information. NLMTrack\napplies an encoder that unifies feature extraction and feature fusion, which\nsimplifies the TIR tracking pipeline. To address the challenge of low detail\nand low contrast in TIR images, on the one hand, we design a multi-level\nprogressive fusion module that enhances the semantic representation and\nincorporates multi-scale features. On the other hand, the decoder combines the\nTIR features and the coordinate sequence features using a causal transformer to\ngenerate the target sequence step by step. Moreover, we explore an adaptive\nloss aimed at elevating tracking accuracy and a simple template update strategy\nto accommodate the target's appearance variations. Experiments show that\nNLMTrack achieves state-of-the-art performance on multiple benchmarks. The Code\nis publicly available at \\url{https://github.com/ELOESZHANG/NLMTrack}.\n","authors":["Miao Yan","Ping Zhang","Haofei Zhang","Ruqian Hao","Juanxiu Liu","Xiaoyang Wang","Lin Liu"],"pdf_url":"https://arxiv.org/pdf/2407.08265v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.17983v3","updated":"2024-07-26T06:46:19Z","published":"2024-02-28T01:56:00Z","title":"3MVRD: Multimodal Multi-task Multi-teacher Visually-Rich Form Document\n Understanding","summary":" This paper presents a groundbreaking multimodal, multi-task, multi-teacher\njoint-grained knowledge distillation model for visually-rich form document\nunderstanding. The model is designed to leverage insights from both\nfine-grained and coarse-grained levels by facilitating a nuanced correlation\nbetween token and entity representations, addressing the complexities inherent\nin form documents. Additionally, we introduce new inter-grained and\ncross-grained loss functions to further refine diverse multi-teacher knowledge\ndistillation transfer process, presenting distribution gaps and a harmonised\nunderstanding of form documents. Through a comprehensive evaluation across\npublicly available form document understanding datasets, our proposed model\nconsistently outperforms existing baselines, showcasing its efficacy in\nhandling the intricate structures and content of visually complex form\ndocuments.\n","authors":["Yihao Ding","Lorenzo Vaiani","Caren Han","Jean Lee","Paolo Garza","Josiah Poon","Luca Cagliero"],"pdf_url":"https://arxiv.org/pdf/2402.17983v3.pdf","comment":"Accepted at Findings of ACL 2024"},{"id":"http://arxiv.org/abs/2402.18695v2","updated":"2024-07-26T06:34:15Z","published":"2024-02-28T20:22:17Z","title":"Grounding Language Models for Visual Entity Recognition","summary":" We introduce AutoVER, an Autoregressive model for Visual Entity Recognition.\nOur model extends an autoregressive Multi-modal Large Language Model by\nemploying retrieval augmented constrained generation. It mitigates low\nperformance on out-of-domain entities while excelling in queries that require\nvisually-situated reasoning. Our method learns to distinguish similar entities\nwithin a vast label space by contrastively training on hard negative pairs in\nparallel with a sequence-to-sequence objective without an external retriever.\nDuring inference, a list of retrieved candidate answers explicitly guides\nlanguage generation by removing invalid decoding paths. The proposed method\nachieves significant improvements across different dataset splits in the\nrecently proposed Oven-Wiki benchmark. Accuracy on the Entity seen split rises\nfrom 32.7% to 61.5%. It also demonstrates superior performance on the unseen\nand query splits by a substantial double-digit margin.\n","authors":["Zilin Xiao","Ming Gong","Paola Cascante-Bonilla","Xingyao Zhang","Jie Wu","Vicente Ordonez"],"pdf_url":"https://arxiv.org/pdf/2402.18695v2.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2407.18534v1","updated":"2024-07-26T06:29:09Z","published":"2024-07-26T06:29:09Z","title":"Boosting Cross-Domain Point Classification via Distilling Relational\n Priors from 2D Transformers","summary":" Semantic pattern of an object point cloud is determined by its topological\nconfiguration of local geometries. Learning discriminative representations can\nbe challenging due to large shape variations of point sets in local regions and\nincomplete surface in a global perspective, which can be made even more severe\nin the context of unsupervised domain adaptation (UDA). In specific,\ntraditional 3D networks mainly focus on local geometric details and ignore the\ntopological structure between local geometries, which greatly limits their\ncross-domain generalization. Recently, the transformer-based models have\nachieved impressive performance gain in a range of image-based tasks,\nbenefiting from its strong generalization capability and scalability stemming\nfrom capturing long range correlation across local patches. Inspired by such\nsuccesses of visual transformers, we propose a novel Relational Priors\nDistillation (RPD) method to extract relational priors from the well-trained\ntransformers on massive images, which can significantly empower cross-domain\nrepresentations with consistent topological priors of objects. To this end, we\nestablish a parameter-frozen pre-trained transformer module shared between 2D\nteacher and 3D student models, complemented by an online knowledge distillation\nstrategy for semantically regularizing the 3D student model. Furthermore, we\nintroduce a novel self-supervised task centered on reconstructing masked point\ncloud patches using corresponding masked multi-view image features, thereby\nempowering the model with incorporating 3D geometric information. Experiments\non the PointDA-10 and the Sim-to-Real datasets verify that the proposed method\nconsistently achieves the state-of-the-art performance of UDA for point cloud\nclassification. The source code of this work is available at\nhttps://github.com/zou-longkun/RPD.git.\n","authors":["Longkun Zou","Wanru Zhu","Ke Chen","Lihua Guo","Kailing Guo","Kui Jia","Yaowei Wang"],"pdf_url":"https://arxiv.org/pdf/2407.18534v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03425v6","updated":"2024-07-26T06:25:48Z","published":"2024-04-04T13:06:25Z","title":"ChangeMamba: Remote Sensing Change Detection With Spatiotemporal State\n Space Model","summary":" Convolutional neural networks (CNN) and Transformers have made impressive\nprogress in the field of remote sensing change detection (CD). However, both\narchitectures have inherent shortcomings: CNN are constrained by a limited\nreceptive field that may hinder their ability to capture broader spatial\ncontexts, while Transformers are computationally intensive, making them costly\nto train and deploy on large datasets. Recently, the Mamba architecture, based\non state space models, has shown remarkable performance in a series of natural\nlanguage processing tasks, which can effectively compensate for the\nshortcomings of the above two architectures. In this paper, we explore for the\nfirst time the potential of the Mamba architecture for remote sensing CD tasks.\nWe tailor the corresponding frameworks, called MambaBCD, MambaSCD, and\nMambaBDA, for binary change detection (BCD), semantic change detection (SCD),\nand building damage assessment (BDA), respectively. All three frameworks adopt\nthe cutting-edge Visual Mamba architecture as the encoder, which allows full\nlearning of global spatial contextual information from the input images. For\nthe change decoder, which is available in all three architectures, we propose\nthree spatio-temporal relationship modeling mechanisms, which can be naturally\ncombined with the Mamba architecture and fully utilize its attribute to achieve\nspatio-temporal interaction of multi-temporal features, thereby obtaining\naccurate change information. On five benchmark datasets, our proposed\nframeworks outperform current CNN- and Transformer-based approaches without\nusing any complex training strategies or tricks, fully demonstrating the\npotential of the Mamba architecture in CD tasks. Further experiments show that\nour architecture is quite robust to degraded data. The source code will be\navailable in https://github.com/ChenHongruixuan/MambaCD\n","authors":["Hongruixuan Chen","Jian Song","Chengxi Han","Junshi Xia","Naoto Yokoya"],"pdf_url":"https://arxiv.org/pdf/2404.03425v6.pdf","comment":"Accepted by IEEE TGRS: https://ieeexplore.ieee.org/document/10565926"},{"id":"http://arxiv.org/abs/2401.09786v2","updated":"2024-07-26T06:17:59Z","published":"2024-01-18T08:10:34Z","title":"Adaptive Self-training Framework for Fine-grained Scene Graph Generation","summary":" Scene graph generation (SGG) models have suffered from inherent problems\nregarding the benchmark datasets such as the long-tailed predicate distribution\nand missing annotation problems. In this work, we aim to alleviate the\nlong-tailed problem of SGG by utilizing unannotated triplets. To this end, we\nintroduce a Self-Training framework for SGG (ST-SGG) that assigns pseudo-labels\nfor unannotated triplets based on which the SGG models are trained. While there\nhas been significant progress in self-training for image recognition, designing\na self-training framework for the SGG task is more challenging due to its\ninherent nature such as the semantic ambiguity and the long-tailed distribution\nof predicate classes. Hence, we propose a novel pseudo-labeling technique for\nSGG, called Class-specific Adaptive Thresholding with Momentum (CATM), which is\na model-agnostic framework that can be applied to any existing SGG models.\nFurthermore, we devise a graph structure learner (GSL) that is beneficial when\nadopting our proposed self-training framework to the state-of-the-art\nmessage-passing neural network (MPNN)-based SGG models. Our extensive\nexperiments verify the effectiveness of ST-SGG on various SGG models,\nparticularly in enhancing the performance on fine-grained predicate classes.\n","authors":["Kibum Kim","Kanghoon Yoon","Yeonjun In","Jinyoung Moon","Donghyun Kim","Chanyoung Park"],"pdf_url":"https://arxiv.org/pdf/2401.09786v2.pdf","comment":"9 pages; ICLR 2024"},{"id":"http://arxiv.org/abs/2407.16277v2","updated":"2024-07-26T06:00:08Z","published":"2024-07-23T08:29:49Z","title":"When, Where, and What? A Novel Benchmark for Accident Anticipation and\n Localization with Large Language Models","summary":" As autonomous driving systems increasingly become part of daily\ntransportation, the ability to accurately anticipate and mitigate potential\ntraffic accidents is paramount. Traditional accident anticipation models\nprimarily utilizing dashcam videos are adept at predicting when an accident may\noccur but fall short in localizing the incident and identifying involved\nentities. Addressing this gap, this study introduces a novel framework that\nintegrates Large Language Models (LLMs) to enhance predictive capabilities\nacross multiple dimensions--what, when, and where accidents might occur. We\ndevelop an innovative chain-based attention mechanism that dynamically adjusts\nto prioritize high-risk elements within complex driving scenes. This mechanism\nis complemented by a three-stage model that processes outputs from smaller\nmodels into detailed multimodal inputs for LLMs, thus enabling a more nuanced\nunderstanding of traffic dynamics. Empirical validation on the DAD, CCD, and\nA3D datasets demonstrates superior performance in Average Precision (AP) and\nMean Time-To-Accident (mTTA), establishing new benchmarks for accident\nprediction technology. Our approach not only advances the technological\nframework for autonomous driving safety but also enhances human-AI interaction,\nmaking predictive insights generated by autonomous systems more intuitive and\nactionable.\n","authors":["Haicheng Liao","Yongkang Li","Chengyue Wang","Yanchen Guan","KaHou Tam","Chunlin Tian","Li Li","Chengzhong Xu","Zhenning Li"],"pdf_url":"https://arxiv.org/pdf/2407.16277v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18524v1","updated":"2024-07-26T05:56:18Z","published":"2024-07-26T05:56:18Z","title":"She Works, He Works: A Curious Exploration of Gender Bias in\n AI-Generated Imagery","summary":" This paper examines gender bias in AI-generated imagery of construction\nworkers, highlighting discrepancies in the portrayal of male and female\nfigures. Grounded in Griselda Pollock's theories on visual culture and gender,\nthe analysis reveals that AI models tend to sexualize female figures while\nportraying male figures as more authoritative and competent. These findings\nunderscore AI's potential to mirror and perpetuate societal biases, emphasizing\nthe need for critical engagement with AI-generated content. The project\ncontributes to discussions on the ethical implications of AI in creative\npractices and its broader impact on cultural perceptions of gender.\n","authors":["Amalia Foka"],"pdf_url":"https://arxiv.org/pdf/2407.18524v1.pdf","comment":"11 pages, 8 figures"},{"id":"http://arxiv.org/abs/2407.18520v1","updated":"2024-07-26T05:29:24Z","published":"2024-07-26T05:29:24Z","title":"Text-Region Matching for Multi-Label Image Recognition with Missing\n Labels","summary":" Recently, large-scale visual language pre-trained (VLP) models have\ndemonstrated impressive performance across various downstream tasks. Motivated\nby these advancements, pioneering efforts have emerged in multi-label image\nrecognition with missing labels, leveraging VLP prompt-tuning technology.\nHowever, they usually cannot match text and vision features well, due to\ncomplicated semantics gaps and missing labels in a multi-label image. To tackle\nthis challenge, we propose \\textbf{T}ext-\\textbf{R}egion \\textbf{M}atching for\noptimizing \\textbf{M}ulti-\\textbf{L}abel prompt tuning, namely TRM-ML, a novel\nmethod for enhancing meaningful cross-modal matching. Compared to existing\nmethods, we advocate exploring the information of category-aware regions rather\nthan the entire image or pixels, which contributes to bridging the semantic gap\nbetween textual and visual representations in a one-to-one matching manner.\nConcurrently, we further introduce multimodal contrastive learning to narrow\nthe semantic gap between textual and visual modalities and establish\nintra-class and inter-class relationships. Additionally, to deal with missing\nlabels, we propose a multimodal category prototype that leverages intra- and\ninter-category semantic relationships to estimate unknown labels, facilitating\npseudo-label generation. Extensive experiments on the MS-COCO, PASCAL VOC,\nVisual Genome, NUS-WIDE, and CUB-200-211 benchmark datasets demonstrate that\nour proposed framework outperforms the state-of-the-art methods by a\nsignificant margin. Our code is available\nhere\\href{https://github.com/yu-gi-oh-leilei/TRM-ML}{\\raisebox{-1pt}{\\faGithub}}.\n","authors":["Leilei Ma","Hongxing Xie","Lei Wang","Yanping Fu","Dengdi Sun","Haifeng Zhao"],"pdf_url":"https://arxiv.org/pdf/2407.18520v1.pdf","comment":"Accepted to ACM International Conference on Multimedia (ACM MM) 2024"},{"id":"http://arxiv.org/abs/2407.18500v1","updated":"2024-07-26T04:18:10Z","published":"2024-07-26T04:18:10Z","title":"Revisit Event Generation Model: Self-Supervised Learning of\n Event-to-Video Reconstruction with Implicit Neural Representations","summary":" Reconstructing intensity frames from event data while maintaining high\ntemporal resolution and dynamic range is crucial for bridging the gap between\nevent-based and frame-based computer vision. Previous approaches have depended\non supervised learning on synthetic data, which lacks interpretability and risk\nover-fitting to the setting of the event simulator. Recently, self-supervised\nlearning (SSL) based methods, which primarily utilize per-frame optical flow to\nestimate intensity via photometric constancy, has been actively investigated.\nHowever, they are vulnerable to errors in the case of inaccurate optical flow.\nThis paper proposes a novel SSL event-to-video reconstruction approach, dubbed\nEvINR, which eliminates the need for labeled data or optical flow estimation.\nOur core idea is to reconstruct intensity frames by directly addressing the\nevent generation model, essentially a partial differential equation (PDE) that\ndescribes how events are generated based on the time-varying brightness\nsignals. Specifically, we utilize an implicit neural representation (INR),\nwhich takes in spatiotemporal coordinate $(x, y, t)$ and predicts intensity\nvalues, to represent the solution of the event generation equation. The INR,\nparameterized as a fully-connected Multi-layer Perceptron (MLP), can be\noptimized with its temporal derivatives supervised by events. To make EvINR\nfeasible for online requisites, we propose several acceleration techniques that\nsubstantially expedite the training process. Comprehensive experiments\ndemonstrate that our EvINR surpasses previous SSL methods by 38% w.r.t. Mean\nSquared Error (MSE) and is comparable or superior to SoTA supervised methods.\nProject page: https://vlislab22.github.io/EvINR/.\n","authors":["Zipeng Wang","Yunfan Lu","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2407.18500v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18497v1","updated":"2024-07-26T04:02:46Z","published":"2024-07-26T04:02:46Z","title":"Answerability Fields: Answerable Location Estimation via Diffusion\n Models","summary":" In an era characterized by advancements in artificial intelligence and\nrobotics, enabling machines to interact with and understand their environment\nis a critical research endeavor. In this paper, we propose Answerability\nFields, a novel approach to predicting answerability within complex indoor\nenvironments. Leveraging a 3D question answering dataset, we construct a\ncomprehensive Answerability Fields dataset, encompassing diverse scenes and\nquestions from ScanNet. Using a diffusion model, we successfully infer and\nevaluate these Answerability Fields, demonstrating the importance of objects\nand their locations in answering questions within a scene. Our results showcase\nthe efficacy of Answerability Fields in guiding scene-understanding tasks,\nlaying the foundation for their application in enhancing interactions between\nintelligent agents and their environments.\n","authors":["Daichi Azuma","Taiki Miyanishi","Shuhei Kurita","Koya Sakamoto","Motoaki Kawanabe"],"pdf_url":"https://arxiv.org/pdf/2407.18497v1.pdf","comment":"IROS2024"},{"id":"http://arxiv.org/abs/2407.18492v1","updated":"2024-07-26T03:52:08Z","published":"2024-07-26T03:52:08Z","title":"Neural Modulation Alteration to Positive and Negative Emotions in\n Depressed Patients: Insights from fMRI Using Positive/Negative Emotion Atlas","summary":" Background: Although it has been noticed that depressed patients show\ndifferences in processing emotions, the precise neural modulation mechanisms of\npositive and negative emotions remain elusive. FMRI is a cutting-edge medical\nimaging technology renowned for its high spatial resolution and dynamic\ntemporal information, making it particularly suitable for the neural dynamics\nof depression research. Methods: To address this gap, our study firstly\nleveraged fMRI to delineate activated regions associated with positive and\nnegative emotions in healthy individuals, resulting in the creation of positive\nemotion atlas (PEA) and negative emotion atlas (NEA). Subsequently, we examined\nneuroimaging changes in depression patients using these atlases and evaluated\ntheir diagnostic performance based on machine learning. Results: Our findings\ndemonstrate that the classification accuracy of depressed patients based on PEA\nand NEA exceeded 0.70, a notable improvement compared to the whole-brain\natlases. Furthermore, ALFF analysis unveiled significant differences between\ndepressed patients and healthy controls in eight functional clusters during the\nNEA, focusing on the left cuneus, cingulate gyrus, and superior parietal\nlobule. In contrast, the PEA revealed more pronounced differences across\nfifteen clusters, involving the right fusiform gyrus, parahippocampal gyrus,\nand inferior parietal lobule. Limitations: Due to the limited sample size and\nsubtypes of depressed patients, the efficacy may need further validation in\nfuture. Conclusions: These findings emphasize the complex interplay between\nemotion modulation and depression, showcasing significant alterations in both\nPEA and NEA among depression patients. This research enhances our understanding\nof emotion modulation in depression, with implications for diagnosis and\ntreatment evaluation.\n","authors":["Yu Feng","Weiming Zeng","Yifan Xie","Hongyu Chen","Lei Wang","Yingying Wang","Hongjie Yan","Kaile Zhang","Ran Tao","Wai Ting Siok","Nizhuan Wang"],"pdf_url":"https://arxiv.org/pdf/2407.18492v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18487v1","updated":"2024-07-26T03:38:29Z","published":"2024-07-26T03:38:29Z","title":"SMPISD-MTPNet: Scene Semantic Prior-Assisted Infrared Ship Detection\n Using Multi-Task Perception Networks","summary":" Infrared ship detection (IRSD) has received increasing attention in recent\nyears due to the robustness of infrared images to adverse weather. However, a\nlarge number of false alarms may occur in complex scenes. To address these\nchallenges, we propose the Scene Semantic Prior-Assisted Multi-Task Perception\nNetwork (SMPISD-MTPNet), which includes three stages: scene semantic\nextraction, deep feature extraction, and prediction. In the scene semantic\nextraction stage, we employ a Scene Semantic Extractor (SSE) to guide the\nnetwork by the features extracted based on expert knowledge. In the deep\nfeature extraction stage, a backbone network is employed to extract deep\nfeatures. These features are subsequently integrated by a fusion network,\nenhancing the detection capabilities across targets of varying sizes. In the\nprediction stage, we utilize the Multi-Task Perception Module, which includes\nthe Gradient-based Module and the Scene Segmentation Module, enabling precise\ndetection of small and dim targets within complex scenes. For the training\nprocess, we introduce the Soft Fine-tuning training strategy to suppress the\ndistortion caused by data augmentation. Besides, due to the lack of a publicly\navailable dataset labelled for scenes, we introduce the Infrared Ship Dataset\nwith Scene Segmentation (IRSDSS). Finally, we evaluate the network and compare\nit with state-of-the-art (SOTA) methods, indicating that SMPISD-MTPNet\noutperforms existing approaches. The source code and dataset for this research\ncan be accessed at https://github.com/greekinRoma/KMNDNet.\n","authors":["Chen Hu","Xiaogang Dong","Yian Huang Lele Wang","Liang Xu","Tian Pu","Zhenming Peng"],"pdf_url":"https://arxiv.org/pdf/2407.18487v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.06581v5","updated":"2024-07-26T03:27:58Z","published":"2024-07-09T06:20:17Z","title":"Vision language models are blind","summary":" While large language models with vision capabilities (VLMs), e.g., GPT-4o and\nGemini 1.5 Pro, are powering various image-text applications and scoring high\non many vision-understanding benchmarks, we find that they are surprisingly\nstill struggling with low-level vision tasks that are easy to humans.\nSpecifically, on BlindTest, our suite of 7 very simple tasks such as\nidentifying (a) whether two circles overlap; (b) whether two lines intersect;\n(c) which letter is being circled in a word; and (d) counting circles in an\nOlympic-like logo, four state-of-the-art VLMs are only 58.57% accurate on\naverage. Claude 3.5 Sonnet performs the best at 74.94% accuracy, but this is\nstill far from the human expected accuracy of 100%. Across different image\nresolutions and line widths, VLMs consistently struggle with tasks that require\nprecise spatial information and recognizing geometric primitives that overlap\nor are close together. Code and data are available at:\nhttps://vlmsareblind.github.io\n","authors":["Pooyan Rahmanzadehgervi","Logan Bolton","Mohammad Reza Taesiri","Anh Totti Nguyen"],"pdf_url":"https://arxiv.org/pdf/2407.06581v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.18849v2","updated":"2024-07-26T03:18:35Z","published":"2024-06-27T02:40:35Z","title":"Dysca: A Dynamic and Scalable Benchmark for Evaluating Perception\n Ability of LVLMs","summary":" Currently many benchmarks have been proposed to evaluate the perception\nability of the Large Vision-Language Models (LVLMs). However, most benchmarks\nconduct questions by selecting images from existing datasets, resulting in the\npotential data leakage. Besides, these benchmarks merely focus on evaluating\nLVLMs on the realistic style images and clean scenarios, leaving the\nmulti-stylized images and noisy scenarios unexplored. In response to these\nchallenges, we propose a dynamic and scalable benchmark named Dysca for\nevaluating LVLMs by leveraging synthesis images. Specifically, we leverage\nStable Diffusion and design a rule-based method to dynamically generate novel\nimages, questions and the corresponding answers. We consider 51 kinds of image\nstyles and evaluate the perception capability in 20 subtasks. Moreover, we\nconduct evaluations under 4 scenarios (i.e., Clean, Corruption, Print Attacking\nand Adversarial Attacking) and 3 question types (i.e., Multi-choices,\nTrue-or-false and Free-form). Thanks to the generative paradigm, Dysca serves\nas a scalable benchmark for easily adding new subtasks and scenarios. A total\nof 8 advanced open-source LVLMs with 10 checkpoints are evaluated on Dysca,\nrevealing the drawbacks of current LVLMs. The benchmark is released in\n\\url{https://github.com/Benchmark-Dysca/Dysca}.\n","authors":["Jie Zhang","Zhongqi Wang","Mengqi Lei","Zheng Yuan","Bei Yan","Shiguang Shan","Xilin Chen"],"pdf_url":"https://arxiv.org/pdf/2406.18849v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05231v2","updated":"2024-07-26T03:15:41Z","published":"2024-03-08T11:41:48Z","title":"Tracking Meets LoRA: Faster Training, Larger Model, Stronger Performance","summary":" Motivated by the Parameter-Efficient Fine-Tuning (PEFT) in large language\nmodels, we propose LoRAT, a method that unveils the power of large ViT model\nfor tracking within laboratory-level resources. The essence of our work lies in\nadapting LoRA, a technique that fine-tunes a small subset of model parameters\nwithout adding inference latency, to the domain of visual tracking. However,\nunique challenges and potential domain gaps make this transfer not as easy as\nthe first intuition. Firstly, a transformer-based tracker constructs unshared\nposition embedding for template and search image. This poses a challenge for\nthe transfer of LoRA, usually requiring consistency in the design when applied\nto the pre-trained backbone, to downstream tasks. Secondly, the inductive bias\ninherent in convolutional heads diminishes the effectiveness of\nparameter-efficient fine-tuning in tracking models. To overcome these\nlimitations, we first decouple the position embeddings in transformer-based\ntrackers into shared spatial ones and independent type ones. The shared\nembeddings, which describe the absolute coordinates of multi-resolution images\n(namely, the template and search images), are inherited from the pre-trained\nbackbones. In contrast, the independent embeddings indicate the sources of each\ntoken and are learned from scratch. Furthermore, we design an anchor-free head\nsolely based on MLP to adapt PETR, enabling better performance with less\ncomputational overhead. With our design, 1) it becomes practical to train\ntrackers with the ViT-g backbone on GPUs with only memory of 25.8GB (batch size\nof 16); 2) we reduce the training time of the L-224 variant from 35.0 to 10.8\nGPU hours; 3) we improve the LaSOT SUC score from 0.703 to 0.742 with the L-224\nvariant; 4) we fast the inference speed of the L-224 variant from 52 to 119\nFPS. Code and models are available at https://github.com/LitingLin/LoRAT.\n","authors":["Liting Lin","Heng Fan","Zhipeng Zhang","Yaowei Wang","Yong Xu","Haibin Ling"],"pdf_url":"https://arxiv.org/pdf/2403.05231v2.pdf","comment":"Accepted by ECCV 2024. All experiment results are updated"},{"id":"http://arxiv.org/abs/2403.05021v3","updated":"2024-07-26T02:58:28Z","published":"2024-03-08T03:54:22Z","title":"Beyond MOT: Semantic Multi-Object Tracking","summary":" Current multi-object tracking (MOT) aims to predict trajectories of targets\n(i.e.,\"where\") in videos. Yet, knowing merely \"where\" is insufficient in many\ncrucial applications. In comparison, semantic understanding such as\nfine-grained behaviors, interactions, and overall summarized captions (i.e.,\n\"what\") from videos, associated with \"where\", is highly-desired for\ncomprehensive video analysis. Thus motivated, we introduce Semantic\nMulti-Object Tracking (SMOT), that aims to estimate object trajectories and\nmeanwhile understand semantic details of associated trajectories including\ninstance captions, instance interactions, and overall video captions,\nintegrating \"where\" and \"what\" for tracking. In order to foster the exploration\nof SMOT, we propose BenSMOT, a large-scale Benchmark for Semantic MOT.\nSpecifically, BenSMOT comprises 3,292 videos with 151K frames, covering various\nscenarios for semantic tracking of humans. BenSMOT provides annotations for the\ntrajectories of targets, along with associated instance captions in natural\nlanguage, instance interactions, and overall caption for each video sequence.\nTo our best knowledge, BenSMOT is the first publicly available benchmark for\nSMOT. Besides, to encourage future research, we present a novel tracker named\nSMOTer, which is specially designed and end-to-end trained for SMOT, showing\npromising performance. By releasing BenSMOT, we expect to go beyond\nconventional MOT by predicting \"where\" and \"what\" for SMOT, opening up a new\ndirection in tracking for video understanding. Our BenSMOT and SMOTer will be\nreleased.\n","authors":["Yunhao Li","Qin Li","Hao Wang","Xue Ma","Jiali Yao","Shaohua Dong","Heng Fan","Libo Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.05021v3.pdf","comment":"Accepted to ECCV2024"},{"id":"http://arxiv.org/abs/2211.06841v4","updated":"2024-07-26T02:51:08Z","published":"2022-11-13T08:02:03Z","title":"Point-DAE: Denoising Autoencoders for Self-supervised Point Cloud\n Learning","summary":" Masked autoencoder has demonstrated its effectiveness in self-supervised\npoint cloud learning. Considering that masking is a kind of corruption, in this\nwork we explore a more general denoising autoencoder for point cloud learning\n(Point-DAE) by investigating more types of corruptions beyond masking.\nSpecifically, we degrade the point cloud with certain corruptions as input, and\nlearn an encoder-decoder model to reconstruct the original point cloud from its\ncorrupted version. Three corruption families (\\ie, density/masking, noise, and\naffine transformation) and a total of fourteen corruption types are\ninvestigated with traditional non-Transformer encoders. Besides the popular\nmasking corruption, we identify another effective corruption family, \\ie,\naffine transformation. The affine transformation disturbs all points globally,\nwhich is complementary to the masking corruption where some local regions are\ndropped. We also validate the effectiveness of affine transformation corruption\nwith the Transformer backbones, where we decompose the reconstruction of the\ncomplete point cloud into the reconstructions of detailed local patches and\nrough global shape, alleviating the position leakage problem in the\nreconstruction. Extensive experiments on tasks of object classification,\nfew-shot learning, robustness testing, part segmentation, and 3D object\ndetection validate the effectiveness of the proposed method. The codes are\navailable at \\url{https://github.com/YBZh/Point-DAE}.\n","authors":["Yabin Zhang","Jiehong Lin","Ruihuang Li","Kui Jia","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2211.06841v4.pdf","comment":"Journal revision; Codes are available at\n \\url{https://github.com/YBZh/Point-DAE}"},{"id":"http://arxiv.org/abs/2404.13671v2","updated":"2024-07-26T02:42:21Z","published":"2024-04-21T14:22:04Z","title":"FiLo: Zero-Shot Anomaly Detection by Fine-Grained Description and\n High-Quality Localization","summary":" Zero-shot anomaly detection (ZSAD) methods entail detecting anomalies\ndirectly without access to any known normal or abnormal samples within the\ntarget item categories. Existing approaches typically rely on the robust\ngeneralization capabilities of multimodal pretrained models, computing\nsimilarities between manually crafted textual features representing \"normal\" or\n\"abnormal\" semantics and image features to detect anomalies and localize\nanomalous patches. However, the generic descriptions of \"abnormal\" often fail\nto precisely match diverse types of anomalies across different object\ncategories. Additionally, computing feature similarities for single patches\nstruggles to pinpoint specific locations of anomalies with various sizes and\nscales. To address these issues, we propose a novel ZSAD method called FiLo,\ncomprising two components: adaptively learned Fine-Grained Description (FG-Des)\nand position-enhanced High-Quality Localization (HQ-Loc). FG-Des introduces\nfine-grained anomaly descriptions for each category using Large Language Models\n(LLMs) and employs adaptively learned textual templates to enhance the accuracy\nand interpretability of anomaly detection. HQ-Loc, utilizing Grounding DINO for\npreliminary localization, position-enhanced text prompts, and Multi-scale\nMulti-shape Cross-modal Interaction (MMCI) module, facilitates more accurate\nlocalization of anomalies of different sizes and shapes. Experimental results\non datasets like MVTec and VisA demonstrate that FiLo significantly improves\nthe performance of ZSAD in both detection and localization, achieving\nstate-of-the-art performance with an image-level AUC of 83.9% and a pixel-level\nAUC of 95.9% on the VisA dataset. Code is available at\nhttps://github.com/CASIA-IVA-Lab/FiLo.\n","authors":["Zhaopeng Gu","Bingke Zhu","Guibo Zhu","Yingying Chen","Hao Li","Ming Tang","Jinqiao Wang"],"pdf_url":"https://arxiv.org/pdf/2404.13671v2.pdf","comment":"Accepted by ACM MM 2024"},{"id":"http://arxiv.org/abs/2407.18466v1","updated":"2024-07-26T02:25:45Z","published":"2024-07-26T02:25:45Z","title":"A Progressive Single-Modality to Multi-Modality Classification Framework\n for Alzheimer's Disease Sub-type Diagnosis","summary":" The current clinical diagnosis framework of Alzheimer's disease (AD) involves\nmultiple modalities acquired from multiple diagnosis stages, each with distinct\nusage and cost. Previous AD diagnosis research has predominantly focused on how\nto directly fuse multiple modalities for an end-to-end one-stage diagnosis,\nwhich practically requires a high cost in data acquisition. Moreover, a\nsignificant part of these methods diagnose AD without considering clinical\nguideline and cannot offer accurate sub-type diagnosis. In this paper, by\nexploring inter-correlation among multiple modalities, we propose a novel\nprogressive AD sub-type diagnosis framework, aiming to give diagnosis results\nbased on easier-to-access modalities in earlier low-cost stages, instead of\nmodalities from all stages. Specifically, first, we design 1) a text\ndisentanglement network for better processing tabular data collected in the\ninitial stage, and 2) a modality fusion module for fusing multi-modality\nfeatures separately. Second, we align features from modalities acquired in\nearlier low-cost stage(s) with later high-cost stage(s) to give accurate\ndiagnosis without actual modality acquisition in later-stage(s) for saving\ncost. Furthermore, we follow the clinical guideline to align features at each\nstage for achieving sub-type diagnosis. Third, we leverage a progressive\nclassifier that can progressively include additional acquired modalities (if\nneeded) for diagnosis, to achieve the balance between diagnosis cost and\ndiagnosis performance. We evaluate our proposed framework on large diverse\npublic and in-home datasets (8280 in total) and achieve superior performance\nover state-of-the-art methods. Our codes will be released after the acceptance.\n","authors":["Yuxiao Liu","Mianxin Liu","Yuanwang Zhang","Kaicong Sun","Dinggang Shen"],"pdf_url":"https://arxiv.org/pdf/2407.18466v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16945v2","updated":"2024-07-26T02:24:11Z","published":"2024-07-24T02:24:21Z","title":"Affective Behaviour Analysis via Progressive Learning","summary":" Affective Behavior Analysis aims to develop emotionally intelligent\ntechnology that can recognize and respond to human emotions. To advance this,\nthe 7th Affective Behavior Analysis in-the-wild (ABAW) competition establishes\ntwo tracks: i.e., the Multi-task Learning (MTL) Challenge and the Compound\nExpression (CE) challenge based on Aff-Wild2 and C-EXPR-DB datasets. In this\npaper, we present our methods and experimental results for the two competition\ntracks. Specifically, it can be summarized in the following four aspects: 1) To\nattain high-quality facial features, we train a Masked-Auto Encoder in a\nself-supervised manner. 2) We devise a temporal convergence module to capture\nthe temporal information between video frames and explore the impact of window\nsize and sequence length on each sub-task. 3) To facilitate the joint\noptimization of various sub-tasks, we explore the impact of sub-task joint\ntraining and feature fusion from individual tasks on each task performance\nimprovement. 4) We utilize curriculum learning to transition the model from\nrecognizing single expressions to recognizing compound expressions, thereby\nimproving the accuracy of compound expression recognition. Extensive\nexperiments demonstrate the superiority of our designs.\n","authors":["Chen Liu","Wei Zhang","Feng Qiu","Lincheng Li","Xin Yu"],"pdf_url":"https://arxiv.org/pdf/2407.16945v2.pdf","comment":"Techical Report for 7th ABAW Competition"},{"id":"http://arxiv.org/abs/2405.20584v2","updated":"2024-07-26T02:10:04Z","published":"2024-05-31T02:45:31Z","title":"Disrupting Diffusion: Token-Level Attention Erasure Attack against\n Diffusion-based Customization","summary":" With the development of diffusion-based customization methods like\nDreamBooth, individuals now have access to train the models that can generate\ntheir personalized images. Despite the convenience, malicious users have\nmisused these techniques to create fake images, thereby triggering a privacy\nsecurity crisis. In light of this, proactive adversarial attacks are proposed\nto protect users against customization. The adversarial examples are trained to\ndistort the customization model's outputs and thus block the misuse. In this\npaper, we propose DisDiff (Disrupting Diffusion), a novel adversarial attack\nmethod to disrupt the diffusion model outputs. We first delve into the\nintrinsic image-text relationships, well-known as cross-attention, and\nempirically find that the subject-identifier token plays an important role in\nguiding image generation. Thus, we propose the Cross-Attention Erasure module\nto explicitly \"erase\" the indicated attention maps and disrupt the text\nguidance. Besides,we analyze the influence of the sampling process of the\ndiffusion model on Projected Gradient Descent (PGD) attack and introduce a\nnovel Merit Sampling Scheduler to adaptively modulate the perturbation updating\namplitude in a step-aware manner. Our DisDiff outperforms the state-of-the-art\nmethods by 12.75% of FDFR scores and 7.25% of ISM scores across two facial\nbenchmarks and two commonly used prompts on average.\n","authors":["Yisu Liu","Jinyang An","Wanqian Zhang","Dayan Wu","Jingzi Gu","Zheng Lin","Weiping Wang"],"pdf_url":"https://arxiv.org/pdf/2405.20584v2.pdf","comment":"Accepted by ACM MM2024"},{"id":"http://arxiv.org/abs/2407.18456v1","updated":"2024-07-26T01:42:31Z","published":"2024-07-26T01:42:31Z","title":"Lensless fiber endomicroscopic phase imaging with speckle-conditioned\n diffusion model","summary":" Lensless fiber endomicroscope is an emerging tool for in-vivo microscopic\nimaging, where quantitative phase imaging (QPI) can be utilized as a label-free\nmethod to enhance image contrast. However, existing single-shot phase\nreconstruction methods through lensless fiber endomicroscope typically perform\nwell on simple images but struggle with complex microscopic structures. Here,\nwe propose a speckle-conditioned diffusion model (SpecDiffusion), which\nreconstructs phase images directly from speckles captured at the detection side\nof a multi-core fiber (MCF). Unlike conventional neural networks, SpecDiffusion\nemploys iterative phase denoising steps for speckle-driven phase\nreconstruction. The iteration scheme allows SpecDiffusion to break down the\nphase reconstruction process into multiple steps, gradually building up to the\nfinal phase image. This attribute alleviates the computation challenge at each\nstep and enables the reconstruction of rich details in complex microscopic\nimages. To validate its efficacy, we build an optical system to capture\nspeckles from MCF and construct a dataset consisting of 100,000 paired images.\nSpecDiffusion provides high-fidelity phase reconstruction results and shows\npowerful generalization capacity for unseen objects, such as test charts and\nbiological tissues, reducing the average mean absolute error of the\nreconstructed tissue images by 7 times. Furthermore, the reconstructed tissue\nimages using SpecDiffusion shows higher accuracy in zero-shot cell segmentation\ntasks compared to the conventional method, demonstrating the potential for\nfurther cell morphology analysis through the learning-based lensless fiber\nendomicroscope. SpecDiffusion offers a precise and generalized method to phase\nreconstruction through scattering media, including MCFs, opening new\nperspective in lensless fiber endomicroscopic imaging.\n","authors":["Zhaoqing Chen","Jiawei Sun","Xinyi Ye","Bin Zhao","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2407.18456v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03613v5","updated":"2024-07-26T01:24:23Z","published":"2024-04-04T17:34:41Z","title":"Per-Gaussian Embedding-Based Deformation for Deformable 3D Gaussian\n Splatting","summary":" As 3D Gaussian Splatting (3DGS) provides fast and high-quality novel view\nsynthesis, it is a natural extension to deform a canonical 3DGS to multiple\nframes for representing a dynamic scene. However, previous works fail to\naccurately reconstruct complex dynamic scenes. We attribute the failure to the\ndesign of the deformation field, which is built as a coordinate-based function.\nThis approach is problematic because 3DGS is a mixture of multiple fields\ncentered at the Gaussians, not just a single coordinate-based framework. To\nresolve this problem, we define the deformation as a function of per-Gaussian\nembeddings and temporal embeddings. Moreover, we decompose deformations as\ncoarse and fine deformations to model slow and fast movements, respectively.\nAlso, we introduce a local smoothness regularization for per-Gaussian embedding\nto improve the details in dynamic regions. Project page:\nhttps://jeongminb.github.io/e-d3dgs/\n","authors":["Jeongmin Bae","Seoha Kim","Youngsik Yun","Hahyun Lee","Gun Bang","Youngjung Uh"],"pdf_url":"https://arxiv.org/pdf/2404.03613v5.pdf","comment":"ECCV 2024. Project page: https://jeongminb.github.io/e-d3dgs/"},{"id":"http://arxiv.org/abs/2407.14651v2","updated":"2024-07-26T01:19:27Z","published":"2024-07-19T20:05:10Z","title":"Improving Representation of High-frequency Components for Medical\n Foundation Models","summary":" Foundation models have recently attracted significant attention for their\nimpressive generalizability across diverse downstream tasks. However, these\nmodels are demonstrated to exhibit great limitations in representing\nhigh-frequency components and fine-grained details. In many medical imaging\ntasks, the precise representation of such information is crucial due to the\ninherently intricate anatomical structures, sub-visual features, and complex\nboundaries involved. Consequently, the limited representation of prevalent\nfoundation models can result in significant performance degradation or even\nfailure in these tasks. To address these challenges, we propose a novel\npretraining strategy, named Frequency-advanced Representation Autoencoder\n(Frepa). Through high-frequency masking and low-frequency perturbation combined\nwith adversarial learning, Frepa encourages the encoder to effectively\nrepresent and preserve high-frequency components in the image embeddings.\nAdditionally, we introduce an innovative histogram-equalized image masking\nstrategy, extending the Masked Autoencoder approach beyond ViT to other\narchitectures such as Swin Transformer and convolutional networks. We develop\nFrepa across nine medical modalities and validate it on 32 downstream tasks for\nboth 2D images and 3D volume data. Without fine-tuning, Frepa can outperform\nother self-supervised pretraining methods and, in some cases, even surpasses\ntask-specific trained models. This improvement is particularly significant for\ntasks involving fine-grained details, such as achieving up to a +15% increase\nin DSC for retina vessel segmentation and a +7% increase in IoU for lung nodule\ndetection. Further experiments quantitatively reveal that Frepa enables\nsuperior high-frequency representations and preservation in the embeddings,\nunderscoring its potential for developing more generalized and universal\nmedical image foundation models.\n","authors":["Yuetan Chu","Yilan Zhang","Zhongyi Han","Changchun Yang","Longxi Zhou","Gongning Luo","Xin Gao"],"pdf_url":"https://arxiv.org/pdf/2407.14651v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17792v2","updated":"2024-07-26T01:16:07Z","published":"2024-07-25T06:03:02Z","title":"Harnessing Temporal Causality for Advanced Temporal Action Detection","summary":" As a fundamental task in long-form video understanding, temporal action\ndetection (TAD) aims to capture inherent temporal relations in untrimmed videos\nand identify candidate actions with precise boundaries. Over the years, various\nnetworks, including convolutions, graphs, and transformers, have been explored\nfor effective temporal modeling for TAD. However, these modules typically treat\npast and future information equally, overlooking the crucial fact that changes\nin action boundaries are essentially causal events. Inspired by this insight,\nwe propose leveraging the temporal causality of actions to enhance TAD\nrepresentation by restricting the model's access to only past or future\ncontext. We introduce CausalTAD, which combines causal attention and causal\nMamba to achieve state-of-the-art performance on multiple benchmarks. Notably,\nwith CausalTAD, we ranked 1st in the Action Recognition, Action Detection, and\nAudio-Based Interaction Detection tracks at the EPIC-Kitchens Challenge 2024,\nas well as 1st in the Moment Queries track at the Ego4D Challenge 2024. Our\ncode is available at https://github.com/sming256/OpenTAD/.\n","authors":["Shuming Liu","Lin Sui","Chen-Lin Zhang","Fangzhou Mu","Chen Zhao","Bernard Ghanem"],"pdf_url":"https://arxiv.org/pdf/2407.17792v2.pdf","comment":"1st in Moment Queries track at the Ego4D Challenge 2024; 1st in\n Action Recognition, Action Detection, and Audio-Based Interaction Detection\n tracks at the EPIC-Kitchens Challenge 2024"},{"id":"http://arxiv.org/abs/2407.18450v1","updated":"2024-07-26T01:13:59Z","published":"2024-07-26T01:13:59Z","title":"Textile Anomaly Detection: Evaluation of the State-of-the-Art for\n Automated Quality Inspection of Carpet","summary":" In this study, state-of-the-art unsupervised detection models were evaluated\nfor the purpose of automated anomaly inspection of wool carpets. A custom\ndataset of four unique types of carpet textures was created to thoroughly test\nthe models and their robustness in detecting subtle anomalies in complex\ntextures. Due to the requirements of an inline inspection system in a\nmanufacturing use case, the metrics of importance in this study were accuracy\nin detecting anomalous areas, the number of false detections, and the inference\ntimes of each model for real-time performance. Of the evaluated models, the\nstudent-teacher network based methods were found on average to yield the\nhighest detection accuracy and lowest false detection rates. When trained on a\nmulti-class dataset the models were found to yield comparable if not better\nresults than single-class training. Finally, in terms of detection speed, with\nexception to the generative model, all other evaluated models were found to\nhave comparable inference times on a GPU, with an average of 0.16s per image.\nOn a CPU, most of these models typically produced results between 1.5 to 2\ntimes the respective GPU inference times.\n","authors":["Briony Forsberg","Dr Henry Williams","Prof Bruce MacDonald","Tracy Chen","Dr Kirstine Hulse"],"pdf_url":"https://arxiv.org/pdf/2407.18450v1.pdf","comment":"Accepted at the 2023 Australasian Conference on Robotics and\n Automation (ACRA 2023) Publication url\n https://www.scopus.com/inward/record.uri?eid=2-s2.0-85184380272&partnerID=40&md5=74fde263f4a24a1bff75d6560b423994\n ISSN: 14482053 Contains 10 pages and three figures"},{"id":"http://arxiv.org/abs/2407.18449v1","updated":"2024-07-26T01:12:54Z","published":"2024-07-26T01:12:54Z","title":"Towards A Generalizable Pathology Foundation Model via Unified Knowledge\n Distillation","summary":" Foundation models pretrained on large-scale datasets are revolutionizing the\nfield of computational pathology (CPath). The generalization ability of\nfoundation models is crucial for the success in various downstream clinical\ntasks. However, current foundation models have only been evaluated on a limited\ntype and number of tasks, leaving their generalization ability and overall\nperformance unclear. To address this gap, we established a most comprehensive\nbenchmark to evaluate the performance of off-the-shelf foundation models across\nsix distinct clinical task types, encompassing a total of 39 specific tasks.\nOur findings reveal that existing foundation models excel at certain task types\nbut struggle to effectively handle the full breadth of clinical tasks. To\nimprove the generalization of pathology foundation models, we propose a unified\nknowledge distillation framework consisting of both expert and self knowledge\ndistillation, where the former allows the model to learn from the knowledge of\nmultiple expert models, while the latter leverages self-distillation to enable\nimage representation learning via local-global alignment. Based on this\nframework, a Generalizable Pathology Foundation Model (GPFM) is pretrained on a\nlarge-scale dataset consisting of 190 million images from around 86,000 public\nH\\&E whole slides across 34 major tissue types. Evaluated on the established\nbenchmark, GPFM achieves an impressive average rank of 1.36, with 29 tasks\nranked 1st, while the the second-best model, UNI, attains an average rank of\n2.96, with only 4 tasks ranked 1st. The superior generalization of GPFM\ndemonstrates its exceptional modeling capabilities across a wide range of\nclinical tasks, positioning it as a new cornerstone for feature representation\nin CPath.\n","authors":["Jiabo Ma","Zhengrui Guo","Fengtao Zhou","Yihui Wang","Yingxue Xu","Yu Cai","Zhengjie Zhu","Cheng Jin","Yi Lin Xinrui Jiang","Anjia Han","Li Liang","Ronald Cheong Kin Chan","Jiguang Wang","Kwang-Ting Cheng","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2407.18449v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.06174v3","updated":"2024-07-26T01:09:35Z","published":"2024-07-08T17:49:41Z","title":"The Tug-of-War Between Deepfake Generation and Detection","summary":" Multimodal generative models are rapidly evolving, leading to a surge in the\ngeneration of realistic video and audio that offers exciting possibilities but\nalso serious risks. Deepfake videos, which can convincingly impersonate\nindividuals, have particularly garnered attention due to their potential misuse\nin spreading misinformation and creating fraudulent content. This survey paper\nexamines the dual landscape of deepfake video generation and detection,\nemphasizing the need for effective countermeasures against potential abuses. We\nprovide a comprehensive overview of current deepfake generation techniques,\nincluding face swapping, reenactment, and audio-driven animation, which\nleverage cutting-edge technologies like GANs and diffusion models to produce\nhighly realistic fake videos. Additionally, we analyze various detection\napproaches designed to differentiate authentic from altered videos, from\ndetecting visual artifacts to deploying advanced algorithms that pinpoint\ninconsistencies across video and audio signals.\n The effectiveness of these detection methods heavily relies on the diversity\nand quality of datasets used for training and evaluation. We discuss the\nevolution of deepfake datasets, highlighting the importance of robust, diverse,\nand frequently updated collections to enhance the detection accuracy and\ngeneralizability. As deepfakes become increasingly indistinguishable from\nauthentic content, developing advanced detection techniques that can keep pace\nwith generation technologies is crucial. We advocate for a proactive approach\nin the \"tug-of-war\" between deepfake creators and detectors, emphasizing the\nneed for continuous research collaboration, standardization of evaluation\nmetrics, and the creation of comprehensive benchmarks.\n","authors":["Hannah Lee","Changyeon Lee","Kevin Farhat","Lin Qiu","Steve Geluso","Aerin Kim","Oren Etzioni"],"pdf_url":"https://arxiv.org/pdf/2407.06174v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18443v1","updated":"2024-07-26T00:51:52Z","published":"2024-07-26T00:51:52Z","title":"HybridDepth: Robust Depth Fusion for Mobile AR by Leveraging Depth from\n Focus and Single-Image Priors","summary":" We propose HYBRIDDEPTH, a robust depth estimation pipeline that addresses the\nunique challenges of depth estimation for mobile AR, such as scale ambiguity,\nhardware heterogeneity, and generalizability. HYBRIDDEPTH leverages the camera\nfeatures available on mobile devices. It effectively combines the scale\naccuracy inherent in Depth from Focus (DFF) methods with the generalization\ncapabilities enabled by strong single-image depth priors. By utilizing the\nfocal planes of a mobile camera, our approach accurately captures depth values\nfrom focused pixels and applies these values to compute scale and shift\nparameters for transforming relative depths into metric depths. We test our\npipeline as an end-to-end system, with a newly developed mobile client to\ncapture focal stacks, which are then sent to a GPU-powered server for depth\nestimation.\n Through comprehensive quantitative and qualitative analyses, we demonstrate\nthat HYBRIDDEPTH not only outperforms state-of-the-art (SOTA) models in common\ndatasets (DDFF12, NYU Depth v2) and a real-world AR dataset ARKitScenes but\nalso demonstrates strong zero-shot generalization. For example, HYBRIDDEPTH\ntrained on NYU Depth v2 achieves comparable performance on the DDFF12 to\nexisting models trained on DDFF12. it also outperforms all the SOTA models in\nzero-shot performance on the ARKitScenes dataset. Additionally, we conduct a\nqualitative comparison between our model and the ARCore framework,\ndemonstrating that our models output depth maps are significantly more accurate\nin terms of structural details and metric accuracy. The source code of this\nproject is available at github.\n","authors":["Ashkan Ganj","Hang Su","Tian Guo"],"pdf_url":"https://arxiv.org/pdf/2407.18443v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18437v1","updated":"2024-07-26T00:19:01Z","published":"2024-07-26T00:19:01Z","title":"Mixed Non-linear Quantization for Vision Transformers","summary":" The majority of quantization methods have been proposed to reduce the model\nsize of Vision Transformers, yet most of them have overlooked the quantization\nof non-linear operations. Only a few works have addressed quantization for\nnon-linear operations, but they applied a single quantization method across all\nnon-linear operations. We believe that this can be further improved by\nemploying a different quantization method for each non-linear operation.\nTherefore, to assign the most error-minimizing quantization method from the\nknown methods to each non-linear layer, we propose a mixed non-linear\nquantization that considers layer-wise quantization sensitivity measured by\nSQNR difference metric. The results show that our method outperforms I-BERT,\nFQ-ViT, and I-ViT in both 8-bit and 6-bit settings for ViT, DeiT, and Swin\nmodels by an average of 0.6%p and 19.6%p, respectively. Our method outperforms\nI-BERT and I-ViT by 0.6%p and 20.8%p, respectively, when training time is\nlimited. We plan to release our code at\nhttps://gitlab.com/ones-ai/mixed-non-linear-quantization.\n","authors":["Gihwan Kim","Jemin Lee","Sihyeong Park","Yongin Kwon","Hyungshin Kim"],"pdf_url":"https://arxiv.org/pdf/2407.18437v1.pdf","comment":"16 pages, 4 figures, under review"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2407.18910v1","updated":"2024-07-26T17:59:32Z","published":"2024-07-26T17:59:32Z","title":"Do We Really Need Graph Convolution During Training? Light Post-Training\n Graph-ODE for Efficient Recommendation","summary":" The efficiency and scalability of graph convolution networks (GCNs) in\ntraining recommender systems (RecSys) have been persistent concerns, hindering\ntheir deployment in real-world applications. This paper presents a critical\nexamination of the necessity of graph convolutions during the training phase\nand introduces an innovative alternative: the Light Post-Training Graph\nOrdinary-Differential-Equation (LightGODE). Our investigation reveals that the\nbenefits of GCNs are more pronounced during testing rather than training.\nMotivated by this, LightGODE utilizes a novel post-training graph convolution\nmethod that bypasses the computation-intensive message passing of GCNs and\nemploys a non-parametric continuous graph ordinary-differential-equation (ODE)\nto dynamically model node representations. This approach drastically reduces\ntraining time while achieving fine-grained post-training graph convolution to\navoid the distortion of the original training embedding space, termed the\nembedding discrepancy issue. We validate our model across several real-world\ndatasets of different scales, demonstrating that LightGODE not only outperforms\nGCN-based models in terms of efficiency and effectiveness but also\nsignificantly mitigates the embedding discrepancy commonly associated with\ndeeper graph convolution layers. Our LightGODE challenges the prevailing\nparadigms in RecSys training and suggests re-evaluating the role of graph\nconvolutions, potentially guiding future developments of efficient large-scale\ngraph-based RecSys.\n","authors":["Weizhi Zhang","Liangwei Yang","Zihe Song","Henry Peng Zou","Ke Xu","Henry Peng Zou","Liancheng Fang","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2407.18910v1.pdf","comment":"Accepted to CIKM 2024"},{"id":"http://arxiv.org/abs/2407.18898v1","updated":"2024-07-26T17:51:41Z","published":"2024-07-26T17:51:41Z","title":"A Flexible and Scalable Approach for Collecting Wildlife Advertisements\n on the Web","summary":" Wildlife traffickers are increasingly carrying out their activities in\ncyberspace. As they advertise and sell wildlife products in online\nmarketplaces, they leave digital traces of their activity. This creates a new\nopportunity: by analyzing these traces, we can obtain insights into how\ntrafficking networks work as well as how they can be disrupted. However,\ncollecting such information is difficult. Online marketplaces sell a very large\nnumber of products and identifying ads that actually involve wildlife is a\ncomplex task that is hard to automate. Furthermore, given that the volume of\ndata is staggering, we need scalable mechanisms to acquire, filter, and store\nthe ads, as well as to make them available for analysis. In this paper, we\npresent a new approach to collect wildlife trafficking data at scale. We\npropose a data collection pipeline that combines scoped crawlers for data\ndiscovery and acquisition with foundational models and machine learning\nclassifiers to identify relevant ads. We describe a dataset we created using\nthis pipeline which is, to the best of our knowledge, the largest of its kind:\nit contains almost a million ads obtained from 41 marketplaces, covering 235\nspecies and 20 languages. The source code is publicly available at\n\\url{https://github.com/VIDA-NYU/wildlife_pipeline}.\n","authors":["Juliana Barbosa","Sunandan Chakraborty","Juliana Freire"],"pdf_url":"https://arxiv.org/pdf/2407.18898v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18827v1","updated":"2024-07-26T15:43:52Z","published":"2024-07-26T15:43:52Z","title":"Human-artificial intelligence teaming for scientific information\n extraction from data-driven additive manufacturing research using large\n language models","summary":" Data-driven research in Additive Manufacturing (AM) has gained significant\nsuccess in recent years. This has led to a plethora of scientific literature to\nemerge. The knowledge in these works consists of AM and Artificial Intelligence\n(AI) contexts that have not been mined and formalized in an integrated way. It\nrequires substantial effort and time to extract scientific information from\nthese works. AM domain experts have contributed over two dozen review papers to\nsummarize these works. However, information specific to AM and AI contexts\nstill requires manual effort to extract. The recent success of foundation\nmodels such as BERT (Bidirectional Encoder Representations for Transformers) or\nGPT (Generative Pre-trained Transformers) on textual data has opened the\npossibility of expediting scientific information extraction. We propose a\nframework that enables collaboration between AM and AI experts to continuously\nextract scientific information from data-driven AM literature. A demonstration\ntool is implemented based on the proposed framework and a case study is\nconducted to extract information relevant to the datasets, modeling, sensing,\nand AM system categories. We show the ability of LLMs (Large Language Models)\nto expedite the extraction of relevant information from data-driven AM\nliterature. In the future, the framework can be used to extract information\nfrom the broader design and manufacturing literature in the engineering\ndiscipline.\n","authors":["Mutahar Safdar","Jiarui Xie","Andrei Mircea","Yaoyao Fiona Zhao"],"pdf_url":"https://arxiv.org/pdf/2407.18827v1.pdf","comment":"11 pages, 5 Figures, 3 Tables. This paper has been accepted to be\n published in the proceedings of IDETC-CIE 2024"},{"id":"http://arxiv.org/abs/2406.00019v2","updated":"2024-07-26T15:13:08Z","published":"2024-05-23T07:14:21Z","title":"EHR-SeqSQL : A Sequential Text-to-SQL Dataset For Interactively\n Exploring Electronic Health Records","summary":" In this paper, we introduce EHR-SeqSQL, a novel sequential text-to-SQL\ndataset for Electronic Health Record (EHR) databases. EHR-SeqSQL is designed to\naddress critical yet underexplored aspects in text-to-SQL parsing:\ninteractivity, compositionality, and efficiency. To the best of our knowledge,\nEHR-SeqSQL is not only the largest but also the first medical text-to-SQL\ndataset benchmark to include sequential and contextual questions. We provide a\ndata split and the new test set designed to assess compositional generalization\nability. Our experiments demonstrate the superiority of a multi-turn approach\nover a single-turn approach in learning compositionality. Additionally, our\ndataset integrates specially crafted tokens into SQL queries to improve\nexecution efficiency. With EHR-SeqSQL, we aim to bridge the gap between\npractical needs and academic research in the text-to-SQL domain. EHR-SeqSQL is\navailable \\href{https://github.com/seonhee99/EHR-SeqSQL}{at this https URL}.\n","authors":["Jaehee Ryu","Seonhee Cho","Gyubok Lee","Edward Choi"],"pdf_url":"https://arxiv.org/pdf/2406.00019v2.pdf","comment":"ACL 2024 (Findings)"},{"id":"http://arxiv.org/abs/2407.18735v1","updated":"2024-07-26T13:44:06Z","published":"2024-07-26T13:44:06Z","title":"AutoRDF2GML: Facilitating RDF Integration in Graph Machine Learning","summary":" In this paper, we introduce AutoRDF2GML, a framework designed to convert RDF\ndata into data representations tailored for graph machine learning tasks.\nAutoRDF2GML enables, for the first time, the creation of both content-based\nfeatures -- i.e., features based on RDF datatype properties -- and\ntopology-based features -- i.e., features based on RDF object properties.\nCharacterized by automated feature extraction, AutoRDF2GML makes it possible\neven for users less familiar with RDF and SPARQL to generate data\nrepresentations ready for graph machine learning tasks, such as link\nprediction, node classification, and graph classification. Furthermore, we\npresent four new benchmark datasets for graph machine learning, created from\nlarge RDF knowledge graphs using our framework. These datasets serve as\nvaluable resources for evaluating graph machine learning approaches, such as\ngraph neural networks. Overall, our framework effectively bridges the gap\nbetween the Graph Machine Learning and Semantic Web communities, paving the way\nfor RDF-based machine learning applications.\n","authors":["Michael Färber","David Lamprecht","Yuni Susanti"],"pdf_url":"https://arxiv.org/pdf/2407.18735v1.pdf","comment":"accepted at ISWC'24"},{"id":"http://arxiv.org/abs/2310.19834v2","updated":"2024-07-26T11:21:24Z","published":"2023-10-29T13:07:33Z","title":"AMIR: Automated MisInformation Rebuttal -- A COVID-19 Vaccination\n Datasets based Recommendation System","summary":" Misinformation has emerged as a major societal threat in recent years in\ngeneral; specifically in the context of the COVID-19 pandemic, it has wrecked\nhavoc, for instance, by fuelling vaccine hesitancy. Cost-effective, scalable\nsolutions for combating misinformation are the need of the hour. This work\nexplored how existing information obtained from social media and augmented with\nmore curated fact checked data repositories can be harnessed to facilitate\nautomated rebuttal of misinformation at scale. While the ideas herein can be\ngeneralized and reapplied in the broader context of misinformation mitigation\nusing a multitude of information sources and catering to the spectrum of social\nmedia platforms, this work serves as a proof of concept, and as such, it is\nconfined in its scope to only rebuttal of tweets, and in the specific context\nof misinformation regarding COVID-19. It leverages two publicly available\ndatasets, viz. FaCov (fact-checked articles) and misleading (social media\nTwitter) data on COVID-19 Vaccination.\n","authors":["Shakshi Sharma","Anwitaman Datta","Rajesh Sharma"],"pdf_url":"https://arxiv.org/pdf/2310.19834v2.pdf","comment":"Please cite our published paper on IEEE Transactions on Computational\n Social Systems"},{"id":"http://arxiv.org/abs/2407.18646v1","updated":"2024-07-26T10:28:59Z","published":"2024-07-26T10:28:59Z","title":"Decoding Knowledge Claims: The Evaluation of Scientific Publication\n Contributions through Semantic Analysis","summary":" The surge in scientific publications challenges the use of publication counts\nas a measure of scientific progress, requiring alternative metrics that\nemphasize the quality and novelty of scientific contributions rather than sheer\nquantity. This paper proposes the use of Relaxed Word Mover's Distance (RWMD),\na semantic text similarity measure, to evaluate the novelty of scientific\npapers. We hypothesize that RWMD can more effectively gauge the growth of\nscientific knowledge. To test such an assumption, we apply RWMD to evaluate\nseminal papers, with Hirsch's H-Index paper as a primary case study. We compare\nRWMD results across three groups: 1) H-Index-related papers, 2) scientometric\nstudies, and 3) unrelated papers, aiming to discern redundant literature and\nhype from genuine innovations. Findings suggest that emphasizing knowledge\nclaims offers a deeper insight into scientific contributions, marking RWMD as a\npromising alternative method to traditional citation metrics, thus better\ntracking significant scientific breakthroughs.\n","authors":["Luca D'Aniello","Nicolas Robinson-Garcia","Massimo Aria","Corrado Cuccurullo"],"pdf_url":"https://arxiv.org/pdf/2407.18646v1.pdf","comment":"This paper was submitted to STI 2024 - 28th International Conference\n on Science, Technology and Innovation Indicators STI 2024"},{"id":"http://arxiv.org/abs/2407.18553v1","updated":"2024-07-26T07:05:54Z","published":"2024-07-26T07:05:54Z","title":"REAPER: Reasoning based Retrieval Planning for Complex RAG Systems","summary":" Complex dialog systems often use retrieved evidence to facilitate factual\nresponses. Such RAG (Retrieval Augmented Generation) systems retrieve from\nmassive heterogeneous data stores that are usually architected as multiple\nindexes or APIs instead of a single monolithic source. For a given query,\nrelevant evidence needs to be retrieved from one or a small subset of possible\nretrieval sources. Complex queries can even require multi-step retrieval. For\nexample, a conversational agent on a retail site answering customer questions\nabout past orders will need to retrieve the appropriate customer order first\nand then the evidence relevant to the customer's question in the context of the\nordered product. Most RAG Agents handle such Chain-of-Thought (CoT) tasks by\ninterleaving reasoning and retrieval steps. However, each reasoning step\ndirectly adds to the latency of the system. For large models (>100B parameters)\nthis latency cost is significant -- in the order of multiple seconds.\nMulti-agent systems may classify the query to a single Agent associated with a\nretrieval source, though this means that a (small) classification model\ndictates the performance of a large language model. In this work we present\nREAPER (REAsoning-based PlannER) - an LLM based planner to generate retrieval\nplans in conversational systems. We show significant gains in latency over\nAgent-based systems and are able to scale easily to new and unseen use cases as\ncompared to classification-based planning. Though our method can be applied to\nany RAG system, we show our results in the context of Rufus -- Amazon's\nconversational shopping assistant.\n","authors":["Ashutosh Joshi","Sheikh Muhammad Sarwar","Samarth Varshney","Sreyashi Nag","Shrivats Agrawal","Juhi Naik"],"pdf_url":"https://arxiv.org/pdf/2407.18553v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18472v1","updated":"2024-07-26T02:48:32Z","published":"2024-07-26T02:48:32Z","title":"FedUD: Exploiting Unaligned Data for Cross-Platform Federated\n Click-Through Rate Prediction","summary":" Click-through rate (CTR) prediction plays an important role in online\nadvertising platforms. Most existing methods use data from the advertising\nplatform itself for CTR prediction. As user behaviors also exist on many other\nplatforms, e.g., media platforms, it is beneficial to further exploit such\ncomplementary information for better modeling user interest and for improving\nCTR prediction performance. However, due to privacy concerns, data from\ndifferent platforms cannot be uploaded to a server for centralized model\ntraining. Vertical federated learning (VFL) provides a possible solution which\nis able to keep the raw data on respective participating parties and learn a\ncollaborative model in a privacy-preserving way. However, traditional VFL\nmethods only utilize aligned data with common keys across parties, which\nstrongly restricts their application scope. In this paper, we propose FedUD,\nwhich is able to exploit unaligned data, in addition to aligned data, for more\naccurate federated CTR prediction. FedUD contains two steps. In the first step,\nFedUD utilizes aligned data across parties like traditional VFL, but it\nadditionally includes a knowledge distillation module. This module distills\nuseful knowledge from the guest party's high-level representations and guides\nthe learning of a representation transfer network. In the second step, FedUD\napplies the learned knowledge to enrich the representations of the host party's\nunaligned data such that both aligned and unaligned data can contribute to\nfederated model training. Experiments on two real-world datasets demonstrate\nthe superior performance of FedUD for federated CTR prediction.\n","authors":["Wentao Ouyang","Rui Dong","Ri Tao","Xiangzheng Liu"],"pdf_url":"https://arxiv.org/pdf/2407.18472v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18471v1","updated":"2024-07-26T02:44:55Z","published":"2024-07-26T02:44:55Z","title":"Constructing the CORD-19 Vaccine Dataset","summary":" We introduce new dataset 'CORD-19-Vaccination' to cater to scientists\nspecifically looking into COVID-19 vaccine-related research. This dataset is\nextracted from CORD-19 dataset [Wang et al., 2020] and augmented with new\ncolumns for language detail, author demography, keywords, and topic per paper.\nFacebook's fastText model is used to identify languages [Joulin et al., 2016].\nTo establish author demography (author affiliation, lab/institution location,\nand lab/institution country columns) we processed the JSON file for each paper\nand then further enhanced using Google's search API to determine country\nvalues. 'Yake' was used to extract keywords from the title, abstract, and body\nof each paper and the LDA (Latent Dirichlet Allocation) algorithm was used to\nadd topic information [Campos et al., 2020, 2018a,b]. To evaluate the dataset,\nwe demonstrate a question-answering task like the one used in the CORD-19\nKaggle challenge [Goldbloom et al., 2022]. For further evaluation, sequential\nsentence classification was performed on each paper's abstract using the model\nfrom Dernoncourt et al. [2016]. We partially hand annotated the training\ndataset and used a pre-trained BERT-PubMed layer. 'CORD- 19-Vaccination'\ncontains 30k research papers and can be immensely valuable for NLP research\nsuch as text mining, information extraction, and question answering, specific\nto the domain of COVID-19 vaccine research.\n","authors":["Manisha Singh","Divy Sharma","Alonso Ma","Bridget Tyree","Margaret Mitchell"],"pdf_url":"https://arxiv.org/pdf/2407.18471v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18470v1","updated":"2024-07-26T02:39:30Z","published":"2024-07-26T02:39:30Z","title":"Synergizing Knowledge Graphs with Large Language Models: A Comprehensive\n Review and Future Prospects","summary":" Recent advancements have witnessed the ascension of Large Language Models\n(LLMs), endowed with prodigious linguistic capabilities, albeit marred by\nshortcomings including factual inconsistencies and opacity. Conversely,\nKnowledge Graphs (KGs) harbor verifiable knowledge and symbolic reasoning\nprowess, thereby complementing LLMs' deficiencies. Against this backdrop, the\nsynergy between KGs and LLMs emerges as a pivotal research direction. Our\ncontribution in this paper is a comprehensive dissection of the latest\ndevelopments in integrating KGs with LLMs. Through meticulous analysis of their\nconfluence points and methodologies, we introduce a unifying framework designed\nto elucidate and stimulate further exploration among scholars engaged in\ncognate disciplines. This framework serves a dual purpose: it consolidates\nextant knowledge while simultaneously delineating novel avenues for real-world\ndeployment, thereby amplifying the translational impact of academic research.\n","authors":["DaiFeng Li","Fan Xu"],"pdf_url":"https://arxiv.org/pdf/2407.18470v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.00815v3","updated":"2024-07-26T23:24:39Z","published":"2024-02-25T23:10:20Z","title":"RAM-EHR: Retrieval Augmentation Meets Clinical Predictions on Electronic\n Health Records","summary":" We present RAM-EHR, a Retrieval AugMentation pipeline to improve clinical\npredictions on Electronic Health Records (EHRs). RAM-EHR first collects\nmultiple knowledge sources, converts them into text format, and uses dense\nretrieval to obtain information related to medical concepts. This strategy\naddresses the difficulties associated with complex names for the concepts.\nRAM-EHR then augments the local EHR predictive model co-trained with\nconsistency regularization to capture complementary information from patient\nvisits and summarized knowledge. Experiments on two EHR datasets show the\nefficacy of RAM-EHR over previous knowledge-enhanced baselines (3.4% gain in\nAUROC and 7.2% gain in AUPR), emphasizing the effectiveness of the summarized\nknowledge from RAM-EHR for clinical prediction tasks. The code will be\npublished at \\url{https://github.com/ritaranx/RAM-EHR}.\n","authors":["Ran Xu","Wenqi Shi","Yue Yu","Yuchen Zhuang","Bowen Jin","May D. Wang","Joyce C. Ho","Carl Yang"],"pdf_url":"https://arxiv.org/pdf/2403.00815v3.pdf","comment":"ACL 2024 (Oral)"},{"id":"http://arxiv.org/abs/2407.19099v1","updated":"2024-07-26T21:33:57Z","published":"2024-07-26T21:33:57Z","title":"Sponsored is the New Organic: Implications of Sponsored Results on\n Quality of Search Results in the Amazon Marketplace","summary":" Interleaving sponsored results (advertisements) amongst organic results on\nsearch engine result pages (SERP) has become a common practice across multiple\ndigital platforms. Advertisements have catered to consumer satisfaction and\nfostered competition in digital public spaces; making them an appealing gateway\nfor businesses to reach their consumers. However, especially in the context of\ndigital marketplaces, due to the competitive nature of the sponsored results\nwith the organic ones, multiple unwanted repercussions have surfaced affecting\ndifferent stakeholders. From the consumers' perspective the sponsored\nads/results may cause degradation of search quality and nudge consumers to\npotentially irrelevant and costlier products. The sponsored ads may also affect\nthe level playing field of the competition in the marketplaces among sellers.\nTo understand and unravel these potential concerns, we analyse the Amazon\ndigital marketplace in four different countries by simulating 4,800 search\noperations. Our analyses over SERPs consisting 2M organic and 638K sponsored\nresults show items with poor organic ranks (beyond 100th position) appear as\nsponsored results even before the top organic results on the first page of\nAmazon SERP. Moreover, we also observe that in majority of the cases, these top\nsponsored results are costlier and are of poorer quality than the top organic\nresults. We believe these observations can motivate researchers for further\ndeliberation to bring in more transparency and guard rails in the advertising\npractices followed in digital marketplaces.\n","authors":["Abhisek Dash","Saptarshi Ghosh","Animesh Mukherjee","Abhijnan Chakraborty","Krishna P. Gummadi"],"pdf_url":"https://arxiv.org/pdf/2407.19099v1.pdf","comment":"This work has been accepted as a full paper in AAAI/ACM conference on\n Artificial Intelligence, Ethics and Society (AIES) 2024"},{"id":"http://arxiv.org/abs/2407.19090v1","updated":"2024-07-26T21:11:58Z","published":"2024-07-26T21:11:58Z","title":"MetaHive: A Cache-Optimized Metadata Management for Heterogeneous\n Key-Value Stores","summary":" Cloud key-value (KV) stores provide businesses with a cost-effective and\nadaptive alternative to traditional on-premise data management solutions. KV\nstores frequently consist of heterogeneous clusters, characterized by varying\nhardware specifications of the deployment nodes, with each node potentially\nrunning a distinct version of the KV store software. This heterogeneity is\naccompanied by the diverse metadata that they need to manage. In this study, we\nintroduce MetaHive, a cache-optimized approach to managing metadata in\nheterogeneous KV store clusters. MetaHive disaggregates the original data from\nits associated metadata to promote independence between them, while maintaining\ntheir interconnection during usage. This makes the metadata opaque from the\ndownstream processes and the other KV stores in the cluster. MetaHive also\nensures that the KV and metadata entries are stored in the vicinity of each\nother in memory and storage. This allows MetaHive to optimally utilize the\ncaching mechanism without extra storage read overhead for metadata retrieval.\nWe deploy MetaHive to ensure data integrity in RocksDB and demonstrate its\nrapid data validation with minimal effect on performance.\n","authors":["Alireza Heidari","Amirhossein Ahmadi","Zefeng Zhi","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.19090v1.pdf","comment":"Cloud Databases"},{"id":"http://arxiv.org/abs/2310.20091v6","updated":"2024-07-26T20:20:31Z","published":"2023-10-31T00:12:13Z","title":"Density-based User Representation using Gaussian Process Regression for\n Multi-interest Personalized Retrieval","summary":" Accurate modeling of the diverse and dynamic interests of users remains a\nsignificant challenge in the design of personalized recommender systems.\nExisting user modeling methods, like single-point and multi-point\nrepresentations, have limitations w.r.t.\\ accuracy, diversity, and\nadaptability. To overcome these deficiencies, we introduce density-based user\nrepresentations (DURs), a novel method that leverages Gaussian process\nregression (GPR) for effective multi-interest recommendation and retrieval. Our\napproach, GPR4DUR, exploits DURs to capture user interest variability without\nmanual tuning, incorporates uncertainty-awareness, and scales well to large\nnumbers of users. Experiments using real-world offline datasets confirm the\nadaptability and efficiency of GPR4DUR, while online experiments with simulated\nusers demonstrate its ability to address the exploration-exploitation trade-off\nby effectively utilizing model uncertainty.\n","authors":["Haolun Wu","Ofer Meshi","Masrour Zoghi","Fernando Diaz","Xue Liu","Craig Boutilier","Maryam Karimzadehgan"],"pdf_url":"https://arxiv.org/pdf/2310.20091v6.pdf","comment":"22 pages"},{"id":"http://arxiv.org/abs/2407.21059v1","updated":"2024-07-26T03:45:30Z","published":"2024-07-26T03:45:30Z","title":"Modular RAG: Transforming RAG Systems into LEGO-like Reconfigurable\n Frameworks","summary":" Retrieval-augmented Generation (RAG) has markedly enhanced the capabilities\nof Large Language Models (LLMs) in tackling knowledge-intensive tasks. The\nincreasing demands of application scenarios have driven the evolution of RAG,\nleading to the integration of advanced retrievers, LLMs and other complementary\ntechnologies, which in turn has amplified the intricacy of RAG systems.\nHowever, the rapid advancements are outpacing the foundational RAG paradigm,\nwith many methods struggling to be unified under the process of\n\"retrieve-then-generate\". In this context, this paper examines the limitations\nof the existing RAG paradigm and introduces the modular RAG framework. By\ndecomposing complex RAG systems into independent modules and specialized\noperators, it facilitates a highly reconfigurable framework. Modular RAG\ntranscends the traditional linear architecture, embracing a more advanced\ndesign that integrates routing, scheduling, and fusion mechanisms. Drawing on\nextensive research, this paper further identifies prevalent RAG\npatterns-linear, conditional, branching, and looping-and offers a comprehensive\nanalysis of their respective implementation nuances. Modular RAG presents\ninnovative opportunities for the conceptualization and deployment of RAG\nsystems. Finally, the paper explores the potential emergence of new operators\nand paradigms, establishing a solid theoretical foundation and a practical\nroadmap for the continued evolution and practical deployment of RAG\ntechnologies.\n","authors":["Yunfan Gao","Yun Xiong","Meng Wang","Haofen Wang"],"pdf_url":"https://arxiv.org/pdf/2407.21059v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2407.18913v1","updated":"2024-07-26T17:59:55Z","published":"2024-07-26T17:59:55Z","title":"SOAP-RL: Sequential Option Advantage Propagation for Reinforcement\n Learning in POMDP Environments","summary":" This work compares ways of extending Reinforcement Learning algorithms to\nPartially Observed Markov Decision Processes (POMDPs) with options. One view of\noptions is as temporally extended action, which can be realized as a memory\nthat allows the agent to retain historical information beyond the policy's\ncontext window. While option assignment could be handled using heuristics and\nhand-crafted objectives, learning temporally consistent options and associated\nsub-policies without explicit supervision is a challenge. Two algorithms, PPOEM\nand SOAP, are proposed and studied in depth to address this problem. PPOEM\napplies the forward-backward algorithm (for Hidden Markov Models) to optimize\nthe expected returns for an option-augmented policy. However, this learning\napproach is unstable during on-policy rollouts. It is also unsuited for\nlearning causal policies without the knowledge of future trajectories, since\noption assignments are optimized for offline sequences where the entire episode\nis available. As an alternative approach, SOAP evaluates the policy gradient\nfor an optimal option assignment. It extends the concept of the generalized\nadvantage estimation (GAE) to propagate option advantages through time, which\nis an analytical equivalent to performing temporal back-propagation of option\npolicy gradients. This option policy is only conditional on the history of the\nagent, not future actions. Evaluated against competing baselines, SOAP\nexhibited the most robust performance, correctly discovering options for POMDP\ncorridor environments, as well as on standard benchmarks including Atari and\nMuJoCo, outperforming PPOEM, as well as LSTM and Option-Critic baselines. The\nopen-sourced code is available at https://github.com/shuishida/SoapRL.\n","authors":["Shu Ishida","João F. Henriques"],"pdf_url":"https://arxiv.org/pdf/2407.18913v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18910v1","updated":"2024-07-26T17:59:32Z","published":"2024-07-26T17:59:32Z","title":"Do We Really Need Graph Convolution During Training? Light Post-Training\n Graph-ODE for Efficient Recommendation","summary":" The efficiency and scalability of graph convolution networks (GCNs) in\ntraining recommender systems (RecSys) have been persistent concerns, hindering\ntheir deployment in real-world applications. This paper presents a critical\nexamination of the necessity of graph convolutions during the training phase\nand introduces an innovative alternative: the Light Post-Training Graph\nOrdinary-Differential-Equation (LightGODE). Our investigation reveals that the\nbenefits of GCNs are more pronounced during testing rather than training.\nMotivated by this, LightGODE utilizes a novel post-training graph convolution\nmethod that bypasses the computation-intensive message passing of GCNs and\nemploys a non-parametric continuous graph ordinary-differential-equation (ODE)\nto dynamically model node representations. This approach drastically reduces\ntraining time while achieving fine-grained post-training graph convolution to\navoid the distortion of the original training embedding space, termed the\nembedding discrepancy issue. We validate our model across several real-world\ndatasets of different scales, demonstrating that LightGODE not only outperforms\nGCN-based models in terms of efficiency and effectiveness but also\nsignificantly mitigates the embedding discrepancy commonly associated with\ndeeper graph convolution layers. Our LightGODE challenges the prevailing\nparadigms in RecSys training and suggests re-evaluating the role of graph\nconvolutions, potentially guiding future developments of efficient large-scale\ngraph-based RecSys.\n","authors":["Weizhi Zhang","Liangwei Yang","Zihe Song","Henry Peng Zou","Ke Xu","Henry Peng Zou","Liancheng Fang","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2407.18910v1.pdf","comment":"Accepted to CIKM 2024"},{"id":"http://arxiv.org/abs/2407.18909v1","updated":"2024-07-26T17:59:26Z","published":"2024-07-26T17:59:26Z","title":"Hybrid summary statistics: neural weak lensing inference beyond the\n power spectrum","summary":" In inference problems, we often have domain knowledge which allows us to\ndefine summary statistics that capture most of the information content in a\ndataset. In this paper, we present a hybrid approach, where such physics-based\nsummaries are augmented by a set of compressed neural summary statistics that\nare optimised to extract the extra information that is not captured by the\npredefined summaries. The resulting statistics are very powerful inputs to\nsimulation-based or implicit inference of model parameters. We apply this\ngeneralisation of Information Maximising Neural Networks (IMNNs) to parameter\nconstraints from tomographic weak gravitational lensing convergence maps to\nfind summary statistics that are explicitly optimised to complement angular\npower spectrum estimates. We study several dark matter simulation resolutions\nin low- and high-noise regimes. We show that i) the information-update\nformalism extracts at least $3\\times$ and up to $8\\times$ as much information\nas the angular power spectrum in all noise regimes, ii) the network summaries\nare highly complementary to existing 2-point summaries, and iii) our formalism\nallows for networks with smaller, physically-informed architectures to match\nmuch larger regression networks with far fewer simulations needed to obtain\nasymptotically optimal inference.\n","authors":["T. Lucas Makinen","Tom Charnock","Natalia Porqueres","Axel Lapel","Alan Heavens","Benjamin D. Wandelt"],"pdf_url":"https://arxiv.org/pdf/2407.18909v1.pdf","comment":"16 pages, 11 figures. Submitted to JCAP. We provide publicly\n available code at https://github.com/tlmakinen/hybridStatsWL"},{"id":"http://arxiv.org/abs/2407.18908v1","updated":"2024-07-26T17:59:09Z","published":"2024-07-26T17:59:09Z","title":"Wolf: Captioning Everything with a World Summarization Framework","summary":" We propose Wolf, a WOrLd summarization Framework for accurate video\ncaptioning. Wolf is an automated captioning framework that adopts a\nmixture-of-experts approach, leveraging complementary strengths of Vision\nLanguage Models (VLMs). By utilizing both image and video models, our framework\ncaptures different levels of information and summarizes them efficiently. Our\napproach can be applied to enhance video understanding, auto-labeling, and\ncaptioning. To evaluate caption quality, we introduce CapScore, an LLM-based\nmetric to assess the similarity and quality of generated captions compared to\nthe ground truth captions. We further build four human-annotated datasets in\nthree domains: autonomous driving, general scenes, and robotics, to facilitate\ncomprehensive comparisons. We show that Wolf achieves superior captioning\nperformance compared to state-of-the-art approaches from the research community\n(VILA1.5, CogAgent) and commercial solutions (Gemini-Pro-1.5, GPT-4V). For\ninstance, in comparison with GPT-4V, Wolf improves CapScore both quality-wise\nby 55.6% and similarity-wise by 77.4% on challenging driving videos. Finally,\nwe establish a benchmark for video captioning and introduce a leaderboard,\naiming to accelerate advancements in video understanding, captioning, and data\nalignment. Leaderboard: https://wolfv0.github.io/leaderboard.html.\n","authors":["Boyi Li","Ligeng Zhu","Ran Tian","Shuhan Tan","Yuxiao Chen","Yao Lu","Yin Cui","Sushant Veer","Max Ehrlich","Jonah Philion","Xinshuo Weng","Fuzhao Xue","Andrew Tao","Ming-Yu Liu","Sanja Fidler","Boris Ivanovic","Trevor Darrell","Jitendra Malik","Song Han","Marco Pavone"],"pdf_url":"https://arxiv.org/pdf/2407.18908v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18906v1","updated":"2024-07-26T17:58:57Z","published":"2024-07-26T17:58:57Z","title":"A Scalable Quantum Non-local Neural Network for Image Classification","summary":" Non-local operations play a crucial role in computer vision enabling the\ncapture of long-range dependencies through weighted sums of features across the\ninput, surpassing the constraints of traditional convolution operations that\nfocus solely on local neighborhoods. Non-local operations typically require\ncomputing pairwise relationships between all elements in a set, leading to\nquadratic complexity in terms of time and memory. Due to the high computational\nand memory demands, scaling non-local neural networks to large-scale problems\ncan be challenging. This article introduces a hybrid quantum-classical scalable\nnon-local neural network, referred to as Quantum Non-Local Neural Network\n(QNL-Net), to enhance pattern recognition. The proposed QNL-Net relies on\ninherent quantum parallelism to allow the simultaneous processing of a large\nnumber of input features enabling more efficient computations in\nquantum-enhanced feature space and involving pairwise relationships through\nquantum entanglement. We benchmark our proposed QNL-Net with other quantum\ncounterparts to binary classification with datasets MNIST and CIFAR-10. The\nsimulation findings showcase our QNL-Net achieves cutting-edge accuracy levels\nin binary image classification among quantum classifiers while utilizing fewer\nqubits.\n","authors":["Sparsh Gupta","Debanjan Konar","Vaneet Aggarwal"],"pdf_url":"https://arxiv.org/pdf/2407.18906v1.pdf","comment":"draft, 13 pages (including references and appendix), 5 figures"},{"id":"http://arxiv.org/abs/2407.18902v1","updated":"2024-07-26T17:56:01Z","published":"2024-07-26T17:56:01Z","title":"Lessons from Learning to Spin \"Pens\"","summary":" In-hand manipulation of pen-like objects is an important skill in our daily\nlives, as many tools such as hammers and screwdrivers are similarly shaped.\nHowever, current learning-based methods struggle with this task due to a lack\nof high-quality demonstrations and the significant gap between simulation and\nthe real world. In this work, we push the boundaries of learning-based in-hand\nmanipulation systems by demonstrating the capability to spin pen-like objects.\nWe first use reinforcement learning to train an oracle policy with privileged\ninformation and generate a high-fidelity trajectory dataset in simulation. This\nserves two purposes: 1) pre-training a sensorimotor policy in simulation; 2)\nconducting open-loop trajectory replay in the real world. We then fine-tune the\nsensorimotor policy using these real-world trajectories to adapt it to the real\nworld dynamics. With less than 50 trajectories, our policy learns to rotate\nmore than ten pen-like objects with different physical properties for multiple\nrevolutions. We present a comprehensive analysis of our design choices and\nshare the lessons learned during development.\n","authors":["Jun Wang","Ying Yuan","Haichuan Che","Haozhi Qi","Yi Ma","Jitendra Malik","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2407.18902v1.pdf","comment":"Website: https://penspin.github.io/"},{"id":"http://arxiv.org/abs/2407.18901v1","updated":"2024-07-26T17:55:45Z","published":"2024-07-26T17:55:45Z","title":"AppWorld: A Controllable World of Apps and People for Benchmarking\n Interactive Coding Agents","summary":" Autonomous agents that address day-to-day digital tasks (e.g., ordering\ngroceries for a household), must not only operate multiple apps (e.g., notes,\nmessaging, shopping app) via APIs, but also generate rich code with complex\ncontrol flow in an iterative manner based on their interaction with the\nenvironment. However, existing benchmarks for tool use are inadequate, as they\nonly cover tasks that require a simple sequence of API calls.\n To remedy this gap, we built $\\textbf{AppWorld Engine}$, a high-quality\nexecution environment (60K lines of code) of 9 day-to-day apps operable via 457\nAPIs and populated with realistic digital activities simulating the lives of\n~100 fictitious users. We then created $\\textbf{AppWorld Benchmark}$ (40K lines\nof code), a suite of 750 natural, diverse, and challenging autonomous agent\ntasks requiring rich and interactive code generation. It supports robust\nprogrammatic evaluation with state-based unit tests, allowing for different\nways of completing a task while also checking for unexpected changes, i.e.,\ncollateral damage. The state-of-the-art LLM, GPT-4o, solves only ~49% of our\n'normal' tasks and ~30% of 'challenge' tasks, while other models solve at least\n16% fewer. This highlights the benchmark's difficulty and AppWorld's potential\nto push the frontiers of interactive coding agents. The project website is\navailable at https://appworld.dev/.\n","authors":["Harsh Trivedi","Tushar Khot","Mareike Hartmann","Ruskin Manku","Vinty Dong","Edward Li","Shashank Gupta","Ashish Sabharwal","Niranjan Balasubramanian"],"pdf_url":"https://arxiv.org/pdf/2407.18901v1.pdf","comment":"ACL'24 Camera Ready"},{"id":"http://arxiv.org/abs/2406.10242v2","updated":"2024-07-26T17:54:59Z","published":"2024-06-05T18:06:57Z","title":"Physics-Guided Actor-Critic Reinforcement Learning for Swimming in\n Turbulence","summary":" Turbulent diffusion causes particles placed in proximity to separate. We\ninvestigate the required swimming efforts to maintain a particle close to its\npassively advected counterpart. We explore optimally balancing these efforts\nwith the intended goal by developing and comparing a novel Physics-Informed\nReinforcement Learning (PIRL) strategy with prescribed control (PC) and\nstandard physics-agnostic Reinforcement Learning strategies. Our PIRL scheme,\ncoined the Actor-Physicist, is an adaptation of the Actor-Critic algorithm in\nwhich the Neural Network parameterized Critic is replaced with an analytically\nderived physical heuristic function (the physicist). This strategy is then\ncompared with an analytically computed optimal PC policy derived from a\nstochastic optimal control formulation and standard physics-agnostic\nActor-Critic type algorithms.\n","authors":["Christopher Koh","Laurent Pagnier","Michael Chertkov"],"pdf_url":"https://arxiv.org/pdf/2406.10242v2.pdf","comment":"23 pages, 6 figures"},{"id":"http://arxiv.org/abs/2407.18899v1","updated":"2024-07-26T17:51:58Z","published":"2024-07-26T17:51:58Z","title":"Learn from the Learnt: Source-Free Active Domain Adaptation via\n Contrastive Sampling and Visual Persistence","summary":" Domain Adaptation (DA) facilitates knowledge transfer from a source domain to\na related target domain. This paper investigates a practical DA paradigm,\nnamely Source data-Free Active Domain Adaptation (SFADA), where source data\nbecomes inaccessible during adaptation, and a minimum amount of annotation\nbudget is available in the target domain. Without referencing the source data,\nnew challenges emerge in identifying the most informative target samples for\nlabeling, establishing cross-domain alignment during adaptation, and ensuring\ncontinuous performance improvements through the iterative query-and-adaptation\nprocess. In response, we present learn from the learnt (LFTL), a novel paradigm\nfor SFADA to leverage the learnt knowledge from the source pretrained model and\nactively iterated models without extra overhead. We propose Contrastive Active\nSampling to learn from the hypotheses of the preceding model, thereby querying\ntarget samples that are both informative to the current model and persistently\nchallenging throughout active learning. During adaptation, we learn from\nfeatures of actively selected anchors obtained from previous intermediate\nmodels, so that the Visual Persistence-guided Adaptation can facilitate feature\ndistribution alignment and active sample exploitation. Extensive experiments on\nthree widely-used benchmarks show that our LFTL achieves state-of-the-art\nperformance, superior computational efficiency and continuous improvements as\nthe annotation budget increases. Our code is available at\nhttps://github.com/lyumengyao/lftl.\n","authors":["Mengyao Lyu","Tianxiang Hao","Xinhao Xu","Hui Chen","Zijia Lin","Jungong Han","Guiguang Ding"],"pdf_url":"https://arxiv.org/pdf/2407.18899v1.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2407.18897v1","updated":"2024-07-26T17:51:33Z","published":"2024-07-26T17:51:33Z","title":"Small Molecule Optimization with Large Language Models","summary":" Recent advancements in large language models have opened new possibilities\nfor generative molecular drug design. We present Chemlactica and Chemma, two\nlanguage models fine-tuned on a novel corpus of 110M molecules with computed\nproperties, totaling 40B tokens. These models demonstrate strong performance in\ngenerating molecules with specified properties and predicting new molecular\ncharacteristics from limited samples. We introduce a novel optimization\nalgorithm that leverages our language models to optimize molecules for\narbitrary properties given limited access to a black box oracle. Our approach\ncombines ideas from genetic algorithms, rejection sampling, and prompt\noptimization. It achieves state-of-the-art performance on multiple molecular\noptimization benchmarks, including an 8% improvement on Practical Molecular\nOptimization compared to previous methods. We publicly release the training\ncorpus, the language models and the optimization algorithm.\n","authors":["Philipp Guevorguian","Menua Bedrosian","Tigran Fahradyan","Gayane Chilingaryan","Hrant Khachatrian","Armen Aghajanyan"],"pdf_url":"https://arxiv.org/pdf/2407.18897v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18219v2","updated":"2024-07-26T17:50:27Z","published":"2024-07-25T17:35:59Z","title":"Recursive Introspection: Teaching Language Model Agents How to\n Self-Improve","summary":" A central piece in enabling intelligent agentic behavior in foundation models\nis to make them capable of introspecting upon their behavior, reasoning, and\ncorrecting their mistakes as more computation or interaction is available. Even\nthe strongest proprietary large language models (LLMs) do not quite exhibit the\nability of continually improving their responses sequentially, even in\nscenarios where they are explicitly told that they are making a mistake. In\nthis paper, we develop RISE: Recursive IntroSpEction, an approach for\nfine-tuning LLMs to introduce this capability, despite prior work hypothesizing\nthat this capability may not be possible to attain. Our approach prescribes an\niterative fine-tuning procedure, which attempts to teach the model how to alter\nits response after having executed previously unsuccessful attempts to solve a\nhard test-time problem, with optionally additional environment feedback. RISE\nposes fine-tuning for a single-turn prompt as solving a multi-turn Markov\ndecision process (MDP), where the initial state is the prompt. Inspired by\nprinciples in online imitation learning and reinforcement learning, we propose\nstrategies for multi-turn data collection and training so as to imbue an LLM\nwith the capability to recursively detect and correct its previous mistakes in\nsubsequent iterations. Our experiments show that RISE enables Llama2, Llama3,\nand Mistral models to improve themselves with more turns on math reasoning\ntasks, outperforming several single-turn strategies given an equal amount of\ninference-time computation. We also find that RISE scales well, often attaining\nlarger benefits with more capable models. Our analysis shows that RISE makes\nmeaningful improvements to responses to arrive at the correct solution for\nchallenging prompts, without disrupting one-turn abilities as a result of\nexpressing more complex distributions.\n","authors":["Yuxiao Qu","Tianjun Zhang","Naman Garg","Aviral Kumar"],"pdf_url":"https://arxiv.org/pdf/2407.18219v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18889v1","updated":"2024-07-26T17:40:52Z","published":"2024-07-26T17:40:52Z","title":"On the Pros and Cons of Active Learning for Moral Preference Elicitation","summary":" Computational preference elicitation methods are tools used to learn people's\npreferences quantitatively in a given context. Recent works on preference\nelicitation advocate for active learning as an efficient method to iteratively\nconstruct queries (framed as comparisons between context-specific cases) that\nare likely to be most informative about an agent's underlying preferences. In\nthis work, we argue that the use of active learning for moral preference\nelicitation relies on certain assumptions about the underlying moral\npreferences, which can be violated in practice. Specifically, we highlight the\nfollowing common assumptions (a) preferences are stable over time and not\nsensitive to the sequence of presented queries, (b) the appropriate hypothesis\nclass is chosen to model moral preferences, and (c) noise in the agent's\nresponses is limited. While these assumptions can be appropriate for preference\nelicitation in certain domains, prior research on moral psychology suggests\nthey may not be valid for moral judgments. Through a synthetic simulation of\npreferences that violate the above assumptions, we observe that active learning\ncan have similar or worse performance than a basic random query selection\nmethod in certain settings. Yet, simulation results also demonstrate that\nactive learning can still be viable if the degree of instability or noise is\nrelatively small and when the agent's preferences can be approximately\nrepresented with the hypothesis class used for learning. Our study highlights\nthe nuances associated with effective moral preference elicitation in practice\nand advocates for the cautious use of active learning as a methodology to learn\nmoral preferences.\n","authors":["Vijay Keswani","Vincent Conitzer","Hoda Heidari","Jana Schaich Borg","Walter Sinnott-Armstrong"],"pdf_url":"https://arxiv.org/pdf/2407.18889v1.pdf","comment":"To appear in AIES 2024"},{"id":"http://arxiv.org/abs/2407.18887v1","updated":"2024-07-26T17:36:40Z","published":"2024-07-26T17:36:40Z","title":"Embedding And Clustering Your Data Can Improve Contrastive Pretraining","summary":" Recent studies of large-scale contrastive pretraining in the text embedding\ndomain show that using single-source minibatches, rather than mixed-source\nminibatches, can substantially improve overall model accuracy. In this work, we\nexplore extending training data stratification beyond source granularity by\nleveraging a pretrained text embedding model and the classic k-means clustering\nalgorithm to further split training data apart by the semantic clusters within\neach source. Experimentally, we observe a notable increase in NDCG@10 when\npretraining a BERT-based text embedding model on query-passage pairs from the\nMSMARCO passage retrieval dataset. Additionally, we conceptually connect our\nclustering approach to both the Topic Aware Sampling (TAS) aspect of the TAS-B\nmethodology and the nearest-neighbor-based hard-negative mining aspect of the\nANCE methodology and discuss how this unified view motivates future lines of\nresearch on the organization of contrastive pretraining data.\n","authors":["Luke Merrick"],"pdf_url":"https://arxiv.org/pdf/2407.18887v1.pdf","comment":"16 pages, 3 figures, 2 tables"},{"id":"http://arxiv.org/abs/2407.14575v2","updated":"2024-07-26T17:35:20Z","published":"2024-07-19T16:19:14Z","title":"Regression prediction algorithm for energy consumption regression in\n cloud computing based on horned lizard algorithm optimised convolutional\n neural network-bidirectional gated recurrent unit","summary":" For this paper, a prediction study of cloud computing energy consumption was\nconducted by optimising the data regression algorithm based on the horned\nlizard optimisation algorithm for Convolutional Neural Networks-Bi-Directional\nGated Recurrent Units. Firstly, through Spearman correlation analysis of CPU,\nusage, memory usage, network traffic, power consumption, number of instructions\nexecuted, execution time and energy efficiency, we found that power consumption\nhas the highest degree of positive correlation with energy efficiency, while\nCPU usage has the highest degree of negative correlation with energy\nefficiency. In our experiments, we introduced a random forest model and an\noptimisation model based on the horned lizard optimisation algorithm for\ntesting, and the results show that the optimisation algorithm has better\nprediction results compared to the random forest model. Specifically, the mean\nsquare error (MSE) of the optimisation algorithm is 0.01 smaller than that of\nthe random forest model, and the mean absolute error (MAE) is 0.01 smaller than\nthat of the random forest.3 The results of the combined metrics show that the\noptimisation algorithm performs more accurately and reliably in predicting\nenergy efficiency. This research result provides new ideas and methods to\nimprove the energy efficiency of cloud computing systems. This research not\nonly expands the scope of application in the field of cloud computing, but also\nprovides a strong support for improving the energy use efficiency of the\nsystem.\n","authors":["Feiyang Li","Zinan Cao","Qixuan Yu","Xirui Tang"],"pdf_url":"https://arxiv.org/pdf/2407.14575v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.08112v2","updated":"2024-07-26T17:31:51Z","published":"2024-07-11T01:08:39Z","title":"How Well Can a Long Sequence Model Model Long Sequences? Comparing\n Architechtural Inductive Biases on Long-Context Abilities","summary":" Long sequences occur in abundance within real-world scenarios, hence properly\nmodelling them opens numerous down-stream use-cases. Deep neural networks,\nhowever, have often struggled with these for a variety of reasons. Recent\nadvances, both in system engineering as well as model design, have enabled the\nscaling up of model that are purported to support extended context length. In\nparticular, the state-space and linear recurrent neural network families of\nmodels hypothetically can entend to infinite sequence lenth. However, is this\ntoo good to be true? We conduct an evaluation to show that while such claims\nmay be sound theoretically, there remain large practical gaps that are\nempirically observed. In particular, recurrent models still suffer in the same\nsettings as long-context LLMs with attention. We further show that different\ninductive biases have inconsistent extrapolation capabilities, highlighting the\nneed to further study such paradigms and investigate why long-context models\nseemingly fail to behave as one might expect.\n","authors":["Jerry Huang"],"pdf_url":"https://arxiv.org/pdf/2407.08112v2.pdf","comment":"Work In Progress. 9 pages"},{"id":"http://arxiv.org/abs/2407.09186v2","updated":"2024-07-26T17:26:45Z","published":"2024-07-12T11:38:41Z","title":"Variational Inference via Smoothed Particle Hydrodynamics","summary":" A new variational inference method, SPH-ParVI, based on smoothed particle\nhydrodynamics (SPH), is proposed for sampling partially known densities (e.g.\nup to a constant) or sampling using gradients. SPH-ParVI simulates the flow of\na fluid under external effects driven by the target density; transient or\nsteady state of the fluid approximates the target density. The continuum fluid\nis modelled as an interacting particle system (IPS) via SPH, where each\nparticle carries smoothed properties, interacts and evolves as per the\nNavier-Stokes equations. This mesh-free, Lagrangian simulation method offers\nfast, flexible, scalable and deterministic sampling and inference for a class\nof probabilistic models such as those encountered in Bayesian inference and\ngenerative modelling.\n","authors":["Yongchao Huang"],"pdf_url":"https://arxiv.org/pdf/2407.09186v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18879v1","updated":"2024-07-26T17:24:50Z","published":"2024-07-26T17:24:50Z","title":"Utilizing TTS Synthesized Data for Efficient Development of Keyword\n Spotting Model","summary":" This paper explores the use of TTS synthesized training data for KWS (keyword\nspotting) task while minimizing development cost and time. Keyword spotting\nmodels require a huge amount of training data to be accurate, and obtaining\nsuch training data can be costly. In the current state of the art, TTS models\ncan generate large amounts of natural-sounding data, which can help reducing\ncost and time for KWS model development. Still, TTS generated data can be\nlacking diversity compared to real data. To pursue maximizing KWS model\naccuracy under the constraint of limited resources and current TTS capability,\nwe explored various strategies to mix TTS data and real human speech data, with\na focus on minimizing real data use and maximizing diversity of TTS output. Our\nexperimental results indicate that relatively small amounts of real audio data\nwith speaker diversity (100 speakers, 2k utterances) and large amounts of TTS\nsynthesized data can achieve reasonably high accuracy (within 3x error rate of\nbaseline), compared to the baseline (trained with 3.8M real positive\nutterances).\n","authors":["Hyun Jin Park","Dhruuv Agarwal","Neng Chen","Rentao Sun","Kurt Partridge","Justin Chen","Harry Zhang","Pai Zhu","Jacob Bartel","Kyle Kastner","Gary Wang","Andrew Rosenberg","Quan Wang"],"pdf_url":"https://arxiv.org/pdf/2407.18879v1.pdf","comment":"to be published in a Workshop at Interspeech 2024, Synthetic Data's\n Transformative Role in Foundational Speech Models"},{"id":"http://arxiv.org/abs/2407.18878v1","updated":"2024-07-26T17:16:31Z","published":"2024-07-26T17:16:31Z","title":"An Accelerated Multi-level Monte Carlo Approach for Average Reward\n Reinforcement Learning with General Policy Parametrization","summary":" In our study, we delve into average-reward reinforcement learning with\ngeneral policy parametrization. Within this domain, current guarantees either\nfall short with suboptimal guarantees or demand prior knowledge of mixing time.\nTo address these issues, we introduce Randomized Accelerated Natural Actor\nCritic, a method that integrates Multi-level Monte-Carlo and Natural Actor\nCritic. Our approach is the first to achieve global convergence rate of\n$\\tilde{\\mathcal{O}}(1/\\sqrt{T})$ without requiring knowledge of mixing time,\nsignificantly surpassing the state-of-the-art bound of\n$\\tilde{\\mathcal{O}}(1/T^{1/4})$.\n","authors":["Swetha Ganesh","Vaneet Aggarwal"],"pdf_url":"https://arxiv.org/pdf/2407.18878v1.pdf","comment":"28 pages, 1 table"},{"id":"http://arxiv.org/abs/2407.18875v1","updated":"2024-07-26T17:09:48Z","published":"2024-07-26T17:09:48Z","title":"Generative Adversarial Networks for Imputing Sparse Learning Performance","summary":" Learning performance data, such as correct or incorrect responses to\nquestions in Intelligent Tutoring Systems (ITSs) is crucial for tracking and\nassessing the learners' progress and mastery of knowledge. However, the issue\nof data sparsity, characterized by unexplored questions and missing attempts,\nhampers accurate assessment and the provision of tailored, personalized\ninstruction within ITSs. This paper proposes using the Generative Adversarial\nImputation Networks (GAIN) framework to impute sparse learning performance\ndata, reconstructed into a three-dimensional (3D) tensor representation across\nthe dimensions of learners, questions and attempts. Our customized GAIN-based\nmethod computational process imputes sparse data in a 3D tensor space,\nsignificantly enhanced by convolutional neural networks for its input and\noutput layers. This adaptation also includes the use of a least squares loss\nfunction for optimization and aligns the shapes of the input and output with\nthe dimensions of the questions-attempts matrices along the learners'\ndimension. Through extensive experiments on six datasets from various ITSs,\nincluding AutoTutor, ASSISTments and MATHia, we demonstrate that the GAIN\napproach generally outperforms existing methods such as tensor factorization\nand other generative adversarial network (GAN) based approaches in terms of\nimputation accuracy. This finding enhances comprehensive learning data modeling\nand analytics in AI-based education.\n","authors":["Liang Zhang","Mohammed Yeasin","Jionghao Lin","Felix Havugimana","Xiangen Hu"],"pdf_url":"https://arxiv.org/pdf/2407.18875v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.00568v3","updated":"2024-07-26T17:04:53Z","published":"2022-09-01T16:19:22Z","title":"Distilling Multi-Scale Knowledge for Event Temporal Relation Extraction","summary":" Event Temporal Relation Extraction (ETRE) is paramount but challenging.\nWithin a discourse, event pairs are situated at different distances or the\nso-called proximity bands. The temporal ordering communicated about event pairs\nwhere at more remote (i.e., ``long'') or less remote (i.e., ``short'')\nproximity bands are encoded differently. SOTA models have tended to perform\nwell on events situated at either short or long proximity bands, but not both.\nNonetheless, real-world, natural texts contain all types of temporal\nevent-pairs. In this paper, we present MulCo: Distilling Multi-Scale Knowledge\nvia Contrastive Learning, a knowledge co-distillation approach that shares\nknowledge across multiple event pair proximity bands to improve performance on\nall types of temporal datasets. Our experimental results show that MulCo\nsuccessfully integrates linguistic cues pertaining to temporal reasoning across\nboth short and long proximity bands and achieves new state-of-the-art results\non several ETRE benchmark datasets.\n","authors":["Hao-Ren Yao","Luke Breitfeller","Aakanksha Naik","Chunxiao Zhou","Carolyn Rose"],"pdf_url":"https://arxiv.org/pdf/2209.00568v3.pdf","comment":"Accepted to CIKM 2024 Full Research Track, camera ready version"},{"id":"http://arxiv.org/abs/2407.18865v1","updated":"2024-07-26T16:52:30Z","published":"2024-07-26T16:52:30Z","title":"Downlink CCM Estimation via Representation Learning with Graph\n Regularization","summary":" In this paper, we propose an algorithm for downlink (DL) channel covariance\nmatrix (CCM) estimation for frequency division duplexing (FDD) massive\nmultiple-input multiple-output (MIMO) communication systems with base station\n(BS) possessing a uniform linear array (ULA) antenna structure. We make use of\nthe inherent similarity between the uplink (UL) CCM and the DL CCM due to\nangular reciprocity. We consider a setting where the UL CCM is mapped to DL CCM\nby a mapping function. We first present a theoretical error analysis of\nlearning a nonlinear embedding by constructing a mapping function, which points\nto the importance of the Lipschitz regularity of the mapping function for\nachieving high estimation performance. Then, based on the theoretical ground,\nwe propose a representation learning algorithm as a solution for the estimation\nproblem, where Gaussian RBF kernel interpolators are chosen to map UL CCMs to\ntheir DL counterparts. The proposed algorithm is based on the optimization of\nan objective function that fits a regression model between the DL CCM and UL\nCCM samples in the training dataset and preserves the local geometric structure\nof the data in the UL CCM space, while explicitly regulating the Lipschitz\ncontinuity of the mapping function in light of our theoretical findings. The\nproposed algorithm surpasses benchmark methods in terms of three error metrics\nas shown by simulations.\n","authors":["Melih Can Zerin","Elif Vural","Ali Özgür Yılmaz"],"pdf_url":"https://arxiv.org/pdf/2407.18865v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15871v2","updated":"2024-07-26T16:37:52Z","published":"2024-07-18T18:42:58Z","title":"Semantic Prototypes: Enhancing Transparency Without Black Boxes","summary":" As machine learning (ML) models and datasets increase in complexity, the\ndemand for methods that enhance explainability and interpretability becomes\nparamount. Prototypes, by encapsulating essential characteristics within data,\noffer insights that enable tactical decision-making and enhance transparency.\nTraditional prototype methods often rely on sub-symbolic raw data and opaque\nlatent spaces, reducing explainability and increasing the risk of\nmisinterpretations. This paper presents a novel framework that utilizes\nsemantic descriptions to define prototypes and provide clear explanations,\neffectively addressing the shortcomings of conventional methods. Our approach\nleverages concept-based descriptions to cluster data on the semantic level,\nensuring that prototypes not only represent underlying properties intuitively\nbut are also straightforward to interpret. Our method simplifies the\ninterpretative process and effectively bridges the gap between complex data\nstructures and human cognitive processes, thereby enhancing transparency and\nfostering trust. Our approach outperforms existing widely-used prototype\nmethods in facilitating human understanding and informativeness, as validated\nthrough a user survey.\n","authors":["Orfeas Menis-Mastromichalakis","Giorgos Filandrianos","Jason Liartis","Edmund Dervakos","Giorgos Stamou"],"pdf_url":"https://arxiv.org/pdf/2407.15871v2.pdf","comment":"This paper has been accepted for publication as a full paper at the\n 33rd ACM International Conference on Information and Knowledge Management\n (CIKM 2024)"},{"id":"http://arxiv.org/abs/2404.16894v3","updated":"2024-07-26T16:25:15Z","published":"2024-04-25T01:57:11Z","title":"On TinyML and Cybersecurity: Electric Vehicle Charging Infrastructure\n Use Case","summary":" As technology advances, the use of Machine Learning (ML) in cybersecurity is\nbecoming increasingly crucial to tackle the growing complexity of cyber\nthreats. While traditional ML models can enhance cybersecurity, their high\nenergy and resource demands limit their applications, leading to the emergence\nof Tiny Machine Learning (TinyML) as a more suitable solution for\nresource-constrained environments. TinyML is widely applied in areas such as\nsmart homes, healthcare, and industrial automation. TinyML focuses on\noptimizing ML algorithms for small, low-power devices, enabling intelligent\ndata processing directly on edge devices. This paper provides a comprehensive\nreview of common challenges of TinyML techniques, such as power consumption,\nlimited memory, and computational constraints; it also explores potential\nsolutions to these challenges, such as energy harvesting, computational\noptimization techniques, and transfer learning for privacy preservation. On the\nother hand, this paper discusses TinyML's applications in advancing\ncybersecurity for Electric Vehicle Charging Infrastructures (EVCIs) as a\nrepresentative use case. It presents an experimental case study that enhances\ncybersecurity in EVCI using TinyML, evaluated against traditional ML in terms\nof reduced delay and memory usage, with a slight trade-off in accuracy.\nAdditionally, the study includes a practical setup using the ESP32\nmicrocontroller in the PlatformIO environment, which provides a hands-on\nassessment of TinyML's application in cybersecurity for EVCI.\n","authors":["Fatemeh Dehrouyeh","Li Yang","Firouz Badrkhani Ajaei","Abdallah Shami"],"pdf_url":"https://arxiv.org/pdf/2404.16894v3.pdf","comment":"Accepted and to appear in IEEE Access; Code is available at GitHub\n link: https://github.com/Western-OC2-Lab/TinyML_EVCI"},{"id":"http://arxiv.org/abs/2407.18847v1","updated":"2024-07-26T16:12:06Z","published":"2024-07-26T16:12:06Z","title":"Enhancing material property prediction with ensemble deep graph\n convolutional networks","summary":" Machine learning (ML) models have emerged as powerful tools for accelerating\nmaterials discovery and design by enabling accurate predictions of properties\nfrom compositional and structural data. These capabilities are vital for\ndeveloping advanced technologies across fields such as energy, electronics, and\nbiomedicine, potentially reducing the time and resources needed for new\nmaterial exploration and promoting rapid innovation cycles. Recent efforts have\nfocused on employing advanced ML algorithms, including deep learning - based\ngraph neural network, for property prediction. Additionally, ensemble models\nhave proven to enhance the generalizability and robustness of ML and DL.\nHowever, the use of such ensemble strategies in deep graph networks for\nmaterial property prediction remains underexplored. Our research provides an\nin-depth evaluation of ensemble strategies in deep learning - based graph\nneural network, specifically targeting material property prediction tasks. By\ntesting the Crystal Graph Convolutional Neural Network (CGCNN) and its\nmultitask version, MT-CGCNN, we demonstrated that ensemble techniques,\nespecially prediction averaging, substantially improve precision beyond\ntraditional metrics for key properties like formation energy per atom ($\\Delta\nE^{f}$), band gap ($E_{g}$) and density ($\\rho$) in 33,990 stable inorganic\nmaterials. These findings support the broader application of ensemble methods\nto enhance predictive accuracy in the field.\n","authors":["Chowdhury Mohammad Abid Rahman","Ghadendra Bhandari","Nasser M Nasrabadi","Aldo H. Romero","Prashnna K. Gyawali"],"pdf_url":"https://arxiv.org/pdf/2407.18847v1.pdf","comment":"9 pages, 6 figures, 2 tables"},{"id":"http://arxiv.org/abs/2407.16326v2","updated":"2024-07-26T16:11:23Z","published":"2024-07-23T09:21:38Z","title":"On The Expressive Power of Knowledge Graph Embedding Methods","summary":" Knowledge Graph Embedding (KGE) is a popular approach, which aims to\nrepresent entities and relations of a knowledge graph in latent spaces. Their\nrepresentations are known as embeddings. To measure the plausibility of\ntriplets, score functions are defined over embedding spaces. Despite wide\ndissemination of KGE in various tasks, KGE methods have limitations in\nreasoning abilities. In this paper we propose a mathematical framework to\ncompare reasoning abilities of KGE methods. We show that STransE has a higher\ncapability than TransComplEx, and then present new STransCoRe method, which\nimproves the STransE by combining it with the TransCoRe insights, which can\nreduce the STransE space complexity.\n","authors":["Jiexing Gao","Dmitry Rodin","Vasily Motolygin","Denis Zaytsev"],"pdf_url":"https://arxiv.org/pdf/2407.16326v2.pdf","comment":"This paper may involve data that is not readily available to the\n public"},{"id":"http://arxiv.org/abs/2402.17363v2","updated":"2024-07-26T16:06:39Z","published":"2024-02-27T09:55:34Z","title":"CGGM: A conditional graph generation model with adaptive sparsity for\n node anomaly detection in IoT networks","summary":" Dynamic graphs are extensively employed for detecting anomalous behavior in\nnodes within the Internet of Things (IoT). Generative models are often used to\naddress the issue of imbalanced node categories in dynamic graphs.\nNevertheless, the constraints it faces include the monotonicity of adjacency\nrelationships, the difficulty in constructing multi-dimensional features for\nnodes, and the lack of a method for end-to-end generation of multiple\ncategories of nodes. This paper presents a novel graph generation model, called\nCGGM, designed specifically to generate a larger number of nodes belonging to\nthe minority class. The mechanism for generating an adjacency matrix, through\nadaptive sparsity, enhances flexibility in its structure. The feature\ngeneration module, called multidimensional features generator (MFG) to generate\nnode features along with topological information. Labels are transformed into\nembedding vectors, serving as conditional constraints to control the generation\nof synthetic data across multiple categories. Using a multi-stage loss, the\ndistribution of synthetic data is adjusted to closely resemble that of real\ndata. In extensive experiments, we show that CGGM's synthetic data outperforms\nstate-of-the-art methods across various metrics. Our results demonstrate\nefficient generation of diverse data categories, robustly enhancing\nmulti-category classification model performance.\n","authors":["Xianshi Su","Munan Li","Tongbang Jiang","Hao Long"],"pdf_url":"https://arxiv.org/pdf/2402.17363v2.pdf","comment":"13 pages, 19 figures"},{"id":"http://arxiv.org/abs/2407.18841v1","updated":"2024-07-26T16:05:26Z","published":"2024-07-26T16:05:26Z","title":"QT-TDM: Planning with Transformer Dynamics Model and Autoregressive\n Q-Learning","summary":" Inspired by the success of the Transformer architecture in natural language\nprocessing and computer vision, we investigate the use of Transformers in\nReinforcement Learning (RL), specifically in modeling the environment's\ndynamics using Transformer Dynamics Models (TDMs). We evaluate the capabilities\nof TDMs for continuous control in real-time planning scenarios with Model\nPredictive Control (MPC). While Transformers excel in long-horizon prediction,\ntheir tokenization mechanism and autoregressive nature lead to costly planning\nover long horizons, especially as the environment's dimensionality increases.\nTo alleviate this issue, we use a TDM for short-term planning, and learn an\nautoregressive discrete Q-function using a separate Q-Transformer (QT) model to\nestimate a long-term return beyond the short-horizon planning. Our proposed\nmethod, QT-TDM, integrates the robust predictive capabilities of Transformers\nas dynamics models with the efficacy of a model-free Q-Transformer to mitigate\nthe computational burden associated with real-time planning. Experiments in\ndiverse state-based continuous control tasks show that QT-TDM is superior in\nperformance and sample efficiency compared to existing Transformer-based RL\nmodels while achieving fast and computationally efficient inference.\n","authors":["Mostafa Kotb","Cornelius Weber","Muhammad Burhan Hafez","Stefan Wermter"],"pdf_url":"https://arxiv.org/pdf/2407.18841v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18840v1","updated":"2024-07-26T16:04:40Z","published":"2024-07-26T16:04:40Z","title":"The Cross-environment Hyperparameter Setting Benchmark for Reinforcement\n Learning","summary":" This paper introduces a new empirical methodology, the Cross-environment\nHyperparameter Setting Benchmark, that compares RL algorithms across\nenvironments using a single hyperparameter setting, encouraging algorithmic\ndevelopment which is insensitive to hyperparameters. We demonstrate that this\nbenchmark is robust to statistical noise and obtains qualitatively similar\nresults across repeated applications, even when using few samples. This\nrobustness makes the benchmark computationally cheap to apply, allowing\nstatistically sound insights at low cost. We demonstrate two example\ninstantiations of the CHS, on a set of six small control environments (SC-CHS)\nand on the entire DM Control suite of 28 environments (DMC-CHS). Finally, to\nillustrate the applicability of the CHS to modern RL algorithms on challenging\nenvironments, we conduct a novel empirical study of an open question in the\ncontinuous control literature. We show, with high confidence, that there is no\nmeaningful difference in performance between Ornstein-Uhlenbeck noise and\nuncorrelated Gaussian noise for exploration with the DDPG algorithm on the\nDMC-CHS.\n","authors":["Andrew Patterson","Samuel Neumann","Raksha Kumaraswamy","Martha White","Adam White"],"pdf_url":"https://arxiv.org/pdf/2407.18840v1.pdf","comment":"Accepted to RLC 2024"},{"id":"http://arxiv.org/abs/2407.18838v1","updated":"2024-07-26T16:00:20Z","published":"2024-07-26T16:00:20Z","title":"The Role of Temporal Hierarchy in Spiking Neural Networks","summary":" Spiking Neural Networks (SNNs) have the potential for rich spatio-temporal\nsignal processing thanks to exploiting both spatial and temporal parameters.\nThe temporal dynamics such as time constants of the synapses and neurons and\ndelays have been recently shown to have computational benefits that help reduce\nthe overall number of parameters required in the network and increase the\naccuracy of the SNNs in solving temporal tasks. Optimizing such temporal\nparameters, for example, through gradient descent, gives rise to a temporal\narchitecture for different problems. As has been shown in machine learning, to\nreduce the cost of optimization, architectural biases can be applied, in this\ncase in the temporal domain. Such inductive biases in temporal parameters have\nbeen found in neuroscience studies, highlighting a hierarchy of temporal\nstructure and input representation in different layers of the cortex. Motivated\nby this, we propose to impose a hierarchy of temporal representation in the\nhidden layers of SNNs, highlighting that such an inductive bias improves their\nperformance. We demonstrate the positive effects of temporal hierarchy in the\ntime constants of feed-forward SNNs applied to temporal tasks (Multi-Time-Scale\nXOR and Keyword Spotting, with a benefit of up to 4.1% in classification\naccuracy). Moreover, we show that such architectural biases, i.e. hierarchy of\ntime constants, naturally emerge when optimizing the time constants through\ngradient descent, initialized as homogeneous values. We further pursue this\nproposal in temporal convolutional SNNs, by introducing the hierarchical bias\nin the size and dilation of temporal kernels, giving rise to competitive\nresults in popular temporal spike-based datasets.\n","authors":["Filippo Moro","Pau Vilimelis Aceituno","Laura Kriener","Melika Payvand"],"pdf_url":"https://arxiv.org/pdf/2407.18838v1.pdf","comment":"16 pages, 9 figures, pre-print"},{"id":"http://arxiv.org/abs/2401.10805v3","updated":"2024-07-26T16:00:07Z","published":"2024-01-19T16:48:49Z","title":"Learning to Visually Connect Actions and their Effects","summary":" We introduce the novel concept of visually Connecting Actions and Their\nEffects (CATE) in video understanding. CATE can have applications in areas like\ntask planning and learning from demonstration. We identify and explore two\ndifferent aspects of the concept of CATE: Action Selection (AS) and\nEffect-Affinity Assessment (EAA), where video understanding models connect\nactions and effects at semantic and fine-grained levels, respectively. We\ndesign various baseline models for AS and EAA. Despite the intuitive nature of\nthe task, we observe that models struggle, and humans outperform them by a\nlarge margin. Our experiments show that in solving AS and EAA, models learn\nintuitive properties like object tracking and pose encoding without explicit\nsupervision. We demonstrate that CATE can be an effective self-supervised task\nfor learning video representations from unlabeled videos. The study aims to\nshowcase the fundamental nature and versatility of CATE, with the hope of\ninspiring advanced formulations and models.\n","authors":["Paritosh Parmar","Eric Peh","Basura Fernando"],"pdf_url":"https://arxiv.org/pdf/2401.10805v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15245v2","updated":"2024-07-26T15:48:23Z","published":"2024-07-21T19:05:30Z","title":"Weyl Calculus and Exactly Solvable Schrödinger Bridges with\n Quadratic State Cost","summary":" Schr\\\"{o}dinger bridge--a stochastic dynamical generalization of optimal mass\ntransport--exhibits a learning-control duality. Viewed as a stochastic control\nproblem, the Schr\\\"{o}dinger bridge finds an optimal control policy that steers\na given joint state statistics to another while minimizing the total control\neffort subject to controlled diffusion and deadline constraints. Viewed as a\nstochastic learning problem, the Schr\\\"{o}dinger bridge finds the most-likely\ndistribution-valued trajectory connecting endpoint distributional observations,\ni.e., solves the two point boundary-constrained maximum likelihood problem over\nthe manifold of probability distributions. Recent works have shown that solving\nthe Schr\\\"{o}dinger bridge problem with state cost requires finding the Markov\nkernel associated with a reaction-diffusion PDE where the state cost appears as\na state-dependent reaction rate. We explain how ideas from Weyl calculus in\nquantum mechanics, specifically the Weyl operator and the Weyl symbol, can help\ndetermine such Markov kernels. We illustrate these ideas by explicitly finding\nthe Markov kernel for the case of quadratic state cost via Weyl calculus,\nrecovering our earlier results but avoiding tedious computation with Hermite\npolynomials.\n","authors":["Alexis M. H. Teter","Wenqing Wang","Abhishek Halder"],"pdf_url":"https://arxiv.org/pdf/2407.15245v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00019v2","updated":"2024-07-26T15:39:03Z","published":"2024-01-01T13:03:35Z","title":"Diffusion MRI with Machine Learning","summary":" Diffusion-weighted magnetic resonance imaging (dMRI) offers unique\ncapabilities including noninvasive probing of brain's tissue microstructure and\nstructural connectivity. It is widely used for clinical assessment of brain\npathologies and for neuroscience research. Analyzing the dMRI data to extract\nuseful information for medical and scientific purposes can be challenging. The\ndMRI measurements often suffer from strong noise and artifacts, there is\nusually high inter-session and inter-scanner variability in the data, and\nconsiderable inter-subject heterogeneity in brain structure. Moreover, the\nrelationship between measurements and the phenomena of interest can be highly\ncomplex. Recent years have witnessed increasing use of machine learning methods\nfor dMRI analysis. This manuscript aims to assess these efforts, with a focus\non methods that have addressed data preprocessing and harmonization,\nmicrostructure mapping, tractography, and white matter tract analysis. We study\nthe main findings, strengths, and weaknesses of the existing methods and\nsuggest topics for future research. We find that machine learning may be\nexceptionally suited to tackle some of the difficult tasks in dMRI analysis.\nHowever, for this to happen, several shortcomings of existing methods and\ncritical unresolved issues need to be addressed. These include deficient\nevaluation practices, lack of rich training datasets and validation benchmarks,\nas well as model generalizability, reliability, and explainability concerns.\n","authors":["Davood Karimi"],"pdf_url":"https://arxiv.org/pdf/2402.00019v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18821v1","updated":"2024-07-26T15:31:13Z","published":"2024-07-26T15:31:13Z","title":"Deep Companion Learning: Enhancing Generalization Through Historical\n Consistency","summary":" We propose Deep Companion Learning (DCL), a novel training method for Deep\nNeural Networks (DNNs) that enhances generalization by penalizing inconsistent\nmodel predictions compared to its historical performance. To achieve this, we\ntrain a deep-companion model (DCM), by using previous versions of the model to\nprovide forecasts on new inputs. This companion model deciphers a meaningful\nlatent semantic structure within the data, thereby providing targeted\nsupervision that encourages the primary model to address the scenarios it finds\nmost challenging. We validate our approach through both theoretical analysis\nand extensive experimentation, including ablation studies, on a variety of\nbenchmark datasets (CIFAR-100, Tiny-ImageNet, ImageNet-1K) using diverse\narchitectural models (ShuffleNetV2, ResNet, Vision Transformer, etc.),\ndemonstrating state-of-the-art performance.\n","authors":["Ruizhao Zhu","Venkatesh Saligrama"],"pdf_url":"https://arxiv.org/pdf/2407.18821v1.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2407.18812v1","updated":"2024-07-26T15:20:50Z","published":"2024-07-26T15:20:50Z","title":"Online Planning in POMDPs with State-Requests","summary":" In key real-world problems, full state information is sometimes available but\nonly at a high cost, like activating precise yet energy-intensive sensors or\nconsulting humans, thereby compelling the agent to operate under partial\nobservability. For this scenario, we propose AEMS-SR (Anytime Error\nMinimization Search with State Requests), a principled online planning\nalgorithm tailored for POMDPs with state requests. By representing the search\nspace as a graph instead of a tree, AEMS-SR avoids the exponential growth of\nthe search space originating from state requests. Theoretical analysis\ndemonstrates AEMS-SR's $\\varepsilon$-optimality, ensuring solution quality,\nwhile empirical evaluations illustrate its effectiveness compared with AEMS and\nPOMCP, two SOTA online planning algorithms. AEMS-SR enables efficient planning\nin domains characterized by partial observability and costly state requests\noffering practical benefits across various applications.\n","authors":["Raphael Avalos","Eugenio Bargiacchi","Ann Nowé","Diederik M. Roijers","Frans A. Oliehoek"],"pdf_url":"https://arxiv.org/pdf/2407.18812v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18811v1","updated":"2024-07-26T15:20:42Z","published":"2024-07-26T15:20:42Z","title":"Interpreting artificial neural networks to detect genome-wide\n association signals for complex traits","summary":" Investigating the genetic architecture of complex diseases is challenging due\nto the highly polygenic and interactive landscape of genetic and environmental\nfactors. Although genome-wide association studies (GWAS) have identified\nthousands of variants for multiple complex phenotypes, conventional statistical\napproaches can be limited by simplified assumptions such as linearity and lack\nof epistasis models. In this work, we trained artificial neural networks for\npredicting complex traits using both simulated and real genotype/phenotype\ndatasets. We extracted feature importance scores via different post hoc\ninterpretability methods to identify potentially associated loci (PAL) for the\ntarget phenotype. Simulations we performed with various parameters demonstrated\nthat associated loci can be detected with good precision using strict selection\ncriteria, but downstream analyses are required for fine-mapping the exact\nvariants due to linkage disequilibrium, similarly to conventional GWAS. By\napplying our approach to the schizophrenia cohort in the Estonian Biobank, we\nwere able to detect multiple PAL related to this highly polygenic and heritable\ndisorder. We also performed enrichment analyses with PAL in genic regions,\nwhich predominantly identified terms associated with brain morphology. With\nfurther improvements in model optimization and confidence measures, artificial\nneural networks can enhance the identification of genomic loci associated with\ncomplex diseases, providing a more comprehensive approach for GWAS and serving\nas initial screening tools for subsequent functional studies.\n Keywords: Deep learning, interpretability, genome-wide association studies,\ncomplex diseases\n","authors":["Burak Yelmen","Maris Alver","Estonian Biobank Research Team","Flora Jay","Lili Milani"],"pdf_url":"https://arxiv.org/pdf/2407.18811v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18808v1","updated":"2024-07-26T15:18:29Z","published":"2024-07-26T15:18:29Z","title":"Learning Chaotic Systems and Long-Term Predictions with Neural Jump ODEs","summary":" The Path-dependent Neural Jump ODE (PD-NJ-ODE) is a model for online\nprediction of generic (possibly non-Markovian) stochastic processes with\nirregular (in time) and potentially incomplete (with respect to coordinates)\nobservations. It is a model for which convergence to the $L^2$-optimal\npredictor, which is given by the conditional expectation, is established\ntheoretically. Thereby, the training of the model is solely based on a dataset\nof realizations of the underlying stochastic process, without the need of\nknowledge of the law of the process. In the case where the underlying process\nis deterministic, the conditional expectation coincides with the process\nitself. Therefore, this framework can equivalently be used to learn the\ndynamics of ODE or PDE systems solely from realizations of the dynamical system\nwith different initial conditions. We showcase the potential of our method by\napplying it to the chaotic system of a double pendulum. When training the\nstandard PD-NJ-ODE method, we see that the prediction starts to diverge from\nthe true path after about half of the evaluation time. In this work we enhance\nthe model with two novel ideas, which independently of each other improve the\nperformance of our modelling setup. The resulting dynamics match the true\ndynamics of the chaotic system very closely. The same enhancements can be used\nto provably enable the PD-NJ-ODE to learn long-term predictions for general\nstochastic datasets, where the standard model fails. This is verified in\nseveral experiments.\n","authors":["Florian Krach","Josef Teichmann"],"pdf_url":"https://arxiv.org/pdf/2407.18808v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18807v1","updated":"2024-07-26T15:14:22Z","published":"2024-07-26T15:14:22Z","title":"Robust Learning in Bayesian Parallel Branching Graph Neural Networks:\n The Narrow Width Limit","summary":" The infinite width limit of random neural networks is known to result in\nNeural Networks as Gaussian Process (NNGP) (Lee et al. [2018]), characterized\nby task-independent kernels. It is widely accepted that larger network widths\ncontribute to improved generalization (Park et al. [2019]). However, this work\nchallenges this notion by investigating the narrow width limit of the Bayesian\nParallel Branching Graph Neural Network (BPB-GNN), an architecture that\nresembles residual networks. We demonstrate that when the width of a BPB-GNN is\nsignificantly smaller compared to the number of training examples, each branch\nexhibits more robust learning due to a symmetry breaking of branches in kernel\nrenormalization. Surprisingly, the performance of a BPB-GNN in the narrow width\nlimit is generally superior or comparable to that achieved in the wide width\nlimit in bias-limited scenarios. Furthermore, the readout norms of each branch\nin the narrow width limit are mostly independent of the architectural\nhyperparameters but generally reflective of the nature of the data. Our results\ncharacterize a newly defined narrow-width regime for parallel branching\nnetworks in general.\n","authors":["Zechen Zhang","Haim Sompolinsky"],"pdf_url":"https://arxiv.org/pdf/2407.18807v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18802v1","updated":"2024-07-26T15:05:41Z","published":"2024-07-26T15:05:41Z","title":"Log-Concave Coupling for Sampling Neural Net Posteriors","summary":" In this work, we present a sampling algorithm for single hidden layer neural\nnetworks. This algorithm is built upon a recursive series of Bayesian\nposteriors using a method we call Greedy Bayes. Sampling of the Bayesian\nposterior for neuron weight vectors $w$ of dimension $d$ is challenging because\nof its multimodality. Our algorithm to tackle this problem is based on a\ncoupling of the posterior density for $w$ with an auxiliary random variable\n$\\xi$.\n The resulting reverse conditional $w|\\xi$ of neuron weights given auxiliary\nrandom variable is shown to be log concave. In the construction of the\nposterior distributions we provide some freedom in the choice of the prior. In\nparticular, for Gaussian priors on $w$ with suitably small variance, the\nresulting marginal density of the auxiliary variable $\\xi$ is proven to be\nstrictly log concave for all dimensions $d$. For a uniform prior on the unit\n$\\ell_1$ ball, evidence is given that the density of $\\xi$ is again strictly\nlog concave for sufficiently large $d$.\n The score of the marginal density of the auxiliary random variable $\\xi$ is\ndetermined by an expectation over $w|\\xi$ and thus can be computed by various\nrapidly mixing Markov Chain Monte Carlo methods. Moreover, the computation of\nthe score of $\\xi$ permits methods of sampling $\\xi$ by a stochastic diffusion\n(Langevin dynamics) with drift function built from this score. With such\ndynamics, information-theoretic methods pioneered by Bakry and Emery show that\naccurate sampling of $\\xi$ is obtained rapidly when its density is indeed\nstrictly log-concave. After which, one more draw from $w|\\xi$, produces neuron\nweights $w$ whose marginal distribution is from the desired posterior.\n","authors":["Curtis McDonald","Andrew R Barron"],"pdf_url":"https://arxiv.org/pdf/2407.18802v1.pdf","comment":"This research was presented at the International Symposium on\n Information Theory (ISIT). Athens, Greece, July 11, 2024. The material was\n also presented in the 2024 Shannon Lecture"},{"id":"http://arxiv.org/abs/2407.18792v1","updated":"2024-07-26T14:54:16Z","published":"2024-07-26T14:54:16Z","title":"Benchmarking Dependence Measures to Prevent Shortcut Learning in Medical\n Imaging","summary":" Medical imaging cohorts are often confounded by factors such as acquisition\ndevices, hospital sites, patient backgrounds, and many more. As a result, deep\nlearning models tend to learn spurious correlations instead of causally related\nfeatures, limiting their generalizability to new and unseen data. This problem\ncan be addressed by minimizing dependence measures between intermediate\nrepresentations of task-related and non-task-related variables. These measures\ninclude mutual information, distance correlation, and the performance of\nadversarial classifiers. Here, we benchmark such dependence measures for the\ntask of preventing shortcut learning. We study a simplified setting using\nMorpho-MNIST and a medical imaging task with CheXpert chest radiographs. Our\nresults provide insights into how to mitigate confounding factors in medical\nimaging.\n","authors":["Sarah Müller","Louisa Fay","Lisa M. Koch","Sergios Gatidis","Thomas Küstner","Philipp Berens"],"pdf_url":"https://arxiv.org/pdf/2407.18792v1.pdf","comment":"Accepted to the 15th International Workshop on Machine Learning in\n Medical Imaging (MLMI 2024)"},{"id":"http://arxiv.org/abs/2407.18772v1","updated":"2024-07-26T14:32:18Z","published":"2024-07-26T14:32:18Z","title":"Learning production functions for supply chains with graph neural\n networks","summary":" The global economy relies on the flow of goods over supply chain networks,\nwith nodes as firms and edges as transactions between firms. While we may\nobserve these external transactions, they are governed by unseen production\nfunctions, which determine how firms internally transform the input products\nthey receive into output products that they sell. In this setting, it can be\nextremely valuable to infer these production functions, to better understand\nand improve supply chains, and to forecast future transactions more accurately.\nHowever, existing graph neural networks (GNNs) cannot capture these hidden\nrelationships between nodes' inputs and outputs. Here, we introduce a new class\nof models for this setting, by combining temporal GNNs with a novel inventory\nmodule, which learns production functions via attention weights and a special\nloss function. We evaluate our models extensively on real supply chains data,\nalong with data generated from our new open-source simulator, SupplySim. Our\nmodels successfully infer production functions, with a 6-50% improvement over\nbaselines, and forecast future transactions on real and synthetic data,\noutperforming baselines by 11-62%.\n","authors":["Serina Chang","Zhiyin Lin","Benjamin Yan","Swapnil Bembde","Qi Xiu","Chi Heem Wong","Yu Qin","Frank Kloster","Alex Luo","Raj Palleti","Jure Leskovec"],"pdf_url":"https://arxiv.org/pdf/2407.18772v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18759v1","updated":"2024-07-26T14:14:57Z","published":"2024-07-26T14:14:57Z","title":"Unsupervised Reservoir Computing for Multivariate Denoising of Severely\n Contaminated Signals","summary":" The interdependence and high dimensionality of multivariate signals present\nsignificant challenges for denoising, as conventional univariate methods often\nstruggle to capture the complex interactions between variables. A successful\napproach must consider not only the multivariate dependencies of the desired\nsignal but also the multivariate dependencies of the interfering noise. In our\nprevious research, we introduced a method using machine learning to extract the\nmaximum portion of ``predictable information\" from univariate signal. We extend\nthis approach to multivariate signals, with the key idea being to properly\nincorporate the interdependencies of the noise back into the interdependent\nreconstruction of the signal. The method works successfully for various\nmultivariate signals, including chaotic signals and highly oscillating\nsinusoidal signals which are corrupted by spatially correlated intensive noise.\nIt consistently outperforms other existing multivariate denoising methods\nacross a wide range of scenarios.\n","authors":["Jaesung Choi","Pilwon Kim"],"pdf_url":"https://arxiv.org/pdf/2407.18759v1.pdf","comment":"6pages, 2figures, 2tables"},{"id":"http://arxiv.org/abs/2407.17671v2","updated":"2024-07-26T14:09:08Z","published":"2024-07-24T23:23:38Z","title":"Unsqueeze [CLS] Bottleneck to Learn Rich Representations","summary":" Distillation-based self-supervised learning typically leads to more\ncompressed representations due to its radical clustering process and the\nimplementation of a sharper target distribution. To overcome this limitation\nand preserve more information from input, we introduce UDI, conceptualized as\nUnsqueezed Distillation-based self-supervised learning (SSL). UDI enriches the\nlearned representation by encouraging multimodal prediction distilled from a\nconsolidated profile of local predictions that are derived via stratified\nsampling. Our evaluations show that UDI not only promotes semantically\nmeaningful representations at instance level, delivering superior or\ncompetitive results to state-of-the-art SSL methods in image classification,\nbut also effectively preserves the nuisance of input, which yields significant\nimprovement in dense prediction tasks, including object detection and\nsegmentation. Additionally, UDI performs competitively in low-shot image\nclassification, improving the scalability of joint-embedding pipelines. Various\nvisualizations and ablation studies are presented to further elucidate the\nmechanisms behind UDI. Our source code is available at\nhttps://github.com/ISL-CV/udi.\n","authors":["Qing Su","Shihao Ji"],"pdf_url":"https://arxiv.org/pdf/2407.17671v2.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2311.12715v2","updated":"2024-07-26T14:08:00Z","published":"2023-11-21T16:42:03Z","title":"Attacks on fairness in Federated Learning","summary":" Federated Learning is an important emerging distributed training paradigm\nthat keeps data private on clients. It is now well understood that by\ncontrolling only a small subset of FL clients, it is possible to introduce a\nbackdoor to a federated learning model, in the presence of certain attributes.\nIn this paper, we present a new type of attack that compromises the fairness of\nthe trained model. Fairness is understood to be the attribute-level performance\ndistribution of a trained model. It is particularly salient in domains where,\nfor example, skewed accuracy discrimination between subpopulations could have\ndisastrous consequences. We find that by employing a threat model similar to\nthat of a backdoor attack, an attacker is able to influence the aggregated\nmodel to have an unfair performance distribution between any given set of\nattributes. Furthermore, we find that this attack is possible by controlling\nonly a single client. While combating naturally induced unfairness in FL has\npreviously been discussed in depth, its artificially induced kind has been\nneglected. We show that defending against attacks on fairness should be a\ncritical consideration in any situation where unfairness in a trained model\ncould benefit a user who participated in its training.\n","authors":["Joseph Rance","Filip Svoboda"],"pdf_url":"https://arxiv.org/pdf/2311.12715v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18750v1","updated":"2024-07-26T14:04:57Z","published":"2024-07-26T14:04:57Z","title":"FLUE: Federated Learning with Un-Encrypted model weights","summary":" Federated Learning enables diverse devices to collaboratively train a shared\nmodel while keeping training data locally stored, avoiding the need for\ncentralized cloud storage. Despite existing privacy measures, concerns arise\nfrom potential reverse engineering of gradients, even with added noise,\nrevealing private data. To address this, recent research emphasizes using\nencrypted model parameters during training. This paper introduces a novel\nfederated learning algorithm, leveraging coded local gradients without\nencryption, exchanging coded proxies for model parameters, and injecting\nsurplus noise for enhanced privacy. Two algorithm variants are presented,\nshowcasing convergence and learning rates adaptable to coding schemes and raw\ndata characteristics. Two encryption-free implementations with fixed and random\ncoding matrices are provided, demonstrating promising simulation results from\nboth federated optimization and machine learning perspectives.\n","authors":["Elie Atallah"],"pdf_url":"https://arxiv.org/pdf/2407.18750v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18745v1","updated":"2024-07-26T13:59:20Z","published":"2024-07-26T13:59:20Z","title":"FairAIED: Navigating Fairness, Bias, and Ethics in Educational AI\n Applications","summary":" The integration of Artificial Intelligence (AI) into education has\ntransformative potential, providing tailored learning experiences and creative\ninstructional approaches. However, the inherent biases in AI algorithms hinder\nthis improvement by unintentionally perpetuating prejudice against specific\ndemographics, especially in human-centered applications like education. This\nsurvey delves deeply into the developing topic of algorithmic fairness in\neducational contexts, providing a comprehensive evaluation of the diverse\nliterature on fairness, bias, and ethics in AI-driven educational applications.\nIt identifies the common forms of biases, such as data-related, algorithmic,\nand user-interaction, that fundamentally undermine the accomplishment of\nfairness in AI teaching aids. By outlining existing techniques for mitigating\nthese biases, ranging from varied data gathering to algorithmic fairness\ninterventions, the survey emphasizes the critical role of ethical\nconsiderations and legal frameworks in shaping a more equitable educational\nenvironment. Furthermore, it guides readers through the complexities of\nfairness measurements, methods, and datasets, shedding light on the way to bias\nreduction. Despite these gains, this survey highlights long-standing issues,\nsuch as achieving a balance between fairness and accuracy, as well as the need\nfor diverse datasets. Overcoming these challenges and ensuring the ethical and\nfair use of AI's promise in education call for a collaborative,\ninterdisciplinary approach.\n","authors":["Sribala Vidyadhari Chinta","Zichong Wang","Zhipeng Yin","Nhat Hoang","Matthew Gonzalez","Tai Le Quy","Wenbin Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.18745v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11762v3","updated":"2024-07-26T13:52:14Z","published":"2023-11-20T13:40:40Z","title":"MUVO: A Multimodal World Model with Spatial Representations for\n Autonomous Driving","summary":" Learning unsupervised world models for autonomous driving has the potential\nto improve the reasoning capabilities of today's systems dramatically. However,\nmost work neglects the physical attributes of the world and focuses on sensor\ndata alone. We propose MUVO, a MUltimodal World Model with spatial VOxel\nrepresentations, to address this challenge. We utilize raw camera and lidar\ndata to learn a sensor-agnostic geometric representation of the world. We\ndemonstrate multimodal future predictions and show that our spatial\nrepresentation improves the prediction quality of both camera images and lidar\npoint clouds.\n","authors":["Daniel Bogdoll","Yitian Yang","Tim Joseph","J. Marius Zöllner"],"pdf_url":"https://arxiv.org/pdf/2311.11762v3.pdf","comment":"Daniel Bogdoll and Yitian Yang contributed equally"},{"id":"http://arxiv.org/abs/2403.04202v4","updated":"2024-07-26T13:47:17Z","published":"2024-03-07T04:12:24Z","title":"Dynamics of Moral Behavior in Heterogeneous Populations of Learning\n Agents","summary":" Growing concerns about safety and alignment of AI systems highlight the\nimportance of embedding moral capabilities in artificial agents: a promising\nsolution is the use of learning from experience, i.e., Reinforcement Learning.\nIn multi-agent (social) environments, complex population-level phenomena may\nemerge from interactions between individual learning agents. Many of the\nexisting studies rely on simulated social dilemma environments to study the\ninteractions of independent learning agents; however, they tend to ignore the\nmoral heterogeneity that is likely to be present in societies of agents in\npractice. For example, at different points in time a single learning agent may\nface opponents who are consequentialist (i.e., focused on maximizing outcomes\nover time), norm-based (i.e., conforming to specific norms), or virtue-based\n(i.e., considering a combination of different virtues). The extent to which\nagents' co-development may be impacted by such moral heterogeneity in\npopulations is not well understood. In this paper, we present a study of the\nlearning dynamics of morally heterogeneous populations interacting in a social\ndilemma setting. Using an Iterated Prisoner's Dilemma environment with a\npartner selection mechanism, we investigate the extent to which the prevalence\nof diverse moral agents in populations affects individual agents' learning\nbehaviors and emergent population-level outcomes. We observe several types of\nnon-trivial interactions between pro-social and anti-social agents, and find\nthat certain types of moral agents are able to steer selfish agents towards\nmore cooperative behavior.\n","authors":["Elizaveta Tennant","Stephen Hailes","Mirco Musolesi"],"pdf_url":"https://arxiv.org/pdf/2403.04202v4.pdf","comment":"Accepted at AIES 2024 (7th AAAI/ACM Conference on AI, Ethics, and\n Society - San Jose, CA, USA)"},{"id":"http://arxiv.org/abs/2402.15374v3","updated":"2024-07-26T13:47:11Z","published":"2024-02-23T15:19:37Z","title":"Outlier detection by ensembling uncertainty with negative objectness","summary":" Outlier detection is an essential capability in safety-critical applications\nof supervised visual recognition. Most of the existing methods deliver best\nresults by encouraging standard closed-set models to produce low-confidence\npredictions in negative training data. However, that approach conflates\nprediction uncertainty with recognition of the negative class. We therefore\nreconsider direct prediction of K+1 logits that correspond to K groundtruth\nclasses and one outlier class. This setup allows us to formulate a novel\nanomaly score as an ensemble of in-distribution uncertainty and the posterior\nof the outlier class which we term negative objectness. Now outliers can be\nindependently detected due to i) high prediction uncertainty or ii) similarity\nwith negative data. We embed our method into a dense prediction architecture\nwith mask-level recognition over K+2 classes. The training procedure encourages\nthe novel K+2-th class to learn negative objectness at pasted negative\ninstances. Our models outperform the current state-of-the art on standard\nbenchmarks for image-wide and pixel-level outlier detection with and without\ntraining on real negative data.\n","authors":["Anja Delić","Matej Grcić","Siniša Šegvić"],"pdf_url":"https://arxiv.org/pdf/2402.15374v3.pdf","comment":"Accepted to BMVC 2024"},{"id":"http://arxiv.org/abs/2407.18735v1","updated":"2024-07-26T13:44:06Z","published":"2024-07-26T13:44:06Z","title":"AutoRDF2GML: Facilitating RDF Integration in Graph Machine Learning","summary":" In this paper, we introduce AutoRDF2GML, a framework designed to convert RDF\ndata into data representations tailored for graph machine learning tasks.\nAutoRDF2GML enables, for the first time, the creation of both content-based\nfeatures -- i.e., features based on RDF datatype properties -- and\ntopology-based features -- i.e., features based on RDF object properties.\nCharacterized by automated feature extraction, AutoRDF2GML makes it possible\neven for users less familiar with RDF and SPARQL to generate data\nrepresentations ready for graph machine learning tasks, such as link\nprediction, node classification, and graph classification. Furthermore, we\npresent four new benchmark datasets for graph machine learning, created from\nlarge RDF knowledge graphs using our framework. These datasets serve as\nvaluable resources for evaluating graph machine learning approaches, such as\ngraph neural networks. Overall, our framework effectively bridges the gap\nbetween the Graph Machine Learning and Semantic Web communities, paving the way\nfor RDF-based machine learning applications.\n","authors":["Michael Färber","David Lamprecht","Yuni Susanti"],"pdf_url":"https://arxiv.org/pdf/2407.18735v1.pdf","comment":"accepted at ISWC'24"},{"id":"http://arxiv.org/abs/2407.18732v1","updated":"2024-07-26T13:35:06Z","published":"2024-07-26T13:35:06Z","title":"A Physics-Informed Neural Network-Based Approach for the Spatial\n Upsampling of Spherical Microphone Arrays","summary":" Spherical microphone arrays are convenient tools for capturing the spatial\ncharacteristics of a sound field. However, achieving superior spatial\nresolution requires arrays with numerous capsules, consequently leading to\nexpensive devices. To address this issue, we present a method for spatially\nupsampling spherical microphone arrays with a limited number of capsules. Our\napproach exploits a physics-informed neural network with Rowdy activation\nfunctions, leveraging physical constraints to provide high-order microphone\narray signals, starting from low-order devices. Results show that, within its\ndomain of application, our approach outperforms a state of the art method based\non signal processing for spherical microphone arrays upsampling.\n","authors":["Federico Miotello","Ferdinando Terminiello","Mirco Pezzoli","Alberto Bernardini","Fabio Antonacci","Augusto Sarti"],"pdf_url":"https://arxiv.org/pdf/2407.18732v1.pdf","comment":"Accepted for publication at IWAENC 2024"},{"id":"http://arxiv.org/abs/2309.03731v2","updated":"2024-07-26T13:33:14Z","published":"2023-09-07T14:17:44Z","title":"Using representation balancing to learn conditional-average dose\n responses from clustered data","summary":" Estimating a unit's responses to interventions with an associated dose, the\n\"conditional average dose response\" (CADR), is relevant in a variety of\ndomains, from healthcare to business, economics, and beyond. Such a response\ntypically needs to be estimated from observational data, which introduces\nseveral challenges. That is why the machine learning (ML) community has\nproposed several tailored CADR estimators. Yet, the proposal of most of these\nmethods requires strong assumptions on the distribution of data and the\nassignment of interventions, which go beyond the standard assumptions in causal\ninference. Whereas previous works have so far focused on smooth shifts in\ncovariate distributions across doses, in this work, we will study estimating\nCADR from clustered data and where different doses are assigned to different\nsegments of a population. On a novel benchmarking dataset, we show the impacts\nof clustered data on model performance and propose an estimator, CBRNet, that\nlearns cluster-agnostic and hence dose-agnostic covariate representations\nthrough representation balancing for unbiased CADR inference. We run extensive\nexperiments to illustrate the workings of our method and compare it with the\nstate of the art in ML for CADR estimation.\n","authors":["Christopher Bockel-Rickermann","Toon Vanderschueren","Jeroen Berrevoets","Tim Verdonck","Wouter Verbeke"],"pdf_url":"https://arxiv.org/pdf/2309.03731v2.pdf","comment":"21 pages, 7 figures, v2: updated methodology and experiments"},{"id":"http://arxiv.org/abs/2406.04727v2","updated":"2024-07-26T13:24:41Z","published":"2024-06-07T08:19:59Z","title":"MMPolymer: A Multimodal Multitask Pretraining Framework for Polymer\n Property Prediction","summary":" Polymers are high-molecular-weight compounds constructed by the covalent\nbonding of numerous identical or similar monomers so that their 3D structures\nare complex yet exhibit unignorable regularity. Typically, the properties of a\npolymer, such as plasticity, conductivity, bio-compatibility, and so on, are\nhighly correlated with its 3D structure. However, existing polymer property\nprediction methods heavily rely on the information learned from polymer SMILES\nsequences (P-SMILES strings) while ignoring crucial 3D structural information,\nresulting in sub-optimal performance. In this work, we propose MMPolymer, a\nnovel multimodal multitask pretraining framework incorporating polymer 1D\nsequential and 3D structural information to encourage downstream polymer\nproperty prediction tasks. Besides, considering the scarcity of polymer 3D\ndata, we further introduce the \"Star Substitution\" strategy to extract 3D\nstructural information effectively. During pretraining, in addition to\npredicting masked tokens and recovering clear 3D coordinates, MMPolymer\nachieves the cross-modal alignment of latent representations. Then we further\nfine-tune the pretrained MMPolymer for downstream polymer property prediction\ntasks in the supervised learning paradigm. Experiments show that MMPolymer\nachieves state-of-the-art performance in downstream property prediction tasks.\nMoreover, given the pretrained MMPolymer, utilizing merely a single modality in\nthe fine-tuning phase can also outperform existing methods, showcasing the\nexceptional capability of MMPolymer in polymer feature extraction and\nutilization.\n","authors":["Fanmeng Wang","Wentao Guo","Minjie Cheng","Shen Yuan","Hongteng Xu","Zhifeng Gao"],"pdf_url":"https://arxiv.org/pdf/2406.04727v2.pdf","comment":"Accepted by the 33rd ACM International Conference on Information and\n Knowledge Management (CIKM 2024)"},{"id":"http://arxiv.org/abs/2407.18723v1","updated":"2024-07-26T13:18:42Z","published":"2024-07-26T13:18:42Z","title":"LLASP: Fine-tuning Large Language Models for Answer Set Programming","summary":" Recently, Large Language Models (LLMs) have showcased their potential in\nvarious natural language processing tasks, including code generation. However,\nwhile significant progress has been made in adapting LLMs to generate code for\nseveral imperative programming languages and tasks, there remains a notable gap\nin their application to declarative formalisms, such as Answer Set Programming\n(ASP). In this paper, we move a step towards exploring the capabilities of LLMs\nfor ASP code generation. First, we perform a systematic evaluation of several\nstate-of-the-art LLMs. Despite their power in terms of number of parameters,\ntraining data and computational resources, empirical results demonstrate\ninadequate performances in generating correct ASP programs. Therefore, we\npropose LLASP, a fine-tuned lightweight model specifically trained to encode\nfundamental ASP program patterns. To this aim, we create an ad-hoc dataset\ncovering a wide variety of fundamental problem specifications that can be\nencoded in ASP. Our experiments demonstrate that the quality of ASP programs\ngenerated by LLASP is remarkable. This holds true not only when compared to the\nnon-fine-tuned counterpart but also when compared to the majority of eager LLM\ncandidates, particularly from a semantic perspective. All the code and data\nused to perform the experiments are publicly available at\nhttps://anonymous.4open.science/r/LLASP-D86C/.\n","authors":["Erica Coppolillo","Francesco Calimeri","Giuseppe Manco","Simona Perri","Francesco Ricca"],"pdf_url":"https://arxiv.org/pdf/2407.18723v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15439v2","updated":"2024-07-26T13:02:24Z","published":"2024-07-22T07:36:27Z","title":"Merit-based Fair Combinatorial Semi-Bandit with Unrestricted Feedback\n Delays","summary":" We study the stochastic combinatorial semi-bandit problem with unrestricted\nfeedback delays under merit-based fairness constraints. This is motivated by\napplications such as crowdsourcing, and online advertising, where immediate\nfeedback is not immediately available and fairness among different choices (or\narms) is crucial. We consider two types of unrestricted feedback delays:\nreward-independent delays where the feedback delays are independent of the\nrewards, and reward-dependent delays where the feedback delays are correlated\nwith the rewards. Furthermore, we introduce merit-based fairness constraints to\nensure a fair selection of the arms. We define the reward regret and the\nfairness regret and present new bandit algorithms to select arms under\nunrestricted feedback delays based on their merits. We prove that our\nalgorithms all achieve sublinear expected reward regret and expected fairness\nregret, with a dependence on the quantiles of the delay distribution. We also\nconduct extensive experiments using synthetic and real-world data and show that\nour algorithms can fairly select arms with different feedback delays.\n","authors":["Ziqun Chen","Kechao Cai","Zhuoyue Chen","Jinbei Zhang","John C. S. Lui"],"pdf_url":"https://arxiv.org/pdf/2407.15439v2.pdf","comment":"28 pages, 9 figures, accepted for 27th European Conference on\n Artificial Intelligence (ECAI 2024), Source code added"},{"id":"http://arxiv.org/abs/2407.18712v1","updated":"2024-07-26T12:57:54Z","published":"2024-07-26T12:57:54Z","title":"Cluster-norm for Unsupervised Probing of Knowledge","summary":" The deployment of language models brings challenges in generating reliable\ninformation, especially when these models are fine-tuned using human\npreferences. To extract encoded knowledge without (potentially) biased human\nlabels, unsupervised probing techniques like Contrast-Consistent Search (CCS)\nhave been developed (Burns et al., 2022). However, salient but unrelated\nfeatures in a given dataset can mislead these probes (Farquhar et al., 2023).\nAddressing this, we propose a cluster normalization method to minimize the\nimpact of such features by clustering and normalizing activations of contrast\npairs before applying unsupervised probing techniques. While this approach does\nnot address the issue of differentiating between knowledge in general and\nsimulated knowledge - a major issue in the literature of latent knowledge\nelicitation (Christiano et al., 2021) - it significantly improves the ability\nof unsupervised probes to identify the intended knowledge amidst distractions.\n","authors":["Walter Laurito","Sharan Maiya","Grégoire Dhimoïla"," Owen"," Yeung","Kaarel Hänni"],"pdf_url":"https://arxiv.org/pdf/2407.18712v1.pdf","comment":"34 pages, 35 figures"},{"id":"http://arxiv.org/abs/2407.18707v1","updated":"2024-07-26T12:45:53Z","published":"2024-07-26T12:45:53Z","title":"Finite Neural Networks as Mixtures of Gaussian Processes: From Provable\n Error Bounds to Prior Selection","summary":" Infinitely wide or deep neural networks (NNs) with independent and\nidentically distributed (i.i.d.) parameters have been shown to be equivalent to\nGaussian processes. Because of the favorable properties of Gaussian processes,\nthis equivalence is commonly employed to analyze neural networks and has led to\nvarious breakthroughs over the years. However, neural networks and Gaussian\nprocesses are equivalent only in the limit; in the finite case there are\ncurrently no methods available to approximate a trained neural network with a\nGaussian model with bounds on the approximation error. In this work, we present\nan algorithmic framework to approximate a neural network of finite width and\ndepth, and with not necessarily i.i.d. parameters, with a mixture of Gaussian\nprocesses with error bounds on the approximation error. In particular, we\nconsider the Wasserstein distance to quantify the closeness between\nprobabilistic models and, by relying on tools from optimal transport and\nGaussian processes, we iteratively approximate the output distribution of each\nlayer of the neural network as a mixture of Gaussian processes. Crucially, for\nany NN and $\\epsilon >0$ our approach is able to return a mixture of Gaussian\nprocesses that is $\\epsilon$-close to the NN at a finite set of input points.\nFurthermore, we rely on the differentiability of the resulting error bound to\nshow how our approach can be employed to tune the parameters of a NN to mimic\nthe functional behavior of a given Gaussian process, e.g., for prior selection\nin the context of Bayesian inference. We empirically investigate the\neffectiveness of our results on both regression and classification problems\nwith various neural network architectures. Our experiments highlight how our\nresults can represent an important step towards understanding neural network\npredictions and formally quantifying their uncertainty.\n","authors":["Steven Adams"," Patanè","Morteza Lahijanian","Luca Laurenti"],"pdf_url":"https://arxiv.org/pdf/2407.18707v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18698v1","updated":"2024-07-26T12:23:54Z","published":"2024-07-26T12:23:54Z","title":"Adaptive Contrastive Search: Uncertainty-Guided Decoding for Open-Ended\n Text Generation","summary":" Decoding from the output distributions of large language models to produce\nhigh-quality text is a complex challenge in language modeling. Various\napproaches, such as beam search, sampling with temperature, $k-$sampling,\nnucleus $p-$sampling, typical decoding, contrastive decoding, and contrastive\nsearch, have been proposed to address this problem, aiming to improve\ncoherence, diversity, as well as resemblance to human-generated text. In this\nstudy, we introduce adaptive contrastive search, a novel decoding strategy\nextending contrastive search by incorporating an adaptive degeneration penalty,\nguided by the estimated uncertainty of the model at each generation step. This\nstrategy is designed to enhance both the creativity and diversity of the\nlanguage modeling process while at the same time producing coherent and\nhigh-quality generated text output. Our findings indicate performance\nenhancement in both aspects, across different model architectures and datasets,\nunderscoring the effectiveness of our method in text generation tasks. Our code\nbase, datasets, and models are publicly available.\n","authors":["Esteban Garces Arias","Julian Rodemann","Meimingwei Li","Christian Heumann","Matthias Aßenmacher"],"pdf_url":"https://arxiv.org/pdf/2407.18698v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18693v1","updated":"2024-07-26T12:17:57Z","published":"2024-07-26T12:17:57Z","title":"Deep learning for predicting the occurrence of tipping points","summary":" Tipping points occur in many real-world systems, at which the system shifts\nsuddenly from one state to another. The ability to predict the occurrence of\ntipping points from time series data remains an outstanding challenge and a\nmajor interest in a broad range of research fields. Particularly, the widely\nused methods based on bifurcation theory are neither reliable in prediction\naccuracy nor applicable for irregularly-sampled time series which are commonly\nobserved from real-world systems. Here we address this challenge by developing\na deep learning algorithm for predicting the occurrence of tipping points in\nuntrained systems, by exploiting information about normal forms. Our algorithm\nnot only outperforms traditional methods for regularly-sampled model time\nseries but also achieves accurate predictions for irregularly-sampled model\ntime series and empirical time series. Our ability to predict tipping points\nfor complex systems paves the way for mitigation risks, prevention of\ncatastrophic failures, and restoration of degraded systems, with broad\napplications in social science, engineering, and biology.\n","authors":["Chengzuo Zhuge","Jiawei Li","Wei Chen"],"pdf_url":"https://arxiv.org/pdf/2407.18693v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18691v1","updated":"2024-07-26T12:16:53Z","published":"2024-07-26T12:16:53Z","title":"Graph Neural Networks for Virtual Sensing in Complex Systems: Addressing\n Heterogeneous Temporal Dynamics","summary":" Real-time condition monitoring is crucial for the reliable and efficient\noperation of complex systems. However, relying solely on physical sensors can\nbe limited due to their cost, placement constraints, or inability to directly\nmeasure certain critical parameters. Virtual sensing addresses these\nlimitations by leveraging readily available sensor data and system knowledge to\nestimate inaccessible parameters or infer system states. The increasing\ncomplexity of industrial systems necessitates deployments of sensors with\ndiverse modalities to provide a comprehensive understanding of system states.\nThese sensors capture data at varying frequencies to monitor both rapid and\nslowly varying system dynamics, as well as local and global state evolutions of\nthe systems. This leads to heterogeneous temporal dynamics, which, particularly\nunder varying operational end environmental conditions, pose a significant\nchallenge for accurate virtual sensing. To address this, we propose a\nHeterogeneous Temporal Graph Neural Network (HTGNN) framework. HTGNN explicitly\nmodels signals from diverse sensors and integrates operating conditions into\nthe model architecture. We evaluate HTGNN using two newly released datasets: a\nbearing dataset with diverse load conditions for bearing load prediction and a\nyear-long simulated dataset for predicting bridge live loads. Our results\ndemonstrate that HTGNN significantly outperforms established baseline methods\nin both tasks, particularly under highly varying operating conditions. These\nresults highlight HTGNN's potential as a robust and accurate virtual sensing\napproach for complex systems, paving the way for improved monitoring,\npredictive maintenance, and enhanced system performance.\n","authors":["Mengjie Zhao","Cees Taal","Stephan Baggerohr","Olga Fink"],"pdf_url":"https://arxiv.org/pdf/2407.18691v1.pdf","comment":"This paper extends our previous conference paper (Best Paper at\n European Conference of the PHM Society 2024,\n https://doi.org/10.36001/phme.2024.v8i1.3998)"},{"id":"http://arxiv.org/abs/2302.07868v6","updated":"2024-07-26T11:59:06Z","published":"2023-02-15T18:59:27Z","title":"Target Specific De Novo Design of Drug Candidate Molecules with Graph\n Transformer-based Generative Adversarial Networks","summary":" Discovering novel drug candidate molecules is one of the most fundamental and\ncritical steps in drug development. Generative deep learning models, which\ncreate synthetic data given a probability distribution, offer a high potential\nfor designing de novo molecules. However, for them to be useful in real-life\ndrug development pipelines, these models should be able to design drug-like and\ntarget-centric molecules. In this study, we propose an end-to-end generative\nsystem, DrugGEN, for the de novo design of drug candidate molecules that\ninteract with intended target proteins. The proposed method represents\nmolecules as graphs and processes them via a generative adversarial network\ncomprising graph transformer layers. The system is trained using a large\ndataset of drug-like compounds and target-specific bioactive molecules to\ndesign effective inhibitory molecules against the AKT1 protein, which is\ncritically important in developing treatments for various types of cancer. We\nconducted molecular docking and dynamics to assess the target-centric\ngeneration performance of the model, as well as attention score visualisation\nto examine model interpretability. Results indicate that our de novo molecules\nhave a high potential for interacting with the AKT1 protein at the level of its\nnative ligands. Using the open-access DrugGEN codebase, it is possible to\neasily train models for other druggable proteins, given a dataset of\nexperimentally known bioactive molecules.\n","authors":["Atabey Ünlü","Elif Çevrim","Ahmet Sarıgün","Melih Gökay Yiğit","Hayriye Çelikbilek","Osman Bayram","Heval Ataş Güvenilir","Altay Koyaş","Deniz Cansen Kahraman","Abdurrahman Olğaç","Ahmet Rifaioğlu","Erden Banoğlu","Tunca Doğan"],"pdf_url":"https://arxiv.org/pdf/2302.07868v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18682v1","updated":"2024-07-26T11:56:23Z","published":"2024-07-26T11:56:23Z","title":"Rapid Object Annotation","summary":" In this report we consider the problem of rapidly annotating a video with\nbounding boxes for a novel object. We describe a UI and associated workflow\ndesigned to make this process fast for an arbitrary novel target.\n","authors":["Misha Denil"],"pdf_url":"https://arxiv.org/pdf/2407.18682v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18213v2","updated":"2024-07-26T11:51:58Z","published":"2024-07-25T17:26:41Z","title":"Exploring Scaling Trends in LLM Robustness","summary":" Language model capabilities predictably improve from scaling a model's size\nand training data. Motivated by this, increasingly large language models have\nbeen trained, yielding an array of impressive capabilities. Yet these models\nare vulnerable to adversarial prompts, such as \"jailbreaks\" that hijack models\nto perform undesired behaviors, posing a significant risk of misuse. Prior work\nindicates that computer vision models become more robust with model and data\nscaling, raising the question: does language model robustness also improve with\nscale? We study this question empirically, finding that larger models respond\nsubstantially better to adversarial training, but there is little to no benefit\nfrom model scale in the absence of explicit defenses.\n","authors":["Nikolaus Howe","Michał Zajac","Ian McKenzie","Oskar Hollinsworth","Tom Tseng","Pierre-Luc Bacon","Adam Gleave"],"pdf_url":"https://arxiv.org/pdf/2407.18213v2.pdf","comment":"31 pages; edit fixed metadata typo (author name)"},{"id":"http://arxiv.org/abs/2407.18676v1","updated":"2024-07-26T11:38:18Z","published":"2024-07-26T11:38:18Z","title":"Right Now, Wrong Then: Non-Stationary Direct Preference Optimization\n under Preference Drift","summary":" Reinforcement learning from human feedback (RLHF) aligns Large Language\nModels (LLMs) with human preferences. However, these preferences can often\nchange over time due to external factors (e.g. environment change and societal\ninfluence). Consequently, what was wrong then might be right now. Current\npreference optimization algorithms do not account for temporal preference drift\nin their modeling, which can lead to severe misalignment. To address this\nlimitation, we use a Dynamic Bradley-Terry model that models preferences via\ntime-dependent reward functions, and propose Non-Stationary Direct Preference\nOptimisation (NS-DPO). By introducing a discount parameter in the loss\nfunction, NS-DPO applies exponential weighting, which proportionally focuses\nlearning on more time-relevant datapoints. We theoretically analyse the\nconvergence of NS-DPO in the offline setting, providing upper bounds on the\nestimation error caused by non-stationary preferences. Finally, we demonstrate\nthe effectiveness of NS-DPO1 for fine-tuning LLMs in scenarios with drifting\npreferences. By simulating preference drift using renowned reward models and\nmodifying popular LLM datasets accordingly, we show that NS-DPO fine-tuned LLMs\nremain robust under non-stationarity, significantly outperforming baseline\nalgorithms that ignore temporal preference changes, without sacrificing\nperformance in stationary cases.\n","authors":["Seongho Son","William Bankes","Sayak Ray Chowdhury","Brooks Paige","Ilija Bogunovic"],"pdf_url":"https://arxiv.org/pdf/2407.18676v1.pdf","comment":"30 pages, 9 figures"},{"id":"http://arxiv.org/abs/2407.18675v1","updated":"2024-07-26T11:36:05Z","published":"2024-07-26T11:36:05Z","title":"A dual ensemble classifier used to recognise contaminated multi-channel\n EMG and MMG signals in the control of upper limb bioprosthesis","summary":" Myopotential pattern recognition to decode the intent of the user is the most\nadvanced approach to controlling a powered bioprosthesis. Unfortunately, many\nfactors make this a difficult problem and achieving acceptable recognition\nquality in real-word conditions is a serious challenge. The aim of the paper is\nto develop a recognition system that will mitigate factors related to\nmultimodality and multichannel recording of biosignals and their high\nsusceptibility to contamination. The proposed method involves the use of two\nco-operating multiclassifier systems. The first system is composed of one-class\nclassifiers related to individual electromyographic (EMG) and mechanomyographic\n(MMG) biosignal recording channels, and its task is to recognise contaminated\nchannels. The role of the second system is to recognise the class of movement\nresulting from the patient's intention. The ensemble system consists of base\nclassifiers using the representation (extracted features) of biosignals from\ndifferent channels. The system uses a dynamic selection mechanism, eliminating\nthose base classifiers that are associated with biosignal channels that are\nrecognised by the one-class ensemble system as being contaminated. Experimental\nstudies were conducted using signals from an able-bodied person with simulation\nof amputation. The results obtained allow us to reject the null hypothesis that\nthe application of the dual ensemble foes not lead to improved classification\nquality.\n","authors":["Pawel Trajdos","Marek Kurzynski"],"pdf_url":"https://arxiv.org/pdf/2407.18675v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18673v1","updated":"2024-07-26T11:30:22Z","published":"2024-07-26T11:30:22Z","title":"A Survey on Cell Nuclei Instance Segmentation and Classification:\n Leveraging Context and Attention","summary":" Manually annotating nuclei from the gigapixel Hematoxylin and Eosin\n(H&E)-stained Whole Slide Images (WSIs) is a laborious and costly task, meaning\nautomated algorithms for cell nuclei instance segmentation and classification\ncould alleviate the workload of pathologists and clinical researchers and at\nthe same time facilitate the automatic extraction of clinically interpretable\nfeatures. But due to high intra- and inter-class variability of nuclei\nmorphological and chromatic features, as well as H&E-stains susceptibility to\nartefacts, state-of-the-art algorithms cannot correctly detect and classify\ninstances with the necessary performance. In this work, we hypothesise context\nand attention inductive biases in artificial neural networks (ANNs) could\nincrease the generalization of algorithms for cell nuclei instance segmentation\nand classification. We conduct a thorough survey on context and attention\nmethods for cell nuclei instance segmentation and classification from\nH&E-stained microscopy imaging, while providing a comprehensive discussion of\nthe challenges being tackled with context and attention. Besides, we illustrate\nsome limitations of current approaches and present ideas for future research.\nAs a case study, we extend both a general instance segmentation and\nclassification method (Mask-RCNN) and a tailored cell nuclei instance\nsegmentation and classification model (HoVer-Net) with context- and\nattention-based mechanisms, and do a comparative analysis on a multi-centre\ncolon nuclei identification and counting dataset. Although pathologists rely on\ncontext at multiple levels while paying attention to specific Regions of\nInterest (RoIs) when analysing and annotating WSIs, our findings suggest\ntranslating that domain knowledge into algorithm design is no trivial task, but\nto fully exploit these mechanisms, the scientific understanding of these\nmethods should be addressed.\n","authors":["João D. Nunes","Diana Montezuma","Domingos Oliveira","Tania Pereira","Jaime S. Cardoso"],"pdf_url":"https://arxiv.org/pdf/2407.18673v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17596v2","updated":"2024-07-26T11:26:43Z","published":"2024-07-24T19:02:01Z","title":"Quality Assured: Rethinking Annotation Strategies in Imaging AI","summary":" This paper does not describe a novel method. Instead, it studies an essential\nfoundation for reliable benchmarking and ultimately real-world application of\nAI-based image analysis: generating high-quality reference annotations.\nPrevious research has focused on crowdsourcing as a means of outsourcing\nannotations. However, little attention has so far been given to annotation\ncompanies, specifically regarding their internal quality assurance (QA)\nprocesses. Therefore, our aim is to evaluate the influence of QA employed by\nannotation companies on annotation quality and devise methodologies for\nmaximizing data annotation efficacy. Based on a total of 57,648 instance\nsegmented images obtained from a total of 924 annotators and 34 QA workers from\nfour annotation companies and Amazon Mechanical Turk (MTurk), we derived the\nfollowing insights: (1) Annotation companies perform better both in terms of\nquantity and quality compared to the widely used platform MTurk. (2) Annotation\ncompanies' internal QA only provides marginal improvements, if any. However,\nimproving labeling instructions instead of investing in QA can substantially\nboost annotation performance. (3) The benefit of internal QA depends on\nspecific image characteristics. Our work could enable researchers to derive\nsubstantially more value from a fixed annotation budget and change the way\nannotation companies conduct internal QA.\n","authors":["Tim Rädsch","Annika Reinke","Vivienn Weru","Minu D. Tizabi","Nicholas Heller","Fabian Isensee","Annette Kopp-Schneider","Lena Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2407.17596v2.pdf","comment":"Accepted at ECCV 2024, preprint, Computer Vision, Data Annotation"},{"id":"http://arxiv.org/abs/2309.07986v2","updated":"2024-07-26T11:14:21Z","published":"2023-09-14T18:52:16Z","title":"Viewpoint Textual Inversion: Discovering Scene Representations and 3D\n View Control in 2D Diffusion Models","summary":" Text-to-image diffusion models generate impressive and realistic images, but\ndo they learn to represent the 3D world from only 2D supervision? We\ndemonstrate that yes, certain 3D scene representations are encoded in the text\nembedding space of models like Stable Diffusion. Our approach, Viewpoint Neural\nTextual Inversion (ViewNeTI), is to discover 3D view tokens; these tokens\ncontrol the 3D viewpoint - the rendering pose in a scene - of generated images.\nSpecifically, we train a small neural mapper to take continuous camera\nviewpoint parameters and predict a view token (a word embedding). This token\nconditions diffusion generation via cross-attention to produce images with the\ndesired camera viewpoint. Using ViewNeTI as an evaluation tool, we report two\nfindings: first, the text latent space has a continuous view-control manifold\nfor particular 3D scenes; second, we find evidence for a generalized\nview-control manifold for all scenes. We conclude that since the view token\ncontrols the 3D `rendering' viewpoint, there is likely a scene representation\nembedded in frozen 2D diffusion models. Finally, we exploit the 3D scene\nrepresentations for 3D vision tasks, namely, view-controlled text-to-image\ngeneration, and novel view synthesis from a single image, where our approach\nsets state-of-the-art for LPIPS. Code available at\nhttps://github.com/jmhb0/view_neti\n","authors":["James Burgess","Kuan-Chieh Wang","Serena Yeung-Levy"],"pdf_url":"https://arxiv.org/pdf/2309.07986v2.pdf","comment":"ECCV 2024 (European Conference on Computer Vision). Project page:\n https://jmhb0.github.io/view_neti/"},{"id":"http://arxiv.org/abs/2402.06357v3","updated":"2024-07-26T11:08:07Z","published":"2024-02-09T12:07:06Z","title":"The SkipSponge Attack: Sponge Weight Poisoning of Deep Neural Networks","summary":" Sponge attacks aim to increase the energy consumption and computation time of\nneural networks deployed on hardware accelerators. Existing sponge attacks can\nbe performed during inference via sponge examples or during training via Sponge\nPoisoning. Sponge examples leverage perturbations added to the model's input to\nincrease energy and latency, while Sponge Poisoning alters the objective\nfunction of a model to induce inference-time energy effects. In this work, we\npropose a novel sponge attack called SkipSponge. SkipSponge is the first sponge\nattack that is performed directly on the parameters of a pre-trained model\nusing only a few data samples. Our experiments show that SkipSponge can\nsuccessfully increase the energy consumption of image classification models,\nGANs, and autoencoders with fewer samples required than Sponge Poisoning. We\nshow that poisoning defenses are ineffective if not adjusted specifically for\nthe defense against SkipSponge (i.e., they decrease target layer bias values).\nOur work shows that SkipSponge is more effective on the GANs and the\nautoencoders than the state-of-the-art. Additionally, SkipSponge is stealthier\nthan the previous Sponge Poisoning attack as it does not require significant\nchanges in the victim model's weights. Our experiments indicate that the\nSkipSponge attack can be performed even when an attacker has access to only 1%\nof the entire dataset and reaches up to 13% energy increase.\n","authors":["Jona te Lintelo","Stefanos Koffas","Stjepan Picek"],"pdf_url":"https://arxiv.org/pdf/2402.06357v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18731v2","updated":"2024-07-26T10:50:43Z","published":"2024-04-29T14:17:52Z","title":"Real Time Multi Organ Classification on Computed Tomography Images","summary":" Organ segmentation is a fundamental task in medical imaging since it is\nuseful for many clinical automation pipelines. However, some tasks do not\nrequire full segmentation. Instead, a classifier can identify the selected\norgan without segmenting the entire volume. In this study, we demonstrate a\nclassifier based method to obtain organ labels in real time by using a large\ncontext size with a sparse data sampling strategy. Although our method operates\nas an independent classifier at query locations, it can generate full\nsegmentations by querying grid locations at any resolution, offering faster\nperformance than segmentation algorithms. We compared our method with existing\nsegmentation techniques, demonstrating its superior runtime potential for\npractical applications in medical imaging.\n","authors":["Halid Ziya Yerebakan","Yoshihisa Shinagawa","Gerardo Hermosillo Valadez"],"pdf_url":"https://arxiv.org/pdf/2404.18731v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18658v1","updated":"2024-07-26T10:49:14Z","published":"2024-07-26T10:49:14Z","title":"Adversarial Robustification via Text-to-Image Diffusion Models","summary":" Adversarial robustness has been conventionally believed as a challenging\nproperty to encode for neural networks, requiring plenty of training data. In\nthe recent paradigm of adopting off-the-shelf models, however, access to their\ntraining data is often infeasible or not practical, while most of such models\nare not originally trained concerning adversarial robustness. In this paper, we\ndevelop a scalable and model-agnostic solution to achieve adversarial\nrobustness without using any data. Our intuition is to view recent\ntext-to-image diffusion models as \"adaptable\" denoisers that can be optimized\nto specify target tasks. Based on this, we propose: (a) to initiate a\ndenoise-and-classify pipeline that offers provable guarantees against\nadversarial attacks, and (b) to leverage a few synthetic reference images\ngenerated from the text-to-image model that enables novel adaptation schemes.\nOur experiments show that our data-free scheme applied to the pre-trained CLIP\ncould improve the (provable) adversarial robustness of its diverse zero-shot\nclassification derivatives (while maintaining their accuracy), significantly\nsurpassing prior approaches that utilize the full training data. Not only for\nCLIP, we also demonstrate that our framework is easily applicable for\nrobustifying other visual classifiers efficiently.\n","authors":["Daewon Choi","Jongheon Jeong","Huiwon Jang","Jinwoo Shin"],"pdf_url":"https://arxiv.org/pdf/2407.18658v1.pdf","comment":"Code is available at https://github.com/ChoiDae1/robustify-T2I"},{"id":"http://arxiv.org/abs/2407.18655v1","updated":"2024-07-26T10:45:27Z","published":"2024-07-26T10:45:27Z","title":"Aspects of importance sampling in parameter selection for neural\n networks using ridgelet transform","summary":" The choice of parameters in neural networks is crucial in the performance,\nand an oracle distribution derived from the ridgelet transform enables us to\nobtain suitable initial parameters. In other words, the distribution of\nparameters is connected to the integral representation of target functions. The\noracle distribution allows us to avoid the conventional backpropagation\nlearning process; only a linear regression is enough to construct the neural\nnetwork in simple cases. This study provides a new look at the oracle\ndistributions and ridgelet transforms, i.e., an aspect of importance sampling.\nIn addition, we propose extensions of the parameter sampling methods. We\ndemonstrate the aspect of importance sampling and the proposed sampling\nalgorithms via one-dimensional and high-dimensional examples; the results imply\nthat the magnitude of weight parameters could be more crucial than the\nintercept parameters.\n","authors":["Hikaru Homma","Jun Ohkubo"],"pdf_url":"https://arxiv.org/pdf/2407.18655v1.pdf","comment":"7 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.18650v1","updated":"2024-07-26T10:37:29Z","published":"2024-07-26T10:37:29Z","title":"Achieving interpretable machine learning by functional decomposition of\n black-box models into explainable predictor effects","summary":" Machine learning (ML) has seen significant growth in both popularity and\nimportance. The high prediction accuracy of ML models is often achieved through\ncomplex black-box architectures that are difficult to interpret. This\ninterpretability problem has been hindering the use of ML in fields like\nmedicine, ecology and insurance, where an understanding of the inner workings\nof the model is paramount to ensure user acceptance and fairness. The need for\ninterpretable ML models has boosted research in the field of interpretable\nmachine learning (IML). Here we propose a novel approach for the functional\ndecomposition of black-box predictions, which is considered a core concept of\nIML. The idea of our method is to replace the prediction function by a\nsurrogate model consisting of simpler subfunctions. Similar to additive\nregression models, these functions provide insights into the direction and\nstrength of the main feature contributions and their interactions. Our method\nis based on a novel concept termed stacked orthogonality, which ensures that\nthe main effects capture as much functional behavior as possible and do not\ncontain information explained by higher-order interactions. Unlike earlier\nfunctional IML approaches, it is neither affected by extrapolation nor by\nhidden feature interactions. To compute the subfunctions, we propose an\nalgorithm based on neural additive modeling and an efficient post-hoc\northogonalization procedure.\n","authors":["David Köhler","David Rügamer","Matthias Schmid"],"pdf_url":"https://arxiv.org/pdf/2407.18650v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18648v1","updated":"2024-07-26T10:29:16Z","published":"2024-07-26T10:29:16Z","title":"Fast and Reliable Probabilistic Reflectometry Inversion with\n Prior-Amortized Neural Posterior Estimation","summary":" Reconstructing the structure of thin films and multilayers from measurements\nof scattered X-rays or neutrons is key to progress in physics, chemistry, and\nbiology. However, finding all structures compatible with reflectometry data is\ncomputationally prohibitive for standard algorithms, which typically results in\nunreliable analysis with only a single potential solution identified. We\naddress this lack of reliability with a probabilistic deep learning method that\nidentifies all realistic structures in seconds, setting new standards in\nreflectometry. Our method, Prior-Amortized Neural Posterior Estimation (PANPE),\ncombines simulation-based inference with novel adaptive priors that inform the\ninference network about known structural properties and controllable\nexperimental conditions. PANPE networks support key scenarios such as\nhigh-throughput sample characterization, real-time monitoring of evolving\nstructures, or the co-refinement of several experimental data sets, and can be\nadapted to provide fast, reliable, and flexible inference across many other\ninverse problems.\n","authors":["Vladimir Starostin","Maximilian Dax","Alexander Gerlach","Alexander Hinderhofer","Álvaro Tejero-Cantero","Frank Schreiber"],"pdf_url":"https://arxiv.org/pdf/2407.18648v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18645v1","updated":"2024-07-26T10:26:44Z","published":"2024-07-26T10:26:44Z","title":"Contrastive Learning of Asset Embeddings from Financial Time Series","summary":" Representation learning has emerged as a powerful paradigm for extracting\nvaluable latent features from complex, high-dimensional data. In financial\ndomains, learning informative representations for assets can be used for tasks\nlike sector classification, and risk management. However, the complex and\nstochastic nature of financial markets poses unique challenges. We propose a\nnovel contrastive learning framework to generate asset embeddings from\nfinancial time series data. Our approach leverages the similarity of asset\nreturns over many subwindows to generate informative positive and negative\nsamples, using a statistical sampling strategy based on hypothesis testing to\naddress the noisy nature of financial data. We explore various contrastive loss\nfunctions that capture the relationships between assets in different ways to\nlearn a discriminative representation space. Experiments on real-world datasets\ndemonstrate the effectiveness of the learned asset embeddings on benchmark\nindustry classification and portfolio optimization tasks. In each case our\nnovel approaches significantly outperform existing baselines highlighting the\npotential for contrastive learning to capture meaningful and actionable\nrelationships in financial data.\n","authors":["Rian Dolphin","Barry Smyth","Ruihai Dong"],"pdf_url":"https://arxiv.org/pdf/2407.18645v1.pdf","comment":"9 pages, 4 figures, 4 tables"},{"id":"http://arxiv.org/abs/2407.18639v1","updated":"2024-07-26T10:09:44Z","published":"2024-07-26T10:09:44Z","title":"Vulnerability Detection in Ethereum Smart Contracts via Machine\n Learning: A Qualitative Analysis","summary":" Smart contracts are central to a myriad of critical blockchain applications,\nfrom financial transactions to supply chain management. However, their adoption\nis hindered by security vulnerabilities that can result in significant\nfinancial losses. Most vulnerability detection tools and methods available\nnowadays leverage either static analysis methods or machine learning.\nUnfortunately, as valuable as they are, both approaches suffer from limitations\nthat make them only partially effective. In this survey, we analyze the state\nof the art in machine-learning vulnerability detection for Ethereum smart\ncontracts, by categorizing existing tools and methodologies, evaluating them,\nand highlighting their limitations. Our critical assessment unveils issues such\nas restricted vulnerability coverage and dataset construction flaws, providing\nus with new metrics to overcome the difficulties that restrain a sound\ncomparison of existing solutions. Driven by our findings, we discuss best\npractices to enhance the accuracy, scope, and efficiency of vulnerability\ndetection in smart contracts. Our guidelines address the known flaws while at\nthe same time opening new avenues for research and development. By shedding\nlight on current challenges and offering novel directions for improvement, we\ncontribute to the advancement of secure smart contract development and\nblockchain technology as a whole.\n","authors":["Dalila Ressi","Alvise Spanò","Lorenzo Benetollo","Carla Piazza","Michele Bugliesi","Sabina Rossi"],"pdf_url":"https://arxiv.org/pdf/2407.18639v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18632v1","updated":"2024-07-26T09:55:34Z","published":"2024-07-26T09:55:34Z","title":"Robust VAEs via Generating Process of Noise Augmented Data","summary":" Advancing defensive mechanisms against adversarial attacks in generative\nmodels is a critical research topic in machine learning. Our study focuses on a\nspecific type of generative models - Variational Auto-Encoders (VAEs). Contrary\nto common beliefs and existing literature which suggest that noise injection\ntowards training data can make models more robust, our preliminary experiments\nrevealed that naive usage of noise augmentation technique did not substantially\nimprove VAE robustness. In fact, it even degraded the quality of learned\nrepresentations, making VAEs more susceptible to adversarial perturbations.\nThis paper introduces a novel framework that enhances robustness by\nregularizing the latent space divergence between original and noise-augmented\ndata. Through incorporating a paired probabilistic prior into the standard\nvariational lower bound, our method significantly boosts defense against\nadversarial attacks. Our empirical evaluations demonstrate that this approach,\ntermed Robust Augmented Variational Auto-ENcoder (RAVEN), yields superior\nperformance in resisting adversarial inputs on widely-recognized benchmark\ndatasets.\n","authors":["Hiroo Irobe","Wataru Aoki","Kimihiro Yamazaki","Yuhui Zhang","Takumi Nakagawa","Hiroki Waida","Yuichiro Wada","Takafumi Kanamori"],"pdf_url":"https://arxiv.org/pdf/2407.18632v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17000v6","updated":"2024-07-26T09:49:42Z","published":"2023-05-26T14:59:28Z","title":"DistriBlock: Identifying adversarial audio samples by leveraging\n characteristics of the output distribution","summary":" Adversarial attacks can mislead automatic speech recognition (ASR) systems\ninto predicting an arbitrary target text, thus posing a clear security threat.\nTo prevent such attacks, we propose DistriBlock, an efficient detection\nstrategy applicable to any ASR system that predicts a probability distribution\nover output tokens in each time step. We measure a set of characteristics of\nthis distribution: the median, maximum, and minimum over the output\nprobabilities, the entropy of the distribution, as well as the Kullback-Leibler\nand the Jensen-Shannon divergence with respect to the distributions of the\nsubsequent time step. Then, by leveraging the characteristics observed for both\nbenign and adversarial data, we apply binary classifiers, including simple\nthreshold-based classification, ensembles of such classifiers, and neural\nnetworks. Through extensive analysis across different state-of-the-art ASR\nsystems and language data sets, we demonstrate the supreme performance of this\napproach, with a mean area under the receiver operating characteristic curve\nfor distinguishing target adversarial examples against clean and noisy data of\n99% and 97%, respectively. To assess the robustness of our method, we show that\nadaptive adversarial examples that can circumvent DistriBlock are much noisier,\nwhich makes them easier to detect through filtering and creates another avenue\nfor preserving the system's robustness.\n","authors":["Matías P. Pizarro B.","Dorothea Kolossa","Asja Fischer"],"pdf_url":"https://arxiv.org/pdf/2305.17000v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18629v1","updated":"2024-07-26T09:40:30Z","published":"2024-07-26T09:40:30Z","title":"CardioLab: Laboratory Values Estimation from Electrocardiogram Features\n -- An Exploratory Study","summary":" Introduction: Laboratory value represents a cornerstone of medical\ndiagnostics, but suffers from slow turnaround times, and high costs and only\nprovides information about a single point in time. The continuous estimation of\nlaboratory values from non-invasive data such as electrocardiogram (ECG) would\ntherefore mark a significant frontier in healthcare monitoring. Despite its\ntransformative potential, this domain remains relatively underexplored within\nthe medical community.\n Methods: In this preliminary study, we used a publicly available dataset\n(MIMIC-IV-ECG) to investigate the feasibility of inferring laboratory values\nfrom ECG features and patient demographics using tree-based models (XGBoost).\nWe define the prediction task as a binary prediction problem of predicting\nwhether the lab value falls into low or high abnormalities. The model\nperformance can then be assessed using AUROC.\n Results: Our findings demonstrate promising results in the estimation of\nlaboratory values related to different organ systems based on a small yet\ncomprehensive set of features. While further research and validation are\nwarranted to fully assess the clinical utility and generalizability of\nECG-based estimation in healthcare monitoring, our findings lay the groundwork\nfor future investigations into approaches to laboratory value estimation using\nECG data. Such advancements hold promise for revolutionizing predictive\nhealthcare applications, offering faster, non-invasive, and more affordable\nmeans of patient monitoring.\n","authors":["Juan Miguel Lopez Alcaraz","Nils Strodthoff"],"pdf_url":"https://arxiv.org/pdf/2407.18629v1.pdf","comment":"5 pages, code under https://github.com/AI4HealthUOL/CardioLab"},{"id":"http://arxiv.org/abs/2311.05241v2","updated":"2024-07-26T09:39:01Z","published":"2023-11-09T09:49:50Z","title":"When Meta-Learning Meets Online and Continual Learning: A Survey","summary":" Over the past decade, deep neural networks have demonstrated significant\nsuccess using the training scheme that involves mini-batch stochastic gradient\ndescent on extensive datasets. Expanding upon this accomplishment, there has\nbeen a surge in research exploring the application of neural networks in other\nlearning scenarios. One notable framework that has garnered significant\nattention is meta-learning. Often described as \"learning to learn,\"\nmeta-learning is a data-driven approach to optimize the learning algorithm.\nOther branches of interest are continual learning and online learning, both of\nwhich involve incrementally updating a model with streaming data. While these\nframeworks were initially developed independently, recent works have started\ninvestigating their combinations, proposing novel problem settings and learning\nalgorithms. However, due to the elevated complexity and lack of unified\nterminology, discerning differences between the learning frameworks can be\nchallenging even for experienced researchers. To facilitate a clear\nunderstanding, this paper provides a comprehensive survey that organizes\nvarious problem settings using consistent terminology and formal descriptions.\nBy offering an overview of these learning paradigms, our work aims to foster\nfurther advancements in this promising area of research.\n","authors":["Jaehyeon Son","Soochan Lee","Gunhee Kim"],"pdf_url":"https://arxiv.org/pdf/2311.05241v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18627v1","updated":"2024-07-26T09:35:50Z","published":"2024-07-26T09:35:50Z","title":"Multi-Agent Deep Reinforcement Learning for Energy Efficient Multi-Hop\n STAR-RIS-Assisted Transmissions","summary":" Simultaneously transmitting and reflecting reconfigurable intelligent surface\n(STAR-RIS) provides a promising way to expand coverage in wireless\ncommunications. However, limitation of single STAR-RIS inspire us to integrate\nthe concept of multi-hop transmissions, as focused on RIS in existing research.\nTherefore, we propose the novel architecture of multi-hop STAR-RISs to achieve\na wider range of full-plane service coverage. In this paper, we intend to solve\nactive beamforming of the base station and passive beamforming of STAR-RISs,\naiming for maximizing the energy efficiency constrained by hardware limitation\nof STAR-RISs. Furthermore, we investigate the impact of the on-off state of\nSTAR-RIS elements on energy efficiency. To tackle the complex problem, a\nMulti-Agent Global and locAl deep Reinforcement learning (MAGAR) algorithm is\ndesigned. The global agent elevates the collaboration among local agents, which\nfocus on individual learning. In numerical results, we observe the significant\nimprovement of MAGAR compared to the other benchmarks, including Q-learning,\nmulti-agent deep Q network (DQN) with golbal reward, and multi-agent DQN with\nlocal rewards. Moreover, the proposed architecture of multi-hop STAR-RISs\nachieves the highest energy efficiency compared to mode switching based\nSTAR-RISs, conventional RISs and deployment without RISs or STAR-RISs.\n","authors":["Pei-Hsiang Liao","Li-Hsiang Shen","Po-Chen Wu","Kai-Ten Feng"],"pdf_url":"https://arxiv.org/pdf/2407.18627v1.pdf","comment":"Accepted by Proc. IEEE VTC-fall"},{"id":"http://arxiv.org/abs/2407.18624v1","updated":"2024-07-26T09:33:53Z","published":"2024-07-26T09:33:53Z","title":"Dual-Decoupling Learning and Metric-Adaptive Thresholding for\n Semi-Supervised Multi-Label Learning","summary":" Semi-supervised multi-label learning (SSMLL) is a powerful framework for\nleveraging unlabeled data to reduce the expensive cost of collecting precise\nmulti-label annotations. Unlike semi-supervised learning, one cannot select the\nmost probable label as the pseudo-label in SSMLL due to multiple semantics\ncontained in an instance. To solve this problem, the mainstream method\ndeveloped an effective thresholding strategy to generate accurate\npseudo-labels. Unfortunately, the method neglected the quality of model\npredictions and its potential impact on pseudo-labeling performance. In this\npaper, we propose a dual-perspective method to generate high-quality\npseudo-labels. To improve the quality of model predictions, we perform\ndual-decoupling to boost the learning of correlative and discriminative\nfeatures, while refining the generation and utilization of pseudo-labels. To\nobtain proper class-wise thresholds, we propose the metric-adaptive\nthresholding strategy to estimate the thresholds, which maximize the\npseudo-label performance for a given metric on labeled data. Experiments on\nmultiple benchmark datasets show the proposed method can achieve the\nstate-of-the-art performance and outperform the comparative methods with a\nsignificant margin.\n","authors":["Jia-Hao Xiao","Ming-Kun Xie","Heng-Bo Fan","Gang Niu","Masashi Sugiyama","Sheng-Jun Huang"],"pdf_url":"https://arxiv.org/pdf/2407.18624v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18609v1","updated":"2024-07-26T09:00:18Z","published":"2024-07-26T09:00:18Z","title":"Denoising Lévy Probabilistic Models","summary":" Investigating noise distribution beyond Gaussian in diffusion generative\nmodels is an open problem. The Gaussian case has seen success experimentally\nand theoretically, fitting a unified SDE framework for score-based and\ndenoising formulations. Recent studies suggest heavy-tailed noise distributions\ncan address mode collapse and manage datasets with class imbalance, heavy\ntails, or outliers. Yoon et al. (NeurIPS 2023) introduced the L\\'evy-Ito model\n(LIM), extending the SDE framework to heavy-tailed SDEs with $\\alpha$-stable\nnoise. Despite its theoretical elegance and performance gains, LIM's complex\nmathematics may limit its accessibility and broader adoption. This study takes\na simpler approach by extending the denoising diffusion probabilistic model\n(DDPM) with $\\alpha$-stable noise, creating the denoising L\\'evy probabilistic\nmodel (DLPM). Using elementary proof techniques, we show DLPM reduces to\nrunning vanilla DDPM with minimal changes, allowing the use of existing\nimplementations with minimal changes. DLPM and LIM have different training\nalgorithms and, unlike the Gaussian case, they admit different backward\nprocesses and sampling algorithms. Our experiments demonstrate that DLPM\nachieves better coverage of data distribution tail, improved generation of\nunbalanced datasets, and faster computation times with fewer backward steps.\n","authors":["Dario Shariatian","Umut Simsekli","Alain Durmus"],"pdf_url":"https://arxiv.org/pdf/2407.18609v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18607v1","updated":"2024-07-26T08:59:26Z","published":"2024-07-26T08:59:26Z","title":"Using GPT-4 to guide causal machine learning","summary":" Since its introduction to the public, ChatGPT has had an unprecedented\nimpact. While some experts praised AI advancements and highlighted their\npotential risks, others have been critical about the accuracy and usefulness of\nLarge Language Models (LLMs). In this paper, we are interested in the ability\nof LLMs to identify causal relationships. We focus on the well-established\nGPT-4 (Turbo) and evaluate its performance under the most restrictive\nconditions, by isolating its ability to infer causal relationships based solely\non the variable labels without being given any context, demonstrating the\nminimum level of effectiveness one can expect when it is provided with\nlabel-only information. We show that questionnaire participants judge the GPT-4\ngraphs as the most accurate in the evaluated categories, closely followed by\nknowledge graphs constructed by domain experts, with causal Machine Learning\n(ML) far behind. We use these results to highlight the important limitation of\ncausal ML, which often produces causal graphs that violate common sense,\naffecting trust in them. However, we show that pairing GPT-4 with causal ML\novercomes this limitation, resulting in graphical structures learnt from real\ndata that align more closely with those identified by domain experts, compared\nto structures learnt by causal ML alone. Overall, our findings suggest that\ndespite GPT-4 not being explicitly designed to reason causally, it can still be\na valuable tool for causal representation, as it improves the causal discovery\nprocess of causal ML algorithms that are designed to do just that.\n","authors":["Anthony C. Constantinou","Neville K. Kitson","Alessio Zanga"],"pdf_url":"https://arxiv.org/pdf/2407.18607v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18606v1","updated":"2024-07-26T08:56:13Z","published":"2024-07-26T08:56:13Z","title":"A data balancing approach designing of an expert system for Heart\n Disease Prediction","summary":" Heart disease is a major global health concern that results in millions of\ndeaths annually. Prevention and effective treatment of heart-related problems\ndepend heavily on early detection and accurate prediction. It was previously\npredicted accurately with machine learning methods. This innovative development\nin healthcare has the power to transform preventative care and save a great\ndeal of lives. The study starts with a thorough assessment of the literature\nthat covers a wide range of topics, including pre-processing techniques,\nperformance evaluation measures, datasets used in heart disease research,\npredictive modeling strategies, diagnostic methodologies, and current issues in\nthe field. Building on these fundamental understandings, the background section\ndescribes the particular actions conducted in this investigation, such as the\ndescription of the dataset, data pre-treatment techniques, label encoding,\nfeature selection methodology, algorithm selection tactics, and stringent\nperformance evaluation techniques.The results indicate that ensemble methods,\nparticularly random forests, outperformed individual classifiers in predicting\nheart disease. Key predictors identified included hypertension, cholesterol\nlevels, smoking status, and physical inactivity. The Decision Tree and Random\nForest model achieved an accuracy of 99.83%. This work demonstrates how machine\nlearning models, particularly ensemble approaches, can increase the precision\nof heart disease prediction. In comparison to conventional techniques, the\nmodels offer a more reliable risk assessment since they integrate a wide range\nof variables and sophisticated algorithms. The results open the door to\ntailored healthcare treatments that facilitate early identification and\ntreatment of cardiac disease.\n","authors":["Rahul Karmakar","Udita Ghosh","Arpita Pal","Sattwiki Dey","Debraj Malik","Priyabrata Sain"],"pdf_url":"https://arxiv.org/pdf/2407.18606v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18601v1","updated":"2024-07-26T08:41:58Z","published":"2024-07-26T08:41:58Z","title":"Climbing the Complexity Ladder with Expressive Attention","summary":" Attention involves comparing query and key vectors in terms of a scalar\nproduct, $\\mathbf{Q}^T\\mathbf{K}$, together with a subsequent softmax\nnormalization. Classicaly, parallel/orthogonal/antiparallel queries and keys\nlead to large/intermediate/small attention weights. Here we study expressive\nattention (EA), which is based on $(\\mathbf{Q}^T\\mathbf{K})^2$, the squared dot\nproduct. In this case attention is enhanced when query and key are either\nparallel or antiparallel, and suppressed for orthogonal configurations. For a\nseries of autoregressive prediction tasks, we find that EA performs at least as\nwell as the standard mechanism, dot-product attention (DPA). Increasing task\ncomplexity, EA is observed to outperform DPA with increasing margins, which\nalso holds for multi-task settings. For a given model size, EA manages to\nachieve 100\\% performance for a range of complexity levels not accessible to\nDPA.\n","authors":["Claudius Gros"],"pdf_url":"https://arxiv.org/pdf/2407.18601v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18597v1","updated":"2024-07-26T08:37:14Z","published":"2024-07-26T08:37:14Z","title":"Reinforcement Learning for Sustainable Energy: A Survey","summary":" The transition to sustainable energy is a key challenge of our time,\nrequiring modifications in the entire pipeline of energy production, storage,\ntransmission, and consumption. At every stage, new sequential decision-making\nchallenges emerge, ranging from the operation of wind farms to the management\nof electrical grids or the scheduling of electric vehicle charging stations.\nAll such problems are well suited for reinforcement learning, the branch of\nmachine learning that learns behavior from data. Therefore, numerous studies\nhave explored the use of reinforcement learning for sustainable energy. This\npaper surveys this literature with the intention of bridging both the\nunderlying research communities: energy and machine learning. After a brief\nintroduction of both fields, we systematically list relevant sustainability\nchallenges, how they can be modeled as a reinforcement learning problem, and\nwhat solution approaches currently exist in the literature. Afterwards, we zoom\nout and identify overarching reinforcement learning themes that appear\nthroughout sustainability, such as multi-agent, offline, and safe reinforcement\nlearning. Lastly, we also cover standardization of environments, which will be\ncrucial for connecting both research fields, and highlight potential directions\nfor future work. In summary, this survey provides an extensive overview of\nreinforcement learning methods for sustainable energy, which may play a vital\nrole in the energy transition.\n","authors":["Koen Ponse","Felix Kleuker","Márton Fejér","Álvaro Serra-Gómez","Aske Plaat","Thomas Moerland"],"pdf_url":"https://arxiv.org/pdf/2407.18597v1.pdf","comment":"22 pages excluding references, 40 pages including references, 7\n images"},{"id":"http://arxiv.org/abs/2311.10777v5","updated":"2024-07-26T08:22:07Z","published":"2023-11-16T06:01:47Z","title":"A Systematic Review of Aspect-based Sentiment Analysis: Domains,\n Methods, and Trends","summary":" Aspect-based Sentiment Analysis (ABSA) is a fine-grained type of sentiment\nanalysis that identifies aspects and their associated opinions from a given\ntext. With the surge of digital opinionated text data, ABSA gained increasing\npopularity for its ability to mine more detailed and targeted insights. Many\nreview papers on ABSA subtasks and solution methodologies exist, however, few\nfocus on trends over time or systemic issues relating to research application\ndomains, datasets, and solution approaches. To fill the gap, this paper\npresents a Systematic Literature Review (SLR) of ABSA studies with a focus on\ntrends and high-level relationships among these fundamental components. This\nreview is one of the largest SLRs on ABSA. To our knowledge, it is also the\nfirst to systematically examine the interrelations among ABSA research and data\ndistribution across domains, as well as trends in solution paradigms and\napproaches. Our sample includes 727 primary studies screened from 8550 search\nresults without time constraints via an innovative automatic filtering process.\nOur quantitative analysis not only identifies trends in nearly two decades of\nABSA research development but also unveils a systemic lack of dataset and\ndomain diversity as well as domain mismatch that may hinder the development of\nfuture ABSA research. We discuss these findings and their implications and\npropose suggestions for future research.\n","authors":["Yan Cathy Hua","Paul Denny","Katerina Taskova","Jörg Wicker"],"pdf_url":"https://arxiv.org/pdf/2311.10777v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17856v2","updated":"2024-07-26T08:18:27Z","published":"2024-07-25T08:21:46Z","title":"MDS-ED: Multimodal Decision Support in the Emergency Department -- a\n Benchmark Dataset for Diagnoses and Deterioration Prediction in Emergency\n Medicine","summary":" Background: Benchmarking medical decision support algorithms often struggles\ndue to limited access to datasets, narrow prediction tasks, and restricted\ninput modalities. These limitations affect their clinical relevance and\nperformance in high-stakes areas like emergency care, complicating replication,\nvalidation, and improvement of benchmarks.\n Methods: We introduce a dataset based on MIMIC-IV, benchmarking protocol, and\ninitial results for evaluating multimodal decision support in the emergency\ndepartment (ED). We use diverse data modalities from the first 1.5 hours of\npatient arrival, including demographics, biometrics, vital signs, lab values,\nand electrocardiogram waveforms. We analyze 1443 clinical labels across two\ncontexts: predicting diagnoses with ICD-10 codes and forecasting patient\ndeterioration.\n Results: Our multimodal diagnostic model achieves an AUROC score over 0.8 in\na statistically significant manner for 357 out of 1428 conditions, including\ncardiac issues like myocardial infarction and non-cardiac conditions such as\nrenal disease and diabetes. The deterioration model scores above 0.8 in a\nstatistically significant manner for 13 out of 15 targets, including critical\nevents like cardiac arrest and mechanical ventilation, ICU admission as well as\nshort- and long-term mortality. Incorporating raw waveform data significantly\nimproves model performance, which represents one of the first robust\ndemonstrations of this effect.\n Conclusions: This study highlights the uniqueness of our dataset, which\nencompasses a wide range of clinical tasks and utilizes a comprehensive set of\nfeatures collected early during the emergency after arriving at the ED. The\nstrong performance, as evidenced by high AUROC scores across diagnostic and\ndeterioration targets, underscores the potential of our approach to\nrevolutionize decision-making in acute and emergency medicine.\n","authors":["Juan Miguel Lopez Alcaraz","Hjalmar Bouma","Nils Strodthoff"],"pdf_url":"https://arxiv.org/pdf/2407.17856v2.pdf","comment":"14 pages, 1 figure, code available under\n https://github.com/AI4HealthUOL/MDS-ED"},{"id":"http://arxiv.org/abs/2407.00465v2","updated":"2024-07-26T07:57:18Z","published":"2024-06-29T15:21:20Z","title":"Characterizing Continual Learning Scenarios and Strategies for Audio\n Analysis","summary":" Audio analysis is useful in many application scenarios. The state-of-the-art\naudio analysis approaches assume the data distribution at training and\ndeployment time will be the same. However, due to various real-life challenges,\nthe data may encounter drift in its distribution or can encounter new classes\nin the late future. Thus, a one-time trained model might not perform\nadequately. Continual learning (CL) approaches are devised to handle such\nchanges in data distribution. There have been a few attempts to use CL\napproaches for audio analysis. Yet, there is a lack of a systematic evaluation\nframework. In this paper, we create a comprehensive CL dataset and characterize\nCL approaches for audio-based monitoring tasks. We have investigated the\nfollowing CL and non-CL approaches: EWC, LwF, SI, GEM, A-GEM, GDumb, Replay,\nNaive, Cumulative, and Joint training. The study is very beneficial for\nresearchers and practitioners working in the area of audio analysis for\ndeveloping adaptive models. We observed that Replay achieved better results\nthan other methods in the DCASE challenge data. It achieved an accuracy of\n70.12% for the domain incremental scenario and an accuracy of 96.98% for the\nclass incremental scenario.\n","authors":["Ruchi Bhatt","Pratibha Kumari","Dwarikanath Mahapatra","Abdulmotaleb El Saddik","Mukesh Saini"],"pdf_url":"https://arxiv.org/pdf/2407.00465v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18569v1","updated":"2024-07-26T07:51:11Z","published":"2024-07-26T07:51:11Z","title":"PP-TIL: Personalized Planning for Autonomous Driving with Instance-based\n Transfer Imitation Learning","summary":" Personalized motion planning holds significant importance within urban\nautomated driving, catering to the unique requirements of individual users.\nNevertheless, prior endeavors have frequently encountered difficulties in\nsimultaneously addressing two crucial aspects: personalized planning within\nintricate urban settings and enhancing planning performance through data\nutilization. The challenge arises from the expensive and limited nature of user\ndata, coupled with the scene state space tending towards infinity. These\nfactors contribute to overfitting and poor generalization problems during model\ntraining. Henceforth, we propose an instance-based transfer imitation learning\napproach. This method facilitates knowledge transfer from extensive expert\ndomain data to the user domain, presenting a fundamental resolution to these\nissues. We initially train a pre-trained model using large-scale expert data.\nSubsequently, during the fine-tuning phase, we feed the batch data, which\ncomprises expert and user data. Employing the inverse reinforcement learning\ntechnique, we extract the style feature distribution from user demonstrations,\nconstructing the regularization term for the approximation of user style. In\nour experiments, we conducted extensive evaluations of the proposed method.\nCompared to the baseline methods, our approach mitigates the overfitting issue\ncaused by sparse user data. Furthermore, we discovered that integrating the\ndriving model with a differentiable nonlinear optimizer as a safety protection\nlayer for end-to-end personalized fine-tuning results in superior planning\nperformance.\n","authors":["Fangze Lin","Ying He","Fei Yu"],"pdf_url":"https://arxiv.org/pdf/2407.18569v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18564v1","updated":"2024-07-26T07:40:54Z","published":"2024-07-26T07:40:54Z","title":"Unveiling Privacy Vulnerabilities: Investigating the Role of Structure\n in Graph Data","summary":" The public sharing of user information opens the door for adversaries to\ninfer private data, leading to privacy breaches and facilitating malicious\nactivities. While numerous studies have concentrated on privacy leakage via\npublic user attributes, the threats associated with the exposure of user\nrelationships, particularly through network structure, are often neglected.\nThis study aims to fill this critical gap by advancing the understanding and\nprotection against privacy risks emanating from network structure, moving\nbeyond direct connections with neighbors to include the broader implications of\nindirect network structural patterns. To achieve this, we first investigate the\nproblem of Graph Privacy Leakage via Structure (GPS), and introduce a novel\nmeasure, the Generalized Homophily Ratio, to quantify the various mechanisms\ncontributing to privacy breach risks in GPS. Based on this insight, we develop\na novel graph private attribute inference attack, which acts as a pivotal tool\nfor evaluating the potential for privacy leakage through network structures\nunder worst-case scenarios. To protect users' private data from such\nvulnerabilities, we propose a graph data publishing method incorporating a\nlearnable graph sampling technique, effectively transforming the original graph\ninto a privacy-preserving version. Extensive experiments demonstrate that our\nattack model poses a significant threat to user privacy, and our graph data\npublishing method successfully achieves the optimal privacy-utility trade-off\ncompared to baselines.\n","authors":["Hanyang Yuan","Jiarong Xu","Cong Wang","Ziqi Yang","Chunping Wang","Keting Yin","Yang Yang"],"pdf_url":"https://arxiv.org/pdf/2407.18564v1.pdf","comment":"In KDD'24; with full appendix"},{"id":"http://arxiv.org/abs/2407.18556v1","updated":"2024-07-26T07:10:27Z","published":"2024-07-26T07:10:27Z","title":"Look Globally and Reason: Two-stage Path Reasoning over Sparse Knowledge\n Graphs","summary":" Sparse Knowledge Graphs (KGs), frequently encountered in real-world\napplications, contain fewer facts in the form of (head entity, relation, tail\nentity) compared to more populated KGs. The sparse KG completion task, which\nreasons answers for given queries in the form of (head entity, relation, ?) for\nsparse KGs, is particularly challenging due to the necessity of reasoning\nmissing facts based on limited facts. Path-based models, known for excellent\nexplainability, are often employed for this task. However, existing path-based\nmodels typically rely on external models to fill in missing facts and\nsubsequently perform path reasoning. This approach introduces unexplainable\nfactors or necessitates meticulous rule design. In light of this, this paper\nproposes an alternative approach by looking inward instead of seeking external\nassistance. We introduce a two-stage path reasoning model called LoGRe (Look\nGlobally and Reason) over sparse KGs. LoGRe constructs a relation-path\nreasoning schema by globally analyzing the training data to alleviate the\nsparseness problem. Based on this schema, LoGRe then aggregates paths to reason\nout answers. Experimental results on five benchmark sparse KG datasets\ndemonstrate the effectiveness of the proposed LoGRe model.\n","authors":["Saiping Guan","Jiyao Wei","Xiaolong Jin","Jiafeng Guo","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2407.18556v1.pdf","comment":"Accepted to CIKM 2024"},{"id":"http://arxiv.org/abs/2312.14552v2","updated":"2024-07-26T07:08:24Z","published":"2023-12-22T09:28:30Z","title":"Machine learning for structure-guided materials and process design","summary":" In recent years, there has been a growing interest in accelerated materials\ninnovation in the context of the process-structure-property chain. In this\nregard, it is essential to take into account manufacturing processes and tailor\nmaterials design approaches to support downstream process design approaches. As\na major step into this direction, we present a holistic optimization approach\nthat covers the entire process-structure-property chain in materials\nengineering. Our approach specifically employs machine learning to address two\ncritical identification problems: a materials design problem, which involves\nidentifying near-optimal material structures that exhibit desired properties,\nand a process design problem that is to find an optimal processing path to\nmanufacture these structures. Both identification problems are typically\nill-posed, which presents a significant challenge for solution approaches.\nHowever, the non-unique nature of these problems offers an important advantage\nfor processing: By having several target structures that perform similarly\nwell, processes can be efficiently guided towards manufacturing the best\nreachable structure. The functionality of the approach will be demonstrated\nmanufacturing crystallographic textures with desired properties in a metal\nforming process.\n","authors":["Lukas Morand","Tarek Iraki","Johannes Dornheim","Stefan Sandfeld","Norbert Link","Dirk Helm"],"pdf_url":"https://arxiv.org/pdf/2312.14552v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12902v2","updated":"2024-07-26T07:08:19Z","published":"2023-11-21T11:04:13Z","title":"Enhancing Solutions for Complex PDEs: Introducing Complementary\n Convolution and Equivariant Attention in Fourier Neural Operators","summary":" Neural operators improve conventional neural networks by expanding their\ncapabilities of functional mappings between different function spaces to solve\npartial differential equations (PDEs). One of the most notable methods is the\nFourier Neural Operator (FNO), which draws inspiration from Green's function\nmethod and directly approximates operator kernels in the frequency domain.\nHowever, after empirical observation followed by theoretical validation, we\ndemonstrate that the FNO approximates kernels primarily in a relatively\nlow-frequency domain. This suggests a limited capability in solving complex\nPDEs, particularly those characterized by rapid coefficient changes and\noscillations in the solution space. Such cases are crucial in specific\nscenarios, like atmospheric convection and ocean circulation. To address this\nchallenge, inspired by the translation equivariant of the convolution kernel,\nwe propose a novel hierarchical Fourier neural operator along with\nconvolution-residual layers and attention mechanisms to make them complementary\nin the frequency domain to solve complex PDEs. We perform experiments on\nforward and reverse problems of multiscale elliptic equations, Navier-Stokes\nequations, and other physical scenarios, and find that the proposed method\nachieves superior performance in these PDE benchmarks, especially for equations\ncharacterized by rapid coefficient variations.\n","authors":["Xuanle Zhao","Yue Sun","Tielin Zhang","Bo Xu"],"pdf_url":"https://arxiv.org/pdf/2311.12902v2.pdf","comment":"14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.18552v1","updated":"2024-07-26T07:05:04Z","published":"2024-07-26T07:05:04Z","title":"Multimodal Emotion Recognition using Audio-Video Transformer Fusion with\n Cross Attention","summary":" Understanding emotions is a fundamental aspect of human communication.\nIntegrating audio and video signals offers a more comprehensive understanding\nof emotional states compared to traditional methods that rely on a single data\nsource, such as speech or facial expressions. Despite its potential, multimodal\nemotion recognition faces significant challenges, particularly in\nsynchronization, feature extraction, and fusion of diverse data sources. To\naddress these issues, this paper introduces a novel transformer-based model\nnamed Audio-Video Transformer Fusion with Cross Attention (AVT-CA). The AVT-CA\nmodel employs a transformer fusion approach to effectively capture and\nsynchronize interlinked features from both audio and video inputs, thereby\nresolving synchronization problems. Additionally, the Cross Attention mechanism\nwithin AVT-CA selectively extracts and emphasizes critical features while\ndiscarding irrelevant ones from both modalities, addressing feature extraction\nand fusion challenges. Extensive experimental analysis conducted on the\nCMU-MOSEI, RAVDESS and CREMA-D datasets demonstrates the efficacy of the\nproposed model. The results underscore the importance of AVT-CA in developing\nprecise and reliable multimodal emotion recognition systems for practical\napplications.\n","authors":["Joe Dhanith P R","Shravan Venkatraman","Vigya Sharma","Santhosh Malarvannan"],"pdf_url":"https://arxiv.org/pdf/2407.18552v1.pdf","comment":"38 Pages, 9 Tables, 12 Figures"},{"id":"http://arxiv.org/abs/2407.18544v1","updated":"2024-07-26T06:50:17Z","published":"2024-07-26T06:50:17Z","title":"Utilising Explainable Techniques for Quality Prediction in a Complex\n Textiles Manufacturing Use Case","summary":" This paper develops an approach to classify instances of product failure in a\ncomplex textiles manufacturing dataset using explainable techniques. The\ndataset used in this study was obtained from a New Zealand manufacturer of\nwoollen carpets and rugs. In investigating the trade-off between accuracy and\nexplainability, three different tree-based classification algorithms were\nevaluated: a Decision Tree and two ensemble methods, Random Forest and XGBoost.\nAdditionally, three feature selection methods were also evaluated: the\nSelectKBest method, using chi-squared as the scoring function, the Pearson\nCorrelation Coefficient, and the Boruta algorithm. Not surprisingly, the\nensemble methods typically produced better results than the Decision Tree\nmodel. The Random Forest model yielded the best results overall when combined\nwith the Boruta feature selection technique. Finally, a tree ensemble\nexplaining technique was used to extract rule lists to capture necessary and\nsufficient conditions for classification by a trained model that could be\neasily interpreted by a human. Notably, several features that were in the\nextracted rule lists were statistical features and calculated features that\nwere added to the original dataset. This demonstrates the influence that\nbringing in additional information during the data preprocessing stages can\nhave on the ultimate model performance.\n","authors":["Briony Forsberg","Dr Henry Williams","Prof Bruce MacDonald","Tracy Chen","Dr Reza Hamzeh","Dr Kirstine Hulse"],"pdf_url":"https://arxiv.org/pdf/2407.18544v1.pdf","comment":"Accepted at the 2024 IEEE 20th International Conference on Automation\n Science and Engineering (CASE 2024), awaiting publication Contains seven\n pages and five figures"},{"id":"http://arxiv.org/abs/2403.09793v3","updated":"2024-07-26T06:41:45Z","published":"2024-03-14T18:25:40Z","title":"Socially Integrated Navigation: A Social Acting Robot with Deep\n Reinforcement Learning","summary":" Mobile robots are being used on a large scale in various crowded situations\nand become part of our society. The socially acceptable navigation behavior of\na mobile robot with individual human consideration is an essential requirement\nfor scalable applications and human acceptance. Deep Reinforcement Learning\n(DRL) approaches are recently used to learn a robot's navigation policy and to\nmodel the complex interactions between robots and humans. We propose to divide\nexisting DRL-based navigation approaches based on the robot's exhibited social\nbehavior and distinguish between social collision avoidance with a lack of\nsocial behavior and socially aware approaches with explicit predefined social\nbehavior. In addition, we propose a novel socially integrated navigation\napproach where the robot's social behavior is adaptive and emerges from the\ninteraction with humans. The formulation of our approach is derived from a\nsociological definition, which states that social acting is oriented toward the\nacting of others. The DRL policy is trained in an environment where other\nagents interact socially integrated and reward the robot's behavior\nindividually. The simulation results indicate that the proposed socially\nintegrated navigation approach outperforms a socially aware approach in terms\nof ego navigation performance while significantly reducing the negative impact\non all agents within the environment.\n","authors":["Daniel Flögel","Lars Fischer","Thomas Rudolf","Tobias Schürmann","Sören Hohmann"],"pdf_url":"https://arxiv.org/pdf/2403.09793v3.pdf","comment":"Accepted at 2024 IEEE/RSJ International Conference on Intelligent\n Robots and Systems (IROS)"},{"id":"http://arxiv.org/abs/2402.07729v2","updated":"2024-07-26T06:30:47Z","published":"2024-02-12T15:41:22Z","title":"AIR-Bench: Benchmarking Large Audio-Language Models via Generative\n Comprehension","summary":" Recently, instruction-following audio-language models have received broad\nattention for human-audio interaction. However, the absence of benchmarks\ncapable of evaluating audio-centric interaction capabilities has impeded\nadvancements in this field. Previous models primarily focus on assessing\ndifferent fundamental tasks, such as Automatic Speech Recognition (ASR), and\nlack an assessment of the open-ended generative capabilities centered around\naudio. Thus, it is challenging to track the progression in the Large\nAudio-Language Models (LALMs) domain and to provide guidance for future\nimprovement. In this paper, we introduce AIR-Bench (\\textbf{A}udio\n\\textbf{I}nst\\textbf{R}uction \\textbf{Bench}mark), the first benchmark designed\nto evaluate the ability of LALMs to understand various types of audio signals\n(including human speech, natural sounds, and music), and furthermore, to\ninteract with humans in the textual format. AIR-Bench encompasses two\ndimensions: \\textit{foundation} and \\textit{chat} benchmarks. The former\nconsists of 19 tasks with approximately 19k single-choice questions, intending\nto inspect the basic single-task ability of LALMs. The latter one contains 2k\ninstances of open-ended question-and-answer data, directly assessing the\ncomprehension of the model on complex audio and its capacity to follow\ninstructions. Both benchmarks require the model to generate hypotheses\ndirectly. We design a unified framework that leverages advanced language\nmodels, such as GPT-4, to evaluate the scores of generated hypotheses given the\nmeta-information of the audio. Experimental results demonstrate a high level of\nconsistency between GPT-4-based evaluation and human evaluation. By revealing\nthe limitations of existing LALMs through evaluation results, AIR-Bench can\nprovide insights into the direction of future research.\n","authors":["Qian Yang","Jin Xu","Wenrui Liu","Yunfei Chu","Ziyue Jiang","Xiaohuan Zhou","Yichong Leng","Yuanjun Lv","Zhou Zhao","Chang Zhou","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2402.07729v2.pdf","comment":"Code and Data: https://github.com/OFA-Sys/AIR-Bench. Accepted by ACL\n 2024"},{"id":"http://arxiv.org/abs/2407.18526v1","updated":"2024-07-26T06:16:11Z","published":"2024-07-26T06:16:11Z","title":"Constructing Enhanced Mutual Information for Online Class-Incremental\n Learning","summary":" Online Class-Incremental continual Learning (OCIL) addresses the challenge of\ncontinuously learning from a single-channel data stream, adapting to new tasks\nwhile mitigating catastrophic forgetting. Recently, Mutual Information\n(MI)-based methods have shown promising performance in OCIL. However, existing\nMI-based methods treat various knowledge components in isolation, ignoring the\nknowledge confusion across tasks. This narrow focus on simple MI knowledge\nalignment may lead to old tasks being easily forgotten with the introduction of\nnew tasks, risking the loss of common parts between past and present\nknowledge.To address this, we analyze the MI relationships from the\nperspectives of diversity, representativeness, and separability, and propose an\nEnhanced Mutual Information (EMI) method based on knwoledge decoupling. EMI\nconsists of Diversity Mutual Information (DMI), Representativeness Mutual\nInformation (RMI) and Separability Mutual Information (SMI). DMI diversifies\nintra-class sample features by considering the similarity relationships among\ninter-class sample features to enable the network to learn more general\nknowledge. RMI summarizes representative features for each category and aligns\nsample features with these representative features, making the intra-class\nsample distribution more compact. SMI establishes MI relationships for\ninter-class representative features, enhancing the stability of representative\nfeatures while increasing the distinction between inter-class representative\nfeatures, thus creating clear boundaries between class. Extensive experimental\nresults on widely used benchmark datasets demonstrate the superior performance\nof EMI over state-of-the-art baseline methods.\n","authors":["Huan Zhang","Fan Lyu","Shenghua Fan","Yujin Zheng","Dingwen Wang"],"pdf_url":"https://arxiv.org/pdf/2407.18526v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18525v1","updated":"2024-07-26T06:09:10Z","published":"2024-07-26T06:09:10Z","title":"Is larger always better? Evaluating and prompting large language models\n for non-generative medical tasks","summary":" The use of Large Language Models (LLMs) in medicine is growing, but their\nability to handle both structured Electronic Health Record (EHR) data and\nunstructured clinical notes is not well-studied. This study benchmarks various\nmodels, including GPT-based LLMs, BERT-based models, and traditional clinical\npredictive models, for non-generative medical tasks utilizing renowned\ndatasets. We assessed 14 language models (9 GPT-based and 5 BERT-based) and 7\ntraditional predictive models using the MIMIC dataset (ICU patient records) and\nthe TJH dataset (early COVID-19 EHR data), focusing on tasks such as mortality\nand readmission prediction, disease hierarchy reconstruction, and biomedical\nsentence matching, comparing both zero-shot and finetuned performance. Results\nindicated that LLMs exhibited robust zero-shot predictive capabilities on\nstructured EHR data when using well-designed prompting strategies, frequently\nsurpassing traditional models. However, for unstructured medical texts, LLMs\ndid not outperform finetuned BERT models, which excelled in both supervised and\nunsupervised tasks. Consequently, while LLMs are effective for zero-shot\nlearning on structured data, finetuned BERT models are more suitable for\nunstructured texts, underscoring the importance of selecting models based on\nspecific task requirements and data characteristics to optimize the application\nof NLP technology in healthcare.\n","authors":["Yinghao Zhu","Junyi Gao","Zixiang Wang","Weibin Liao","Xiaochen Zheng","Lifang Liang","Yasha Wang","Chengwei Pan","Ewen M. Harrison","Liantao Ma"],"pdf_url":"https://arxiv.org/pdf/2407.18525v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2402.01713"},{"id":"http://arxiv.org/abs/2407.18523v1","updated":"2024-07-26T05:46:23Z","published":"2024-07-26T05:46:23Z","title":"DTFormer: A Transformer-Based Method for Discrete-Time Dynamic Graph\n Representation Learning","summary":" Discrete-Time Dynamic Graphs (DTDGs), which are prevalent in real-world\nimplementations and notable for their ease of data acquisition, have garnered\nconsiderable attention from both academic researchers and industry\npractitioners. The representation learning of DTDGs has been extensively\napplied to model the dynamics of temporally changing entities and their\nevolving connections. Currently, DTDG representation learning predominantly\nrelies on GNN+RNN architectures, which manifest the inherent limitations of\nboth Graph Neural Networks (GNNs) and Recurrent Neural Networks (RNNs). GNNs\nsuffer from the over-smoothing issue as the models architecture goes deeper,\nwhile RNNs struggle to capture long-term dependencies effectively. GNN+RNN\narchitectures also grapple with scaling to large graph sizes and long\nsequences. Additionally, these methods often compute node representations\nseparately and focus solely on individual node characteristics, thereby\noverlooking the behavior intersections between the two nodes whose link is\nbeing predicted, such as instances where the two nodes appear together in the\nsame context or share common neighbors.\n This paper introduces a novel representation learning method DTFormer for\nDTDGs, pivoting from the traditional GNN+RNN framework to a Transformer-based\narchitecture. Our approach exploits the attention mechanism to concurrently\nprocess topological information within the graph at each timestamp and temporal\ndynamics of graphs along the timestamps, circumventing the aforementioned\nfundamental weakness of both GNNs and RNNs. Moreover, we enhance the model's\nexpressive capability by incorporating the intersection relationships among\nnodes and integrating a multi-patching module. Extensive experiments conducted\non six public dynamic graph benchmark datasets confirm our model's efficacy,\nachieving the SOTA performance.\n","authors":["Xi Chen","Yun Xiong","Siwei Zhang","Jiawei Zhang","Yao Zhang","Shiyang Zhou","Xixi Wu","Mingyang Zhang","Tengfei Liu","Weiqiang Wang"],"pdf_url":"https://arxiv.org/pdf/2407.18523v1.pdf","comment":"11 pages, 3 figures"},{"id":"http://arxiv.org/abs/2407.18519v1","updated":"2024-07-26T05:27:26Z","published":"2024-07-26T05:27:26Z","title":"TCGPN: Temporal-Correlation Graph Pre-trained Network for Stock\n Forecasting","summary":" Recently, the incorporation of both temporal features and the correlation\nacross time series has become an effective approach in time series prediction.\nSpatio-Temporal Graph Neural Networks (STGNNs) demonstrate good performance on\nmany Temporal-correlation Forecasting Problem. However, when applied to tasks\nlacking periodicity, such as stock data prediction, the effectiveness and\nrobustness of STGNNs are found to be unsatisfactory. And STGNNs are limited by\nmemory savings so that cannot handle problems with a large number of nodes. In\nthis paper, we propose a novel approach called the Temporal-Correlation Graph\nPre-trained Network (TCGPN) to address these limitations. TCGPN utilize\nTemporal-correlation fusion encoder to get a mixed representation and\npre-training method with carefully designed temporal and correlation\npre-training tasks. Entire structure is independent of the number and order of\nnodes, so better results can be obtained through various data enhancements. And\nmemory consumption during training can be significantly reduced through\nmultiple sampling. Experiments are conducted on real stock market data sets\nCSI300 and CSI500 that exhibit minimal periodicity. We fine-tune a simple MLP\nin downstream tasks and achieve state-of-the-art results, validating the\ncapability to capture more robust temporal correlation patterns.\n","authors":["Wenbo Yan","Ying Tan"],"pdf_url":"https://arxiv.org/pdf/2407.18519v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03733v3","updated":"2024-07-26T05:26:40Z","published":"2024-06-06T04:12:57Z","title":"Credit Card Fraud Detection Using Advanced Transformer Model","summary":" With the proliferation of various online and mobile payment systems, credit\ncard fraud has emerged as a significant threat to financial security. This\nstudy focuses on innovative applications of the latest Transformer models for\nmore robust and precise fraud detection. To ensure the reliability of the data,\nwe meticulously processed the data sources, balancing the dataset to address\nthe issue of data sparsity significantly. We also selected highly correlated\nvectors to strengthen the training process.To guarantee the reliability and\npracticality of the new Transformer model, we conducted performance comparisons\nwith several widely adopted models, including Support Vector Machine (SVM),\nRandom Forest, Neural Network, and Logistic Regression. We rigorously compared\nthese models using metrics such as Precision, Recall, and F1 Score. Through\nthese detailed analyses and comparisons, we present to the readers a highly\nefficient and powerful anti-fraud mechanism with promising prospects. The\nresults demonstrate that the Transformer model not only excels in traditional\napplications but also shows great potential in niche areas like fraud\ndetection, offering a substantial advancement in the field.\n","authors":["Chang Yu","Yongshun Xu","Jin Cao","Ye Zhang","Yinxin Jin","Mengran Zhu"],"pdf_url":"https://arxiv.org/pdf/2406.03733v3.pdf","comment":"This paper have been received by https://ieee-metacom.org/"},{"id":"http://arxiv.org/abs/2407.18518v1","updated":"2024-07-26T05:23:55Z","published":"2024-07-26T05:23:55Z","title":"WorkR: Occupation Inference for Intelligent Task Assistance","summary":" Occupation information can be utilized by digital assistants to provide\noccupation-specific personalized task support, including interruption\nmanagement, task planning, and recommendations. Prior research in the digital\nworkplace assistant domain requires users to input their occupation information\nfor effective support. However, as many individuals switch between multiple\noccupations daily, current solutions falter without continuous user input. To\naddress this, this study introduces WorkR, a framework that leverages passive\nsensing to capture pervasive signals from various task activities, addressing\nthree challenges: the lack of a passive sensing architecture, personalization\nof occupation characteristics, and discovering latent relationships among\noccupation variables. We argue that signals from application usage, movements,\nsocial interactions, and the environment can inform a user's occupation. WorkR\nuses a Variational Autoencoder (VAE) to derive latent features for training\nmodels to infer occupations. Our experiments with an anonymized, context-rich\nactivity and task log dataset demonstrate that our models can accurately infer\noccupations with more than 91% accuracy across six ISO occupation categories.\n","authors":["Yonchanok Khaokaew","Hao Xue","Mohammad Saiedur Rahaman","Flora D. Salim"],"pdf_url":"https://arxiv.org/pdf/2407.18518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04658v2","updated":"2024-07-26T05:07:22Z","published":"2024-06-07T05:56:43Z","title":"Advanced Payment Security System:XGBoost, LightGBM and SMOTE Integrated","summary":" With the rise of various online and mobile payment systems, transaction fraud\nhas become a significant threat to financial security. This study explores the\napplication of advanced machine learning models, specifically based on XGBoost\nand LightGBM, for developing a more accurate and robust Payment Security\nProtection Model. To enhance data reliability, we meticulously processed the\ndata sources and applied SMOTE (Synthetic Minority Over-sampling Technique) to\naddress class imbalance and improve data representation. By selecting highly\ncorrelated features, we aimed to strengthen the training process and boost\nmodel performance. We conducted thorough performance evaluations of our\nproposed models, comparing them against traditional methods including Random\nForest, Neural Network, and Logistic Regression. Using metrics such as\nPrecision, Recall, and F1 Score, we rigorously assessed their effectiveness.\nOur detailed analyses and comparisons reveal that the combination of SMOTE with\nXGBoost and LightGBM offers a highly efficient and powerful mechanism for\npayment security protection. Moreover, the integration of XGBoost and LightGBM\nin a Local Ensemble model further demonstrated outstanding performance. After\nincorporating SMOTE, the new combined model achieved a significant improvement\nof nearly 6\\% over traditional models and around 5\\% over its sub-models,\nshowcasing remarkable results.\n","authors":["Qi Zheng","Chang Yu","Jin Cao","Yongshun Xu","Qianwen Xing","Yinxin Jin"],"pdf_url":"https://arxiv.org/pdf/2406.04658v2.pdf","comment":"This paper is received by https://ieee-metacom.org"},{"id":"http://arxiv.org/abs/2406.19136v5","updated":"2024-07-26T04:47:15Z","published":"2024-06-27T12:40:29Z","title":"YZS-model: A Predictive Model for Organic Drug Solubility Based on Graph\n Convolutional Networks and Transformer-Attention","summary":" Accurate prediction of drug molecule solubility is crucial for therapeutic\neffectiveness and safety. Traditional methods often miss complex molecular\nstructures, leading to inaccuracies. We introduce the YZS-Model, a deep\nlearning framework integrating Graph Convolutional Networks (GCN), Transformer\narchitectures, and Long Short-Term Memory (LSTM) networks to enhance prediction\nprecision. GCNs excel at capturing intricate molecular topologies by modeling\nthe relationships between atoms and bonds. Transformers, with their\nself-attention mechanisms, effectively identify long-range dependencies within\nmolecules, capturing global interactions. LSTMs process sequential data,\npreserving long-term dependencies and integrating temporal information within\nmolecular sequences. This multifaceted approach leverages the strengths of each\ncomponent, resulting in a model that comprehensively understands and predicts\nmolecular properties. Trained on 9,943 compounds and tested on an anticancer\ndataset, the YZS-Model achieved an $R^2$ of 0.59 and an RMSE of 0.57,\noutperforming benchmark models ($R^2$ of 0.52 and RMSE of 0.61). In an\nindependent test, it demonstrated an RMSE of 1.05, improving accuracy by 45.9%.\nThe integration of these deep learning techniques allows the YZS-Model to learn\nvaluable features from complex data without predefined parameters, handle large\ndatasets efficiently, and adapt to various molecular types. This comprehensive\ncapability significantly improves predictive accuracy and model\ngeneralizability. Its precision in solubility predictions can expedite drug\ndevelopment by optimizing candidate selection, reducing costs, and enhancing\nefficiency. Our research underscores deep learning's transformative potential\nin pharmaceutical science, particularly for solubility prediction and drug\ndesign.\n","authors":["Chenxu Wang","Haowei Ming","Jian He","Yao Lu","Junhong Chen"],"pdf_url":"https://arxiv.org/pdf/2406.19136v5.pdf","comment":"23 pages, 16 figures, 6 tables"},{"id":"http://arxiv.org/abs/2407.18501v1","updated":"2024-07-26T04:18:36Z","published":"2024-07-26T04:18:36Z","title":"The formation of perceptual space in early phonetic acquisition: a\n cross-linguistic modeling approach","summary":" This study investigates how learners organize perceptual space in early\nphonetic acquisition by advancing previous studies in two key aspects. Firstly,\nit examines the shape of the learned hidden representation as well as its\nability to categorize phonetic categories. Secondly, it explores the impact of\ntraining models on context-free acoustic information, without involving\ncontextual cues, on phonetic acquisition, closely mimicking the early language\nlearning stage. Using a cross-linguistic modeling approach, autoencoder models\nare trained on English and Mandarin and evaluated in both native and non-native\nconditions, following experimental conditions used in infant language\nperception studies. The results demonstrate that unsupervised bottom-up\ntraining on context-free acoustic information leads to comparable learned\nrepresentations of perceptual space between native and non-native conditions\nfor both English and Mandarin, resembling the early stage of universal\nlistening in infants. These findings provide insights into the organization of\nperceptual space during early phonetic acquisition and contribute to our\nunderstanding of the formation and representation of phonetic categories.\n","authors":["Frank Lihui Tan","Youngah Do"],"pdf_url":"https://arxiv.org/pdf/2407.18501v1.pdf","comment":"51 pages"},{"id":"http://arxiv.org/abs/2407.18496v1","updated":"2024-07-26T04:01:27Z","published":"2024-07-26T04:01:27Z","title":"Towards More Accurate Prediction of Human Empathy and Emotion in Text\n and Multi-turn Conversations by Combining Advanced NLP, Transformers-based\n Networks, and Linguistic Methodologies","summary":" Based on the WASSA 2022 Shared Task on Empathy Detection and Emotion\nClassification, we predict the level of empathic concern and personal distress\ndisplayed in essays. For the first stage of this project we implemented a\nFeed-Forward Neural Network using sentence-level embeddings as features. We\nexperimented with four different embedding models for generating the inputs to\nthe neural network. The subsequent stage builds upon the previous work and we\nhave implemented three types of revisions. The first revision focuses on the\nenhancements to the model architecture and the training approach. The second\nrevision focuses on handling class imbalance using stratified data sampling.\nThe third revision focuses on leveraging lexical resources, where we apply four\ndifferent resources to enrich the features associated with the dataset. During\nthe final stage of this project, we have created the final end-to-end system\nfor the primary task using an ensemble of models to revise primary task\nperformance. Additionally, as part of the final stage, these approaches have\nbeen adapted to the WASSA 2023 Shared Task on Empathy Emotion and Personality\nDetection in Interactions, in which the empathic concern, emotion polarity, and\nemotion intensity in dyadic text conversations are predicted.\n","authors":["Manisha Singh","Divy Sharma","Alonso Ma","Nora Goldfine"],"pdf_url":"https://arxiv.org/pdf/2407.18496v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.05638v2","updated":"2024-07-26T03:59:24Z","published":"2024-05-09T09:27:18Z","title":"A Correlation-induced Finite Difference Estimator","summary":" Estimating stochastic gradients is pivotal in fields like service systems\nwithin operations research. The classical method for this estimation is the\nfinite difference approximation, which entails generating samples at perturbed\ninputs. Nonetheless, practical challenges persist in determining the\nperturbation and obtaining an optimal finite difference estimator in the sense\nof possessing the smallest mean squared error (MSE). To tackle this problem, we\npropose a double sample-recycling approach in this paper. Firstly, pilot\nsamples are recycled to estimate the optimal perturbation. Secondly, recycling\nthese pilot samples again and generating new samples at the estimated\nperturbation, lead to an efficient finite difference estimator. We analyze its\nbias, variance and MSE. Our analyses demonstrate a reduction in asymptotic\nvariance, and in some cases, a decrease in asymptotic bias, compared to the\noptimal finite difference estimator. Therefore, our proposed estimator\nconsistently coincides with, or even outperforms the optimal finite difference\nestimator. In numerical experiments, we apply the estimator in several\nexamples, and numerical results demonstrate its robustness, as well as\ncoincidence with the theory presented, especially in the case of small sample\nsizes.\n","authors":["Guo Liang","Guangwu Liu","Kun Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.05638v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18488v1","updated":"2024-07-26T03:43:10Z","published":"2024-07-26T03:43:10Z","title":"Conversational Dueling Bandits in Generalized Linear Models","summary":" Conversational recommendation systems elicit user preferences by interacting\nwith users to obtain their feedback on recommended commodities. Such systems\nutilize a multi-armed bandit framework to learn user preferences in an online\nmanner and have received great success in recent years. However, existing\nconversational bandit methods have several limitations. First, they only enable\nusers to provide explicit binary feedback on the recommended items or\ncategories, leading to ambiguity in interpretation. In practice, users are\nusually faced with more than one choice. Relative feedback, known for its\ninformativeness, has gained increasing popularity in recommendation system\ndesign. Moreover, current contextual bandit methods mainly work under linear\nreward assumptions, ignoring practical non-linear reward structures in\ngeneralized linear models. Therefore, in this paper, we introduce relative\nfeedback-based conversations into conversational recommendation systems through\nthe integration of dueling bandits in generalized linear models (GLM) and\npropose a novel conversational dueling bandit algorithm called ConDuel.\nTheoretical analyses of regret upper bounds and empirical validations on\nsynthetic and real-world data underscore ConDuel's efficacy. We also\ndemonstrate the potential to extend our algorithm to multinomial logit bandits\nwith theoretical and experimental guarantees, which further proves the\napplicability of the proposed framework.\n","authors":["Shuhua Yang","Hui Yuan","Xiaoying Zhang","Mengdi Wang","Hong Zhang","Huazheng Wang"],"pdf_url":"https://arxiv.org/pdf/2407.18488v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13817v4","updated":"2024-07-26T03:39:50Z","published":"2023-11-23T05:52:28Z","title":"Enhancing Peak Assignment in 13C NMR Spectroscopy: A Novel Approach\n Using Multimodal Alignment","summary":" Nuclear magnetic resonance (NMR) spectroscopy plays an essential role in\ndeciphering molecular structure and dynamic behaviors. While AI-enhanced NMR\nprediction models hold promise, challenges still persist in tasks such as\nmolecular retrieval, isomer recognition, and peak assignment. In response, this\npaper introduces a novel solution, Multi-Level Multimodal Alignment with\nKnowledge-Guided Instance-Wise Discrimination (K-M3AID), which establishes\ncorrespondences between two heterogeneous modalities: molecular graphs and NMR\nspectra. K-M3AID employs a dual-coordinated contrastive learning architecture\nwith three key modules: a graph-level alignment module, a node-level alignment\nmodule, and a communication channel. Notably, K-M3AID introduces\nknowledge-guided instance-wise discrimination into contrastive learning within\nthe node-level alignment module. In addition, K-M3AID demonstrates that skills\nacquired during node-level alignment have a positive impact on graph-level\nalignment, acknowledging meta-learning as an inherent property. Empirical\nvalidation underscores K-M3AID's effectiveness in multiple zero-shot tasks.\n","authors":["Hao Xu","Zhengyang Zhou","Pengyu Hong"],"pdf_url":"https://arxiv.org/pdf/2311.13817v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18482v1","updated":"2024-07-26T03:17:41Z","published":"2024-07-26T03:17:41Z","title":"Practical Attribution Guidance for Rashomon Sets","summary":" Different prediction models might perform equally well (Rashomon set) in the\nsame task, but offer conflicting interpretations and conclusions about the\ndata. The Rashomon effect in the context of Explainable AI (XAI) has been\nrecognized as a critical factor. Although the Rashomon set has been introduced\nand studied in various contexts, its practical application is at its infancy\nstage and lacks adequate guidance and evaluation. We study the problem of the\nRashomon set sampling from a practical viewpoint and identify two fundamental\naxioms - generalizability and implementation sparsity that exploring methods\nought to satisfy in practical usage. These two axioms are not satisfied by most\nknown attribution methods, which we consider to be a fundamental weakness. We\nuse the norms to guide the design of an $\\epsilon$-subgradient-based sampling\nmethod. We apply this method to a fundamental mathematical problem as a proof\nof concept and to a set of practical datasets to demonstrate its ability\ncompared with existing sampling methods.\n","authors":["Sichao Li","Amanda S. Barnard","Quanling Deng"],"pdf_url":"https://arxiv.org/pdf/2407.18482v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18480v1","updated":"2024-07-26T03:14:13Z","published":"2024-07-26T03:14:13Z","title":"Scalable Graph Compressed Convolutions","summary":" Designing effective graph neural networks (GNNs) with message passing has two\nfundamental challenges, i.e., determining optimal message-passing pathways and\ndesigning local aggregators. Previous methods of designing optimal pathways are\nlimited with information loss on the input features. On the other hand,\nexisting local aggregators generally fail to extract multi-scale features and\napproximate diverse operators under limited parameter scales. In contrast to\nthese methods, Euclidean convolution has been proven as an expressive\naggregator, making it a perfect candidate for GNN construction. However, the\nchallenges of generalizing Euclidean convolution to graphs arise from the\nirregular structure of graphs. To bridge the gap between Euclidean space and\ngraph topology, we propose a differentiable method that applies permutations to\ncalibrate input graphs for Euclidean convolution. The permutations constrain\nall nodes in a row regardless of their input order and therefore enable the\nflexible generalization of Euclidean convolution to graphs. Based on the graph\ncalibration, we propose the Compressed Convolution Network (CoCN) for\nhierarchical graph representation learning. CoCN follows local feature-learning\nand global parameter-sharing mechanisms of convolution neural networks. The\nwhole model can be trained end-to-end, with compressed convolution applied to\nlearn individual node features and their corresponding structure features. CoCN\ncan further borrow successful practices from Euclidean convolution, including\nresidual connection and inception mechanism. We validate CoCN on both\nnode-level and graph-level benchmarks. CoCN achieves superior performance over\ncompetitive GNN baselines. Codes are available at\nhttps://github.com/sunjss/CoCN.\n","authors":["Junshu Sun","Chenxue Yang","Shuhui Wang","Qingming Huang"],"pdf_url":"https://arxiv.org/pdf/2407.18480v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.08012v2","updated":"2024-07-26T03:12:42Z","published":"2024-02-12T19:21:14Z","title":"Online Differentially Private Synthetic Data Generation","summary":" We present a polynomial-time algorithm for online differentially private\nsynthetic data generation. For a data stream within the hypercube $[0,1]^d$ and\nan infinite time horizon, we develop an online algorithm that generates a\ndifferentially private synthetic dataset at each time $t$. This algorithm\nachieves a near-optimal accuracy bound of $O(\\log(t)t^{-1/d})$ for $d\\geq 2$\nand $O(\\log^{4.5}(t)t^{-1})$ for $d=1$ in the 1-Wasserstein distance. This\nresult extends the previous work on the continual release model for counting\nqueries to Lipschitz queries. Compared to the offline case, where the entire\ndataset is available at once, our approach requires only an extra polylog\nfactor in the accuracy bound.\n","authors":["Yiyun He","Roman Vershynin","Yizhe Zhu"],"pdf_url":"https://arxiv.org/pdf/2402.08012v2.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2407.18472v1","updated":"2024-07-26T02:48:32Z","published":"2024-07-26T02:48:32Z","title":"FedUD: Exploiting Unaligned Data for Cross-Platform Federated\n Click-Through Rate Prediction","summary":" Click-through rate (CTR) prediction plays an important role in online\nadvertising platforms. Most existing methods use data from the advertising\nplatform itself for CTR prediction. As user behaviors also exist on many other\nplatforms, e.g., media platforms, it is beneficial to further exploit such\ncomplementary information for better modeling user interest and for improving\nCTR prediction performance. However, due to privacy concerns, data from\ndifferent platforms cannot be uploaded to a server for centralized model\ntraining. Vertical federated learning (VFL) provides a possible solution which\nis able to keep the raw data on respective participating parties and learn a\ncollaborative model in a privacy-preserving way. However, traditional VFL\nmethods only utilize aligned data with common keys across parties, which\nstrongly restricts their application scope. In this paper, we propose FedUD,\nwhich is able to exploit unaligned data, in addition to aligned data, for more\naccurate federated CTR prediction. FedUD contains two steps. In the first step,\nFedUD utilizes aligned data across parties like traditional VFL, but it\nadditionally includes a knowledge distillation module. This module distills\nuseful knowledge from the guest party's high-level representations and guides\nthe learning of a representation transfer network. In the second step, FedUD\napplies the learned knowledge to enrich the representations of the host party's\nunaligned data such that both aligned and unaligned data can contribute to\nfederated model training. Experiments on two real-world datasets demonstrate\nthe superior performance of FedUD for federated CTR prediction.\n","authors":["Wentao Ouyang","Rui Dong","Ri Tao","Xiangzheng Liu"],"pdf_url":"https://arxiv.org/pdf/2407.18472v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18471v1","updated":"2024-07-26T02:44:55Z","published":"2024-07-26T02:44:55Z","title":"Constructing the CORD-19 Vaccine Dataset","summary":" We introduce new dataset 'CORD-19-Vaccination' to cater to scientists\nspecifically looking into COVID-19 vaccine-related research. This dataset is\nextracted from CORD-19 dataset [Wang et al., 2020] and augmented with new\ncolumns for language detail, author demography, keywords, and topic per paper.\nFacebook's fastText model is used to identify languages [Joulin et al., 2016].\nTo establish author demography (author affiliation, lab/institution location,\nand lab/institution country columns) we processed the JSON file for each paper\nand then further enhanced using Google's search API to determine country\nvalues. 'Yake' was used to extract keywords from the title, abstract, and body\nof each paper and the LDA (Latent Dirichlet Allocation) algorithm was used to\nadd topic information [Campos et al., 2020, 2018a,b]. To evaluate the dataset,\nwe demonstrate a question-answering task like the one used in the CORD-19\nKaggle challenge [Goldbloom et al., 2022]. For further evaluation, sequential\nsentence classification was performed on each paper's abstract using the model\nfrom Dernoncourt et al. [2016]. We partially hand annotated the training\ndataset and used a pre-trained BERT-PubMed layer. 'CORD- 19-Vaccination'\ncontains 30k research papers and can be immensely valuable for NLP research\nsuch as text mining, information extraction, and question answering, specific\nto the domain of COVID-19 vaccine research.\n","authors":["Manisha Singh","Divy Sharma","Alonso Ma","Bridget Tyree","Margaret Mitchell"],"pdf_url":"https://arxiv.org/pdf/2407.18471v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13671v2","updated":"2024-07-26T02:42:21Z","published":"2024-04-21T14:22:04Z","title":"FiLo: Zero-Shot Anomaly Detection by Fine-Grained Description and\n High-Quality Localization","summary":" Zero-shot anomaly detection (ZSAD) methods entail detecting anomalies\ndirectly without access to any known normal or abnormal samples within the\ntarget item categories. Existing approaches typically rely on the robust\ngeneralization capabilities of multimodal pretrained models, computing\nsimilarities between manually crafted textual features representing \"normal\" or\n\"abnormal\" semantics and image features to detect anomalies and localize\nanomalous patches. However, the generic descriptions of \"abnormal\" often fail\nto precisely match diverse types of anomalies across different object\ncategories. Additionally, computing feature similarities for single patches\nstruggles to pinpoint specific locations of anomalies with various sizes and\nscales. To address these issues, we propose a novel ZSAD method called FiLo,\ncomprising two components: adaptively learned Fine-Grained Description (FG-Des)\nand position-enhanced High-Quality Localization (HQ-Loc). FG-Des introduces\nfine-grained anomaly descriptions for each category using Large Language Models\n(LLMs) and employs adaptively learned textual templates to enhance the accuracy\nand interpretability of anomaly detection. HQ-Loc, utilizing Grounding DINO for\npreliminary localization, position-enhanced text prompts, and Multi-scale\nMulti-shape Cross-modal Interaction (MMCI) module, facilitates more accurate\nlocalization of anomalies of different sizes and shapes. Experimental results\non datasets like MVTec and VisA demonstrate that FiLo significantly improves\nthe performance of ZSAD in both detection and localization, achieving\nstate-of-the-art performance with an image-level AUC of 83.9% and a pixel-level\nAUC of 95.9% on the VisA dataset. Code is available at\nhttps://github.com/CASIA-IVA-Lab/FiLo.\n","authors":["Zhaopeng Gu","Bingke Zhu","Guibo Zhu","Yingying Chen","Hao Li","Ming Tang","Jinqiao Wang"],"pdf_url":"https://arxiv.org/pdf/2404.13671v2.pdf","comment":"Accepted by ACM MM 2024"},{"id":"http://arxiv.org/abs/2407.18468v1","updated":"2024-07-26T02:34:25Z","published":"2024-07-26T02:34:25Z","title":"Diffusion-Driven Semantic Communication for Generative Models with\n Bandwidth Constraints","summary":" Diffusion models have been extensively utilized in AI-generated content\n(AIGC) in recent years, thanks to the superior generation capabilities.\nCombining with semantic communications, diffusion models are used for tasks\nsuch as denoising, data reconstruction, and content generation. However,\nexisting diffusion-based generative models do not consider the stringent\nbandwidth limitation, which limits its application in wireless communication.\nThis paper introduces a diffusion-driven semantic communication framework with\nadvanced VAE-based compression for bandwidth-constrained generative model. Our\ndesigned architecture utilizes the diffusion model, where the signal\ntransmission process through the wireless channel acts as the forward process\nin diffusion. To reduce bandwidth requirements, we incorporate a downsampling\nmodule and a paired upsampling module based on a variational auto-encoder with\nreparameterization at the receiver to ensure that the recovered features\nconform to the Gaussian distribution. Furthermore, we derive the loss function\nfor our proposed system and evaluate its performance through comprehensive\nexperiments. Our experimental results demonstrate significant improvements in\npixel-level metrics such as peak signal to noise ratio (PSNR) and semantic\nmetrics like learned perceptual image patch similarity (LPIPS). These\nenhancements are more profound regarding the compression rates and SNR compared\nto deep joint source-channel coding (DJSCC).\n","authors":["Lei Guo","Wei Chen","Yuxuan Sun","Bo Ai","Nikolaos Pappas","Tony Quek"],"pdf_url":"https://arxiv.org/pdf/2407.18468v1.pdf","comment":"13 pages, 7 figures, submitted to IEEE for possible publication"},{"id":"http://arxiv.org/abs/2407.18467v1","updated":"2024-07-26T02:28:32Z","published":"2024-07-26T02:28:32Z","title":"Machine Unlearning using a Multi-GAN based Model","summary":" This article presents a new machine unlearning approach that utilizes\nmultiple Generative Adversarial Network (GAN) based models. The proposed method\ncomprises two phases: i) data reorganization in which synthetic data using the\nGAN model is introduced with inverted class labels of the forget datasets, and\nii) fine-tuning the pre-trained model. The GAN models consist of two pairs of\ngenerators and discriminators. The generator discriminator pairs generate\nsynthetic data for the retain and forget datasets. Then, a pre-trained model is\nutilized to get the class labels of the synthetic datasets. The class labels of\nsynthetic and original forget datasets are inverted. Finally, all combined\ndatasets are used to fine-tune the pre-trained model to get the unlearned\nmodel. We have performed the experiments on the CIFAR-10 dataset and tested the\nunlearned models using Membership Inference Attacks (MIA). The inverted class\nlabels procedure and synthetically generated data help to acquire valuable\ninformation that enables the model to outperform state-of-the-art models and\nother standard unlearning classifiers.\n","authors":["Amartya Hatua","Trung T. Nguyen","Andrew H. Sung"],"pdf_url":"https://arxiv.org/pdf/2407.18467v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18462v1","updated":"2024-07-26T02:09:32Z","published":"2024-07-26T02:09:32Z","title":"MistralBSM: Leveraging Mistral-7B for Vehicular Networks Misbehavior\n Detection","summary":" Vehicular networks are exposed to various threats resulting from malicious\nattacks. These threats compromise the security and reliability of\ncommunications among road users, thereby jeopardizing road and traffic safety.\nOne of the main vectors of these attacks within vehicular networks is\nmisbehaving vehicles. To address this challenge, we propose deploying a\npretrained Large Language Model (LLM)-empowered Misbehavior Detection System\n(MDS) within an edge-cloud detection framework. Specifically, we fine-tune\nMistral-7B, a state-of-the-art LLM, as the edge component to enable real-time\ndetection, whereas a larger LLM deployed in the cloud can conduct a more\ncomprehensive analysis. Our experiments conducted on the extended VeReMi\ndataset demonstrate Mistral-7B's superior performance, achieving 98\\% accuracy\ncompared to other LLMs such as LLAMA2-7B and RoBERTa. Additionally, we\ninvestigate the impact of window size on computational costs to optimize\ndeployment efficiency. Leveraging LLMs in MDS shows interesting results in\nimproving the detection of vehicle misbehavior, consequently strengthening\nvehicular network security to ensure the safety of road users.\n","authors":["Wissal Hamhoum","Soumaya Cherkaoui"],"pdf_url":"https://arxiv.org/pdf/2407.18462v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14207v3","updated":"2024-07-26T02:03:00Z","published":"2024-07-19T11:12:08Z","title":"Longhorn: State Space Models are Amortized Online Learners","summary":" The most fundamental capability of modern AI methods such as Large Language\nModels (LLMs) is the ability to predict the next token in a long sequence of\ntokens, known as ``sequence modeling.\" Although the Transformers model is the\ncurrent dominant approach to sequence modeling, its quadratic computational\ncost with respect to sequence length is a significant drawback. State-space\nmodels (SSMs) offer a promising alternative due to their linear decoding\nefficiency and high parallelizability during training. However, existing SSMs\noften rely on seemingly ad hoc linear recurrence designs. In this work, we\nexplore SSM design through the lens of online learning, conceptualizing SSMs as\nmeta-modules for specific online learning problems. This approach links SSM\ndesign to formulating precise online learning objectives, with state transition\nrules derived from optimizing these objectives. Based on this insight, we\nintroduce a novel deep SSM architecture based on the implicit update for\noptimizing an online regression objective. Our experimental results show that\nour models outperform state-of-the-art SSMs, including the Mamba model, on\nstandard sequence modeling benchmarks and language modeling tasks.\n","authors":["Bo Liu","Rui Wang","Lemeng Wu","Yihao Feng","Peter Stone","Qiang Liu"],"pdf_url":"https://arxiv.org/pdf/2407.14207v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05140v3","updated":"2024-07-26T01:28:16Z","published":"2024-02-06T20:11:54Z","title":"Tag-LLM: Repurposing General-Purpose LLMs for Specialized Domains","summary":" Large Language Models (LLMs) have demonstrated remarkable proficiency in\nunderstanding and generating natural language. However, their capabilities wane\nin highly specialized domains underrepresented in the pretraining corpus, such\nas physical and biomedical sciences. This work explores how to repurpose\ngeneral LLMs into effective task solvers for specialized domains. We introduce\na novel, model-agnostic framework for learning custom input tags, which are\nparameterized as continuous vectors appended to the LLM's embedding layer, to\ncondition the LLM. We design two types of input tags: domain tags are used to\ndelimit specialized representations (e.g., chemical formulas) and provide\ndomain-relevant context; function tags are used to represent specific functions\n(e.g., predicting molecular properties) and compress function-solving\ninstructions. We develop a three-stage protocol to learn these tags using\nauxiliary data and domain knowledge. By explicitly disentangling task domains\nfrom task functions, our method enables zero-shot generalization to unseen\nproblems through diverse combinations of the input tags. It also boosts LLM's\nperformance in various specialized domains, such as predicting protein or\nchemical properties and modeling drug-target interactions, outperforming expert\nmodels tailored to these tasks.\n","authors":["Junhong Shen","Neil Tenenholtz","James Brian Hall","David Alvarez-Melis","Nicolo Fusi"],"pdf_url":"https://arxiv.org/pdf/2402.05140v3.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2407.18454v1","updated":"2024-07-26T01:21:25Z","published":"2024-07-26T01:21:25Z","title":"Fairness Definitions in Language Models Explained","summary":" Language Models (LMs) have demonstrated exceptional performance across\nvarious Natural Language Processing (NLP) tasks. Despite these advancements,\nLMs can inherit and amplify societal biases related to sensitive attributes\nsuch as gender and race, limiting their adoption in real-world applications.\nTherefore, fairness has been extensively explored in LMs, leading to the\nproposal of various fairness notions. However, the lack of clear agreement on\nwhich fairness definition to apply in specific contexts (\\textit{e.g.,}\nmedium-sized LMs versus large-sized LMs) and the complexity of understanding\nthe distinctions between these definitions can create confusion and impede\nfurther progress. To this end, this paper proposes a systematic survey that\nclarifies the definitions of fairness as they apply to LMs. Specifically, we\nbegin with a brief introduction to LMs and fairness in LMs, followed by a\ncomprehensive, up-to-date overview of existing fairness notions in LMs and the\nintroduction of a novel taxonomy that categorizes these concepts based on their\nfoundational principles and operational distinctions. We further illustrate\neach definition through experiments, showcasing their practical implications\nand outcomes. Finally, we discuss current research challenges and open\nquestions, aiming to foster innovative ideas and advance the field. The\nimplementation and additional resources are publicly available at\nhttps://github.com/LavinWong/Fairness-in-Large-Language-Models/tree/main/definitions.\n","authors":["Thang Viet Doan","Zhibo Chu","Zichong Wang","Wenbin Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.18454v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17790v2","updated":"2024-07-26T01:14:52Z","published":"2024-07-25T05:52:48Z","title":"Exploring the Limitations of Kolmogorov-Arnold Networks in\n Classification: Insights to Software Training and Hardware Implementation","summary":" Kolmogorov-Arnold Networks (KANs), a novel type of neural network, have\nrecently gained popularity and attention due to the ability to substitute\nmulti-layer perceptions (MLPs) in artificial intelligence (AI) with higher\naccuracy and interoperability. However, KAN assessment is still limited and\ncannot provide an in-depth analysis of a specific domain. Furthermore, no study\nhas been conducted on the implementation of KANs in hardware design, which\nwould directly demonstrate whether KANs are truly superior to MLPs in practical\napplications. As a result, in this paper, we focus on verifying KANs for\nclassification issues, which are a common but significant topic in AI using\nfour different types of datasets. Furthermore, the corresponding hardware\nimplementation is considered using the Vitis high-level synthesis (HLS) tool.\nTo the best of our knowledge, this is the first article to implement hardware\nfor KAN. The results indicate that KANs cannot achieve more accuracy than MLPs\nin high complex datasets while utilizing substantially higher hardware\nresources. Therefore, MLP remains an effective approach for achieving accuracy\nand efficiency in software and hardware implementation.\n","authors":["Van Duy Tran","Tran Xuan Hieu Le","Thi Diem Tran","Hoai Luan Pham","Vu Trung Duong Le","Tuan Hai Vu","Van Tinh Nguyen","Yasuhiko Nakashima"],"pdf_url":"https://arxiv.org/pdf/2407.17790v2.pdf","comment":"6 pages, 3 figures, 2 tables"},{"id":"http://arxiv.org/abs/2407.18450v1","updated":"2024-07-26T01:13:59Z","published":"2024-07-26T01:13:59Z","title":"Textile Anomaly Detection: Evaluation of the State-of-the-Art for\n Automated Quality Inspection of Carpet","summary":" In this study, state-of-the-art unsupervised detection models were evaluated\nfor the purpose of automated anomaly inspection of wool carpets. A custom\ndataset of four unique types of carpet textures was created to thoroughly test\nthe models and their robustness in detecting subtle anomalies in complex\ntextures. Due to the requirements of an inline inspection system in a\nmanufacturing use case, the metrics of importance in this study were accuracy\nin detecting anomalous areas, the number of false detections, and the inference\ntimes of each model for real-time performance. Of the evaluated models, the\nstudent-teacher network based methods were found on average to yield the\nhighest detection accuracy and lowest false detection rates. When trained on a\nmulti-class dataset the models were found to yield comparable if not better\nresults than single-class training. Finally, in terms of detection speed, with\nexception to the generative model, all other evaluated models were found to\nhave comparable inference times on a GPU, with an average of 0.16s per image.\nOn a CPU, most of these models typically produced results between 1.5 to 2\ntimes the respective GPU inference times.\n","authors":["Briony Forsberg","Dr Henry Williams","Prof Bruce MacDonald","Tracy Chen","Dr Kirstine Hulse"],"pdf_url":"https://arxiv.org/pdf/2407.18450v1.pdf","comment":"Accepted at the 2023 Australasian Conference on Robotics and\n Automation (ACRA 2023) Publication url\n https://www.scopus.com/inward/record.uri?eid=2-s2.0-85184380272&partnerID=40&md5=74fde263f4a24a1bff75d6560b423994\n ISSN: 14482053 Contains 10 pages and three figures"},{"id":"http://arxiv.org/abs/2407.18449v1","updated":"2024-07-26T01:12:54Z","published":"2024-07-26T01:12:54Z","title":"Towards A Generalizable Pathology Foundation Model via Unified Knowledge\n Distillation","summary":" Foundation models pretrained on large-scale datasets are revolutionizing the\nfield of computational pathology (CPath). The generalization ability of\nfoundation models is crucial for the success in various downstream clinical\ntasks. However, current foundation models have only been evaluated on a limited\ntype and number of tasks, leaving their generalization ability and overall\nperformance unclear. To address this gap, we established a most comprehensive\nbenchmark to evaluate the performance of off-the-shelf foundation models across\nsix distinct clinical task types, encompassing a total of 39 specific tasks.\nOur findings reveal that existing foundation models excel at certain task types\nbut struggle to effectively handle the full breadth of clinical tasks. To\nimprove the generalization of pathology foundation models, we propose a unified\nknowledge distillation framework consisting of both expert and self knowledge\ndistillation, where the former allows the model to learn from the knowledge of\nmultiple expert models, while the latter leverages self-distillation to enable\nimage representation learning via local-global alignment. Based on this\nframework, a Generalizable Pathology Foundation Model (GPFM) is pretrained on a\nlarge-scale dataset consisting of 190 million images from around 86,000 public\nH\\&E whole slides across 34 major tissue types. Evaluated on the established\nbenchmark, GPFM achieves an impressive average rank of 1.36, with 29 tasks\nranked 1st, while the the second-best model, UNI, attains an average rank of\n2.96, with only 4 tasks ranked 1st. The superior generalization of GPFM\ndemonstrates its exceptional modeling capabilities across a wide range of\nclinical tasks, positioning it as a new cornerstone for feature representation\nin CPath.\n","authors":["Jiabo Ma","Zhengrui Guo","Fengtao Zhou","Yihui Wang","Yingxue Xu","Yu Cai","Zhengjie Zhu","Cheng Jin","Yi Lin Xinrui Jiang","Anjia Han","Li Liang","Ronald Cheong Kin Chan","Jiguang Wang","Kwang-Ting Cheng","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2407.18449v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13929v2","updated":"2024-07-26T01:12:39Z","published":"2024-01-25T04:03:32Z","title":"HMM for Discovering Decision-Making Dynamics Using Reinforcement\n Learning Experiments","summary":" Major depressive disorder (MDD) presents challenges in diagnosis and\ntreatment due to its complex and heterogeneous nature. Emerging evidence\nindicates that reward processing abnormalities may serve as a behavioral marker\nfor MDD. To measure reward processing, patients perform computer-based\nbehavioral tasks that involve making choices or responding to stimulants that\nare associated with different outcomes. Reinforcement learning (RL) models are\nfitted to extract parameters that measure various aspects of reward processing\nto characterize how patients make decisions in behavioral tasks. Recent\nfindings suggest the inadequacy of characterizing reward learning solely based\non a single RL model; instead, there may be a switching of decision-making\nprocesses between multiple strategies. An important scientific question is how\nthe dynamics of learning strategies in decision-making affect the reward\nlearning ability of individuals with MDD. Motivated by the probabilistic reward\ntask (PRT) within the EMBARC study, we propose a novel RL-HMM framework for\nanalyzing reward-based decision-making. Our model accommodates learning\nstrategy switching between two distinct approaches under a hidden Markov model\n(HMM): subjects making decisions based on the RL model or opting for random\nchoices. We account for continuous RL state space and allow time-varying\ntransition probabilities in the HMM. We introduce a computationally efficient\nEM algorithm for parameter estimation and employ a nonparametric bootstrap for\ninference. We apply our approach to the EMBARC study to show that MDD patients\nare less engaged in RL compared to the healthy controls, and engagement is\nassociated with brain activities in the negative affect circuitry during an\nemotional conflict task.\n","authors":["Xingche Guo","Donglin Zeng","Yuanjia Wang"],"pdf_url":"https://arxiv.org/pdf/2401.13929v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18439v1","updated":"2024-07-26T00:38:51Z","published":"2024-07-26T00:38:51Z","title":"Impact of Recurrent Neural Networks and Deep Learning Frameworks on\n Real-time Lightweight Time Series Anomaly Detection","summary":" Real-time lightweight time series anomaly detection has become increasingly\ncrucial in cybersecurity and many other domains. Its ability to adapt to\nunforeseen pattern changes and swiftly identify anomalies enables prompt\nresponses and critical decision-making. While several such anomaly detection\napproaches have been introduced in recent years, they primarily utilize a\nsingle type of recurrent neural networks (RNNs) and have been implemented in\nonly one deep learning framework. It is unclear how the use of different types\nof RNNs available in various deep learning frameworks affects the performance\nof these anomaly detection approaches due to the absence of comprehensive\nevaluations. Arbitrarily choosing a RNN variant and a deep learning framework\nto implement an anomaly detection approach may not reflect its true performance\nand could potentially mislead users into favoring one approach over another. In\nthis paper, we aim to study the influence of various types of RNNs available in\npopular deep learning frameworks on real-time lightweight time series anomaly\ndetection. We reviewed several state-of-the-art approaches and implemented a\nrepresentative anomaly detection approach using well-known RNN variants\nsupported by three widely recognized deep learning frameworks. A comprehensive\nevaluation is then conducted to analyze the performance of each implementation\nacross real-world, open-source time series datasets. The evaluation results\nprovide valuable guidance for selecting the appropriate RNN variant and deep\nlearning framework for real-time, lightweight time series anomaly detection.\n","authors":["Ming-Chang Lee","Jia-Chun Lin","Sokratis Katsikas"],"pdf_url":"https://arxiv.org/pdf/2407.18439v1.pdf","comment":"20 pages, 4 figures, 7 tables, The 26th International Conference on\n Information and Communications Security, 26-28 August, 2024, Mytilene,\n Lesvos, Greece (ICICS2024)"},{"id":"http://arxiv.org/abs/2406.02611v2","updated":"2024-07-26T00:26:10Z","published":"2024-06-03T07:56:58Z","title":"LOLA: LLM-Assisted Online Learning Algorithm for Content Experiments","summary":" In the rapidly evolving digital content landscape, media firms and news\npublishers require automated and efficient methods to enhance user engagement.\nThis paper introduces the LLM-Assisted Online Learning Algorithm (LOLA), a\nnovel framework that integrates Large Language Models (LLMs) with adaptive\nexperimentation to optimize content delivery. Leveraging a large-scale dataset\nfrom Upworthy, which includes 17,681 headline A/B tests, we first investigate\nthree pure-LLM approaches: prompt-based methods, embedding-based classification\nmodels, and fine-tuned open-source LLMs. We find that prompt-based approaches\nperform poorly, achieving no more than 65\\% accuracy in identifying the\ncatchier headline. In contrast, both OpenAI-embedding-based classification\nmodels and fine-tuned Llama-3 with 8 billion parameters achieve an accuracy of\naround 82-84\\%. We then introduce LOLA, which combines the best pure-LLM\napproach with the Upper Confidence Bound algorithm to allocate traffic and\nmaximize clicks adaptively. Our numerical experiments on Upworthy data show\nthat LOLA outperforms the standard A/B test method (the current status quo at\nUpworthy), pure bandit algorithms, and pure-LLM approaches, particularly in\nscenarios with limited experimental traffic. Our approach is scalable and\napplicable to content experiments across various settings where firms seek to\noptimize user engagement, including digital advertising and social media\nrecommendations.\n","authors":["Zikun Ye","Hema Yoganarasimhan","Yufeng Zheng"],"pdf_url":"https://arxiv.org/pdf/2406.02611v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10959v3","updated":"2024-07-26T00:15:47Z","published":"2024-06-16T14:31:26Z","title":"On Convergence Analysis of Policy Iteration Algorithms for\n Entropy-Regularized Stochastic Control Problems","summary":" In this paper we investigate the issues regarding the convergence of the\nPolicy Iteration Algorithm(PIA) for a class of general continuous-time\nentropy-regularized stochastic control problems. In particular, instead of\nemploying sophisticated PDE estimates for the iterative PDEs involved in the\nPIA (see, e.g., Huang-Wang-Zhou(2023)), we shall provide a simple proof from\nscratch for the convergence of the PIA. Our approach builds on probabilistic\nrepresentation formulae for solutions of PDEs and their derivatives. Moreover,\nin the infinite horizon model with large discount factor and in the finite\nhorizon model, the similar arguments lead to the exponential rate of\nconvergence of PIA without tear. Finally, with some extra efforts we show that\nour approach can also be extended to the case when diffusion contains control,\nin the one dimensional setting but without much extra constraints on the\ncoefficients. We believe that these results are new in the literature.\n","authors":["Jin Ma","Gaozhan Wang","Jianfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.10959v3.pdf","comment":"In this version, we have added results on convergence and rate of\n convergence for the diffusion control problem in the scalar case"},{"id":"http://arxiv.org/abs/2407.18436v1","updated":"2024-07-26T00:13:30Z","published":"2024-07-26T00:13:30Z","title":"A Model for Combinatorial Dictionary Learning and Inference","summary":" We are often interested in decomposing complex, structured data into simple\ncomponents that explain the data. The linear version of this problem is\nwell-studied as dictionary learning and factor analysis. In this work, we\npropose a combinatorial model in which to study this question, motivated by the\nway objects occlude each other in a scene to form an image. First, we identify\na property we call \"well-structuredness\" of a set of low-dimensional components\nwhich ensures that no two components in the set are too similar. We show how\nwell-structuredness is sufficient for learning the set of latent components\ncomprising a set of sample instances. We then consider the problem: given a set\nof components and an instance generated from some unknown subset of them,\nidentify which parts of the instance arise from which components. We consider\ntwo variants: (1) determine the minimal number of components required to\nexplain the instance; (2) determine the correct explanation for as many\nlocations as possible. For the latter goal, we also devise a version that is\nrobust to adversarial corruptions, with just a slightly stronger assumption on\nthe components. Finally, we show that the learning problem is computationally\ninfeasible in the absence of any assumptions.\n","authors":["Avrim Blum","Kavya Ravichandran"],"pdf_url":"https://arxiv.org/pdf/2407.18436v1.pdf","comment":"31 pages, 3 figures"},{"id":"http://arxiv.org/abs/2407.18433v1","updated":"2024-07-26T00:00:53Z","published":"2024-07-26T00:00:53Z","title":"Investigating the Privacy Risk of Using Robot Vacuum Cleaners in Smart\n Environments","summary":" Robot vacuum cleaners have become increasingly popular and are widely used in\nvarious smart environments. To improve user convenience, manufacturers also\nintroduced smartphone applications that enable users to customize cleaning\nsettings or access information about their robot vacuum cleaners. While this\nintegration enhances the interaction between users and their robot vacuum\ncleaners, it results in potential privacy concerns because users' personal\ninformation may be exposed. To address these concerns, end-to-end encryption is\nimplemented between the application, cloud service, and robot vacuum cleaners\nto secure the exchanged information. Nevertheless, network header metadata\nremains unencrypted and it is still vulnerable to network eavesdropping. In\nthis paper, we investigate the potential risk of private information exposure\nthrough such metadata. A popular robot vacuum cleaner was deployed in a real\nsmart environment where passive network eavesdropping was conducted during\nseveral selected cleaning events. Our extensive analysis, based on Association\nRule Learning, demonstrates that it is feasible to identify certain events\nusing only the captured Internet traffic metadata, thereby potentially exposing\nprivate user information and raising privacy concerns.\n","authors":["Benjamin Ulsmaag","Jia-Chun Lin","Ming-Chang Lee"],"pdf_url":"https://arxiv.org/pdf/2407.18433v1.pdf","comment":"18 pages, 11 figures, 4 tables, The 26th International Conference on\n Information and Communications Security, 26-28 August, 2024, Mytilene,\n Lesvos, Greece (ICICS2024)"}],"Multimedia":[{"id":"http://arxiv.org/abs/2407.16977v2","updated":"2024-07-26T15:52:46Z","published":"2024-07-24T03:45:35Z","title":"Selective Vision-Language Subspace Projection for Few-shot CLIP","summary":" Vision-language models such as CLIP are capable of mapping the different\nmodality data into a unified feature space, enabling zero/few-shot inference by\nmeasuring the similarity of given images and texts. However, most existing\nmethods overlook modality gaps in CLIP's encoded features, which is shown as\nthe text and image features lie far apart from each other, resulting in limited\nclassification performance. To tackle this issue, we introduce a method called\nSelective Vision-Language Subspace Projection (SSP), which incorporates local\nimage features and utilizes them as a bridge to enhance the alignment between\nimage-text pairs. Specifically, our SSP framework comprises two parallel\nmodules: a vision projector and a language projector. Both projectors utilize\nlocal image features to span the respective subspaces for image and texts,\nthereby projecting the image and text features into their respective subspaces\nto achieve alignment. Moreover, our approach entails only training-free matrix\ncalculations and can be seamlessly integrated into advanced CLIP-based few-shot\nlearning frameworks. Extensive experiments on 11 datasets have demonstrated\nSSP's superior text-image alignment capabilities, outperforming the\nstate-of-the-art alignment methods. The code is available at\nhttps://github.com/zhuhsingyuu/SSP\n","authors":["Xingyu Zhu","Beier Zhu","Yi Tan","Shuo Wang","Yanbin Hao","Hanwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.16977v2.pdf","comment":"Accepted as an Oral Paper at ACM Multimedia 2024"},{"id":"http://arxiv.org/abs/2407.18626v1","updated":"2024-07-26T09:35:36Z","published":"2024-07-26T09:35:36Z","title":"Every Part Matters: Integrity Verification of Scientific Figures Based\n on Multimodal Large Language Models","summary":" This paper tackles a key issue in the interpretation of scientific figures:\nthe fine-grained alignment of text and figures. It advances beyond prior\nresearch that primarily dealt with straightforward, data-driven visualizations\nsuch as bar and pie charts and only offered a basic understanding of diagrams\nthrough captioning and classification. We introduce a novel task, Figure\nIntegrity Verification, designed to evaluate the precision of technologies in\naligning textual knowledge with visual elements in scientific figures. To\nsupport this, we develop a semi-automated method for constructing a large-scale\ndataset, Figure-seg, specifically designed for this task. Additionally, we\npropose an innovative framework, Every Part Matters (EPM), which leverages\nMultimodal Large Language Models (MLLMs) to not only incrementally improve the\nalignment and verification of text-figure integrity but also enhance integrity\nthrough analogical reasoning. Our comprehensive experiments show that these\ninnovations substantially improve upon existing methods, allowing for more\nprecise and thorough analysis of complex scientific figures. This progress not\nonly enhances our understanding of multimodal technologies but also stimulates\nfurther research and practical applications across fields requiring the\naccurate interpretation of complex visual data.\n","authors":["Xiang Shi","Jiawei Liu","Yinpeng Liu","Qikai Cheng","Wei Lu"],"pdf_url":"https://arxiv.org/pdf/2407.18626v1.pdf","comment":"28 pages, 11 figures, under review"},{"id":"http://arxiv.org/abs/2407.18614v1","updated":"2024-07-26T09:15:29Z","published":"2024-07-26T09:15:29Z","title":"LookupForensics: A Large-Scale Multi-Task Dataset for Multi-Phase\n Image-Based Fact Verification","summary":" Amid the proliferation of forged images, notably the tsunami of deepfake\ncontent, extensive research has been conducted on using artificial intelligence\n(AI) to identify forged content in the face of continuing advancements in\ncounterfeiting technologies. We have investigated the use of AI to provide the\noriginal authentic image after deepfake detection, which we believe is a\nreliable and persuasive solution. We call this \"image-based automated fact\nverification,\" a name that originated from a text-based fact-checking system\nused by journalists. We have developed a two-phase open framework that\nintegrates detection and retrieval components. Additionally, inspired by a\ndataset proposed by Meta Fundamental AI Research, we further constructed a\nlarge-scale dataset that is specifically designed for this task. This dataset\nsimulates real-world conditions and includes both content-preserving and\ncontent-aware manipulations that present a range of difficulty levels and have\npotential for ongoing research. This multi-task dataset is fully annotated,\nenabling it to be utilized for sub-tasks within the forgery identification and\nfact retrieval domains. This paper makes two main contributions: (1) We\nintroduce a new task, \"image-based automated fact verification,\" and present a\nnovel two-phase open framework combining \"forgery identification\" and \"fact\nretrieval.\" (2) We present a large-scale dataset tailored for this new task\nthat features various hand-crafted image edits and machine learning-driven\nmanipulations, with extensive annotations suitable for various sub-tasks.\nExtensive experimental results validate its practicality for fact verification\nresearch and clarify its difficulty levels for various sub-tasks.\n","authors":["Shuhan Cui","Huy H. Nguyen","Trung-Nghia Le","Chun-Shien Lu","Isao Echizen"],"pdf_url":"https://arxiv.org/pdf/2407.18614v1.pdf","comment":"Pages 1-13 are the main body of the paper, and pages 14-16 are the\n supplementary material"},{"id":"http://arxiv.org/abs/2407.16307v2","updated":"2024-07-26T08:39:19Z","published":"2024-07-23T09:00:52Z","title":"Multimodal Unlearnable Examples: Protecting Data against Multimodal\n Contrastive Learning","summary":" Multimodal contrastive learning (MCL) has shown remarkable advances in\nzero-shot classification by learning from millions of image-caption pairs\ncrawled from the Internet. However, this reliance poses privacy risks, as\nhackers may unauthorizedly exploit image-text data for model training,\npotentially including personal and privacy-sensitive information. Recent works\npropose generating unlearnable examples by adding imperceptible perturbations\nto training images to build shortcuts for protection. However, they are\ndesigned for unimodal classification, which remains largely unexplored in MCL.\nWe first explore this context by evaluating the performance of existing methods\non image-caption pairs, and they do not generalize effectively to multimodal\ndata and exhibit limited impact to build shortcuts due to the lack of labels\nand the dispersion of pairs in MCL. In this paper, we propose Multi-step Error\nMinimization (MEM), a novel optimization process for generating multimodal\nunlearnable examples. It extends the Error-Minimization (EM) framework to\noptimize both image noise and an additional text trigger, thereby enlarging the\noptimized space and effectively misleading the model to learn the shortcut\nbetween the noise features and the text trigger. Specifically, we adopt\nprojected gradient descent to solve the noise minimization problem and use\nHotFlip to approximate the gradient and replace words to find the optimal text\ntrigger. Extensive experiments demonstrate the effectiveness of MEM, with\npost-protection retrieval results nearly half of random guessing, and its high\ntransferability across different models. Our code is available on the\nhttps://github.com/thinwayliu/Multimodal-Unlearnable-Examples\n","authors":["Xinwei Liu","Xiaojun Jia","Yuan Xun","Siyuan Liang","Xiaochun Cao"],"pdf_url":"https://arxiv.org/pdf/2407.16307v2.pdf","comment":"ACM MM2024"},{"id":"http://arxiv.org/abs/2407.18552v1","updated":"2024-07-26T07:05:04Z","published":"2024-07-26T07:05:04Z","title":"Multimodal Emotion Recognition using Audio-Video Transformer Fusion with\n Cross Attention","summary":" Understanding emotions is a fundamental aspect of human communication.\nIntegrating audio and video signals offers a more comprehensive understanding\nof emotional states compared to traditional methods that rely on a single data\nsource, such as speech or facial expressions. Despite its potential, multimodal\nemotion recognition faces significant challenges, particularly in\nsynchronization, feature extraction, and fusion of diverse data sources. To\naddress these issues, this paper introduces a novel transformer-based model\nnamed Audio-Video Transformer Fusion with Cross Attention (AVT-CA). The AVT-CA\nmodel employs a transformer fusion approach to effectively capture and\nsynchronize interlinked features from both audio and video inputs, thereby\nresolving synchronization problems. Additionally, the Cross Attention mechanism\nwithin AVT-CA selectively extracts and emphasizes critical features while\ndiscarding irrelevant ones from both modalities, addressing feature extraction\nand fusion challenges. Extensive experimental analysis conducted on the\nCMU-MOSEI, RAVDESS and CREMA-D datasets demonstrates the efficacy of the\nproposed model. The results underscore the importance of AVT-CA in developing\nprecise and reliable multimodal emotion recognition systems for practical\napplications.\n","authors":["Joe Dhanith P R","Shravan Venkatraman","Vigya Sharma","Santhosh Malarvannan"],"pdf_url":"https://arxiv.org/pdf/2407.18552v1.pdf","comment":"38 Pages, 9 Tables, 12 Figures"},{"id":"http://arxiv.org/abs/2407.19034v1","updated":"2024-07-26T18:21:30Z","published":"2024-07-26T18:21:30Z","title":"MangaUB: A Manga Understanding Benchmark for Large Multimodal Models","summary":" Manga is a popular medium that combines stylized drawings and text to convey\nstories. As manga panels differ from natural images, computational systems\ntraditionally had to be designed specifically for manga. Recently, the adaptive\nnature of modern large multimodal models (LMMs) shows possibilities for more\ngeneral approaches. To provide an analysis of the current capability of LMMs\nfor manga understanding tasks and identifying areas for their improvement, we\ndesign and evaluate MangaUB, a novel manga understanding benchmark for LMMs.\nMangaUB is designed to assess the recognition and understanding of content\nshown in a single panel as well as conveyed across multiple panels, allowing\nfor a fine-grained analysis of a model's various capabilities required for\nmanga understanding. Our results show strong performance on the recognition of\nimage content, while understanding the emotion and information conveyed across\nmultiple panels is still challenging, highlighting future work towards LMMs for\nmanga understanding.\n","authors":["Hikaru Ikuta","Leslie Wöhler","Kiyoharu Aizawa"],"pdf_url":"https://arxiv.org/pdf/2407.19034v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2407.18995v1","updated":"2024-07-26T09:50:13Z","published":"2024-07-26T09:50:13Z","title":"SWIFT: Semantic Watermarking for Image Forgery Thwarting","summary":" This paper proposes a novel approach towards image authentication and\ntampering detection by using watermarking as a communication channel for\nsemantic information. We modify the HiDDeN deep-learning watermarking\narchitecture to embed and extract high-dimensional real vectors representing\nimage captions. Our method improves significantly robustness on both malign and\nbenign edits. We also introduce a local confidence metric correlated with\nMessage Recovery Rate, enhancing the method's practical applicability. This\napproach bridges the gap between traditional watermarking and passive forensic\nmethods, offering a robust solution for image integrity verification.\n","authors":["Gautier Evennou","Vivien Chappelier","Ewa Kijak","Teddy Furon"],"pdf_url":"https://arxiv.org/pdf/2407.18995v1.pdf","comment":"Code will be released"}]},"2024-07-29T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2405.17430v2","updated":"2024-07-29T17:59:28Z","published":"2024-05-27T17:59:56Z","title":"Matryoshka Multimodal Models","summary":" Large Multimodal Models (LMMs) such as LLaVA have shown strong performance in\nvisual-linguistic reasoning. These models first embed images into a fixed large\nnumber of visual tokens and then feed them into a Large Language Model (LLM).\nHowever, this design causes an excessive number of tokens for dense visual\nscenarios such as high-resolution images and videos, leading to great\ninefficiency. While token pruning/merging methods do exist, they produce a\nsingle length output for each image and do not afford flexibility in trading\noff information density v.s. efficiency. Inspired by the concept of Matryoshka\nDolls, we propose M3: Matryoshka Multimodal Models, which learns to represent\nvisual content as nested sets of visual tokens that capture information across\nmultiple coarse-to-fine granularities. Our approach offers several unique\nbenefits for LMMs: (1) One can explicitly control the visual granularity per\ntest instance during inference, e.g. , adjusting the number of tokens used to\nrepresent an image based on the anticipated complexity or simplicity of the\ncontent; (2) M3 provides a framework for analyzing the granularity needed for\nexisting datasets, where we find that COCO-style benchmarks only need around ~9\nvisual tokens to obtain accuracy similar to that of using all 576 tokens; (3)\nOur approach provides a foundation to explore the best trade-off between\nperformance and visual token length at sample level, where our investigation\nreveals that a large gap exists between the oracle upper bound and current\nfixed-scale representations.\n","authors":["Mu Cai","Jianwei Yang","Jianfeng Gao","Yong Jae Lee"],"pdf_url":"https://arxiv.org/pdf/2405.17430v2.pdf","comment":"Project Page: https://matryoshka-mm.github.io/"},{"id":"http://arxiv.org/abs/2407.20224v1","updated":"2024-07-29T17:58:06Z","published":"2024-07-29T17:58:06Z","title":"Can Editing LLMs Inject Harm?","summary":" Knowledge editing techniques have been increasingly adopted to efficiently\ncorrect the false or outdated knowledge in Large Language Models (LLMs), due to\nthe high cost of retraining from scratch. Meanwhile, one critical but\nunder-explored question is: can knowledge editing be used to inject harm into\nLLMs? In this paper, we propose to reformulate knowledge editing as a new type\nof safety threat for LLMs, namely Editing Attack, and conduct a systematic\ninvestigation with a newly constructed dataset EditAttack. Specifically, we\nfocus on two typical safety risks of Editing Attack including Misinformation\nInjection and Bias Injection. For the risk of misinformation injection, we\nfirst categorize it into commonsense misinformation injection and long-tail\nmisinformation injection. Then, we find that editing attacks can inject both\ntypes of misinformation into LLMs, and the effectiveness is particularly high\nfor commonsense misinformation injection. For the risk of bias injection, we\ndiscover that not only can biased sentences be injected into LLMs with high\neffectiveness, but also one single biased sentence injection can cause a high\nbias increase in general outputs of LLMs, which are even highly irrelevant to\nthe injected sentence, indicating a catastrophic impact on the overall fairness\nof LLMs. Then, we further illustrate the high stealthiness of editing attacks,\nmeasured by their impact on the general knowledge and reasoning capacities of\nLLMs, and show the hardness of defending editing attacks with empirical\nevidence. Our discoveries demonstrate the emerging misuse risks of knowledge\nediting techniques on compromising the safety alignment of LLMs.\n","authors":["Canyu Chen","Baixiang Huang","Zekun Li","Zhaorun Chen","Shiyang Lai","Xiongxiao Xu","Jia-Chen Gu","Jindong Gu","Huaxiu Yao","Chaowei Xiao","Xifeng Yan","William Yang Wang","Philip Torr","Dawn Song","Kai Shu"],"pdf_url":"https://arxiv.org/pdf/2407.20224v1.pdf","comment":"The first two authors contributed equally. 9 pages for main paper, 36\n pages including appendix. The code, results, dataset for this paper and more\n resources are on the project website: https://llm-editing.github.io"},{"id":"http://arxiv.org/abs/2407.20207v1","updated":"2024-07-29T17:39:08Z","published":"2024-07-29T17:39:08Z","title":"QAEA-DR: A Unified Text Augmentation Framework for Dense Retrieval","summary":" In dense retrieval, embedding long texts into dense vectors can result in\ninformation loss, leading to inaccurate query-text matching. Additionally,\nlow-quality texts with excessive noise or sparse key information are unlikely\nto align well with relevant queries. Recent studies mainly focus on improving\nthe sentence embedding model or retrieval process. In this work, we introduce a\nnovel text augmentation framework for dense retrieval. This framework\ntransforms raw documents into information-dense text formats, which supplement\nthe original texts to effectively address the aforementioned issues without\nmodifying embedding or retrieval methodologies. Two text representations are\ngenerated via large language models (LLMs) zero-shot prompting: question-answer\npairs and element-driven events. We term this approach QAEA-DR: unifying\nquestion-answer generation and event extraction in a text augmentation\nframework for dense retrieval. To further enhance the quality of generated\ntexts, a scoring-based evaluation and regeneration mechanism is introduced in\nLLM prompting. Our QAEA-DR model has a positive impact on dense retrieval,\nsupported by both theoretical analysis and empirical experiments.\n","authors":["Hongming Tan","Shaoxiong Zhan","Hai Lin","Hai-Tao Zheng","Wai Kin"," Chan"],"pdf_url":"https://arxiv.org/pdf/2407.20207v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.12620v2","updated":"2024-07-29T17:19:43Z","published":"2024-07-17T14:46:37Z","title":"Harnessing the Power of Artificial Intelligence to Vitalize Endangered\n Indigenous Languages: Technologies and Experiences","summary":" Since 2022 we have been exploring application areas and technologies in which\nArtificial Intelligence (AI) and modern Natural Language Processing (NLP), such\nas Large Language Models (LLMs), can be employed to foster the usage and\nfacilitate the documentation of Indigenous languages which are in danger of\ndisappearing. We start by discussing the decreasing diversity of languages in\nthe world and how working with Indigenous languages poses unique ethical\nchallenges for AI and NLP. To address those challenges, we propose an\nalternative development AI cycle based on community engagement and usage. Then,\nwe report encouraging results in the development of high-quality machine\nlearning translators for Indigenous languages by fine-tuning state-of-the-art\n(SOTA) translators with tiny amounts of data and discuss how to avoid some\ncommon pitfalls in the process. We also present prototypes we have built in\nprojects done in 2023 and 2024 with Indigenous communities in Brazil, aimed at\nfacilitating writing, and discuss the development of Indigenous Language Models\n(ILMs) as a replicable and scalable way to create spell-checkers, next-word\npredictors, and similar tools. Finally, we discuss how we envision a future for\nlanguage documentation where dying languages are preserved as interactive\nlanguage models.\n","authors":["Claudio Pinhanez","Paulo Cavalin","Luciana Storto","Thomas Finbow","Alexander Cobbinah","Julio Nogima","Marisa Vasconcelos","Pedro Domingues","Priscila de Souza Mizukami","Nicole Grell","Majoí Gongora","Isabel Gonçalves"],"pdf_url":"https://arxiv.org/pdf/2407.12620v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16251v3","updated":"2024-07-29T17:16:19Z","published":"2024-04-24T23:39:58Z","title":"Prompt Leakage effect and defense strategies for multi-turn LLM\n interactions","summary":" Prompt leakage poses a compelling security and privacy threat in LLM\napplications. Leakage of system prompts may compromise intellectual property,\nand act as adversarial reconnaissance for an attacker. A systematic evaluation\nof prompt leakage threats and mitigation strategies is lacking, especially for\nmulti-turn LLM interactions. In this paper, we systematically investigate LLM\nvulnerabilities against prompt leakage for 10 closed- and open-source LLMs,\nacross four domains. We design a unique threat model which leverages the LLM\nsycophancy effect and elevates the average attack success rate (ASR) from 17.7%\nto 86.2% in a multi-turn setting. Our standardized setup further allows\ndissecting leakage of specific prompt contents such as task instructions and\nknowledge documents. We measure the mitigation effect of 7 black-box defense\nstrategies, along with finetuning an open-source model to defend against\nleakage attempts. We present different combination of defenses against our\nthreat model, including a cost analysis. Our study highlights key takeaways for\nbuilding secure LLM applications and provides directions for research in\nmulti-turn LLM interactions\n","authors":["Divyansh Agarwal","Alexander R. Fabbri","Ben Risher","Philippe Laban","Shafiq Joty","Chien-Sheng Wu"],"pdf_url":"https://arxiv.org/pdf/2404.16251v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20189v1","updated":"2024-07-29T17:14:36Z","published":"2024-07-29T17:14:36Z","title":"Aligning Query Representation with Rewritten Query and Relevance\n Judgments in Conversational Search","summary":" Conversational search supports multi-turn user-system interactions to solve\ncomplex information needs. Different from the traditional single-turn ad-hoc\nsearch, conversational search encounters a more challenging problem of\ncontext-dependent query understanding with the lengthy and long-tail\nconversational history context. While conversational query rewriting methods\nleverage explicit rewritten queries to train a rewriting model to transform the\ncontext-dependent query into a stand-stone search query, this is usually done\nwithout considering the quality of search results. Conversational dense\nretrieval methods use fine-tuning to improve a pre-trained ad-hoc query\nencoder, but they are limited by the conversational search data available for\ntraining. In this paper, we leverage both rewritten queries and relevance\njudgments in the conversational search data to train a better query\nrepresentation model. The key idea is to align the query representation with\nthose of rewritten queries and relevant documents. The proposed model -- Query\nRepresentation Alignment Conversational Dense Retriever, QRACDR, is tested on\neight datasets, including various settings in conversational search and ad-hoc\nsearch. The results demonstrate the strong performance of QRACDR compared with\nstate-of-the-art methods, and confirm the effectiveness of representation\nalignment.\n","authors":["Fengran Mo","Chen Qu","Kelong Mao","Yihong Wu","Zhan Su","Kaiyu Huang","Jian-Yun Nie"],"pdf_url":"https://arxiv.org/pdf/2407.20189v1.pdf","comment":"Accepted by CIKM 2024"},{"id":"http://arxiv.org/abs/2407.20183v1","updated":"2024-07-29T17:12:40Z","published":"2024-07-29T17:12:40Z","title":"MindSearch: Mimicking Human Minds Elicits Deep AI Searcher","summary":" Information seeking and integration is a complex cognitive task that consumes\nenormous time and effort. Inspired by the remarkable progress of Large Language\nModels, recent works attempt to solve this task by combining LLMs and search\nengines. However, these methods still obtain unsatisfying performance due to\nthree challenges: (1) complex requests often cannot be accurately and\ncompletely retrieved by the search engine once (2) corresponding information to\nbe integrated is spread over multiple web pages along with massive noise, and\n(3) a large number of web pages with long contents may quickly exceed the\nmaximum context length of LLMs. Inspired by the cognitive process when humans\nsolve these problems, we introduce MindSearch to mimic the human minds in web\ninformation seeking and integration, which can be instantiated by a simple yet\neffective LLM-based multi-agent framework. The WebPlanner models the human mind\nof multi-step information seeking as a dynamic graph construction process: it\ndecomposes the user query into atomic sub-questions as nodes in the graph and\nprogressively extends the graph based on the search result from WebSearcher.\nTasked with each sub-question, WebSearcher performs hierarchical information\nretrieval with search engines and collects valuable information for WebPlanner.\nThe multi-agent design of MindSearch enables the whole framework to seek and\nintegrate information parallelly from larger-scale (e.g., more than 300) web\npages in 3 minutes, which is worth 3 hours of human effort. MindSearch\ndemonstrates significant improvement in the response quality in terms of depth\nand breadth, on both close-set and open-set QA problems. Besides, responses\nfrom MindSearch based on InternLM2.5-7B are preferable by humans to ChatGPT-Web\nand Perplexity.ai applications, which implies that MindSearch can already\ndeliver a competitive solution to the proprietary AI search engine.\n","authors":["Zehui Chen","Kuikun Liu","Qiuchen Wang","Jiangning Liu","Wenwei Zhang","Kai Chen","Feng Zhao"],"pdf_url":"https://arxiv.org/pdf/2407.20183v1.pdf","comment":"Technical Report. Project Page: https://mindsearch.netlify.app Code:\n https://github.com/InternLM/MindSearch"},{"id":"http://arxiv.org/abs/2407.20177v1","updated":"2024-07-29T17:06:30Z","published":"2024-07-29T17:06:30Z","title":"AutoScale: Automatic Prediction of Compute-optimal Data Composition for\n Training LLMs","summary":" To ensure performance on a diverse set of downstream tasks, LLMs are\npretrained via data mixtures over different domains. In this work, we\ndemonstrate that the optimal data composition for a fixed compute budget varies\ndepending on the scale of the training data, suggesting that the common\npractice of empirically determining an optimal composition using small-scale\nexperiments will not yield the optimal data mixtures when scaling up to the\nfinal model. To address this challenge, we propose *AutoScale*, an automated\ntool that finds a compute-optimal data composition for training at any desired\ntarget scale. AutoScale first determines the optimal composition at a small\nscale using a novel bilevel optimization framework, Direct Data Optimization\n(*DDO*), and then fits a predictor to estimate the optimal composition at\nlarger scales. The predictor's design is inspired by our theoretical analysis\nof scaling laws related to data composition, which could be of independent\ninterest. In empirical studies with pre-training 774M Decoder-only LMs (GPT-2\nLarge) on RedPajama dataset, AutoScale decreases validation perplexity at least\n25% faster than any baseline with up to 38% speed up compared to without\nreweighting, achieving the best overall performance across downstream tasks. On\npre-training Encoder-only LMs (BERT) with masked language modeling, DDO is\nshown to decrease loss on all domains while visibly improving average task\nperformance on GLUE benchmark by 8.7% and on large-scale QA dataset (SQuAD) by\n5.9% compared with without reweighting. AutoScale speeds up training by up to\n28%. Our codes are open-sourced.\n","authors":["Feiyang Kang","Yifan Sun","Bingbing Wen","Si Chen","Dawn Song","Rafid Mahmood","Ruoxi Jia"],"pdf_url":"https://arxiv.org/pdf/2407.20177v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02481v2","updated":"2024-07-29T16:30:17Z","published":"2024-06-04T16:49:06Z","title":"Large Language Models as Carriers of Hidden Messages","summary":" With the help of simple fine-tuning, one can artificially embed hidden text\ninto large language models (LLMs). This text is revealed only when triggered by\na specific query to the LLM. Two primary applications are LLM fingerprinting\nand steganography. In the context of LLM fingerprinting, a unique text\nidentifier (fingerprint) is embedded within the model to verify licensing\ncompliance. In the context of steganography, the LLM serves as a carrier for\nhidden messages that can be disclosed through a chosen trigger question.\n Our work demonstrates that embedding hidden text in the LLM via fine-tuning,\nthough seemingly secure due to the vast number of potential triggers (any\nsequence of characters or tokens could serve as a trigger), is susceptible to\nextraction through analysis of the LLM's output decoding process. We propose an\nextraction attack called Unconditional Token Forcing (UTF). It is premised on\nthe hypothesis that iteratively feeding each token from the LLM's vocabulary\ninto the model should reveal output sequences with abnormally high token\nprobabilities, indicating potential hidden text candidates. We also present a\ndefense method to hide text in such a way that it is resistant to both UTF and\nattacks based on sampling decoding methods, which we named Unconditional Token\nForcing Confusion (UTFC). To the best of our knowledge, there is no attack\nmethod that can extract text hidden with UTFC. UTFC has both benign\napplications (improving LLM fingerprinting) and malign applications (using LLMs\nto create covert communication channels). Code is available at\ngithub.com/j-hoscilowic/zurek-stegano\n","authors":["Jakub Hoscilowicz","Pawel Popiolek","Jan Rudkowski","Jedrzej Bieniasz","Artur Janicki"],"pdf_url":"https://arxiv.org/pdf/2406.02481v2.pdf","comment":"Work in progress. Code is available at\n https://github.com/j-hoscilowic/zurek-stegano"},{"id":"http://arxiv.org/abs/2309.00237v4","updated":"2024-07-29T15:52:22Z","published":"2023-09-01T04:01:20Z","title":"Publicly Shareable Clinical Large Language Model Built on Synthetic\n Clinical Notes","summary":" The development of large language models tailored for handling patients'\nclinical notes is often hindered by the limited accessibility and usability of\nthese notes due to strict privacy regulations. To address these challenges, we\nfirst create synthetic large-scale clinical notes using publicly available case\nreports extracted from biomedical literature. We then use these synthetic notes\nto train our specialized clinical large language model, Asclepius. While\nAsclepius is trained on synthetic data, we assess its potential performance in\nreal-world applications by evaluating it using real clinical notes. We\nbenchmark Asclepius against several other large language models, including\nGPT-3.5-turbo and other open-source alternatives. To further validate our\napproach using synthetic notes, we also compare Asclepius with its variants\ntrained on real clinical notes. Our findings convincingly demonstrate that\nsynthetic clinical notes can serve as viable substitutes for real ones when\nconstructing high-performing clinical language models. This conclusion is\nsupported by detailed evaluations conducted by both GPT-4 and medical\nprofessionals. All resources including weights, codes, and data used in the\ndevelopment of Asclepius are made publicly accessible for future research.\n(https://github.com/starmpcc/Asclepius)\n","authors":["Sunjun Kweon","Junu Kim","Jiyoun Kim","Sujeong Im","Eunbyeol Cho","Seongsu Bae","Jungwoo Oh","Gyubok Lee","Jong Hak Moon","Seng Chan You","Seungjin Baek","Chang Hoon Han","Yoon Bin Jung","Yohan Jo","Edward Choi"],"pdf_url":"https://arxiv.org/pdf/2309.00237v4.pdf","comment":"ACL 2024 (Findings)"},{"id":"http://arxiv.org/abs/2407.20083v1","updated":"2024-07-29T15:07:19Z","published":"2024-07-29T15:07:19Z","title":"An Energy-based Model for Word-level AutoCompletion in Computer-aided\n Translation","summary":" Word-level AutoCompletion(WLAC) is a rewarding yet challenging task in\nComputer-aided Translation. Existing work addresses this task through a\nclassification model based on a neural network that maps the hidden vector of\nthe input context into its corresponding label (i.e., the candidate target word\nis treated as a label). Since the context hidden vector itself does not take\nthe label into account and it is projected to the label through a linear\nclassifier, the model can not sufficiently leverage valuable information from\nthe source sentence as verified in our experiments, which eventually hinders\nits overall performance. To alleviate this issue, this work proposes an\nenergy-based model for WLAC, which enables the context hidden vector to capture\ncrucial information from the source sentence. Unfortunately, training and\ninference suffer from efficiency and effectiveness challenges, thereby we\nemploy three simple yet effective strategies to put our model into practice.\nExperiments on four standard benchmarks demonstrate that our reranking-based\napproach achieves substantial improvements (about 6.07%) over the previous\nstate-of-the-art model. Further analyses show that each strategy of our\napproach contributes to the final performance.\n","authors":["Cheng Yang","Guoping Huang","Mo Yu","Zhirui Zhang","Siheng Li","Mingming Yang","Shuming Shi","Yujiu Yang","Lemao Liu"],"pdf_url":"https://arxiv.org/pdf/2407.20083v1.pdf","comment":"Accepted to TACL 2024"},{"id":"http://arxiv.org/abs/2407.20076v1","updated":"2024-07-29T15:02:51Z","published":"2024-07-29T15:02:51Z","title":"Investigating the Impact of Semi-Supervised Methods with Data\n Augmentation on Offensive Language Detection in Romanian Language","summary":" Offensive language detection is a crucial task in today's digital landscape,\nwhere online platforms grapple with maintaining a respectful and inclusive\nenvironment. However, building robust offensive language detection models\nrequires large amounts of labeled data, which can be expensive and\ntime-consuming to obtain. Semi-supervised learning offers a feasible solution\nby utilizing labeled and unlabeled data to create more accurate and robust\nmodels. In this paper, we explore a few different semi-supervised methods, as\nwell as data augmentation techniques. Concretely, we implemented eight\nsemi-supervised methods and ran experiments for them using only the available\ndata in the RO-Offense dataset and applying five augmentation techniques before\nfeeding the data to the models. Experimental results demonstrate that some of\nthem benefit more from augmentations than others.\n","authors":["Elena Beatrice Nicola","Dumitru Clementin Cercel","Florin Pop"],"pdf_url":"https://arxiv.org/pdf/2407.20076v1.pdf","comment":"10 pages, 3 figures, 28th International Conference on Knowledge-Based\n and Intelligent Information & Engineering Systems"},{"id":"http://arxiv.org/abs/2407.20046v1","updated":"2024-07-29T14:30:39Z","published":"2024-07-29T14:30:39Z","title":"Exploring Large Language Models to generate Easy to Read content","summary":" Ensuring text accessibility and understandability are essential goals,\nparticularly for individuals with cognitive impairments and intellectual\ndisabilities, who encounter challenges in accessing information across various\nmediums such as web pages, newspapers, administrative tasks, or health\ndocuments. Initiatives like Easy to Read and Plain Language guidelines aim to\nsimplify complex texts; however, standardizing these guidelines remains\nchallenging and often involves manual processes. This work presents an\nexploratory investigation into leveraging Artificial Intelligence (AI) and\nNatural Language Processing (NLP) approaches to systematically simplify Spanish\ntexts into Easy to Read formats, with a focus on utilizing Large Language\nModels (LLMs) for simplifying texts, especially in generating Easy to Read\ncontent. The study contributes a parallel corpus of Spanish adapted for Easy To\nRead format, which serves as a valuable resource for training and testing text\nsimplification systems. Additionally, several text simplification experiments\nusing LLMs and the collected corpus are conducted, involving fine-tuning and\ntesting a Llama2 model to generate Easy to Read content. A qualitative\nevaluation, guided by an expert in text adaptation for Easy to Read content, is\ncarried out to assess the automatically simplified texts. This research\ncontributes to advancing text accessibility for individuals with cognitive\nimpairments, highlighting promising strategies for leveraging LLMs while\nresponsibly managing energy usage.\n","authors":["Paloma Martínez","Lourdes Moreno","Alberto Ramos"],"pdf_url":"https://arxiv.org/pdf/2407.20046v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00326v3","updated":"2024-07-29T13:40:11Z","published":"2023-12-01T03:44:54Z","title":"Agent-OM: Leveraging LLM Agents for Ontology Matching","summary":" Ontology matching (OM) enables semantic interoperability between different\nontologies and resolves their conceptual heterogeneity by aligning related\nentities. OM systems currently have two prevailing design paradigms:\nconventional knowledge-based expert systems and newer machine learning-based\npredictive systems. While large language models (LLMs) and LLM agents have\nrevolutionised data engineering and have been applied creatively in many\ndomains, their potential for OM remains underexplored. This study introduces a\nnovel agent-powered LLM-based design paradigm for OM systems. With\nconsideration of several specific challenges in leveraging LLM agents for OM,\nwe propose a generic framework, namely Agent-OM (w.r.t. Agent for Ontology\nMatching), consisting of two Siamese agents for retrieval and matching, with a\nset of simple OM tools. Our framework is implemented in a proof-of-concept\nsystem. Evaluations of three Ontology Alignment Evaluation Initiative (OAEI)\ntracks over state-of-the-art OM systems show that our system can achieve\nresults very close to the long-standing best performance on simple OM tasks and\ncan significantly improve the performance on complex and few-shot OM tasks.\n","authors":["Zhangcheng Qiang","Weiqing Wang","Kerry Taylor"],"pdf_url":"https://arxiv.org/pdf/2312.00326v3.pdf","comment":"19 pages, 13 figures, 4 tables"},{"id":"http://arxiv.org/abs/2407.19998v1","updated":"2024-07-29T13:29:43Z","published":"2024-07-29T13:29:43Z","title":"Do LLMs Really Adapt to Domains? An Ontology Learning Perspective","summary":" Large Language Models (LLMs) have demonstrated unprecedented prowess across\nvarious natural language processing tasks in various application domains.\nRecent studies show that LLMs can be leveraged to perform lexical semantic\ntasks, such as Knowledge Base Completion (KBC) or Ontology Learning (OL).\nHowever, it has not effectively been verified whether their success is due to\ntheir ability to reason over unstructured or semi-structured data, or their\neffective learning of linguistic patterns and senses alone. This unresolved\nquestion is particularly crucial when dealing with domain-specific data, where\nthe lexical senses and their meaning can completely differ from what a LLM has\nlearned during its training stage. This paper investigates the following\nquestion: Do LLMs really adapt to domains and remain consistent in the\nextraction of structured knowledge, or do they only learn lexical senses\ninstead of reasoning? To answer this question and, we devise a controlled\nexperiment setup that uses WordNet to synthesize parallel corpora, with English\nand gibberish terms. We examine the differences in the outputs of LLMs for each\ncorpus in two OL tasks: relation extraction and taxonomy discovery. Empirical\nresults show that, while adapting to the gibberish corpora, off-the-shelf LLMs\ndo not consistently reason over semantic relationships between concepts, and\ninstead leverage senses and their frame. However, fine-tuning improves the\nperformance of LLMs on lexical semantic tasks even when the domain-specific\nterms are arbitrary and unseen during pre-training, hinting at the\napplicability of pre-trained LLMs for OL.\n","authors":["Huu Tan Mai","Cuong Xuan Chu","Heiko Paulheim"],"pdf_url":"https://arxiv.org/pdf/2407.19998v1.pdf","comment":"Accepted at ISWC 2024"},{"id":"http://arxiv.org/abs/2407.19984v1","updated":"2024-07-29T13:18:23Z","published":"2024-07-29T13:18:23Z","title":"Confidence Estimation for Automatic Detection of Depression and\n Alzheimer's Disease Based on Clinical Interviews","summary":" Speech-based automatic detection of Alzheimer's disease (AD) and depression\nhas attracted increased attention. Confidence estimation is crucial for a\ntrust-worthy automatic diagnostic system which informs the clinician about the\nconfidence of model predictions and helps reduce the risk of misdiagnosis. This\npaper investigates confidence estimation for automatic detection of AD and\ndepression based on clinical interviews. A novel Bayesian approach is proposed\nwhich uses a dynamic Dirichlet prior distribution to model the second-order\nprobability of the predictive distribution. Experimental results on the\npublicly available ADReSS and DAIC-WOZ datasets demonstrate that the proposed\nmethod outperforms a range of baselines for both classification accuracy and\nconfidence estimation.\n","authors":["Wen Wu","Chao Zhang","Philip C. Woodland"],"pdf_url":"https://arxiv.org/pdf/2407.19984v1.pdf","comment":"Accepted by Interspeech 2024"},{"id":"http://arxiv.org/abs/2406.10908v2","updated":"2024-07-29T13:05:00Z","published":"2024-06-16T12:11:46Z","title":"MICL: Improving In-Context Learning through Multiple-Label Words in\n Demonstration","summary":" In-context learning (ICL) enables large language models (LLMs) to perform new\ntasks by using sample-label pairs as demonstrations. However, variations in\ndemonstrations can lead to significantly different performances. Current\nresearch mainly focuses on selecting demonstration samples, preassuming the\nclass name to be the label word when creating sample-label pairs. However, the\nchoice of label words is crucial for ICL performance. In addition, we observe\nthat using a single class name in demonstration may not yield optimal results.\nIn this paper, we propose to use multiple label words in one sample-label pair\nto enhance ICL performance. Further, we select and order sample-label pairs\nbased on LLM's output distribution, aiming to optimize the demonstration\nexamples from both the samples' and labels' perspectives. Evaluation results on\nseven classification datasets show that the use of multiple label words,\nstrategically organized by their selection, order and quantity, improves ICL\nperformance through diverse label information.\n","authors":["Zhu Zixiao","Feng Zijian","Zhou Hanzhang","Qian Junlang","Mao Kezhi"],"pdf_url":"https://arxiv.org/pdf/2406.10908v2.pdf","comment":"16 pages, 7 figures"},{"id":"http://arxiv.org/abs/2407.19967v1","updated":"2024-07-29T13:00:36Z","published":"2024-07-29T13:00:36Z","title":"A Temporal Psycholinguistics Approach to Identity Resolution of Social\n Media Users","summary":" In this thesis, we propose an approach to identity resolution across social\nmedia platforms using the topics, sentiments, and timings of the posts on the\nplatforms. After collecting the public posts of around 5000 profiles from\nDisqus and Twitter, we analyze their posts to match their profiles across the\ntwo platforms. We pursue both temporal and non-temporal methods in our\nanalysis. While neither approach proves definitively superior, the temporal\napproach generally performs better. We found that the temporal window size\ninfluences results more than the shifting amount. On the other hand, our\nsentiment analysis shows that the inclusion of sentiment makes little\ndifference, probably due to flawed data extraction methods. We also\nexperimented with a distance-based reward-and-punishment-focused scoring model,\nwhich achieved an accuracy of 24.198% and an average rank of 158.217 out of\n2525 in our collected corpus. Future work includes refining sentiment analysis\nby evaluating sentiments per topic, extending temporal analysis with additional\nphases, and improving the scoring model through weight adjustments and modified\nrewards.\n","authors":["Md Touhidul Islam"],"pdf_url":"https://arxiv.org/pdf/2407.19967v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19947v1","updated":"2024-07-29T12:29:29Z","published":"2024-07-29T12:29:29Z","title":"Inference acceleration for large language models using \"stairs\" assisted\n greedy generation","summary":" Large Language Models (LLMs) with billions of parameters are known for their\nimpressive predicting capabilities but require lots of resources to run. With\ntheir massive rise in popularity, even a small reduction in required resources\ncould have an impact on environment. On the other hand, smaller models require\nfewer resources but may sacrifice accuracy. In this work, we are proposing an\nimplementation of ``stairs'' assisted greedy generation. It is a modified\nassisted generation methodology that makes use of a smaller model's fast\ngeneration, large model's batch prediction, and \"stairs\" validation in order to\nachieve a speed up in prediction generation. Results show between 9.58 and\n17.24 percent inference time reduction compared to a stand-alone large LLM\nprediction in a text generation task without a loss in accuracy.\n","authors":["Domas Grigaliūnas","Mantas Lukoševičius"],"pdf_url":"https://arxiv.org/pdf/2407.19947v1.pdf","comment":"Accepted at the 29th International Conference on Information Society\n and University Studies (IVUS 2024)"},{"id":"http://arxiv.org/abs/2405.16247v2","updated":"2024-07-29T12:16:56Z","published":"2024-05-25T14:11:44Z","title":"AutoManual: Generating Instruction Manuals by LLM Agents via Interactive\n Environmental Learning","summary":" Large Language Models (LLM) based agents have shown promise in autonomously\ncompleting tasks across various domains, e.g., robotics, games, and web\nnavigation. However, these agents typically require elaborate design and expert\nprompts to solve tasks in specific domains, which limits their adaptability. We\nintroduce AutoManual, a framework enabling LLM agents to autonomously build\ntheir understanding through interaction and adapt to new environments.\nAutoManual categorizes environmental knowledge into diverse rules and optimizes\nthem in an online fashion by two agents: 1) The Planner codes actionable plans\nbased on current rules for interacting with the environment. 2) The Builder\nupdates the rules through a well-structured rule system that facilitates online\nrule management and essential detail retention. To mitigate hallucinations in\nmanaging rules, we introduce a case-conditioned prompting strategy for the\nBuilder. Finally, the Formulator agent compiles these rules into a\ncomprehensive manual. The self-generated manual can not only improve the\nadaptability but also guide the planning of smaller LLMs while being\nhuman-readable. Given only one simple demonstration, AutoManual significantly\nimproves task success rates, achieving 97.4\\% with GPT-4-turbo and 86.2\\% with\nGPT-3.5-turbo on ALFWorld benchmark tasks. The code is available at\nhttps://github.com/minghchen/automanual.\n","authors":["Minghao Chen","Yihang Li","Yanting Yang","Shiyu Yu","Binbin Lin","Xiaofei He"],"pdf_url":"https://arxiv.org/pdf/2405.16247v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05574v3","updated":"2024-07-29T12:05:36Z","published":"2024-02-26T09:10:34Z","title":"HealMe: Harnessing Cognitive Reframing in Large Language Models for\n Psychotherapy","summary":" Large Language Models (LLMs) can play a vital role in psychotherapy by\nadeptly handling the crucial task of cognitive reframing and overcoming\nchallenges such as shame, distrust, therapist skill variability, and resource\nscarcity. Previous LLMs in cognitive reframing mainly converted negative\nemotions to positive ones, but these approaches have limited efficacy, often\nnot promoting clients' self-discovery of alternative perspectives. In this\npaper, we unveil the Helping and Empowering through Adaptive Language in Mental\nEnhancement (HealMe) model. This novel cognitive reframing therapy method\neffectively addresses deep-rooted negative thoughts and fosters rational,\nbalanced perspectives. Diverging from traditional LLM methods, HealMe employs\nempathetic dialogue based on psychotherapeutic frameworks. It systematically\nguides clients through distinguishing circumstances from feelings,\nbrainstorming alternative viewpoints, and developing empathetic, actionable\nsuggestions. Moreover, we adopt the first comprehensive and expertly crafted\npsychological evaluation metrics, specifically designed to rigorously assess\nthe performance of cognitive reframing, in both AI-simulated dialogues and\nreal-world therapeutic conversations. Experimental results show that our model\noutperforms others in terms of empathy, guidance, and logical coherence,\ndemonstrating its effectiveness and potential positive impact on psychotherapy.\n","authors":["Mengxi Xiao","Qianqian Xie","Ziyan Kuang","Zhicheng Liu","Kailai Yang","Min Peng","Weiguang Han","Jimin Huang"],"pdf_url":"https://arxiv.org/pdf/2403.05574v3.pdf","comment":"19 pages, 4 figures"},{"id":"http://arxiv.org/abs/2407.19914v1","updated":"2024-07-29T11:44:21Z","published":"2024-07-29T11:44:21Z","title":"Sentiment Analysis of Lithuanian Online Reviews Using Large Language\n Models","summary":" Sentiment analysis is a widely researched area within Natural Language\nProcessing (NLP), attracting significant interest due to the advent of\nautomated solutions. Despite this, the task remains challenging because of the\ninherent complexity of languages and the subjective nature of sentiments. It is\neven more challenging for less-studied and less-resourced languages such as\nLithuanian. Our review of existing Lithuanian NLP research reveals that\ntraditional machine learning methods and classification algorithms have limited\neffectiveness for the task. In this work, we address sentiment analysis of\nLithuanian five-star-based online reviews from multiple domains that we collect\nand clean. We apply transformer models to this task for the first time,\nexploring the capabilities of pre-trained multilingual Large Language Models\n(LLMs), specifically focusing on fine-tuning BERT and T5 models. Given the\ninherent difficulty of the task, the fine-tuned models perform quite well,\nespecially when the sentiments themselves are less ambiguous: 80.74% and 89.61%\ntesting recognition accuracy of the most popular one- and five-star reviews\nrespectively. They significantly outperform current commercial state-of-the-art\ngeneral-purpose LLM GPT-4. We openly share our fine-tuned LLMs online.\n","authors":["Brigita Vileikytė","Mantas Lukoševičius","Lukas Stankevičius"],"pdf_url":"https://arxiv.org/pdf/2407.19914v1.pdf","comment":"Accepted at the 29th International Conference on Information Society\n and University Studies (IVUS 2024)"},{"id":"http://arxiv.org/abs/2407.19897v1","updated":"2024-07-29T11:21:17Z","published":"2024-07-29T11:21:17Z","title":"BEExAI: Benchmark to Evaluate Explainable AI","summary":" Recent research in explainability has given rise to numerous post-hoc\nattribution methods aimed at enhancing our comprehension of the outputs of\nblack-box machine learning models. However, evaluating the quality of\nexplanations lacks a cohesive approach and a consensus on the methodology for\nderiving quantitative metrics that gauge the efficacy of explainability\npost-hoc attribution methods. Furthermore, with the development of increasingly\ncomplex deep learning models for diverse data applications, the need for a\nreliable way of measuring the quality and correctness of explanations is\nbecoming critical. We address this by proposing BEExAI, a benchmark tool that\nallows large-scale comparison of different post-hoc XAI methods, employing a\nset of selected evaluation metrics.\n","authors":["Samuel Sithakoul","Sara Meftah","Clément Feutry"],"pdf_url":"https://arxiv.org/pdf/2407.19897v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05590v2","updated":"2024-07-29T11:20:42Z","published":"2024-04-08T15:03:57Z","title":"MedExpQA: Multilingual Benchmarking of Large Language Models for Medical\n Question Answering","summary":" Large Language Models (LLMs) have the potential of facilitating the\ndevelopment of Artificial Intelligence technology to assist medical experts for\ninteractive decision support, which has been demonstrated by their competitive\nperformances in Medical QA. However, while impressive, the required quality bar\nfor medical applications remains far from being achieved. Currently, LLMs\nremain challenged by outdated knowledge and by their tendency to generate\nhallucinated content. Furthermore, most benchmarks to assess medical knowledge\nlack reference gold explanations which means that it is not possible to\nevaluate the reasoning of LLMs predictions. Finally, the situation is\nparticularly grim if we consider benchmarking LLMs for languages other than\nEnglish which remains, as far as we know, a totally neglected topic. In order\nto address these shortcomings, in this paper we present MedExpQA, the first\nmultilingual benchmark based on medical exams to evaluate LLMs in Medical\nQuestion Answering. To the best of our knowledge, MedExpQA includes for the\nfirst time reference gold explanations written by medical doctors which can be\nleveraged to establish various gold-based upper-bounds for comparison with LLMs\nperformance. Comprehensive multilingual experimentation using both the gold\nreference explanations and Retrieval Augmented Generation (RAG) approaches show\nthat performance of LLMs still has large room for improvement, especially for\nlanguages other than English. Furthermore, and despite using state-of-the-art\nRAG methods, our results also demonstrate the difficulty of obtaining and\nintegrating readily available medical knowledge that may positively impact\nresults on downstream evaluations for Medical Question Answering. So far the\nbenchmark is available in four languages, but we hope that this work may\nencourage further development to other languages.\n","authors":["Iñigo Alonso","Maite Oronoz","Rodrigo Agerri"],"pdf_url":"https://arxiv.org/pdf/2404.05590v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19884v1","updated":"2024-07-29T11:01:17Z","published":"2024-07-29T11:01:17Z","title":"Preliminary WMT24 Ranking of General MT Systems and LLMs","summary":" This is the preliminary ranking of WMT24 General MT systems based on\nautomatic metrics. The official ranking will be a human evaluation, which is\nsuperior to the automatic ranking and supersedes it. The purpose of this report\nis not to interpret any findings but only provide preliminary results to the\nparticipants of the General MT task that may be useful during the writing of\nthe system submission.\n","authors":["Tom Kocmi","Eleftherios Avramidis","Rachel Bawden","Ondrej Bojar","Anton Dvorkovich","Christian Federmann","Mark Fishel","Markus Freitag","Thamme Gowda","Roman Grundkiewicz","Barry Haddow","Marzena Karpinska","Philipp Koehn","Benjamin Marie","Kenton Murray","Masaaki Nagata","Martin Popel","Maja Popovic","Mariya Shmatova","Steinþór Steingrímsson","Vilém Zouhar"],"pdf_url":"https://arxiv.org/pdf/2407.19884v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19842v1","updated":"2024-07-29T09:55:34Z","published":"2024-07-29T09:55:34Z","title":"Detecting and Understanding Vulnerabilities in Language Models via\n Mechanistic Interpretability","summary":" Large Language Models (LLMs), characterized by being trained on broad amounts\nof data in a self-supervised manner, have shown impressive performance across a\nwide range of tasks. Indeed, their generative abilities have aroused interest\non the application of LLMs across a wide range of contexts. However, neural\nnetworks in general, and LLMs in particular, are known to be vulnerable to\nadversarial attacks, where an imperceptible change to the input can mislead the\noutput of the model. This is a serious concern that impedes the use of LLMs on\nhigh-stakes applications, such as healthcare, where a wrong prediction can\nimply serious consequences. Even though there are many efforts on making LLMs\nmore robust to adversarial attacks, there are almost no works that study\n\\emph{how} and \\emph{where} these vulnerabilities that make LLMs prone to\nadversarial attacks happen. Motivated by these facts, we explore how to\nlocalize and understand vulnerabilities, and propose a method, based on\nMechanistic Interpretability (MI) techniques, to guide this process.\nSpecifically, this method enables us to detect vulnerabilities related to a\nconcrete task by (i) obtaining the subset of the model that is responsible for\nthat task, (ii) generating adversarial samples for that task, and (iii) using\nMI techniques together with the previous samples to discover and understand the\npossible vulnerabilities. We showcase our method on a pretrained GPT-2 Small\nmodel carrying out the task of predicting 3-letter acronyms to demonstrate its\neffectiveness on locating and understanding concrete vulnerabilities of the\nmodel.\n","authors":["Jorge García-Carrasco","Alejandro Maté","Juan Trujillo"],"pdf_url":"https://arxiv.org/pdf/2407.19842v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19835v1","updated":"2024-07-29T09:45:34Z","published":"2024-07-29T09:45:34Z","title":"ATHAR: A High-Quality and Diverse Dataset for Classical Arabic to\n English Translation","summary":" Classical Arabic represents a significant era, encompassing the golden age of\nArab culture, philosophy, and scientific literature. With a broad consensus on\nthe importance of translating these literatures to enrich knowledge\ndissemination across communities, the advent of large language models (LLMs)\nand translation systems offers promising tools to facilitate this goal.\nHowever, we have identified a scarcity of translation datasets in Classical\nArabic, which are often limited in scope and topics, hindering the development\nof high-quality translation systems. In response, we present the ATHAR dataset,\ncomprising 66,000 high-quality Classical Arabic to English translation samples\nthat cover a wide array of subjects including science, culture, and philosophy.\nFurthermore, we assess the performance of current state-of-the-art LLMs under\nvarious settings, concluding that there is a need for such datasets in current\nsystems. Our findings highlight how models can benefit from fine-tuning or\nincorporating this dataset into their pretraining pipelines. The dataset is\npublicly available on the HuggingFace Data Hub at\n\\url{https://huggingface.co/datasets/mohamed-khalil/ATHAR}.\n","authors":["Mohammed Khalil","Mohammed Sabry"],"pdf_url":"https://arxiv.org/pdf/2407.19835v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19832v1","updated":"2024-07-29T09:38:15Z","published":"2024-07-29T09:38:15Z","title":"ML-Mamba: Efficient Multi-Modal Large Language Model Utilizing Mamba-2","summary":" Multimodal Large Language Models (MLLMs) have attracted much attention due to\ntheir multifunctionality. However, traditional Transformer architectures incur\nsignificant overhead due to their secondary computational complexity. To\naddress this issue, we introduce ML-Mamba, a multimodal language model that\nutilizes the latest and efficient Mamba-2 model for inference. Mamba-2 is known\nfor its linear extension and fast processing of long sequences. We replace the\nTransformer based backbone with a pre-trained Mamba-2 model and explore methods\nfor integrating 2D visual selective scanning mechanisms into multimodal\nlearning. We also try various visual encoders and Mamba-2 model variants. Our\nextensive experiments conducted in various multimodal benchmark tests have\ndemonstrated the competitive performance of ML-Mamba and highlighted the\npotential of state space models in multimodal tasks. The experimental results\nshow that: (1) ML-Mamba achieves performance comparable to state-of-the-art\nmethods such as TinyLaVA and MobileVLM v2 through its linear sequential\nmodeling, while also having faster inference speed; (2) ML-Mamba performs well\nin visual hallucinations and spatial relationship judgment in closed set\nbenchmark tests; (3) ML-Mamba achieves performance comparable to LLaVA while\nreducing the number of parameters by 40\\%.(4) Compared to the multimodal model\nusing the original Mamba model, the Mamba-2 based large-scale multimodal\nlanguage model has stronger inference performance and effectiveness.\n","authors":["Wenjun Huang","Jianguo Hu"],"pdf_url":"https://arxiv.org/pdf/2407.19832v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17900v2","updated":"2024-07-29T09:33:01Z","published":"2024-07-25T09:42:24Z","title":"The Power of Combining Data and Knowledge: GPT-4o is an Effective\n Interpreter of Machine Learning Models in Predicting Lymph Node Metastasis of\n Lung Cancer","summary":" Lymph node metastasis (LNM) is a crucial factor in determining the initial\ntreatment for patients with lung cancer, yet accurate preoperative diagnosis of\nLNM remains challenging. Recently, large language models (LLMs) have garnered\nsignificant attention due to their remarkable text generation capabilities.\nLeveraging the extensive medical knowledge learned from vast corpora, LLMs can\nestimate probabilities for clinical problems, though their performance has\nhistorically been inferior to data-driven machine learning models. In this\npaper, we propose a novel ensemble method that combines the medical knowledge\nacquired by LLMs with the latent patterns identified by machine learning models\nto enhance LNM prediction performance. Initially, we developed machine learning\nmodels using patient data. We then designed a prompt template to integrate the\npatient data with the predicted probability from the machine learning model.\nSubsequently, we instructed GPT-4o, the most advanced LLM developed by OpenAI,\nto estimate the likelihood of LNM based on patient data and then adjust the\nestimate using the machine learning output. Finally, we collected three outputs\nfrom the GPT-4o using the same prompt and ensembled these results as the final\nprediction. Using the proposed method, our models achieved an AUC value of\n0.765 and an AP value of 0.415 for LNM prediction, significantly improving\npredictive performance compared to baseline machine learning models. The\nexperimental results indicate that GPT-4o can effectively leverage its medical\nknowledge and the probabilities predicted by machine learning models to achieve\nmore accurate LNM predictions. These findings demonstrate that LLMs can perform\nwell in clinical risk prediction tasks, offering a new paradigm for integrating\nmedical knowledge and patient data in clinical predictions.\n","authors":["Danqing Hu","Bing Liu","Xiaofeng Zhu","Nan Wu"],"pdf_url":"https://arxiv.org/pdf/2407.17900v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19825v1","updated":"2024-07-29T09:21:52Z","published":"2024-07-29T09:21:52Z","title":"Concise Thoughts: Impact of Output Length on LLM Reasoning and Cost","summary":" Today's large language models (LLMs) can solve challenging question-answering\ntasks, and prompt engineering techniques, such as chain-of-thought (CoT), have\ngained attention for enhancing the explanation and correctness of outputs.\nNevertheless, models require significant time to generate answers augmented\nwith lengthy reasoning details. To address this issue, this paper analyzes the\nimpact of output lengths on LLM inference pipelines and proposes novel metrics\nto evaluate them in terms of \\textit{correct conciseness}. It also examines the\nimpact of controlling output length through a refined prompt engineering\nstrategy, Constrained-CoT (CCoT), which encourages the model to limit output\nlength. Experiments on pre-trained LLMs demonstrated the benefit of the\nproposed metrics and the effectiveness of CCoT across different models. For\ninstance, constraining the reasoning of LLaMA2-70b to 100 words improves the\naccuracy from 36.01\\% (CoT) to 41.07\\% (CCoT) on the GSM8K dataset, while\nreducing the average output length by 28 words.\n","authors":["Sania Nayab","Giulio Rossolini","Giorgio Buttazzo","Nicolamaria Manes","Fabrizio Giacomelli"],"pdf_url":"https://arxiv.org/pdf/2407.19825v1.pdf","comment":"Preprint version, under review"},{"id":"http://arxiv.org/abs/2407.19816v1","updated":"2024-07-29T09:08:40Z","published":"2024-07-29T09:08:40Z","title":"Comparative Analysis of Encoder-Based NER and Large Language Models for\n Skill Extraction from Russian Job Vacancies","summary":" The labor market is undergoing rapid changes, with increasing demands on job\nseekers and a surge in job openings. Identifying essential skills and\ncompetencies from job descriptions is challenging due to varying employer\nrequirements and the omission of key skills. This study addresses these\nchallenges by comparing traditional Named Entity Recognition (NER) methods\nbased on encoders with Large Language Models (LLMs) for extracting skills from\nRussian job vacancies. Using a labeled dataset of 4,000 job vacancies for\ntraining and 1,472 for testing, the performance of both approaches is\nevaluated. Results indicate that traditional NER models, especially DeepPavlov\nRuBERT NER tuned, outperform LLMs across various metrics including accuracy,\nprecision, recall, and inference time. The findings suggest that traditional\nNER models provide more effective and efficient solutions for skill extraction,\nenhancing job requirement clarity and aiding job seekers in aligning their\nqualifications with employer expectations. This research contributes to the\nfield of natural language processing (NLP) and its application in the labor\nmarket, particularly in non-English contexts.\n","authors":["Nikita Matkin","Aleksei Smirnov","Mikhail Usanin","Egor Ivanov","Kirill Sobyanin","Sofiia Paklina","Petr Parshakov"],"pdf_url":"https://arxiv.org/pdf/2407.19816v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19813v1","updated":"2024-07-29T09:05:10Z","published":"2024-07-29T09:05:10Z","title":"Improving Retrieval Augmented Language Model with Self-Reasoning","summary":" The Retrieval-Augmented Language Model (RALM) has shown remarkable\nperformance on knowledge-intensive tasks by incorporating external knowledge\nduring inference, which mitigates the factual hallucinations inherited in large\nlanguage models (LLMs). Despite these advancements, challenges persist in the\nimplementation of RALMs, particularly concerning their reliability and\ntraceability. To be specific, the irrelevant document retrieval may result in\nunhelpful response generation or even deteriorate the performance of LLMs,\nwhile the lack of proper citations in generated outputs complicates efforts to\nverify the trustworthiness of the models. To this end, we propose a novel\nself-reasoning framework aimed at improving the reliability and traceability of\nRALMs, whose core idea is to leverage reasoning trajectories generated by the\nLLM itself. The framework involves constructing self-reason trajectories with\nthree processes: a relevance-aware process, an evidence-aware selective\nprocess, and a trajectory analysis process. We have evaluated our framework\nacross four public datasets (two short-form QA datasets, one long-form QA\ndataset, and one fact verification dataset) to demonstrate the superiority of\nour method, which can outperform existing state-of-art models and can achieve\ncomparable performance with GPT-4, while only using 2,000 training samples.\n","authors":["Yuan Xia","Jingbo Zhou","Zhenhui Shi","Jun Chen","Haifeng Huang"],"pdf_url":"https://arxiv.org/pdf/2407.19813v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19808v1","updated":"2024-07-29T09:02:38Z","published":"2024-07-29T09:02:38Z","title":"Segmentation en phrases : ouvrez les guillemets sans perdre le fil","summary":" This paper presents a graph cascade for sentence segmentation of XML\ndocuments. Our proposal offers sentences inside sentences for cases introduced\nby quotation marks and hyphens, and also pays particular attention to\nsituations involving incises introduced by parentheses and lists introduced by\ncolons. We present how the tool works and compare the results obtained with\nthose available in 2019 on the same dataset, together with an evaluation of the\nsystem's performance on a test corpus\n","authors":["Sandrine Ollinger","Denis Maurel"],"pdf_url":"https://arxiv.org/pdf/2407.19808v1.pdf","comment":"in French language"},{"id":"http://arxiv.org/abs/2407.19807v1","updated":"2024-07-29T09:02:19Z","published":"2024-07-29T09:02:19Z","title":"Cool-Fusion: Fuse Large Language Models without Training","summary":" We focus on the problem of fusing two or more heterogeneous large language\nmodels (LLMs) to facilitate their complementary strengths. One of the\nchallenges on model fusion is high computational load, i.e. to fine-tune or to\nalign vocabularies via combinatorial optimization. To this end, we propose\n\\emph{Cool-Fusion}, a simple yet effective approach that fuses the knowledge of\nheterogeneous source LLMs to leverage their complementary strengths.\n\\emph{Cool-Fusion} is the first method that does not require any type of\ntraining like the ensemble approaches. But unlike ensemble methods, it is\napplicable to any set of source LLMs that have different vocabularies. The\nbasic idea is to have each source LLM individually generate tokens until the\ntokens can be decoded into a text segment that ends at word boundaries common\nto all source LLMs. Then, the source LLMs jointly rerank the generated text\nsegment and select the best one, which is the fused text generation in one\nstep. Extensive experiments are conducted across a variety of benchmark\ndatasets. On \\emph{GSM8K}, \\emph{Cool-Fusion} increases accuracy from three\nstrong source LLMs by a significant 8\\%-17.8\\%.\n","authors":["Cong Liu","Xiaojun Quan","Yan Pan","Liang Lin","Weigang Wu","Xu Chen"],"pdf_url":"https://arxiv.org/pdf/2407.19807v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19798v1","updated":"2024-07-29T08:43:48Z","published":"2024-07-29T08:43:48Z","title":"Teaching LLMs at Charles University: Assignments and Activities","summary":" This paper presents teaching materials, particularly assignments and ideas\nfor classroom activities, from a new course on large language models (LLMs)\ntaught at Charles University. The assignments include experiments with LLM\ninference for weather report generation and machine translation. The classroom\nactivities include class quizzes, focused research on downstream tasks and\ndatasets, and an interactive \"best paper\" session aimed at reading and\ncomprehension of research papers.\n","authors":["Jindřich Helcl","Zdeněk Kasner","Ondřej Dušek","Tomasz Limisiewicz","Dominik Macháček","Tomáš Musil","Jindřich Libovický"],"pdf_url":"https://arxiv.org/pdf/2407.19798v1.pdf","comment":"6th TeachNLP workshop at ACL 2024"},{"id":"http://arxiv.org/abs/2407.19795v1","updated":"2024-07-29T08:38:46Z","published":"2024-07-29T08:38:46Z","title":"VolDoGer: LLM-assisted Datasets for Domain Generalization in\n Vision-Language Tasks","summary":" Domain generalizability is a crucial aspect of a deep learning model since it\ndetermines the capability of the model to perform well on data from unseen\ndomains. However, research on the domain generalizability of deep learning\nmodels for vision-language tasks remains limited, primarily because of the lack\nof required datasets. To address these challenges, we propose VolDoGer:\nVision-Language Dataset for Domain Generalization, a dedicated dataset designed\nfor domain generalization that addresses three vision-language tasks: image\ncaptioning, visual question answering, and visual entailment. We constructed\nVolDoGer by extending LLM-based data annotation techniques to vision-language\ntasks, thereby alleviating the burden of recruiting human annotators. We\nevaluated the domain generalizability of various models, ranging from\nfine-tuned models to a recent multimodal large language model, through\nVolDoGer.\n","authors":["Juhwan Choi","Junehyoung Kwon","JungMin Yun","Seunguk Yu","YoungBin Kim"],"pdf_url":"https://arxiv.org/pdf/2407.19795v1.pdf","comment":"31 pages, 5 figures, 20 tables"},{"id":"http://arxiv.org/abs/2407.19794v1","updated":"2024-07-29T08:38:14Z","published":"2024-07-29T08:38:14Z","title":"Introducing a new hyper-parameter for RAG: Context Window Utilization","summary":" This paper introduces a new hyper-parameter for Retrieval-Augmented\nGeneration (RAG) systems called Context Window Utilization. RAG systems enhance\ngenerative models by incorporating relevant information retrieved from external\nknowledge bases, improving the factual accuracy and contextual relevance of\ngenerated responses. The size of the text chunks retrieved and processed is a\ncritical factor influencing RAG performance. This study aims to identify the\noptimal chunk size that maximizes answer generation quality. Through systematic\nexperimentation, we analyze the effects of varying chunk sizes on the\nefficiency and effectiveness of RAG frameworks. Our findings reveal that an\noptimal chunk size balances the trade-off between providing sufficient context\nand minimizing irrelevant information. These insights are crucial for enhancing\nthe design and implementation of RAG systems, underscoring the importance of\nselecting an appropriate chunk size to achieve superior performance.\n","authors":["Kush Juvekar","Anupam Purwar"],"pdf_url":"https://arxiv.org/pdf/2407.19794v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.14873v3","updated":"2024-07-29T08:27:34Z","published":"2024-02-21T17:13:41Z","title":"Technical Report on the Pangram AI-Generated Text Classifier","summary":" We present Pangram Text, a transformer-based neural network trained to\ndistinguish text written by large language models from text written by humans.\nPangram Text outperforms zero-shot methods such as DetectGPT as well as leading\ncommercial AI detection tools with over 38 times lower error rates on a\ncomprehensive benchmark comprised of 10 text domains (student writing, creative\nwriting, scientific writing, books, encyclopedias, news, email, scientific\npapers, short-form Q&A) and 8 open- and closed-source large language models. We\npropose a training algorithm, hard negative mining with synthetic mirrors, that\nenables our classifier to achieve orders of magnitude lower false positive\nrates on high-data domains such as reviews. Finally, we show that Pangram Text\nis not biased against nonnative English speakers and generalizes to domains and\nmodels unseen during training.\n","authors":["Bradley Emi","Max Spero"],"pdf_url":"https://arxiv.org/pdf/2402.14873v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18752v2","updated":"2024-07-29T08:27:33Z","published":"2024-07-26T14:07:00Z","title":"Knowledge Graph Structure as Prompt: Improving Small Language Models\n Capabilities for Knowledge-based Causal Discovery","summary":" Causal discovery aims to estimate causal structures among variables based on\nobservational data. Large Language Models (LLMs) offer a fresh perspective to\ntackle the causal discovery problem by reasoning on the metadata associated\nwith variables rather than their actual data values, an approach referred to as\nknowledge-based causal discovery. In this paper, we investigate the\ncapabilities of Small Language Models (SLMs, defined as LLMs with fewer than 1\nbillion parameters) with prompt-based learning for knowledge-based causal\ndiscovery. Specifically, we present KG Structure as Prompt, a novel approach\nfor integrating structural information from a knowledge graph, such as common\nneighbor nodes and metapaths, into prompt-based learning to enhance the\ncapabilities of SLMs. Experimental results on three types of biomedical and\nopen-domain datasets under few-shot settings demonstrate the effectiveness of\nour approach, surpassing most baselines and even conventional fine-tuning\napproaches trained on full datasets. Our findings further highlight the strong\ncapabilities of SLMs: in combination with knowledge graphs and prompt-based\nlearning, SLMs demonstrate the potential to surpass LLMs with larger number of\nparameters. Our code and datasets are available on GitHub.\n","authors":["Yuni Susanti","Michael Färber"],"pdf_url":"https://arxiv.org/pdf/2407.18752v2.pdf","comment":"accepted at ISWC'24"},{"id":"http://arxiv.org/abs/2407.19779v1","updated":"2024-07-29T08:21:42Z","published":"2024-07-29T08:21:42Z","title":"Synthesizing Scientific Summaries: An Extractive and Abstractive\n Approach","summary":" The availability of a vast array of research papers in any area of study,\nnecessitates the need of automated summarisation systems that can present the\nkey research conducted and their corresponding findings. Scientific paper\nsummarisation is a challenging task for various reasons including token length\nlimits in modern transformer models and corresponding memory and compute\nrequirements for long text. A significant amount of work has been conducted in\nthis area, with approaches that modify the attention mechanisms of existing\ntransformer models and others that utilise discourse information to capture\nlong range dependencies in research papers. In this paper, we propose a hybrid\nmethodology for research paper summarisation which incorporates an extractive\nand abstractive approach. We use the extractive approach to capture the key\nfindings of research, and pair it with the introduction of the paper which\ncaptures the motivation for research. We use two models based on unsupervised\nlearning for the extraction stage and two transformer language models,\nresulting in four combinations for our hybrid approach. The performances of the\nmodels are evaluated on three metrics and we present our findings in this\npaper. We find that using certain combinations of hyper parameters, it is\npossible for automated summarisation systems to exceed the abstractiveness of\nsummaries written by humans. Finally, we state our future scope of research in\nextending this methodology to summarisation of generalised long documents.\n","authors":["Grishma Sharma","Aditi Paretkar","Deepak Sharma"],"pdf_url":"https://arxiv.org/pdf/2407.19779v1.pdf","comment":"the paper consists of 10 pages , 5 figures and 4 tables"},{"id":"http://arxiv.org/abs/2407.19775v1","updated":"2024-07-29T08:18:48Z","published":"2024-07-29T08:18:48Z","title":"Model Agnostic Hybrid Sharding For Heterogeneous Distributed Inference","summary":" The rapid growth of large-scale AI models, particularly large language models\nhas brought significant challenges in data privacy, computational resources,\nand accessibility. Traditional centralized architectures often struggle to meet\nrequired data security and scalability needs which hinders the democratization\nof AI systems. Nesa introduces a model-agnostic sharding framework designed for\ndecentralized AI inference. Our framework uses blockchain-based sequential deep\nneural network sharding to distribute computational tasks across a diverse\nnetwork of nodes based on a personalised heuristic and routing mechanism. This\nenables efficient distributed training and inference for recent large-scale\nmodels even on consumer-grade hardware. We use compression techniques like\ndynamic blockwise quantization and mixed matrix decomposition to reduce data\ntransfer and memory needs. We also integrate robust security measures,\nincluding hardware-based trusted execution environments to ensure data\nintegrity and confidentiality. Evaluating our system across various natural\nlanguage processing and vision tasks shows that these compression strategies do\nnot compromise model accuracy. Our results highlight the potential to\ndemocratize access to cutting-edge AI technologies by enabling secure and\nefficient inference on a decentralized network.\n","authors":["Claudio Angione","Yue Zhao","Harry Yang","Ahmad Farhan","Fielding Johnston","James Buban","Patrick Colangelo"],"pdf_url":"https://arxiv.org/pdf/2407.19775v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19760v1","updated":"2024-07-29T07:51:43Z","published":"2024-07-29T07:51:43Z","title":"Legal Minds, Algorithmic Decisions: How LLMs Apply Constitutional\n Principles in Complex Scenarios","summary":" In this paper, we conduct an empirical analysis of how large language models\n(LLMs), specifically GPT-4, interpret constitutional principles in complex\ndecision-making scenarios. We examine rulings from the Italian Constitutional\nCourt on bioethics issues that involve trade-offs between competing values and\ncompare model-generated legal arguments on these issues to those presented by\nthe State, the Court, and the applicants. Our results indicate that GPT-4\nconsistently aligns more closely with progressive interpretations of the\nConstitution, often overlooking competing values and mirroring the applicants'\nviews rather than the more conservative perspectives of the State or the\nCourt's moderate positions. Our experiments reveal a distinct tendency of GPT-4\nto favor progressive legal interpretations, underscoring the influence of\nunderlying data biases. We thus underscore the importance of testing alignment\nin real-world scenarios and considering the implications of deploying LLMs in\ndecision-making processes.\n","authors":["Camilla Bignotti","Carolina Camassa"],"pdf_url":"https://arxiv.org/pdf/2407.19760v1.pdf","comment":"Accepted at AIES24"},{"id":"http://arxiv.org/abs/2407.19740v1","updated":"2024-07-29T07:07:37Z","published":"2024-07-29T07:07:37Z","title":"KNOWCOMP POKEMON Team at DialAM-2024: A Two-Stage Pipeline for Detecting\n Relations in Dialogical Argument Mining","summary":" Dialogical Argument Mining(DialAM) is an important branch of Argument\nMining(AM). DialAM-2024 is a shared task focusing on dialogical argument\nmining, which requires us to identify argumentative relations and illocutionary\nrelations among proposition nodes and locution nodes. To accomplish this, we\npropose a two-stage pipeline, which includes the Two-Step S-Node Prediction\nModel in Stage 1 and the YA-Node Prediction Model in Stage 2. We also augment\nthe training data in both stages and introduce context in Stage 2. We\nsuccessfully completed the task and achieved good results. Our team Pokemon\nranked 1st in the ARI Focused score and 4th in the Global Focused score.\n","authors":["Zihao Zheng","Zhaowei Wang","Qing Zong","Yangqiu Song"],"pdf_url":"https://arxiv.org/pdf/2407.19740v1.pdf","comment":"Published on the 11th Workshop on Argument Mining"},{"id":"http://arxiv.org/abs/2407.19726v1","updated":"2024-07-29T06:13:28Z","published":"2024-07-29T06:13:28Z","title":"Do Text-to-Vis Benchmarks Test Real Use of Visualisations?","summary":" Large language models are able to generate code for visualisations in\nresponse to user requests. This is a useful application, and an appealing one\nfor NLP research because plots of data provide grounding for language. However,\nthere are relatively few benchmarks, and it is unknown whether those that exist\nare representative of what people do in practice. This paper aims to answer\nthat question through an empirical study comparing benchmark datasets and code\nfrom public repositories. Our findings reveal a substantial gap in datasets,\nwith evaluations not testing the same distribution of chart types, attributes,\nand the number of actions. The only representative dataset requires\nmodification to become an end-to-end and practical benchmark. This shows that\nnew, more benchmarks are needed to support the development of systems that\ntruly address users' visualisation needs. These observations will guide future\ndata creation, highlighting which features hold genuine significance for users.\n","authors":["Hy Nguyen","Xuefei He","Andrew Reeson","Cecile Paris","Josiah Poon","Jonathan K. Kummerfeld"],"pdf_url":"https://arxiv.org/pdf/2407.19726v1.pdf","comment":"ARR AE score of 4"},{"id":"http://arxiv.org/abs/2309.16701v3","updated":"2024-07-29T06:03:24Z","published":"2023-08-15T17:38:55Z","title":"MVMR: A New Framework for Evaluating Faithfulness of Video Moment\n Retrieval against Multiple Distractors","summary":" With the explosion of multimedia content, video moment retrieval (VMR), which\naims to detect a video moment that matches a given text query from a video, has\nbeen studied intensively as a critical problem. However, the existing VMR\nframework evaluates video moment retrieval performance, assuming that a video\nis given, which may not reveal whether the models exhibit overconfidence in the\nfalsely given video. In this paper, we propose the MVMR (Massive Videos Moment\nRetrieval for Faithfulness Evaluation) task that aims to retrieve video moments\nwithin a massive video set, including multiple distractors, to evaluate the\nfaithfulness of VMR models. For this task, we suggest an automated massive\nvideo pool construction framework to categorize negative (distractors) and\npositive (false-negative) video sets using textual and visual semantic distance\nverification methods. We extend existing VMR datasets using these methods and\nnewly construct three practical MVMR datasets. To solve the task, we further\npropose a strong informative sample-weighted learning method, CroCs, which\nemploys two contrastive learning mechanisms: (1) weakly-supervised potential\nnegative learning and (2) cross-directional hard-negative learning.\nExperimental results on the MVMR datasets reveal that existing VMR models are\neasily distracted by the misinformation (distractors), whereas our model shows\nsignificantly robust performance, demonstrating that CroCs is essential to\ndistinguishing positive moments against distractors. Our code and datasets are\npublicly available: https://github.com/yny0506/Massive-Videos-Moment-Retrieval.\n","authors":["Nakyeong Yang","Minsung Kim","Seunghyun Yoon","Joongbo Shin","Kyomin Jung"],"pdf_url":"https://arxiv.org/pdf/2309.16701v3.pdf","comment":"accepted to CIKM 2024"},{"id":"http://arxiv.org/abs/2407.19705v1","updated":"2024-07-29T05:00:48Z","published":"2024-07-29T05:00:48Z","title":"CollectiveSFT: Scaling Large Language Models for Chinese Medical\n Benchmark with Collective Instructions in Healthcare","summary":" The rapid progress in Large Language Models (LLMs) has prompted the creation\nof numerous benchmarks to evaluate their capabilities.This study focuses on the\nComprehensive Medical Benchmark in Chinese (CMB), showcasing how dataset\ndiversity and distribution in supervised fine-tuning (SFT) may enhance LLM\nperformance.Remarkably, We successfully trained a smaller base model to achieve\nscores comparable to larger models, indicating that a diverse and\nwell-distributed dataset can optimize performance regardless of model size.This\nstudy suggests that even smaller models may reach high performance levels with\ncarefully curated and varied datasets.By integrating a wide range of\ninstructional content, our approach addresses potential issues such as data\nquality inconsistencies. Our results imply that a broader spectrum of training\ndata may enhance a model's ability to generalize and perform effectively across\ndifferent medical scenarios, highlighting the importance of dataset quality and\ndiversity in fine-tuning processes.\n","authors":["Jingwei Zhu","Minghuan Tan","Min Yang","Ruixue Li","Hamid Alinejad-Rokny"],"pdf_url":"https://arxiv.org/pdf/2407.19705v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19687v1","updated":"2024-07-29T04:10:13Z","published":"2024-07-29T04:10:13Z","title":"Efficiently and Effectively: A Two-stage Approach to Balance Plaintext\n and Encrypted Text for Traffic Classification","summary":" Encrypted traffic classification is the task of identifying the application\nor service associated with encrypted network traffic. One effective approach\nfor this task is to use deep learning methods to encode the raw traffic bytes\ndirectly and automatically extract features for classification (byte-based\nmodels). However, current byte-based models input raw traffic bytes, whether\nplaintext or encrypted text, for automated feature extraction, neglecting the\ndistinct impacts of plaintext and encrypted text on downstream tasks.\nAdditionally, these models primarily focus on improving classification\naccuracy, with little emphasis on the efficiency of models. In this paper, for\nthe first time, we analyze the impact of plaintext and encrypted text on the\nmodel's effectiveness and efficiency. Based on our observations and findings,\nwe propose a two-phase approach to balance the trade-off between plaintext and\nencrypted text in traffic classification. Specifically, Stage one is to\nDetermine whether the Plain text is enough to be accurately Classified (DPC)\nusing the proposed DPC Selector. This stage quickly identifies samples that can\nbe classified using plaintext, leveraging explicit byte features in plaintext\nto enhance model's efficiency. Stage two aims to adaptively make a\nclassification with the result from stage one. This stage incorporates\nencrypted text information for samples that cannot be classified using\nplaintext alone, ensuring the model's effectiveness on traffic classification\ntasks. Experiments on two datasets demonstrate that our proposed model achieves\nstate-of-the-art results in both effectiveness and efficiency.\n","authors":["Wei Peng"],"pdf_url":"https://arxiv.org/pdf/2407.19687v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.05908v2","updated":"2024-07-29T04:03:22Z","published":"2023-07-12T04:28:41Z","title":"Predictive Pipelined Decoding: A Compute-Latency Trade-off for Exact LLM\n Decoding","summary":" This paper presents \"Predictive Pipelined Decoding (PPD),\" an approach that\nspeeds up greedy decoding in Large Language Models (LLMs) while maintaining the\nexact same output as the original decoding. Unlike conventional strategies, PPD\nemploys additional compute resources to parallelize the initiation of\nsubsequent token decoding during the current token decoding. This method\nreduces decoding latency and reshapes the understanding of trade-offs in LLM\ndecoding strategies. We have developed a theoretical framework that allows us\nto analyze the trade-off between computation and latency. Using this framework,\nwe can analytically estimate the potential reduction in latency associated with\nour proposed method, achieved through the assessment of the match rate,\nrepresented as p_correct. The results demonstrate that the use of extra\ncomputational resources has the potential to accelerate LLM decoding.\nAdditionally, we implement PPD and conduct preliminary experiments to\nempirically validate its efficacy, addressing potential practical overheads not\ncovered by theoretical analysis.\n","authors":["Seongjun Yang","Gibbeum Lee","Jaewoong Cho","Dimitris Papailiopoulos","Kangwook Lee"],"pdf_url":"https://arxiv.org/pdf/2307.05908v2.pdf","comment":"ES-FoMo Workshop at ICML 2023 / Published in TMLR"},{"id":"http://arxiv.org/abs/2305.11527v4","updated":"2024-07-29T03:41:34Z","published":"2023-05-19T08:51:11Z","title":"InstructIE: A Bilingual Instruction-based Information Extraction Dataset","summary":" Large language models can perform well on general natural language tasks, but\ntheir effectiveness is still suboptimal for information extraction (IE). Recent\nworks indicate that the main reason lies in the lack of extensive data on IE\ninstructions. Note that the existing datasets on IE instructions not only have\nlimited coverage but also involve high construction costs. To address this\nissue, we introduce InstructIE, a bilingual instruction-based IE dataset, which\ncovers 12 diverse domains. We propose KG2Instruction, a framework specifically\nfor the automatic generation of such datasets. Additionally, we manually\nannotate the test set. Experimental results demonstrate that large language\nmodels trained with InstructIE can not only obtain better IE capabilities but\nalso enhance zero-shot performance compared with baselines.\n","authors":["Honghao Gui","Shuofei Qiao","Jintian Zhang","Hongbin Ye","Mengshu Sun","Lei Liang","Jeff Z. Pan","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.11527v4.pdf","comment":"ISWC 2024; project homepage:\n https://www.zjukg.org/project/InstructIE/ dataset:\n https://huggingface.co/datasets/zjunlp/InstructIE"},{"id":"http://arxiv.org/abs/2407.19672v1","updated":"2024-07-29T03:26:22Z","published":"2024-07-29T03:26:22Z","title":"SeaLLMs 3: Open Foundation and Chat Multilingual Large Language Models\n for Southeast Asian Languages","summary":" Large Language Models (LLMs) have shown remarkable abilities across various\ntasks, yet their development has predominantly centered on high-resource\nlanguages like English and Chinese, leaving low-resource languages underserved.\nTo address this disparity, we present SeaLLMs 3, the latest iteration of the\nSeaLLMs model family, tailored for Southeast Asian languages. This region,\ncharacterized by its rich linguistic diversity, has lacked adequate language\ntechnology support. SeaLLMs 3 aims to bridge this gap by covering a\ncomprehensive range of languages spoken in this region, including English,\nChinese, Indonesian, Vietnamese, Thai, Tagalog, Malay, Burmese, Khmer, Lao,\nTamil, and Javanese. Leveraging efficient language enhancement techniques and a\nspecially constructed instruction tuning dataset, SeaLLMs 3 significantly\nreduces training costs while maintaining high performance and versatility. Our\nmodel excels in tasks such as world knowledge, mathematical reasoning,\ntranslation, and instruction following, achieving state-of-the-art performance\namong similarly sized models. Additionally, we prioritized safety and\nreliability by addressing both general and culture-specific considerations and\nincorporated mechanisms to reduce hallucinations. This work underscores the\nimportance of inclusive AI, showing that advanced LLM capabilities can benefit\nunderserved linguistic and cultural communities.\n","authors":["Wenxuan Zhang","Hou Pong Chan","Yiran Zhao","Mahani Aljunied","Jianyu Wang","Chaoqun Liu","Yue Deng","Zhiqiang Hu","Weiwen Xu","Yew Ken Chia","Xin Li","Lidong Bing"],"pdf_url":"https://arxiv.org/pdf/2407.19672v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18483v2","updated":"2024-07-29T03:16:13Z","published":"2024-07-26T03:23:31Z","title":"A Role-specific Guided Large Language Model for Ophthalmic Consultation\n Based on Stylistic Differentiation","summary":" Ophthalmology consultations are crucial for diagnosing, treating, and\npreventing eye diseases. However, the growing demand for consultations exceeds\nthe availability of ophthalmologists. By leveraging large pre-trained language\nmodels, we can design effective dialogues for specific scenarios, aiding in\nconsultations. Traditional fine-tuning strategies for question-answering tasks\nare impractical due to increasing model size and often ignoring patient-doctor\nrole function during consultations. In this paper, we propose EyeDoctor, an\nophthalmic medical questioning large language model that enhances accuracy\nthrough doctor-patient role perception guided and an augmented knowledge base\nwith external disease information. Experimental results show EyeDoctor achieves\nhigher question-answering precision in ophthalmology consultations. Notably,\nEyeDoctor demonstrated a 7.25% improvement in Rouge-1 scores and a 10.16%\nimprovement in F1 scores on multi-round datasets compared to second best model\nChatGPT, highlighting the importance of doctor-patient role differentiation and\ndynamic knowledge base expansion for intelligent medical consultations. EyeDoc\nalso serves as a free available web based service and souce code is available\nat https://github.com/sperfu/EyeDoc.\n","authors":["Laiyi Fu","Binbin Fan","Hongkai Du","Yanxiang Feng","Chunhua Li","Huping Song"],"pdf_url":"https://arxiv.org/pdf/2407.18483v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19670v1","updated":"2024-07-29T03:14:57Z","published":"2024-07-29T03:14:57Z","title":"Overview of PerpectiveArg2024: The First Shared Task on Perspective\n Argument Retrieval","summary":" Argument retrieval is the task of finding relevant arguments for a given\nquery. While existing approaches rely solely on the semantic alignment of\nqueries and arguments, this first shared task on perspective argument retrieval\nincorporates perspectives during retrieval, accounting for latent influences in\nargumentation. We present a novel multilingual dataset covering demographic and\nsocio-cultural (socio) variables, such as age, gender, and political attitude,\nrepresenting minority and majority groups in society. We distinguish between\nthree scenarios to explore how retrieval systems consider explicitly (in both\nquery and corpus) and implicitly (only in query) formulated perspectives. This\npaper provides an overview of this shared task and summarizes the results of\nthe six submitted systems. We find substantial challenges in incorporating\nperspectivism, especially when aiming for personalization based solely on the\ntext of arguments without explicitly providing socio profiles. Moreover,\nretrieval systems tend to be biased towards the majority group but partially\nmitigate bias for the female gender. While we bootstrap perspective argument\nretrieval, further research is essential to optimize retrieval systems to\nfacilitate personalization and reduce polarization.\n","authors":["Neele Falk","Andreas Waldis","Iryna Gurevych"],"pdf_url":"https://arxiv.org/pdf/2407.19670v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19669v1","updated":"2024-07-29T03:12:28Z","published":"2024-07-29T03:12:28Z","title":"mGTE: Generalized Long-Context Text Representation and Reranking Models\n for Multilingual Text Retrieval","summary":" We present systematic efforts in building long-context multilingual text\nrepresentation model (TRM) and reranker from scratch for text retrieval. We\nfirst introduce a text encoder (base size) enhanced with RoPE and unpadding,\npre-trained in a native 8192-token context (longer than 512 of previous\nmultilingual encoders). Then we construct a hybrid TRM and a cross-encoder\nreranker by contrastive learning. Evaluations show that our text encoder\noutperforms the same-sized previous state-of-the-art XLM-R. Meanwhile, our TRM\nand reranker match the performance of large-sized state-of-the-art BGE-M3\nmodels and achieve better results on long-context retrieval benchmarks. Further\nanalysis demonstrate that our proposed models exhibit higher efficiency during\nboth training and inference. We believe their efficiency and effectiveness\ncould benefit various researches and industrial applications.\n","authors":["Xin Zhang","Yanzhao Zhang","Dingkun Long","Wen Xie","Ziqi Dai","Jialong Tang","Huan Lin","Baosong Yang","Pengjun Xie","Fei Huang","Meishan Zhang","Wenjie Li","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.19669v1.pdf","comment":"20 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.18416v2","updated":"2024-07-29T02:30:35Z","published":"2024-07-25T22:24:45Z","title":"PersonaGym: Evaluating Persona Agents and LLMs","summary":" Persona agents, which are LLM agents that act according to an assigned\npersona, have demonstrated impressive contextual response capabilities across\nvarious applications. These persona agents offer significant enhancements\nacross diverse sectors, such as education, healthcare, and entertainment, where\nmodel developers can align agent responses to different user requirements\nthereby broadening the scope of agent applications. However, evaluating persona\nagent performance is incredibly challenging due to the complexity of assessing\npersona adherence in free-form interactions across various environments that\nare relevant to each persona agent. We introduce PersonaGym, the first dynamic\nevaluation framework for assessing persona agents, and PersonaScore, the first\nautomated human-aligned metric grounded in decision theory for comprehensive\nlarge-scale evaluation of persona agents. Our evaluation of 6 open and\nclosed-source LLMs, using a benchmark encompassing 200 personas and 10,000\nquestions, reveals significant opportunities for advancement in persona agent\ncapabilities across state-of-the-art models. For example, Claude 3.5 Sonnet\nonly has a 2.97% relative improvement in PersonaScore than GPT 3.5 despite\nbeing a much more advanced model. Importantly, we find that increased model\nsize and complexity do not necessarily imply enhanced persona agent\ncapabilities thereby highlighting the pressing need for algorithmic and\narchitectural invention towards faithful and performant persona agents.\n","authors":["Vinay Samuel","Henry Peng Zou","Yue Zhou","Shreyas Chaudhari","Ashwin Kalyan","Tanmay Rajpurohit","Ameet Deshpande","Karthik Narasimhan","Vishvak Murahari"],"pdf_url":"https://arxiv.org/pdf/2407.18416v2.pdf","comment":"21 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.19638v1","updated":"2024-07-29T01:45:05Z","published":"2024-07-29T01:45:05Z","title":"From Pre-training Corpora to Large Language Models: What Factors\n Influence LLM Performance in Causal Discovery Tasks?","summary":" Recent advances in artificial intelligence have seen Large Language Models\n(LLMs) demonstrate notable proficiency in causal discovery tasks. This study\nexplores the factors influencing the performance of LLMs in causal discovery\ntasks. Utilizing open-source LLMs, we examine how the frequency of causal\nrelations within their pre-training corpora affects their ability to accurately\nrespond to causal discovery queries. Our findings reveal that a higher\nfrequency of causal mentions correlates with better model performance,\nsuggesting that extensive exposure to causal information during training\nenhances the models' causal discovery capabilities. Additionally, we\ninvestigate the impact of context on the validity of causal relations. Our\nresults indicate that LLMs might exhibit divergent predictions for identical\ncausal relations when presented in different contexts. This paper provides the\nfirst comprehensive analysis of how different factors contribute to LLM\nperformance in causal discovery tasks.\n","authors":["Tao Feng","Lizhen Qu","Niket Tandon","Zhuang Li","Xiaoxi Kang","Gholamreza Haffari"],"pdf_url":"https://arxiv.org/pdf/2407.19638v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14309v2","updated":"2024-07-29T01:19:12Z","published":"2024-07-19T13:42:56Z","title":"How to Engage Your Readers? Generating Guiding Questions to Promote\n Active Reading","summary":" Using questions in written text is an effective strategy to enhance\nreadability. However, what makes an active reading question good, what the\nlinguistic role of these questions is, and what is their impact on human\nreading remains understudied. We introduce GuidingQ, a dataset of 10K in-text\nquestions from textbooks and scientific articles. By analyzing the dataset, we\npresent a comprehensive understanding of the use, distribution, and linguistic\ncharacteristics of these questions. Then, we explore various approaches to\ngenerate such questions using language models. Our results highlight the\nimportance of capturing inter-question relationships and the challenge of\nquestion position identification in generating these questions. Finally, we\nconduct a human study to understand the implication of such questions on\nreading comprehension. We find that the generated questions are of high quality\nand are almost as effective as human-written questions in terms of improving\nreaders' memorization and comprehension.\n","authors":["Peng Cui","Vilém Zouhar","Xiaoyu Zhang","Mrinmaya Sachan"],"pdf_url":"https://arxiv.org/pdf/2407.14309v2.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2407.19625v1","updated":"2024-07-29T01:06:45Z","published":"2024-07-29T01:06:45Z","title":"LoginMEA: Local-to-Global Interaction Network for Multi-modal Entity\n Alignment","summary":" Multi-modal entity alignment (MMEA) aims to identify equivalent entities\nbetween two multi-modal knowledge graphs (MMKGs), whose entities can be\nassociated with relational triples and related images. Most previous studies\ntreat the graph structure as a special modality, and fuse different modality\ninformation with separate uni-modal encoders, neglecting valuable relational\nassociations in modalities. Other studies refine each uni-modal information\nwith graph structures, but may introduce unnecessary relations in specific\nmodalities. To this end, we propose a novel local-to-global interaction network\nfor MMEA, termed as LoginMEA. Particularly, we first fuse local multi-modal\ninteractions to generate holistic entity semantics and then refine them with\nglobal relational interactions of entity neighbors. In this design, the\nuni-modal information is fused adaptively, and can be refined with relations\naccordingly. To enrich local interactions of multi-modal entity information, we\ndevice modality weights and low-rank interactive fusion, allowing diverse\nimpacts and element-level interactions among modalities. To capture global\ninteractions of graph structures, we adopt relation reflection graph attention\nnetworks, which fully capture relational associations between entities.\nExtensive experiments demonstrate superior results of our method over 5\ncross-KG or bilingual benchmark datasets, indicating the effectiveness of\ncapturing local and global interactions.\n","authors":["Taoyu Su","Xinghua Zhang","Jiawei Sheng","Zhenyu Zhang","Tingwen Liu"],"pdf_url":"https://arxiv.org/pdf/2407.19625v1.pdf","comment":"Accepted by ECAI 2024"},{"id":"http://arxiv.org/abs/2407.19616v1","updated":"2024-07-29T00:18:17Z","published":"2024-07-29T00:18:17Z","title":"TopicTag: Automatic Annotation of NMF Topic Models Using Chain of\n Thought and Prompt Tuning with LLMs","summary":" Topic modeling is a technique for organizing and extracting themes from large\ncollections of unstructured text. Non-negative matrix factorization (NMF) is a\ncommon unsupervised approach that decomposes a term frequency-inverse document\nfrequency (TF-IDF) matrix to uncover latent topics and segment the dataset\naccordingly. While useful for highlighting patterns and clustering documents,\nNMF does not provide explicit topic labels, necessitating subject matter\nexperts (SMEs) to assign labels manually. We present a methodology for\nautomating topic labeling in documents clustered via NMF with automatic model\ndetermination (NMFk). By leveraging the output of NMFk and employing prompt\nengineering, we utilize large language models (LLMs) to generate accurate topic\nlabels. Our case study on over 34,000 scientific abstracts on Knowledge Graphs\ndemonstrates the effectiveness of our method in enhancing knowledge management\nand document organization.\n","authors":["Selma Wanna","Ryan Barron","Nick Solovyev","Maksim E. Eren","Manish Bhattarai","Kim Rasmussen","Boian S. Alexandrov"],"pdf_url":"https://arxiv.org/pdf/2407.19616v1.pdf","comment":"Accepted to ACM Symposium on Document Engineering 2024 (DocEng 24),\n 2024"},{"id":"http://arxiv.org/abs/2407.20454v1","updated":"2024-07-29T23:18:55Z","published":"2024-07-29T23:18:55Z","title":"CoMMIT: Coordinated Instruction Tuning for Multimodal Large Language\n Models","summary":" Instruction tuning in multimodal large language models (MLLMs) aims to\nsmoothly integrate a backbone LLM with a pre-trained feature encoder for\ndownstream tasks. The major challenge is how to efficiently find the synergy\nthrough cooperative learning where LLMs adapt their reasoning abilities in\ndownstream tasks while feature encoders adjust their encoding to provide more\nrelevant modal information. In this paper, we analyze the MLLM instruction\ntuning from both theoretical and empirical perspectives, where we find\nunbalanced learning between the two components, i.e., the feature encoder and\nthe LLM, can cause diminishing learning gradients that slow the model\nconvergence and often lead to sub-optimal results due to insufficient learning.\nInspired by our findings, we propose a measurement to quantitatively evaluate\nthe learning balance, based on which we further design a dynamic learning\nscheduler that better coordinates the learning. In addition, we introduce an\nauxiliary loss regularization method to promote updating of the generation\ndistribution of MLLMs considering the learning state of each model component,\nwhich potentially prevents each component from gradient diminishing and enables\na more accurate estimation of the learning balance coefficient. We conduct\nexperiments with multiple LLM backbones and feature encoders, where our\ntechniques are model-agnostic and can be generically integrated with various\nMLLM backbones. Experiment results on multiple downstream tasks and modalities\nin vision and audio, demonstrate the proposed method's better efficiency and\neffectiveness in MLLM instruction tuning.\n","authors":["Junda Wu","Xintong Li","Tong Yu","Yu Wang","Xiang Chen","Jiuxiang Gu","Lina Yao","Jingbo Shang","Julian McAuley"],"pdf_url":"https://arxiv.org/pdf/2407.20454v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2305.18290v3","updated":"2024-07-29T22:26:36Z","published":"2023-05-29T17:57:46Z","title":"Direct Preference Optimization: Your Language Model is Secretly a Reward\n Model","summary":" While large-scale unsupervised language models (LMs) learn broad world\nknowledge and some reasoning skills, achieving precise control of their\nbehavior is difficult due to the completely unsupervised nature of their\ntraining. Existing methods for gaining such steerability collect human labels\nof the relative quality of model generations and fine-tune the unsupervised LM\nto align with these preferences, often with reinforcement learning from human\nfeedback (RLHF). However, RLHF is a complex and often unstable procedure, first\nfitting a reward model that reflects the human preferences, and then\nfine-tuning the large unsupervised LM using reinforcement learning to maximize\nthis estimated reward without drifting too far from the original model. In this\npaper we introduce a new parameterization of the reward model in RLHF that\nenables extraction of the corresponding optimal policy in closed form, allowing\nus to solve the standard RLHF problem with only a simple classification loss.\nThe resulting algorithm, which we call Direct Preference Optimization (DPO), is\nstable, performant, and computationally lightweight, eliminating the need for\nsampling from the LM during fine-tuning or performing significant\nhyperparameter tuning. Our experiments show that DPO can fine-tune LMs to align\nwith human preferences as well as or better than existing methods. Notably,\nfine-tuning with DPO exceeds PPO-based RLHF in ability to control sentiment of\ngenerations, and matches or improves response quality in summarization and\nsingle-turn dialogue while being substantially simpler to implement and train.\n","authors":["Rafael Rafailov","Archit Sharma","Eric Mitchell","Stefano Ermon","Christopher D. Manning","Chelsea Finn"],"pdf_url":"https://arxiv.org/pdf/2305.18290v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20438v1","updated":"2024-07-29T22:10:51Z","published":"2024-07-29T22:10:51Z","title":"Generating Gender Alternatives in Machine Translation","summary":" Machine translation (MT) systems often translate terms with ambiguous gender\n(e.g., English term \"the nurse\") into the gendered form that is most prevalent\nin the systems' training data (e.g., \"enfermera\", the Spanish term for a female\nnurse). This often reflects and perpetuates harmful stereotypes present in\nsociety. With MT user interfaces in mind that allow for resolving gender\nambiguity in a frictionless manner, we study the problem of generating all\ngrammatically correct gendered translation alternatives. We open source train\nand test datasets for five language pairs and establish benchmarks for this\ntask. Our key technical contribution is a novel semi-supervised solution for\ngenerating alternatives that integrates seamlessly with standard MT models and\nmaintains high performance without requiring additional components or\nincreasing inference overhead.\n","authors":["Sarthak Garg","Mozhdeh Gheini","Clara Emmanuel","Tatiana Likhomanenko","Qin Gao","Matthias Paulik"],"pdf_url":"https://arxiv.org/pdf/2407.20438v1.pdf","comment":"GeBNLP 2024"},{"id":"http://arxiv.org/abs/2407.20413v1","updated":"2024-07-29T20:52:26Z","published":"2024-07-29T20:52:26Z","title":"Through the Looking Glass, and what Horn Clause Programs Found There","summary":" Dual Horn clauses mirror key properties of Horn clauses. This paper explores\nthe ``other side of the looking glass'' to reveal some expected and unexpected\nsymmetries and their practical uses.\n We revisit Dual Horn clauses as enablers of a form of constructive negation\nthat supports goal-driven forward reasoning and is valid both\nintuitionistically and classically. In particular, we explore the ability to\nfalsify a counterfactual hypothesis in the context of a background theory\nexpressed as a Dual Horn clause program.\n With Dual Horn clause programs, by contrast to negation as failure, the\nvariable bindings in their computed answers provide explanations for the\nreasons why a statement is successfully falsified. Moreover, in the\npropositional case, by contrast to negation as failure as implemented with\nstable models semantics in ASP systems, and similarly to Horn clause programs,\nDual Horn clause programs have polynomial complexity.\n After specifying their execution model with a metainterpreter, we devise a\ncompilation scheme from Dual Horn clause programs to Horn clause programs,\nensuring their execution with no performance penalty and we design the embedded\nSymLP language to support combined Horn clause and Dual Horn clause programs.\n As a (motivating) application, we cast LLM reasoning chains into\npropositional Horn and Dual Horn clauses that work together to constructively\nprove and disprove goals and enhance Generative AI with explainability of\nreasoning chains.\n","authors":["Paul Tarau"],"pdf_url":"https://arxiv.org/pdf/2407.20413v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06979v3","updated":"2024-07-29T20:51:25Z","published":"2023-09-13T14:15:03Z","title":"Auto-Regressive Next-Token Predictors are Universal Learners","summary":" Large language models display remarkable capabilities in logical and\nmathematical reasoning, allowing them to solve complex tasks. Interestingly,\nthese abilities emerge in networks trained on the simple task of next-token\nprediction. In this work, we present a theoretical framework for studying\nauto-regressive next-token predictors. We demonstrate that even simple models\nsuch as linear next-token predictors, trained on Chain-of-Thought (CoT) data,\ncan approximate any function efficiently computed by a Turing machine. We\nintroduce a new complexity measure -- length complexity -- which measures the\nnumber of intermediate tokens in a CoT sequence required to approximate some\ntarget function, and analyze the interplay between length complexity and other\nnotions of complexity. Finally, we show experimentally that simple next-token\npredictors, such as linear networks and shallow Multi-Layer Perceptrons (MLPs),\ndisplay non-trivial performance on text generation and arithmetic tasks. Our\nresults demonstrate that the power of today's LLMs can be attributed, to a\ngreat extent, to the auto-regressive next-token training scheme, and not\nnecessarily to a particular choice of architecture.\n","authors":["Eran Malach"],"pdf_url":"https://arxiv.org/pdf/2309.06979v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02418v2","updated":"2024-07-29T20:29:32Z","published":"2024-04-03T02:56:52Z","title":"Auxiliary task demands mask the capabilities of smaller language models","summary":" Developmental psychologists have argued about when cognitive capacities such\nas language understanding or theory of mind emerge. These debates often hinge\non the concept of \"task demands\" -- the auxiliary challenges associated with\nperforming a particular evaluation -- that may mask the child's underlying\nability. The same issues arise when measuring the capacities of language models\n(LMs): performance on a task is a function of the model's underlying knowledge,\ncombined with the model's ability to interpret and perform the task given its\navailable resources. Here, we show that for analogical reasoning, reflective\nreasoning, word prediction, and grammaticality judgments, evaluation methods\nwith greater task demands yield lower performance than evaluations with reduced\ndemands. This \"demand gap\" is most pronounced for models with fewer parameters\nand less training data. Our results illustrate that LM performance should not\nbe interpreted as a direct indication of intelligence (or lack thereof), but as\na reflection of capacities seen through the lens of researchers' design\nchoices.\n","authors":["Jennifer Hu","Michael C. Frank"],"pdf_url":"https://arxiv.org/pdf/2404.02418v2.pdf","comment":"Published at the 1st Conference on Language Modeling (COLM 2024)"},{"id":"http://arxiv.org/abs/2312.04474v4","updated":"2024-07-29T20:21:37Z","published":"2023-12-07T17:51:43Z","title":"Chain of Code: Reasoning with a Language Model-Augmented Code Emulator","summary":" Code provides a general syntactic structure to build complex programs and\nperform precise computations when paired with a code interpreter - we\nhypothesize that language models (LMs) can leverage code-writing to improve\nChain of Thought reasoning not only for logic and arithmetic tasks, but also\nfor semantic ones (and in particular, those that are a mix of both). For\nexample, consider prompting an LM to write code that counts the number of times\nit detects sarcasm in an essay: the LM may struggle to write an implementation\nfor \"detect_sarcasm(string)\" that can be executed by the interpreter (handling\nthe edge cases would be insurmountable). However, LMs may still produce a valid\nsolution if they not only write code, but also selectively \"emulate\" the\ninterpreter by generating the expected output of \"detect_sarcasm(string)\". In\nthis work, we propose Chain of Code (CoC), a simple yet surprisingly effective\nextension that improves LM code-driven reasoning. The key idea is to encourage\nLMs to format semantic sub-tasks in a program as flexible pseudocode that the\ninterpreter can explicitly catch undefined behaviors and hand off to simulate\nwith an LM (as an \"LMulator\"). Experiments demonstrate that Chain of Code\noutperforms Chain of Thought and other baselines across a variety of\nbenchmarks; on BIG-Bench Hard, Chain of Code achieves 84%, a gain of 12% over\nChain of Thought. In a nutshell, CoC broadens the scope of reasoning questions\nthat LMs can answer by \"thinking in code\".\n","authors":["Chengshu Li","Jacky Liang","Andy Zeng","Xinyun Chen","Karol Hausman","Dorsa Sadigh","Sergey Levine","Li Fei-Fei","Fei Xia","Brian Ichter"],"pdf_url":"https://arxiv.org/pdf/2312.04474v4.pdf","comment":"ICML 2024 Oral; Project webpage: https://chain-of-code.github.io"},{"id":"http://arxiv.org/abs/2407.11277v2","updated":"2024-07-29T19:35:36Z","published":"2024-07-15T22:55:27Z","title":"Target conversation extraction: Source separation using turn-taking\n dynamics","summary":" Extracting the speech of participants in a conversation amidst interfering\nspeakers and noise presents a challenging problem. In this paper, we introduce\nthe novel task of target conversation extraction, where the goal is to extract\nthe audio of a target conversation based on the speaker embedding of one of its\nparticipants. To accomplish this, we propose leveraging temporal patterns\ninherent in human conversations, particularly turn-taking dynamics, which\nuniquely characterize speakers engaged in conversation and distinguish them\nfrom interfering speakers and noise. Using neural networks, we show the\nfeasibility of our approach on English and Mandarin conversation datasets. In\nthe presence of interfering speakers, our results show an 8.19 dB improvement\nin signal-to-noise ratio for 2-speaker conversations and a 7.92 dB improvement\nfor 2-4-speaker conversations. Code, dataset available at\nhttps://github.com/chentuochao/Target-Conversation-Extraction.\n","authors":["Tuochao Chen","Qirui Wang","Bohan Wu","Malek Itani","Sefik Emre Eskimez","Takuya Yoshioka","Shyamnath Gollakota"],"pdf_url":"https://arxiv.org/pdf/2407.11277v2.pdf","comment":"Accepted by Interspeech 2024"},{"id":"http://arxiv.org/abs/2407.20382v1","updated":"2024-07-29T19:12:18Z","published":"2024-07-29T19:12:18Z","title":"What if Red Can Talk? Dynamic Dialogue Generation Using Large Language\n Models","summary":" Role-playing games (RPGs) provide players with a rich, interactive world to\nexplore. Dialogue serves as the primary means of communication between\ndevelopers and players, manifesting in various forms such as guides, NPC\ninteractions, and storytelling. While most games rely on written scripts to\ndefine the main story and character personalities, player immersion can be\nsignificantly enhanced through casual interactions between characters. With the\nadvent of large language models (LLMs), we introduce a dialogue filler\nframework that utilizes LLMs enhanced by knowledge graphs to generate dynamic\nand contextually appropriate character interactions. We test this framework\nwithin the environments of Final Fantasy VII Remake and Pokemon, providing\nqualitative and quantitative evidence that demonstrates GPT-4's capability to\nact with defined personalities and generate dialogue. However, some flaws\nremain, such as GPT-4 being overly positive or more subtle personalities, such\nas maturity, tend to be of lower quality compared to more overt traits like\ntimidity. This study aims to assist developers in crafting more nuanced filler\ndialogues, thereby enriching player immersion and enhancing the overall RPG\nexperience.\n","authors":["Navapat Nananukul","Wichayaporn Wongkamjan"],"pdf_url":"https://arxiv.org/pdf/2407.20382v1.pdf","comment":"ACL Wordplay 2024"},{"id":"http://arxiv.org/abs/2407.20371v1","updated":"2024-07-29T18:42:39Z","published":"2024-07-29T18:42:39Z","title":"Gender, Race, and Intersectional Bias in Resume Screening via Language\n Model Retrieval","summary":" Artificial intelligence (AI) hiring tools have revolutionized resume\nscreening, and large language models (LLMs) have the potential to do the same.\nHowever, given the biases which are embedded within LLMs, it is unclear whether\nthey can be used in this scenario without disadvantaging groups based on their\nprotected attributes. In this work, we investigate the possibilities of using\nLLMs in a resume screening setting via a document retrieval framework that\nsimulates job candidate selection. Using that framework, we then perform a\nresume audit study to determine whether a selection of Massive Text Embedding\n(MTE) models are biased in resume screening scenarios. We simulate this for\nnine occupations, using a collection of over 500 publicly available resumes and\n500 job descriptions. We find that the MTEs are biased, significantly favoring\nWhite-associated names in 85.1\\% of cases and female-associated names in only\n11.1\\% of cases, with a minority of cases showing no statistically significant\ndifferences. Further analyses show that Black males are disadvantaged in up to\n100\\% of cases, replicating real-world patterns of bias in employment settings,\nand validate three hypotheses of intersectionality. We also find an impact of\ndocument length as well as the corpus frequency of names in the selection of\nresumes. These findings have implications for widely used AI tools that are\nautomating employment, fairness, and tech policy.\n","authors":["Kyra Wilson","Aylin Caliskan"],"pdf_url":"https://arxiv.org/pdf/2407.20371v1.pdf","comment":"To be published in Proceedings of the 2024 AAAI/ACM Conference on AI,\n Ethics, and Society; code available at\n https://github.com/kyrawilson/Resume-Screening-Bias"},{"id":"http://arxiv.org/abs/2406.00045v2","updated":"2024-07-29T18:19:35Z","published":"2024-05-28T05:10:40Z","title":"Personalized Steering of Large Language Models: Versatile Steering\n Vectors Through Bi-directional Preference Optimization","summary":" Researchers have been studying approaches to steer the behavior of Large\nLanguage Models (LLMs) and build personalized LLMs tailored for various\napplications. While fine-tuning seems to be a direct solution, it requires\nsubstantial computational resources and may significantly affect the utility of\nthe original LLM. Recent endeavors have introduced more lightweight strategies,\nfocusing on extracting \"steering vectors\" to guide the model's output toward\ndesired behaviors by adjusting activations within specific layers of the LLM's\ntransformer architecture. However, such steering vectors are directly extracted\nfrom the activations of human preference data and thus often lead to suboptimal\nresults and occasional failures, especially in alignment-related scenarios.\nThis work proposes an innovative approach that could produce more effective\nsteering vectors through bi-directional preference optimization. Our method is\ndesigned to allow steering vectors to directly influence the generation\nprobability of contrastive human preference data pairs, thereby offering a more\nprecise representation of the target behavior. By carefully adjusting the\ndirection and magnitude of the steering vector, we enabled personalized control\nover the desired behavior across a spectrum of intensities. Extensive\nexperimentation across various open-ended generation tasks, particularly\nfocusing on steering AI personas, has validated the efficacy of our approach.\nMoreover, we comprehensively investigate critical alignment-concerning\nscenarios, such as managing truthfulness, mitigating hallucination, and\naddressing jailbreaking attacks. Remarkably, our method can still demonstrate\noutstanding steering effectiveness across these scenarios. Furthermore, we\nshowcase the transferability of our steering vectors across different\nmodels/LoRAs and highlight the synergistic benefits of applying multiple\nvectors simultaneously.\n","authors":["Yuanpu Cao","Tianrong Zhang","Bochuan Cao","Ziyi Yin","Lu Lin","Fenglong Ma","Jinghui Chen"],"pdf_url":"https://arxiv.org/pdf/2406.00045v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20341v1","updated":"2024-07-29T18:00:17Z","published":"2024-07-29T18:00:17Z","title":"BRIDGE: Bridging Gaps in Image Captioning Evaluation with Stronger\n Visual Cues","summary":" Effectively aligning with human judgment when evaluating machine-generated\nimage captions represents a complex yet intriguing challenge. Existing\nevaluation metrics like CIDEr or CLIP-Score fall short in this regard as they\ndo not take into account the corresponding image or lack the capability of\nencoding fine-grained details and penalizing hallucinations. To overcome these\nissues, in this paper, we propose BRIDGE, a new learnable and reference-free\nimage captioning metric that employs a novel module to map visual features into\ndense vectors and integrates them into multi-modal pseudo-captions which are\nbuilt during the evaluation process. This approach results in a multimodal\nmetric that properly incorporates information from the input image without\nrelying on reference captions, bridging the gap between human judgment and\nmachine-generated image captions. Experiments spanning several datasets\ndemonstrate that our proposal achieves state-of-the-art results compared to\nexisting reference-free evaluation scores. Our source code and trained models\nare publicly available at: https://github.com/aimagelab/bridge-score.\n","authors":["Sara Sarto","Marcella Cornia","Lorenzo Baraldi","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2407.20341v1.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2407.20311v1","updated":"2024-07-29T17:52:40Z","published":"2024-07-29T17:52:40Z","title":"Physics of Language Models: Part 2.1, Grade-School Math and the Hidden\n Reasoning Process","summary":" Recent advances in language models have demonstrated their capability to\nsolve mathematical reasoning problems, achieving near-perfect accuracy on\ngrade-school level math benchmarks like GSM8K. In this paper, we formally study\nhow language models solve these problems. We design a series of controlled\nexperiments to address several fundamental questions: (1) Can language models\ntruly develop reasoning skills, or do they simply memorize templates? (2) What\nis the model's hidden (mental) reasoning process? (3) Do models solve math\nquestions using skills similar to or different from humans? (4) Do models\ntrained on GSM8K-like datasets develop reasoning skills beyond those necessary\nfor solving GSM8K problems? (5) What mental process causes models to make\nreasoning mistakes? (6) How large or deep must a model be to effectively solve\nGSM8K-level math questions?\n Our study uncovers many hidden mechanisms by which language models solve\nmathematical questions, providing insights that extend beyond current\nunderstandings of LLMs.\n","authors":["Tian Ye","Zicheng Xu","Yuanzhi Li","Zeyuan Allen-Zhu"],"pdf_url":"https://arxiv.org/pdf/2407.20311v1.pdf","comment":"video appeared in ICML 2024 tutorial"},{"id":"http://arxiv.org/abs/2407.21077v1","updated":"2024-07-29T20:42:59Z","published":"2024-07-29T20:42:59Z","title":"Genetic Instruct: Scaling up Synthetic Generation of Coding Instructions\n for Large Language Models","summary":" Large Language Models (LLMs) rely on instruction samples for alignment, but\ncreating these datasets poses challenges, particularly in expert-dependent\ntasks like coding, which can be cost-prohibitive. One approach to mitigate\nthese challenges is synthesizing data using another LLM. In this paper, we\nintroduce a scalable method for generating synthetic instructions to enhance\nthe code generation capability of LLMs. The proposed algorithm,\nGenetic-Instruct, mimics evolutionary processes, utilizing self-instruction to\ncreate numerous synthetic samples from a limited number of seeds.\nGenetic-Instruct is designed for efficient scaling of the generation process.\nFine-tuning multiple coding LLMs with the synthetic samples demonstrates a\nsignificant improvement in their code generation accuracy compared to the\nbaselines.\n","authors":["Somshubra Majumdar","Vahid Noroozi","Sean Narenthiran","Aleksander Ficek","Jagadeesh Balam","Boris Ginsburg"],"pdf_url":"https://arxiv.org/pdf/2407.21077v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21075v1","updated":"2024-07-29T18:38:49Z","published":"2024-07-29T18:38:49Z","title":"Apple Intelligence Foundation Language Models","summary":" We present foundation language models developed to power Apple Intelligence\nfeatures, including a ~3 billion parameter model designed to run efficiently on\ndevices and a large server-based language model designed for Private Cloud\nCompute. These models are designed to perform a wide range of tasks\nefficiently, accurately, and responsibly. This report describes the model\narchitecture, the data used to train the model, the training process, how the\nmodels are optimized for inference, and the evaluation results. We highlight\nour focus on Responsible AI and how the principles are applied throughout the\nmodel development.\n","authors":["Tom Gunter","Zirui Wang","Chong Wang","Ruoming Pang","Andy Narayanan","Aonan Zhang","Bowen Zhang","Chen Chen","Chung-Cheng Chiu","David Qiu","Deepak Gopinath","Dian Ang Yap","Dong Yin","Feng Nan","Floris Weers","Guoli Yin","Haoshuo Huang","Jianyu Wang","Jiarui Lu","John Peebles","Ke Ye","Mark Lee","Nan Du","Qibin Chen","Quentin Keunebroek","Sam Wiseman","Syd Evans","Tao Lei","Vivek Rathod","Xiang Kong","Xianzhi Du","Yanghao Li","Yongqiang Wang","Yuan Gao","Zaid Ahmed","Zhaoyang Xu","Zhiyun Lu","Al Rashid","Albin Madappally Jose","Alec Doane","Alfredo Bencomo","Allison Vanderby","Andrew Hansen","Ankur Jain","Anupama Mann Anupama","Areeba Kamal","Bugu Wu","Carolina Brum","Charlie Maalouf","Chinguun Erdenebileg","Chris Dulhanty","Dominik Moritz","Doug Kang","Eduardo Jimenez","Evan Ladd","Fangping Shi","Felix Bai","Frank Chu","Fred Hohman","Hadas Kotek","Hannah Gillis Coleman","Jane Li","Jeffrey Bigham","Jeffery Cao","Jeff Lai","Jessica Cheung","Jiulong Shan","Joe Zhou","John Li","Jun Qin","Karanjeet Singh","Karla Vega","Kelvin Zou","Laura Heckman","Lauren Gardiner","Margit Bowler","Maria Cordell","Meng Cao","Nicole Hay","Nilesh Shahdadpuri","Otto Godwin","Pranay Dighe","Pushyami Rachapudi","Ramsey Tantawi","Roman Frigg","Sam Davarnia","Sanskruti Shah","Saptarshi Guha","Sasha Sirovica","Shen Ma","Shuang Ma","Simon Wang","Sulgi Kim","Suma Jayaram","Vaishaal Shankar","Varsha Paidi","Vivek Kumar","Xin Wang","Xin Zheng","Walker Cheng","Yael Shrager","Yang Ye","Yasu Tanaka","Yihao Guo","Yunsong Meng","Zhao Tang Luo","Zhi Ouyang","Alp Aygar","Alvin Wan","Andrew Walkingshaw","Andy Narayanan","Antonie Lin","Arsalan Farooq","Brent Ramerth","Colorado Reed","Chris Bartels","Chris Chaney","David Riazati","Eric Liang Yang","Erin Feldman","Gabriel Hochstrasser","Guillaume Seguin","Irina Belousova","Joris Pelemans","Karen Yang","Keivan Alizadeh Vahid","Liangliang Cao","Mahyar Najibi","Marco Zuliani","Max Horton","Minsik Cho","Nikhil Bhendawade","Patrick Dong","Piotr Maj","Pulkit Agrawal","Qi Shan","Qichen Fu","Regan Poston","Sam Xu","Shuangning Liu","Sushma Rao","Tashweena Heeramun","Thomas Merth","Uday Rayala","Victor Cui","Vivek Rangarajan Sridhar","Wencong Zhang","Wenqi Zhang","Wentao Wu","Xingyu Zhou","Xinwen Liu","Yang Zhao","Yin Xia","Zhile Ren","Zhongzheng Ren"],"pdf_url":"https://arxiv.org/pdf/2407.21075v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20076v1","updated":"2024-07-29T15:02:51Z","published":"2024-07-29T15:02:51Z","title":"Investigating the Impact of Semi-Supervised Methods with Data\n Augmentation on Offensive Language Detection in Romanian Language","summary":" Offensive language detection is a crucial task in today's digital landscape,\nwhere online platforms grapple with maintaining a respectful and inclusive\nenvironment. However, building robust offensive language detection models\nrequires large amounts of labeled data, which can be expensive and\ntime-consuming to obtain. Semi-supervised learning offers a feasible solution\nby utilizing labeled and unlabeled data to create more accurate and robust\nmodels. In this paper, we explore a few different semi-supervised methods, as\nwell as data augmentation techniques. Concretely, we implemented eight\nsemi-supervised methods and ran experiments for them using only the available\ndata in the RO-Offense dataset and applying five augmentation techniques before\nfeeding the data to the models. Experimental results demonstrate that some of\nthem benefit more from augmentations than others.\n","authors":["Elena-Beatrice Nicola","Dumitru-Clementin Cercel","Florin Pop"],"pdf_url":"https://arxiv.org/pdf/2407.20076v1.pdf","comment":"Accepted at KES 2024"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2407.20232v1","updated":"2024-07-29T17:59:57Z","published":"2024-07-29T17:59:57Z","title":"Specify and Edit: Overcoming Ambiguity in Text-Based Image Editing","summary":" Text-based editing diffusion models exhibit limited performance when the\nuser's input instruction is ambiguous. To solve this problem, we propose\n$\\textit{Specify ANd Edit}$ (SANE), a zero-shot inference pipeline for\ndiffusion-based editing systems. We use a large language model (LLM) to\ndecompose the input instruction into specific instructions, i.e. well-defined\ninterventions to apply to the input image to satisfy the user's request. We\nbenefit from the LLM-derived instructions along the original one, thanks to a\nnovel denoising guidance strategy specifically designed for the task. Our\nexperiments with three baselines and on two datasets demonstrate the benefits\nof SANE in all setups. Moreover, our pipeline improves the interpretability of\nediting models, and boosts the output diversity. We also demonstrate that our\napproach can be applied to any edit, whether ambiguous or not. Our code is\npublic at https://github.com/fabvio/SANE.\n","authors":["Ekaterina Iakovleva","Fabio Pizzati","Philip Torr","Stéphane Lathuilière"],"pdf_url":"https://arxiv.org/pdf/2407.20232v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20230v1","updated":"2024-07-29T17:59:50Z","published":"2024-07-29T17:59:50Z","title":"SAPG: Split and Aggregate Policy Gradients","summary":" Despite extreme sample inefficiency, on-policy reinforcement learning, aka\npolicy gradients, has become a fundamental tool in decision-making problems.\nWith the recent advances in GPU-driven simulation, the ability to collect large\namounts of data for RL training has scaled exponentially. However, we show that\ncurrent RL methods, e.g. PPO, fail to ingest the benefit of parallelized\nenvironments beyond a certain point and their performance saturates. To address\nthis, we propose a new on-policy RL algorithm that can effectively leverage\nlarge-scale environments by splitting them into chunks and fusing them back\ntogether via importance sampling. Our algorithm, termed SAPG, shows\nsignificantly higher performance across a variety of challenging environments\nwhere vanilla PPO and other strong baselines fail to achieve high performance.\nWebsite at https://sapg-rl.github.io/\n","authors":["Jayesh Singla","Ananye Agarwal","Deepak Pathak"],"pdf_url":"https://arxiv.org/pdf/2407.20230v1.pdf","comment":"In ICML 2024 (Oral). Website at https://sapg-rl.github.io/"},{"id":"http://arxiv.org/abs/2405.17430v2","updated":"2024-07-29T17:59:28Z","published":"2024-05-27T17:59:56Z","title":"Matryoshka Multimodal Models","summary":" Large Multimodal Models (LMMs) such as LLaVA have shown strong performance in\nvisual-linguistic reasoning. These models first embed images into a fixed large\nnumber of visual tokens and then feed them into a Large Language Model (LLM).\nHowever, this design causes an excessive number of tokens for dense visual\nscenarios such as high-resolution images and videos, leading to great\ninefficiency. While token pruning/merging methods do exist, they produce a\nsingle length output for each image and do not afford flexibility in trading\noff information density v.s. efficiency. Inspired by the concept of Matryoshka\nDolls, we propose M3: Matryoshka Multimodal Models, which learns to represent\nvisual content as nested sets of visual tokens that capture information across\nmultiple coarse-to-fine granularities. Our approach offers several unique\nbenefits for LMMs: (1) One can explicitly control the visual granularity per\ntest instance during inference, e.g. , adjusting the number of tokens used to\nrepresent an image based on the anticipated complexity or simplicity of the\ncontent; (2) M3 provides a framework for analyzing the granularity needed for\nexisting datasets, where we find that COCO-style benchmarks only need around ~9\nvisual tokens to obtain accuracy similar to that of using all 576 tokens; (3)\nOur approach provides a foundation to explore the best trade-off between\nperformance and visual token length at sample level, where our investigation\nreveals that a large gap exists between the oracle upper bound and current\nfixed-scale representations.\n","authors":["Mu Cai","Jianwei Yang","Jianfeng Gao","Yong Jae Lee"],"pdf_url":"https://arxiv.org/pdf/2405.17430v2.pdf","comment":"Project Page: https://matryoshka-mm.github.io/"},{"id":"http://arxiv.org/abs/2407.20229v1","updated":"2024-07-29T17:59:21Z","published":"2024-07-29T17:59:21Z","title":"Improving 2D Feature Representations by 3D-Aware Fine-Tuning","summary":" Current visual foundation models are trained purely on unstructured 2D data,\nlimiting their understanding of 3D structure of objects and scenes. In this\nwork, we show that fine-tuning on 3D-aware data improves the quality of\nemerging semantic features. We design a method to lift semantic 2D features\ninto an efficient 3D Gaussian representation, which allows us to re-render them\nfor arbitrary views. Using the rendered 3D-aware features, we design a\nfine-tuning strategy to transfer such 3D awareness into a 2D foundation model.\nWe demonstrate that models fine-tuned in that way produce features that readily\nimprove downstream task performance in semantic segmentation and depth\nestimation through simple linear probing. Notably, though fined-tuned on a\nsingle indoor dataset, the improvement is transferable to a variety of indoor\ndatasets and out-of-domain datasets. We hope our study encourages the community\nto consider injecting 3D awareness when training 2D foundation models. Project\npage: https://ywyue.github.io/FiT3D.\n","authors":["Yuanwen Yue","Anurag Das","Francis Engelmann","Siyu Tang","Jan Eric Lenssen"],"pdf_url":"https://arxiv.org/pdf/2407.20229v1.pdf","comment":"ECCV 2024. Project page: https://ywyue.github.io/FiT3D"},{"id":"http://arxiv.org/abs/2407.20228v1","updated":"2024-07-29T17:59:05Z","published":"2024-07-29T17:59:05Z","title":"FlexAttention for Efficient High-Resolution Vision-Language Models","summary":" Current high-resolution vision-language models encode images as\nhigh-resolution image tokens and exhaustively take all these tokens to compute\nattention, which significantly increases the computational cost. To address\nthis problem, we propose FlexAttention, a flexible attention mechanism for\nefficient high-resolution vision-language models. Specifically, a\nhigh-resolution image is encoded both as high-resolution tokens and\nlow-resolution tokens, where only the low-resolution tokens and a few selected\nhigh-resolution tokens are utilized to calculate the attention map, which\ngreatly shrinks the computational cost. The high-resolution tokens are selected\nvia a high-resolution selection module which could retrieve tokens of relevant\nregions based on an input attention map. The selected high-resolution tokens\nare then concatenated to the low-resolution tokens and text tokens, and input\nto a hierarchical self-attention layer which produces an attention map that\ncould be used for the next-step high-resolution token selection. The\nhierarchical self-attention process and high-resolution token selection process\nare performed iteratively for each attention layer. Experiments on multimodal\nbenchmarks prove that our FlexAttention outperforms existing high-resolution\nVLMs (e.g., relatively ~9% in V* Bench, ~7% in TextVQA), while also\nsignificantly reducing the computational cost by nearly 40%.\n","authors":["Junyan Li","Delin Chen","Tianle Cai","Peihao Chen","Yining Hong","Zhenfang Chen","Yikang Shen","Chuang Gan"],"pdf_url":"https://arxiv.org/pdf/2407.20228v1.pdf","comment":"Accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2407.20223v1","updated":"2024-07-29T17:57:38Z","published":"2024-07-29T17:57:38Z","title":"Correspondence-Free SE(3) Point Cloud Registration in RKHS via\n Unsupervised Equivariant Learning","summary":" This paper introduces a robust unsupervised SE(3) point cloud registration\nmethod that operates without requiring point correspondences. The method frames\npoint clouds as functions in a reproducing kernel Hilbert space (RKHS),\nleveraging SE(3)-equivariant features for direct feature space registration. A\nnovel RKHS distance metric is proposed, offering reliable performance amidst\nnoise, outliers, and asymmetrical data. An unsupervised training approach is\nintroduced to effectively handle limited ground truth data, facilitating\nadaptation to real datasets. The proposed method outperforms classical and\nsupervised methods in terms of registration accuracy on both synthetic\n(ModelNet40) and real-world (ETH3D) noisy, outlier-rich datasets. To our best\nknowledge, this marks the first instance of successful real RGB-D odometry data\nregistration using an equivariant method. The code is available at\n{https://sites.google.com/view/eccv24-equivalign}\n","authors":["Ray Zhang","Zheming Zhou","Min Sun","Omid Ghasemalizadeh","Cheng-Hao Kuo","Ryan Eustice","Maani Ghaffari","Arnie Sen"],"pdf_url":"https://arxiv.org/pdf/2407.20223v1.pdf","comment":"10 pages, to be published in ECCV 2024"},{"id":"http://arxiv.org/abs/2407.20219v1","updated":"2024-07-29T17:54:24Z","published":"2024-07-29T17:54:24Z","title":"Global Structure-from-Motion Revisited","summary":" Recovering 3D structure and camera motion from images has been a\nlong-standing focus of computer vision research and is known as\nStructure-from-Motion (SfM). Solutions to this problem are categorized into\nincremental and global approaches. Until now, the most popular systems follow\nthe incremental paradigm due to its superior accuracy and robustness, while\nglobal approaches are drastically more scalable and efficient. With this work,\nwe revisit the problem of global SfM and propose GLOMAP as a new\ngeneral-purpose system that outperforms the state of the art in global SfM. In\nterms of accuracy and robustness, we achieve results on-par or superior to\nCOLMAP, the most widely used incremental SfM, while being orders of magnitude\nfaster. We share our system as an open-source implementation at\n{https://github.com/colmap/glomap}.\n","authors":["Linfei Pan","Dániel Baráth","Marc Pollefeys","Johannes L. Schönberger"],"pdf_url":"https://arxiv.org/pdf/2407.20219v1.pdf","comment":"accepted at ECCV2024"},{"id":"http://arxiv.org/abs/2407.20214v1","updated":"2024-07-29T17:44:34Z","published":"2024-07-29T17:44:34Z","title":"SANGRIA: Surgical Video Scene Graph Optimization for Surgical Workflow\n Prediction","summary":" Graph-based holistic scene representations facilitate surgical workflow\nunderstanding and have recently demonstrated significant success. However, this\ntask is often hindered by the limited availability of densely annotated\nsurgical scene data. In this work, we introduce an end-to-end framework for the\ngeneration and optimization of surgical scene graphs on a downstream task. Our\napproach leverages the flexibility of graph-based spectral clustering and the\ngeneralization capability of foundation models to generate unsupervised scene\ngraphs with learnable properties. We reinforce the initial spatial graph with\nsparse temporal connections using local matches between consecutive frames to\npredict temporally consistent clusters across a temporal neighborhood. By\njointly optimizing the spatiotemporal relations and node features of the\ndynamic scene graph with the downstream task of phase segmentation, we address\nthe costly and annotation-burdensome task of semantic scene comprehension and\nscene graph generation in surgical videos using only weak surgical phase\nlabels. Further, by incorporating effective intermediate scene representation\ndisentanglement steps within the pipeline, our solution outperforms the SOTA on\nthe CATARACTS dataset by 8% accuracy and 10% F1 score in surgical workflow\nrecognition\n","authors":["Çağhan Köksal","Ghazal Ghazaei","Felix Holm","Azade Farshad","Nassir Navab"],"pdf_url":"https://arxiv.org/pdf/2407.20214v1.pdf","comment":"9 pages, 3 figures, 3 tables, MICCAI GRAIL Workshop paper"},{"id":"http://arxiv.org/abs/2407.20213v1","updated":"2024-07-29T17:42:45Z","published":"2024-07-29T17:42:45Z","title":"Registering Neural 4D Gaussians for Endoscopic Surgery","summary":" The recent advance in neural rendering has enabled the ability to reconstruct\nhigh-quality 4D scenes using neural networks. Although 4D neural reconstruction\nis popular, registration for such representations remains a challenging task,\nespecially for dynamic scene registration in surgical planning and simulation.\nIn this paper, we propose a novel strategy for dynamic surgical neural scene\nregistration. We first utilize 4D Gaussian Splatting to represent the surgical\nscene and capture both static and dynamic scenes effectively. Then, a spatial\naware feature aggregation method, Spatially Weight Cluttering (SWC) is proposed\nto accurately align the feature between surgical scenes, enabling precise and\nrealistic surgical simulations. Lastly, we present a novel strategy of\ndeformable scene registration to register two dynamic scenes. By incorporating\nboth spatial and temporal information for correspondence matching, our approach\nachieves superior performance compared to existing registration methods for\nimplicit neural representation. The proposed method has the potential to\nimprove surgical planning and training, ultimately leading to better patient\noutcomes.\n","authors":["Yiming Huang","Beilei Cui","Ikemura Kei","Jiekai Zhang","Long Bai","Hongliang Ren"],"pdf_url":"https://arxiv.org/pdf/2407.20213v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20198v1","updated":"2024-07-29T17:24:52Z","published":"2024-07-29T17:24:52Z","title":"SpaER: Learning Spatio-temporal Equivariant Representations for Fetal\n Brain Motion Tracking","summary":" In this paper, we introduce SpaER, a pioneering method for fetal motion\ntracking that leverages equivariant filters and self-attention mechanisms to\neffectively learn spatio-temporal representations. Different from conventional\napproaches that statically estimate fetal brain motions from pairs of images,\nour method dynamically tracks the rigid movement patterns of the fetal head\nacross temporal and spatial dimensions. Specifically, we first develop an\nequivariant neural network that efficiently learns rigid motion sequences\nthrough low-dimensional spatial representations of images. Subsequently, we\nlearn spatio-temporal representations by incorporating time encoding and\nself-attention neural network layers. This approach allows for the capture of\nlong-term dependencies of fetal brain motion and addresses alignment errors due\nto contrast changes and severe motion artifacts. Our model also provides a\ngeometric deformation estimation that properly addresses image distortions\namong all time frames. To the best of our knowledge, our approach is the first\nto learn spatial-temporal representations via deep neural networks for fetal\nmotion tracking without data augmentation. We validated our model using real\nfetal echo-planar images with simulated and real motions. Our method carries\nsignificant potential value in accurately measuring, tracking, and correcting\nfetal motion in fetal MRI sequences.\n","authors":["Jian Wang","Razieh Faghihpirayesh","Polina Golland","Ali Ghoulipour"],"pdf_url":"https://arxiv.org/pdf/2407.20198v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20179v1","updated":"2024-07-29T17:08:21Z","published":"2024-07-29T17:08:21Z","title":"Theia: Distilling Diverse Vision Foundation Models for Robot Learning","summary":" Vision-based robot policy learning, which maps visual inputs to actions,\nnecessitates a holistic understanding of diverse visual tasks beyond\nsingle-task needs like classification or segmentation. Inspired by this, we\nintroduce Theia, a vision foundation model for robot learning that distills\nmultiple off-the-shelf vision foundation models trained on varied vision tasks.\nTheia's rich visual representations encode diverse visual knowledge, enhancing\ndownstream robot learning. Extensive experiments demonstrate that Theia\noutperforms its teacher models and prior robot learning models using less\ntraining data and smaller model sizes. Additionally, we quantify the quality of\npre-trained visual representations and hypothesize that higher entropy in\nfeature norm distributions leads to improved robot learning performance. Code\nand models are available at https://github.com/bdaiinstitute/theia.\n","authors":["Jinghuan Shang","Karl Schmeckpeper","Brandon B. May","Maria Vittoria Minniti","Tarik Kelestemur","David Watkins","Laura Herlant"],"pdf_url":"https://arxiv.org/pdf/2407.20179v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20174v1","updated":"2024-07-29T17:04:34Z","published":"2024-07-29T17:04:34Z","title":"Advancing Multimodal Large Language Models in Chart Question Answering\n with Visualization-Referenced Instruction Tuning","summary":" Emerging multimodal large language models (MLLMs) exhibit great potential for\nchart question answering (CQA). Recent efforts primarily focus on scaling up\ntraining datasets (i.e., charts, data tables, and question-answer (QA) pairs)\nthrough data collection and synthesis. However, our empirical study on existing\nMLLMs and CQA datasets reveals notable gaps. First, current data collection and\nsynthesis focus on data volume and lack consideration of fine-grained visual\nencodings and QA tasks, resulting in unbalanced data distribution divergent\nfrom practical CQA scenarios. Second, existing work follows the training recipe\nof the base MLLMs initially designed for natural images, under-exploring the\nadaptation to unique chart characteristics, such as rich text elements. To fill\nthe gap, we propose a visualization-referenced instruction tuning approach to\nguide the training dataset enhancement and model development. Specifically, we\npropose a novel data engine to effectively filter diverse and high-quality data\nfrom existing datasets and subsequently refine and augment the data using\nLLM-based generation techniques to better align with practical QA tasks and\nvisual encodings. Then, to facilitate the adaptation to chart characteristics,\nwe utilize the enriched data to train an MLLM by unfreezing the vision encoder\nand incorporating a mixture-of-resolution adaptation strategy for enhanced\nfine-grained recognition. Experimental results validate the effectiveness of\nour approach. Even with fewer training examples, our model consistently\noutperforms state-of-the-art CQA models on established benchmarks. We also\ncontribute a dataset split as a benchmark for future research. Source codes and\ndatasets of this paper are available at\nhttps://github.com/zengxingchen/ChartQA-MLLM.\n","authors":["Xingchen Zeng","Haichuan Lin","Yilin Ye","Wei Zeng"],"pdf_url":"https://arxiv.org/pdf/2407.20174v1.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2407.20172v1","updated":"2024-07-29T17:00:32Z","published":"2024-07-29T17:00:32Z","title":"LatentArtiFusion: An Effective and Efficient Histological Artifacts\n Restoration Framework","summary":" Histological artifacts pose challenges for both pathologists and\nComputer-Aided Diagnosis (CAD) systems, leading to errors in analysis. Current\napproaches for histological artifact restoration, based on Generative\nAdversarial Networks (GANs) and pixel-level Diffusion Models, suffer from\nperformance limitations and computational inefficiencies. In this paper, we\npropose a novel framework, LatentArtiFusion, which leverages the latent\ndiffusion model (LDM) to reconstruct histological artifacts with high\nperformance and computational efficiency. Unlike traditional pixel-level\ndiffusion frameworks, LatentArtiFusion executes the restoration process in a\nlower-dimensional latent space, significantly improving computational\nefficiency. Moreover, we introduce a novel regional artifact reconstruction\nalgorithm in latent space to prevent mistransfer in non-artifact regions,\ndistinguishing our approach from GAN-based methods. Through extensive\nexperiments on real-world histology datasets, LatentArtiFusion demonstrates\nremarkable speed, outperforming state-of-the-art pixel-level diffusion\nframeworks by more than 30X. It also consistently surpasses GAN-based methods\nby at least 5% across multiple evaluation metrics. Furthermore, we evaluate the\neffectiveness of our proposed framework in downstream tissue classification\ntasks, showcasing its practical utility. Code is available at\nhttps://github.com/bugs-creator/LatentArtiFusion.\n","authors":["Zhenqi He","Wenrui Liu","Minghao Yin","Kai Han"],"pdf_url":"https://arxiv.org/pdf/2407.20172v1.pdf","comment":"Accept to DGM4MICCAI2024"},{"id":"http://arxiv.org/abs/2407.20171v1","updated":"2024-07-29T17:00:09Z","published":"2024-07-29T17:00:09Z","title":"Diffusion Feedback Helps CLIP See Better","summary":" Contrastive Language-Image Pre-training (CLIP), which excels at abstracting\nopen-world representations across domains and modalities, has become a\nfoundation for a variety of vision and multimodal tasks. However, recent\nstudies reveal that CLIP has severe visual shortcomings, such as which can\nhardly distinguish orientation, quantity, color, structure, etc. These visual\nshortcomings also limit the perception capabilities of multimodal large\nlanguage models (MLLMs) built on CLIP. The main reason could be that the\nimage-text pairs used to train CLIP are inherently biased, due to the lack of\nthe distinctiveness of the text and the diversity of images. In this work, we\npresent a simple post-training approach for CLIP models, which largely\novercomes its visual shortcomings via a self-supervised diffusion process. We\nintroduce DIVA, which uses the DIffusion model as a Visual Assistant for CLIP.\nSpecifically, DIVA leverages generative feedback from text-to-image diffusion\nmodels to optimize CLIP representations, with only images (without\ncorresponding text). We demonstrate that DIVA improves CLIP's performance on\nthe challenging MMVP-VLM benchmark which assesses fine-grained visual abilities\nto a large extent (e.g., 3-7%), and enhances the performance of MLLMs and\nvision models on multimodal understanding and segmentation tasks. Extensive\nevaluation on 29 image classification and retrieval benchmarks confirms that\nour framework preserves CLIP's strong zero-shot capabilities. The code will be\navailable at https://github.com/baaivision/DIVA.\n","authors":["Wenxuan Wang","Quan Sun","Fan Zhang","Yepeng Tang","Jing Liu","Xinlong Wang"],"pdf_url":"https://arxiv.org/pdf/2407.20171v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.10779v3","updated":"2024-07-29T16:28:41Z","published":"2022-06-22T00:10:06Z","title":"Not Just Streaks: Towards Ground Truth for Single Image Deraining","summary":" We propose a large-scale dataset of real-world rainy and clean image pairs\nand a method to remove degradations, induced by rain streaks and rain\naccumulation, from the image. As there exists no real-world dataset for\nderaining, current state-of-the-art methods rely on synthetic data and thus are\nlimited by the sim2real domain gap; moreover, rigorous evaluation remains a\nchallenge due to the absence of a real paired dataset. We fill this gap by\ncollecting a real paired deraining dataset through meticulous control of\nnon-rain variations. Our dataset enables paired training and quantitative\nevaluation for diverse real-world rain phenomena (e.g. rain streaks and rain\naccumulation). To learn a representation robust to rain phenomena, we propose a\ndeep neural network that reconstructs the underlying scene by minimizing a\nrain-robust loss between rainy and clean images. Extensive experiments\ndemonstrate that our model outperforms the state-of-the-art deraining methods\non real rainy images under various conditions. Project website:\nhttps://visual.ee.ucla.edu/gt_rain.htm/.\n","authors":["Yunhao Ba","Howard Zhang","Ethan Yang","Akira Suzuki","Arnold Pfahnl","Chethan Chinder Chandrappa","Celso de Melo","Suya You","Stefano Soatto","Alex Wong","Achuta Kadambi"],"pdf_url":"https://arxiv.org/pdf/2206.10779v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20141v1","updated":"2024-07-29T16:11:21Z","published":"2024-07-29T16:11:21Z","title":"DDAP: Dual-Domain Anti-Personalization against Text-to-Image Diffusion\n Models","summary":" Diffusion-based personalized visual content generation technologies have\nachieved significant breakthroughs, allowing for the creation of specific\nobjects by just learning from a few reference photos. However, when misused to\nfabricate fake news or unsettling content targeting individuals, these\ntechnologies could cause considerable societal harm. To address this problem,\ncurrent methods generate adversarial samples by adversarially maximizing the\ntraining loss, thereby disrupting the output of any personalized generation\nmodel trained with these samples. However, the existing methods fail to achieve\neffective defense and maintain stealthiness, as they overlook the intrinsic\nproperties of diffusion models. In this paper, we introduce a novel Dual-Domain\nAnti-Personalization framework (DDAP). Specifically, we have developed Spatial\nPerturbation Learning (SPL) by exploiting the fixed and perturbation-sensitive\nnature of the image encoder in personalized generation. Subsequently, we have\ndesigned a Frequency Perturbation Learning (FPL) method that utilizes the\ncharacteristics of diffusion models in the frequency domain. The SPL disrupts\nthe overall texture of the generated images, while the FPL focuses on image\ndetails. By alternating between these two methods, we construct the DDAP\nframework, effectively harnessing the strengths of both domains. To further\nenhance the visual quality of the adversarial samples, we design a localization\nmodule to accurately capture attentive areas while ensuring the effectiveness\nof the attack and avoiding unnecessary disturbances in the background.\nExtensive experiments on facial benchmarks have shown that the proposed DDAP\nenhances the disruption of personalized generation models while also\nmaintaining high quality in adversarial samples, making it more effective in\nprotecting privacy in practical applications.\n","authors":["Jing Yang","Runping Xi","Yingxin Lai","Xun Lin","Zitong Yu"],"pdf_url":"https://arxiv.org/pdf/2407.20141v1.pdf","comment":"Accepted by IJCB 2024"},{"id":"http://arxiv.org/abs/2405.05953v3","updated":"2024-07-29T15:57:27Z","published":"2024-05-09T17:46:22Z","title":"Frame Interpolation with Consecutive Brownian Bridge Diffusion","summary":" Recent work in Video Frame Interpolation (VFI) tries to formulate VFI as a\ndiffusion-based conditional image generation problem, synthesizing the\nintermediate frame given a random noise and neighboring frames. Due to the\nrelatively high resolution of videos, Latent Diffusion Models (LDMs) are\nemployed as the conditional generation model, where the autoencoder compresses\nimages into latent representations for diffusion and then reconstructs images\nfrom these latent representations. Such a formulation poses a crucial\nchallenge: VFI expects that the output is deterministically equal to the ground\ntruth intermediate frame, but LDMs randomly generate a diverse set of different\nimages when the model runs multiple times. The reason for the diverse\ngeneration is that the cumulative variance (variance accumulated at each step\nof generation) of generated latent representations in LDMs is large. This makes\nthe sampling trajectory random, resulting in diverse rather than deterministic\ngenerations. To address this problem, we propose our unique solution: Frame\nInterpolation with Consecutive Brownian Bridge Diffusion. Specifically, we\npropose consecutive Brownian Bridge diffusion that takes a deterministic\ninitial value as input, resulting in a much smaller cumulative variance of\ngenerated latent representations. Our experiments suggest that our method can\nimprove together with the improvement of the autoencoder and achieve\nstate-of-the-art performance in VFI, leaving strong potential for further\nenhancement.\n","authors":["Zonglin Lyu","Ming Li","Jianbo Jiao","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2405.05953v3.pdf","comment":"corrected typo"},{"id":"http://arxiv.org/abs/2407.20114v1","updated":"2024-07-29T15:44:22Z","published":"2024-07-29T15:44:22Z","title":"FiCo-ITR: bridging fine-grained and coarse-grained image-text retrieval\n for comparative performance analysis","summary":" In the field of Image-Text Retrieval (ITR), recent advancements have\nleveraged large-scale Vision-Language Pretraining (VLP) for Fine-Grained (FG)\ninstance-level retrieval, achieving high accuracy at the cost of increased\ncomputational complexity. For Coarse-Grained (CG) category-level retrieval,\nprominent approaches employ Cross-Modal Hashing (CMH) to prioritise efficiency,\nalbeit at the cost of retrieval performance. Due to differences in\nmethodologies, FG and CG models are rarely compared directly within evaluations\nin the literature, resulting in a lack of empirical data quantifying the\nretrieval performance-efficiency tradeoffs between the two. This paper\naddresses this gap by introducing the \\texttt{FiCo-ITR} library, which\nstandardises evaluation methodologies for both FG and CG models, facilitating\ndirect comparisons. We conduct empirical evaluations of representative models\nfrom both subfields, analysing precision, recall, and computational complexity\nacross varying data scales. Our findings offer new insights into the\nperformance-efficiency trade-offs between recent representative FG and CG\nmodels, highlighting their respective strengths and limitations. These findings\nprovide the foundation necessary to make more informed decisions regarding\nmodel selection for specific retrieval tasks and highlight avenues for future\nresearch into hybrid systems that leverage the strengths of both FG and CG\napproaches.\n","authors":["Mikel Williams-Lekuona","Georgina Cosma"],"pdf_url":"https://arxiv.org/pdf/2407.20114v1.pdf","comment":"19 pages, submitted to International Journal of Multimedia\n Information Retrieval"},{"id":"http://arxiv.org/abs/2407.08680v3","updated":"2024-07-29T15:38:47Z","published":"2024-07-11T17:13:15Z","title":"Generalizable Implicit Motion Modeling for Video Frame Interpolation","summary":" Motion modeling is critical in flow-based Video Frame Interpolation (VFI).\nExisting paradigms either consider linear combinations of bidirectional flows\nor directly predict bilateral flows for given timestamps without exploring\nfavorable motion priors, thus lacking the capability of effectively modeling\nspatiotemporal dynamics in real-world videos. To address this limitation, in\nthis study, we introduce Generalizable Implicit Motion Modeling (GIMM), a novel\nand effective approach to motion modeling for VFI. Specifically, to enable GIMM\nas an effective motion modeling paradigm, we design a motion encoding pipeline\nto model spatiotemporal motion latent from bidirectional flows extracted from\npre-trained flow estimators, effectively representing input-specific motion\npriors. Then, we implicitly predict arbitrary-timestep optical flows within two\nadjacent input frames via an adaptive coordinate-based neural network, with\nspatiotemporal coordinates and motion latent as inputs. Our GIMM can be\nsmoothly integrated with existing flow-based VFI works without further\nmodifications. We show that GIMM performs better than the current state of the\nart on the VFI benchmarks.\n","authors":["Zujin Guo","Wei Li","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2407.08680v3.pdf","comment":"Project Page: https://gseancdat.github.io/projects/GIMMVFI"},{"id":"http://arxiv.org/abs/2407.20108v1","updated":"2024-07-29T15:35:35Z","published":"2024-07-29T15:35:35Z","title":"Classification, Regression and Segmentation directly from k-Space in\n Cardiac MRI","summary":" Cardiac Magnetic Resonance Imaging (CMR) is the gold standard for diagnosing\ncardiovascular diseases. Clinical diagnoses predominantly rely on\nmagnitude-only Digital Imaging and Communications in Medicine (DICOM) images,\nomitting crucial phase information that might provide additional diagnostic\nbenefits. In contrast, k-space is complex-valued and encompasses both magnitude\nand phase information, while humans cannot directly perceive. In this work, we\npropose KMAE, a Transformer-based model specifically designed to process\nk-space data directly, eliminating conventional intermediary conversion steps\nto the image domain. KMAE can handle critical cardiac disease classification,\nrelevant phenotype regression, and cardiac morphology segmentation tasks. We\nutilize this model to investigate the potential of k-space-based diagnosis in\ncardiac MRI. Notably, this model achieves competitive classification and\nregression performance compared to image-domain methods e.g. Masked\nAutoencoders (MAEs) and delivers satisfactory segmentation performance with a\nmyocardium dice score of 0.884. Last but not least, our model exhibits robust\nperformance with consistent results even when the k-space is 8* undersampled.\nWe encourage the MR community to explore the untapped potential of k-space and\npursue end-to-end, automated diagnosis with reduced human intervention.\n","authors":["Ruochen Li","Jiazhen Pan","Youxiang Zhu","Juncheng Ni","Daniel Rueckert"],"pdf_url":"https://arxiv.org/pdf/2407.20108v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20099v1","updated":"2024-07-29T15:26:15Z","published":"2024-07-29T15:26:15Z","title":"RSC-SNN: Exploring the Trade-off Between Adversarial Robustness and\n Accuracy in Spiking Neural Networks via Randomized Smoothing Coding","summary":" Spiking Neural Networks (SNNs) have received widespread attention due to\ntheir unique neuronal dynamics and low-power nature. Previous research\nempirically shows that SNNs with Poisson coding are more robust than Artificial\nNeural Networks (ANNs) on small-scale datasets. However, it is still unclear in\ntheory how the adversarial robustness of SNNs is derived, and whether SNNs can\nstill maintain its adversarial robustness advantage on large-scale dataset\ntasks. This work theoretically demonstrates that SNN's inherent adversarial\nrobustness stems from its Poisson coding. We reveal the conceptual equivalence\nof Poisson coding and randomized smoothing in defense strategies, and analyze\nin depth the trade-off between accuracy and adversarial robustness in SNNs via\nthe proposed Randomized Smoothing Coding (RSC) method. Experiments demonstrate\nthat the proposed RSC-SNNs show remarkable adversarial robustness, surpassing\nANNs and achieving state-of-the-art robustness results on large-scale dataset\nImageNet. Our open-source implementation code is available at this https URL:\nhttps://github.com/KemingWu/RSC-SNN.\n","authors":["Keming Wu","Man Yao","Yuhong Chou","Xuerui Qiu","Rui Yang","Bo Xu","Guoqi Li"],"pdf_url":"https://arxiv.org/pdf/2407.20099v1.pdf","comment":"Accepted by ACM MM 2024"},{"id":"http://arxiv.org/abs/2407.20090v1","updated":"2024-07-29T15:22:02Z","published":"2024-07-29T15:22:02Z","title":"Infrared Small Target Detection based on Adjustable Sensitivity Strategy\n and Multi-Scale Fusion","summary":" Recently, deep learning-based single-frame infrared small target (SIRST)\ndetection technology has made significant progress. However, existing infrared\nsmall target detection methods are often optimized for a fixed image\nresolution, a single wavelength, or a specific imaging system, limiting their\nbreadth and flexibility in practical applications. Therefore, we propose a\nrefined infrared small target detection scheme based on an adjustable\nsensitivity (AS) strategy and multi-scale fusion. Specifically, a multi-scale\nmodel fusion framework based on multi-scale direction-aware network (MSDA-Net)\nis constructed, which uses input images of multiple scales to train multiple\nmodels and fuses them. Multi-scale fusion helps characterize the shape, edge,\nand texture features of the target from different scales, making the model more\naccurate and reliable in locating the target. At the same time, we fully\nconsider the characteristics of the infrared small target detection task and\nconstruct an edge enhancement difficulty mining (EEDM) loss. The EEDM loss\nhelps alleviate the problem of category imbalance and guides the network to pay\nmore attention to difficult target areas and edge features during training. In\naddition, we propose an adjustable sensitivity strategy for post-processing.\nThis strategy significantly improves the detection rate of infrared small\ntargets while ensuring segmentation accuracy. Extensive experimental results\nshow that the proposed scheme achieves the best performance. Notably, this\nscheme won the first prize in the PRCV 2024 wide-area infrared small target\ndetection competition.\n","authors":["Jinmiao Zhao","Zelin Shi","Chuang Yu","Yunpeng Liu"],"pdf_url":"https://arxiv.org/pdf/2407.20090v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20086v1","updated":"2024-07-29T15:16:08Z","published":"2024-07-29T15:16:08Z","title":"Segmenting Fetal Head with Efficient Fine-tuning Strategies in\n Low-resource Settings: an empirical study with U-Net","summary":" Accurate measurement of fetal head circumference is crucial for estimating\nfetal growth during routine prenatal screening. Prior to measurement, it is\nnecessary to accurately identify and segment the region of interest,\nspecifically the fetal head, in ultrasound images. Recent advancements in deep\nlearning techniques have shown significant progress in segmenting the fetal\nhead using encoder-decoder models. Among these models, U-Net has become a\nstandard approach for accurate segmentation. However, training an\nencoder-decoder model can be a time-consuming process that demands substantial\ncomputational resources. Moreover, fine-tuning these models is particularly\nchallenging when there is a limited amount of data available. There are still\nno \"best-practice\" guidelines for optimal fine-tuning of U-net for fetal\nultrasound image segmentation. This work summarizes existing fine-tuning\nstrategies with various backbone architectures, model components, and\nfine-tuning strategies across ultrasound data from Netherlands, Spain, Malawi,\nEgypt and Algeria. Our study shows that (1) fine-tuning U-Net leads to better\nperformance than training from scratch, (2) fine-tuning strategies in decoder\nare superior to other strategies, (3) network architecture with less number of\nparameters can achieve similar or better performance. We also demonstrate the\neffectiveness of fine-tuning strategies in low-resource settings and further\nexpand our experiments into few-shot learning. Lastly, we publicly released our\ncode and specific fine-tuned weights.\n","authors":["Fangyijie Wang","Guénolé Silvestre","Kathleen M. Curran"],"pdf_url":"https://arxiv.org/pdf/2407.20086v1.pdf","comment":"5 figures, 2 tables"},{"id":"http://arxiv.org/abs/2407.20080v1","updated":"2024-07-29T15:04:53Z","published":"2024-07-29T15:04:53Z","title":"UniTTA: Unified Benchmark and Versatile Framework Towards Realistic\n Test-Time Adaptation","summary":" Test-Time Adaptation (TTA) aims to adapt pre-trained models to the target\ndomain during testing. In reality, this adaptability can be influenced by\nmultiple factors. Researchers have identified various challenging scenarios and\ndeveloped diverse methods to address these challenges, such as dealing with\ncontinual domain shifts, mixed domains, and temporally correlated or imbalanced\nclass distributions. Despite these efforts, a unified and comprehensive\nbenchmark has yet to be established. To this end, we propose a Unified\nTest-Time Adaptation (UniTTA) benchmark, which is comprehensive and widely\napplicable. Each scenario within the benchmark is fully described by a Markov\nstate transition matrix for sampling from the original dataset. The UniTTA\nbenchmark considers both domain and class as two independent dimensions of data\nand addresses various combinations of imbalance/balance and\ni.i.d./non-i.i.d./continual conditions, covering a total of \\( (2 \\times 3)^2 =\n36 \\) scenarios. It establishes a comprehensive evaluation benchmark for\nrealistic TTA and provides a guideline for practitioners to select the most\nsuitable TTA method. Alongside this benchmark, we propose a versatile UniTTA\nframework, which includes a Balanced Domain Normalization (BDN) layer and a\nCOrrelated Feature Adaptation (COFA) method--designed to mitigate distribution\ngaps in domain and class, respectively. Extensive experiments demonstrate that\nour UniTTA framework excels within the UniTTA benchmark and achieves\nstate-of-the-art performance on average. Our code is available at\n\\url{https://github.com/LeapLabTHU/UniTTA}.\n","authors":["Chaoqun Du","Yulin Wang","Jiayi Guo","Yizeng Han","Jie Zhou","Gao Huang"],"pdf_url":"https://arxiv.org/pdf/2407.20080v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01445v2","updated":"2024-07-29T15:04:15Z","published":"2024-07-01T16:37:18Z","title":"FastCLIP: A Suite of Optimization Techniques to Accelerate CLIP Training\n with Limited Resources","summary":" Existing studies of training state-of-the-art Contrastive Language-Image\nPretraining (CLIP) models on large-scale data involve hundreds of or even\nthousands of GPUs due to the requirement of a large batch size. However, such a\nlarge amount of resources is not accessible to most people. While advanced\ncompositional optimization techniques for optimizing global contrastive losses\nhave been demonstrated effective for removing the requirement of large batch\nsize, their performance on large-scale data remains underexplored and not\noptimized. To bridge the gap, this paper explores several aspects of CLIP\ntraining with limited resources (e.g., up to tens of GPUs). First, we introduce\nFastCLIP, a general CLIP training framework built on advanced compositional\noptimization techniques while designed and optimized for the distributed\nsetting. Our framework is equipped with an efficient gradient reduction\nstrategy to reduce communication overhead. Second, to further boost training\nefficiency, we investigate three components of the framework from an\noptimization perspective: the schedule of the inner learning rate, the update\nrules of the temperature parameter and the model parameters, respectively.\nExperiments on different strategies for each component shed light on how to\nconduct CLIP training more efficiently. Finally, we benchmark the performance\nof FastCLIP and the state-of-the-art training baseline (OpenCLIP) on different\ncompute scales up to 32 GPUs on 8 nodes, and three data scales ranging from 2.7\nmillion, 9.1 million to 315 million image-text pairs to demonstrate the\nsignificant improvement of FastCLIP in the resource-limited setting. We release\nthe code of FastCLIP at https://github.com/Optimization-AI/fast_clip .\n","authors":["Xiyuan Wei","Fanjiang Ye","Ori Yonay","Xingyu Chen","Baixi Sun","Dingwen Tao","Tianbao Yang"],"pdf_url":"https://arxiv.org/pdf/2407.01445v2.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2407.20078v1","updated":"2024-07-29T15:03:27Z","published":"2024-07-29T15:03:27Z","title":"Background Semantics Matter: Cross-Task Feature Exchange Network for\n Clustered Infrared Small Target Detection With Sky-Annotated Dataset","summary":" Infrared small target detection poses unique challenges due to the scarcity\nof intrinsic target features and the abundance of similar background\ndistractors. We argue that background semantics play a pivotal role in\ndistinguishing visually similar objects for this task. To address this, we\nintroduce a new task -- clustered infrared small target detection, and present\nDenseSIRST, a novel benchmark dataset that provides per-pixel semantic\nannotations for background regions, enabling the transition from sparse to\ndense target detection. Leveraging this dataset, we propose the\nBackground-Aware Feature Exchange Network (BAFE-Net), which transforms the\ndetection paradigm from a single task focused on the foreground to a multi-task\narchitecture that jointly performs target detection and background semantic\nsegmentation. BAFE-Net introduces a cross-task feature hard-exchange mechanism\nto embed target and background semantics between the two tasks. Furthermore, we\npropose the Background-Aware Gaussian Copy-Paste (BAG-CP) method, which\nselectively pastes small targets into sky regions during training, avoiding the\ncreation of false alarm targets in complex non-sky backgrounds. Extensive\nexperiments validate the effectiveness of BAG-CP and BAFE-Net in improving\ntarget detection accuracy while reducing false alarms. The DenseSIRST dataset,\ncode, and trained models are available at https://github.com/GrokCV/BAFE-Net.\n","authors":["Yimian Dai","Mengxuan Xiao","Yiming Zhu","Huan Wang","Kehua Guo","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2407.20078v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17572v2","updated":"2024-07-29T14:55:33Z","published":"2024-07-24T18:05:13Z","title":"CityX: Controllable Procedural Content Generation for Unbounded 3D\n Cities","summary":" Generating a realistic, large-scale 3D virtual city remains a complex\nchallenge due to the involvement of numerous 3D assets, various city styles,\nand strict layout constraints. Existing approaches provide promising attempts\nat procedural content generation to create large-scale scenes using Blender\nagents. However, they face crucial issues such as difficulties in scaling up\ngeneration capability and achieving fine-grained control at the semantic layout\nlevel. To address these problems, we propose a novel multi-modal controllable\nprocedural content generation method, named CityX, which enhances realistic,\nunbounded 3D city generation guided by multiple layout conditions, including\nOSM, semantic maps, and satellite images. Specifically, the proposed method\ncontains a general protocol for integrating various PCG plugins and a\nmulti-agent framework for transforming instructions into executable Blender\nactions. Through this effective framework, CityX shows the potential to build\nan innovative ecosystem for 3D scene generation by bridging the gap between the\nquality of generated assets and industrial requirements. Extensive experiments\nhave demonstrated the effectiveness of our method in creating high-quality,\ndiverse, and unbounded cities guided by multi-modal conditions. Our project\npage: https://cityx-lab.github.io.\n","authors":["Shougao Zhang","Mengqi Zhou","Yuxi Wang","Chuanchen Luo","Rongyu Wang","Yiwei Li","Xucheng Yin","Zhaoxiang Zhang","Junran Peng"],"pdf_url":"https://arxiv.org/pdf/2407.17572v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.08061v2","updated":"2024-07-29T14:49:42Z","published":"2024-07-10T21:51:50Z","title":"Geospecific View Generation -- Geometry-Context Aware High-resolution\n Ground View Inference from Satellite Views","summary":" Predicting realistic ground views from satellite imagery in urban scenes is a\nchallenging task due to the significant view gaps between satellite and\nground-view images. We propose a novel pipeline to tackle this challenge, by\ngenerating geospecifc views that maximally respect the weak geometry and\ntexture from multi-view satellite images. Different from existing approaches\nthat hallucinate images from cues such as partial semantics or geometry from\noverhead satellite images, our method directly predicts ground-view images at\ngeolocation by using a comprehensive set of information from the satellite\nimage, resulting in ground-level images with a resolution boost at a factor of\nten or more. We leverage a novel building refinement method to reduce geometric\ndistortions in satellite data at ground level, which ensures the creation of\naccurate conditions for view synthesis using diffusion networks. Moreover, we\nproposed a novel geospecific prior, which prompts distribution learning of\ndiffusion models to respect image samples that are closer to the geolocation of\nthe predicted images. We demonstrate our pipeline is the first to generate\nclose-to-real and geospecific ground views merely based on satellite images.\n","authors":["Ningli Xu","Rongjun Qin"],"pdf_url":"https://arxiv.org/pdf/2407.08061v2.pdf","comment":"11 figures"},{"id":"http://arxiv.org/abs/2407.20062v1","updated":"2024-07-29T14:48:34Z","published":"2024-07-29T14:48:34Z","title":"SalNAS: Efficient Saliency-prediction Neural Architecture Search with\n self-knowledge distillation","summary":" Recent advancements in deep convolutional neural networks have significantly\nimproved the performance of saliency prediction. However, the manual\nconfiguration of the neural network architectures requires domain knowledge\nexpertise and can still be time-consuming and error-prone. To solve this, we\npropose a new Neural Architecture Search (NAS) framework for saliency\nprediction with two contributions. Firstly, a supernet for saliency prediction\nis built with a weight-sharing network containing all candidate architectures,\nby integrating a dynamic convolution into the encoder-decoder in the supernet,\ntermed SalNAS. Secondly, despite the fact that SalNAS is highly efficient\n(20.98 million parameters), it can suffer from the lack of generalization. To\nsolve this, we propose a self-knowledge distillation approach, termed Self-KD,\nthat trains the student SalNAS with the weighted average information between\nthe ground truth and the prediction from the teacher model. The teacher model,\nwhile sharing the same architecture, contains the best-performing weights\nchosen by cross-validation. Self-KD can generalize well without the need to\ncompute the gradient in the teacher model, enabling an efficient training\nsystem. By utilizing Self-KD, SalNAS outperforms other state-of-the-art\nsaliency prediction models in most evaluation rubrics across seven benchmark\ndatasets while being a lightweight model. The code will be available at\nhttps://github.com/chakkritte/SalNAS\n","authors":["Chakkrit Termritthikun","Ayaz Umer","Suwichaya Suwanwimolkul","Feng Xia","Ivan Lee"],"pdf_url":"https://arxiv.org/pdf/2407.20062v1.pdf","comment":"Published in Engineering Applications of Artificial Intelligence"},{"id":"http://arxiv.org/abs/2407.20034v1","updated":"2024-07-29T14:21:07Z","published":"2024-07-29T14:21:07Z","title":"MaskInversion: Localized Embeddings via Optimization of Explainability\n Maps","summary":" Vision-language foundation models such as CLIP have achieved tremendous\nresults in global vision-language alignment, but still show some limitations in\ncreating representations for specific image regions. % To address this problem,\nwe propose MaskInversion, a method that leverages the feature representations\nof pre-trained foundation models, such as CLIP, to generate a context-aware\nembedding for a query image region specified by a mask at test time.\nMaskInversion starts with initializing an embedding token and compares its\nexplainability map, derived from the foundation model, to the query mask. The\nembedding token is then subsequently refined to approximate the query region by\nminimizing the discrepancy between its explainability map and the query mask.\nDuring this process, only the embedding vector is updated, while the underlying\nfoundation model is kept frozen allowing to use MaskInversion with any\npre-trained model. As deriving the explainability map involves computing its\ngradient, which can be expensive, we propose a gradient decomposition strategy\nthat simplifies this computation. The learned region representation can be used\nfor a broad range of tasks, including open-vocabulary class retrieval,\nreferring expression comprehension, as well as for localized captioning and\nimage generation. We evaluate the proposed method on all those tasks on several\ndatasets such as PascalVOC, MSCOCO, RefCOCO, and OpenImagesV7 and show its\ncapabilities compared to other SOTA approaches.\n","authors":["Walid Bousselham","Sofian Chaybouti","Christian Rupprecht","Vittorio Ferrari","Hilde Kuehne"],"pdf_url":"https://arxiv.org/pdf/2407.20034v1.pdf","comment":"Project page: https://walidbousselham.com/MaskInversion"},{"id":"http://arxiv.org/abs/2407.20021v1","updated":"2024-07-29T13:57:40Z","published":"2024-07-29T13:57:40Z","title":"MimiQ: Low-Bit Data-Free Quantization of Vision Transformers","summary":" Data-free quantization (DFQ) is a technique that creates a lightweight\nnetwork from its full-precision counterpart without the original training data,\noften through a synthetic dataset. Although several DFQ methods have been\nproposed for vision transformer (ViT) architectures, they fail to achieve\nefficacy in low-bit settings. Examining the existing methods, we identify that\ntheir synthetic data produce misaligned attention maps, while those of the real\nsamples are highly aligned. From the observation of aligned attention, we find\nthat aligning attention maps of synthetic data helps to improve the overall\nperformance of quantized ViTs. Motivated by this finding, we devise \\aname, a\nnovel DFQ method designed for ViTs that focuses on inter-head attention\nsimilarity. First, we generate synthetic data by aligning head-wise attention\nresponses in relation to spatial query patches. Then, we apply head-wise\nstructural attention distillation to align the attention maps of the quantized\nnetwork to those of the full-precision teacher. The experimental results show\nthat the proposed method significantly outperforms baselines, setting a new\nstate-of-the-art performance for data-free ViT quantization.\n","authors":["Kanghyun Choi","Hye Yoon Lee","Dain Kwon","SunJong Park","Kyuyeun Kim","Noseong Park","Jinho Lee"],"pdf_url":"https://arxiv.org/pdf/2407.20021v1.pdf","comment":"Author Preprint"},{"id":"http://arxiv.org/abs/2407.20020v1","updated":"2024-07-29T13:57:24Z","published":"2024-07-29T13:57:24Z","title":"ImagiNet: A Multi-Content Dataset for Generalizable Synthetic Image\n Detection via Contrastive Learning","summary":" Generative models, such as diffusion models (DMs), variational autoencoders\n(VAEs), and generative adversarial networks (GANs), produce images with a level\nof authenticity that makes them nearly indistinguishable from real photos and\nartwork. While this capability is beneficial for many industries, the\ndifficulty of identifying synthetic images leaves online media platforms\nvulnerable to impersonation and misinformation attempts. To support the\ndevelopment of defensive methods, we introduce ImagiNet, a high-resolution and\nbalanced dataset for synthetic image detection, designed to mitigate potential\nbiases in existing resources. It contains 200K examples, spanning four content\ncategories: photos, paintings, faces, and uncategorized. Synthetic images are\nproduced with open-source and proprietary generators, whereas real counterparts\nof the same content type are collected from public datasets. The structure of\nImagiNet allows for a two-track evaluation system: i) classification as real or\nsynthetic and ii) identification of the generative model. To establish a\nbaseline, we train a ResNet-50 model using a self-supervised contrastive\nobjective (SelfCon) for each track. The model demonstrates state-of-the-art\nperformance and high inference speed across established benchmarks, achieving\nan AUC of up to 0.99 and balanced accuracy ranging from 86% to 95%, even under\nsocial network conditions that involve compression and resizing. Our data and\ncode are available at https://github.com/delyan-boychev/imaginet.\n","authors":["Delyan Boychev","Radostin Cholakov"],"pdf_url":"https://arxiv.org/pdf/2407.20020v1.pdf","comment":"24 pages, 9 figures, 9 tables"},{"id":"http://arxiv.org/abs/2306.11377v2","updated":"2024-07-29T13:46:35Z","published":"2023-06-20T08:36:08Z","title":"HabiCrowd: A High Performance Simulator for Crowd-Aware Visual\n Navigation","summary":" Visual navigation, a foundational aspect of Embodied AI (E-AI), has been\nsignificantly studied in the past few years. While many 3D simulators have been\nintroduced to support visual navigation tasks, scarcely works have been\ndirected towards combining human dynamics, creating the gap between simulation\nand real-world applications. Furthermore, current 3D simulators incorporating\nhuman dynamics have several limitations, particularly in terms of computational\nefficiency, which is a promise of E-AI simulators. To overcome these\nshortcomings, we introduce HabiCrowd, the first standard benchmark for\ncrowd-aware visual navigation that integrates a crowd dynamics model with\ndiverse human settings into photorealistic environments. Empirical evaluations\ndemonstrate that our proposed human dynamics model achieves state-of-the-art\nperformance in collision avoidance, while exhibiting superior computational\nefficiency compared to its counterparts. We leverage HabiCrowd to conduct\nseveral comprehensive studies on crowd-aware visual navigation tasks and\nhuman-robot interactions. The source code and data can be found at\nhttps://habicrowd.github.io/.\n","authors":["An Dinh Vuong","Toan Tien Nguyen","Minh Nhat VU","Baoru Huang","Dzung Nguyen","Huynh Thi Thanh Binh","Thieu Vo","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2306.11377v2.pdf","comment":"Accepted to IROS 2024"},{"id":"http://arxiv.org/abs/2407.20013v1","updated":"2024-07-29T13:45:23Z","published":"2024-07-29T13:45:23Z","title":"Classification of freshwater snails of the genus \\emph{Radomaniola} with\n multimodal triplet networks","summary":" In this paper, we present our first proposal of a machine learning system for\nthe classification of freshwater snails of the genus \\emph{Radomaniola}. We\nelaborate on the specific challenges encountered during system design, and how\nwe tackled them; namely a small, very imbalanced dataset with a high number of\nclasses and high visual similarity between classes. We then show how we\nemployed triplet networks and the multiple input modalities of images,\nmeasurements, and genetic information to overcome these challenges and reach a\nperformance comparable to that of a trained domain expert.\n","authors":["Dennis Vetter","Muhammad Ahsan","Diana Delicado","Thomas A. Neubauer","Thomas Wilke","Gemma Roig"],"pdf_url":"https://arxiv.org/pdf/2407.20013v1.pdf","comment":"Spotlight at ICML 2024 AI for Science workshop"},{"id":"http://arxiv.org/abs/2407.14500v2","updated":"2024-07-29T13:32:14Z","published":"2024-07-18T17:59:17Z","title":"ViLLa: Video Reasoning Segmentation with Large Language Model","summary":" Although video perception models have made remarkable advancements in recent\nyears, they still heavily rely on explicit text descriptions or pre-defined\ncategories to identify target instances before executing video perception\ntasks. These models, however, fail to proactively comprehend and reason the\nuser's intentions via textual input. Even though previous works attempt to\ninvestigate solutions to incorporate reasoning with image segmentation, they\nfail to reason with videos due to the video's complexity in object motion. To\nbridge the gap between image and video, in this work, we propose a new video\nsegmentation task - video reasoning segmentation. The task is designed to\noutput tracklets of segmentation masks given a complex input text query. What's\nmore, to promote research in this unexplored area, we construct a reasoning\nvideo segmentation benchmark. Finally, we present ViLLa: Video reasoning\nsegmentation with a Large Language Model, which incorporates the language\ngeneration capabilities of multimodal Large Language Models (LLMs) while\nretaining the capabilities of detecting, segmenting, and tracking multiple\ninstances. We use a temporal-aware context aggregation module to incorporate\ncontextual visual cues to text embeddings and propose a video-frame decoder to\nbuild temporal correlations across segmentation tokens. Remarkably, our ViLLa\ndemonstrates capability in handling complex reasoning and referring video\nsegmentation. Also, our model shows impressive ability in different temporal\nunderstanding benchmarks. Both quantitative and qualitative experiments show\nour method effectively unlocks new video reasoning segmentation capabilities\nfor multimodal LLMs. The code and dataset will be available at\nhttps://github.com/rkzheng99/ViLLa.\n","authors":["Rongkun Zheng","Lu Qi","Xi Chen","Yi Wang","Kun Wang","Yu Qiao","Hengshuang Zhao"],"pdf_url":"https://arxiv.org/pdf/2407.14500v2.pdf","comment":"15 pages,6 figures"},{"id":"http://arxiv.org/abs/2407.19996v1","updated":"2024-07-29T13:27:44Z","published":"2024-07-29T13:27:44Z","title":"Reproducibility Study of \"ITI-GEN: Inclusive Text-to-Image Generation\"","summary":" Text-to-image generative models often present issues regarding fairness with\nrespect to certain sensitive attributes, such as gender or skin tone. This\nstudy aims to reproduce the results presented in \"ITI-GEN: Inclusive\nText-to-Image Generation\" by Zhang et al. (2023a), which introduces a model to\nimprove inclusiveness in these kinds of models. We show that most of the claims\nmade by the authors about ITI-GEN hold: it improves the diversity and quality\nof generated images, it is scalable to different domains, it has plug-and-play\ncapabilities, and it is efficient from a computational point of view. However,\nITI-GEN sometimes uses undesired attributes as proxy features and it is unable\nto disentangle some pairs of (correlated) attributes such as gender and\nbaldness. In addition, when the number of considered attributes increases, the\ntraining time grows exponentially and ITI-GEN struggles to generate inclusive\nimages for all elements in the joint distribution. To solve these issues, we\npropose using Hard Prompt Search with negative prompting, a method that does\nnot require training and that handles negation better than vanilla Hard Prompt\nSearch. Nonetheless, Hard Prompt Search (with or without negative prompting)\ncannot be used for continuous attributes that are hard to express in natural\nlanguage, an area where ITI-GEN excels as it is guided by images during\ntraining. Finally, we propose combining ITI-GEN and Hard Prompt Search with\nnegative prompting.\n","authors":["Daniel Gallo Fernández","Răzvan-Andrei Matisan","Alejandro Monroy Muñoz","Janusz Partyka"],"pdf_url":"https://arxiv.org/pdf/2407.19996v1.pdf","comment":"Accepted to TMLR, see https://openreview.net/forum?id=d3Vj360Wi2"},{"id":"http://arxiv.org/abs/2407.19992v1","updated":"2024-07-29T13:24:55Z","published":"2024-07-29T13:24:55Z","title":"More precise edge detections","summary":" Image Edge detection (ED) is a base task in computer vision. While the\nperformance of the ED algorithm has been improved greatly by introducing\nCNN-based models, current models still suffer from unsatisfactory precision\nrates especially when only a low error toleration distance is allowed.\nTherefore, model architecture for more precise predictions still needs an\ninvestigation. On the other hand, the unavoidable noise training data provided\nby humans would lead to unsatisfactory model predictions even when inputs are\nedge maps themselves, which also needs improvement. In this paper, more precise\nED models are presented with cascaded skipping density blocks (CSDB). Our\nmodels obtain state-of-the-art(SOTA) predictions in several datasets,\nespecially in average precision rate (AP), which is confirmed by extensive\nexperiments. Moreover, our models do not include down-sample operations,\ndemonstrating those widely believed operations are not necessary. Also, a novel\nmodification on data augmentation for training is employed, which allows\nnoiseless data to be employed in model training and thus improves the\nperformance of models predicting on edge maps themselves.\n","authors":["Hao Shu","Guo-Ping Qiu"],"pdf_url":"https://arxiv.org/pdf/2407.19992v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2407.19985v1","updated":"2024-07-29T13:19:31Z","published":"2024-07-29T13:19:31Z","title":"Mixture of Nested Experts: Adaptive Processing of Visual Tokens","summary":" The visual medium (images and videos) naturally contains a large amount of\ninformation redundancy, thereby providing a great opportunity for leveraging\nefficiency in processing. While Vision Transformer (ViT) based models scale\neffectively to large data regimes, they fail to capitalize on this inherent\nredundancy, leading to higher computational costs. Mixture of Experts (MoE)\nnetworks demonstrate scalability while maintaining same inference-time costs,\nbut they come with a larger parameter footprint. We present Mixture of Nested\nExperts (MoNE), which utilizes a nested structure for experts, wherein\nindividual experts fall on an increasing compute-accuracy curve. Given a\ncompute budget, MoNE learns to dynamically choose tokens in a priority order,\nand thus redundant tokens are processed through cheaper nested experts. Using\nthis framework, we achieve equivalent performance as the baseline models, while\nreducing inference time compute by over two-fold. We validate our approach on\nstandard image and video datasets - ImageNet-21K, Kinetics400, and\nSomething-Something-v2. We further highlight MoNE$'$s adaptability by\nshowcasing its ability to maintain strong performance across different\ninference-time compute budgets on videos, using only a single trained model.\n","authors":["Gagan Jain","Nidhi Hegde","Aditya Kusupati","Arsha Nagrani","Shyamal Buch","Prateek Jain","Anurag Arnab","Sujoy Paul"],"pdf_url":"https://arxiv.org/pdf/2407.19985v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19981v1","updated":"2024-07-29T13:15:51Z","published":"2024-07-29T13:15:51Z","title":"Adversarial Robustness in RGB-Skeleton Action Recognition: Leveraging\n Attention Modality Reweighter","summary":" Deep neural networks (DNNs) have been applied in many computer vision tasks\nand achieved state-of-the-art (SOTA) performance. However, misclassification\nwill occur when DNNs predict adversarial examples which are created by adding\nhuman-imperceptible adversarial noise to natural examples. This limits the\napplication of DNN in security-critical fields. In order to enhance the\nrobustness of models, previous research has primarily focused on the unimodal\ndomain, such as image recognition and video understanding. Although multi-modal\nlearning has achieved advanced performance in various tasks, such as action\nrecognition, research on the robustness of RGB-skeleton action recognition\nmodels is scarce. In this paper, we systematically investigate how to improve\nthe robustness of RGB-skeleton action recognition models. We initially\nconducted empirical analysis on the robustness of different modalities and\nobserved that the skeleton modality is more robust than the RGB modality.\nMotivated by this observation, we propose the \\formatword{A}ttention-based\n\\formatword{M}odality \\formatword{R}eweighter (\\formatword{AMR}), which\nutilizes an attention layer to re-weight the two modalities, enabling the model\nto learn more robust features. Our AMR is plug-and-play, allowing easy\nintegration with multimodal models. To demonstrate the effectiveness of AMR, we\nconducted extensive experiments on various datasets. For example, compared to\nthe SOTA methods, AMR exhibits a 43.77\\% improvement against PGD20 attacks on\nthe NTU-RGB+D 60 dataset. Furthermore, it effectively balances the differences\nin robustness between different modalities.\n","authors":["Chao Liu","Xin Liu","Zitong Yu","Yonghong Hou","Huanjing Yue","Jingyu Yang"],"pdf_url":"https://arxiv.org/pdf/2407.19981v1.pdf","comment":"Accepted by IJCB 2024"},{"id":"http://arxiv.org/abs/2407.19970v1","updated":"2024-07-29T13:01:20Z","published":"2024-07-29T13:01:20Z","title":"From Flat to Spatial: Comparison of 4 methods constructing 3D, 2 and\n 1/2D Models from 2D Plans with neural networks","summary":" In the field of architecture, the conversion of single images into 2 and 1/2D\nand 3D meshes is a promising technology that enhances design visualization and\nefficiency. This paper evaluates four innovative methods: \"One-2-3-45,\" \"CRM:\nSingle Image to 3D Textured Mesh with Convolutional Reconstruction Model,\"\n\"Instant Mesh,\" and \"Image-to-Mesh.\" These methods are at the forefront of this\ntechnology, focusing on their applicability in architectural design and\nvisualization. They streamline the creation of 3D architectural models,\nenabling rapid prototyping and detailed visualization from minimal initial\ninputs, such as photographs or simple sketches.One-2-3-45 leverages a\ndiffusion-based approach to generate multi-view reconstructions, ensuring high\ngeometric fidelity and texture quality. CRM utilizes a convolutional network to\nintegrate geometric priors into its architecture, producing detailed and\ntextured meshes quickly and efficiently. Instant Mesh combines the strengths of\nmulti-view diffusion and sparse-view models to offer speed and scalability,\nsuitable for diverse architectural projects. Image-to-Mesh leverages a\ngenerative adversarial network (GAN) to produce 3D meshes from single images,\nfocusing on maintaining high texture fidelity and geometric accuracy by\nincorporating image and depth map data into its training process. It uses a\nhybrid approach that combines voxel-based representations with surface\nreconstruction techniques to ensure detailed and realistic 3D models.This\ncomparative study highlights each method's contribution to reducing design\ncycle times, improving accuracy, and enabling flexible adaptations to various\narchitectural styles and requirements. By providing architects with powerful\ntools for rapid visualization and iteration, these advancements in 3D mesh\ngeneration are set to revolutionize architectural practices.\n","authors":["Jacob Sam","Karan Patel","Mike Saad"],"pdf_url":"https://arxiv.org/pdf/2407.19970v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03653v2","updated":"2024-07-29T12:53:20Z","published":"2024-07-04T05:48:28Z","title":"reBEN: Refined BigEarthNet Dataset for Remote Sensing Image Analysis","summary":" This paper presents refined BigEarthNet (reBEN) that is a large-scale,\nmulti-modal remote sensing dataset constructed to support deep learning (DL)\nstudies for remote sensing image analysis. The reBEN dataset consists of\n549,488 pairs of Sentinel-1 and Sentinel-2 image patches. To construct reBEN,\nwe initially consider the Sentinel-1 and Sentinel-2 tiles used to construct the\nBigEarthNet dataset and then divide them into patches of size 1200 m x 1200 m.\nWe apply atmospheric correction to the Sentinel-2 patches using the latest\nversion of the sen2cor tool, resulting in higher-quality patches compared to\nthose present in BigEarthNet. Each patch is then associated with a pixel-level\nreference map and scene-level multi-labels. This makes reBEN suitable for\npixel- and scene-based learning tasks. The labels are derived from the most\nrecent CORINE Land Cover (CLC) map of 2018 by utilizing the 19-class\nnomenclature as in BigEarthNet. The use of the most recent CLC map results in\novercoming the label noise present in BigEarthNet. Furthermore, we introduce a\nnew geographical-based split assignment algorithm that significantly reduces\nthe spatial correlation among the train, validation, and test sets with respect\nto those present in BigEarthNet. This increases the reliability of the\nevaluation of DL models. To minimize the DL model training time, we introduce\nsoftware tools that convert the reBEN dataset into a DL-optimized data format.\nIn our experiments, we show the potential of reBEN for multi-modal multi-label\nimage classification problems by considering several state-of-the-art DL\nmodels. The pre-trained model weights, associated code, and complete dataset\nare available at https://bigearth.net.\n","authors":["Kai Norman Clasen","Leonard Hackel","Tom Burgert","Gencer Sumbul","Begüm Demir","Volker Markl"],"pdf_url":"https://arxiv.org/pdf/2407.03653v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.18214v2","updated":"2024-07-29T12:51:06Z","published":"2024-06-26T09:57:55Z","title":"Trimming the Fat: Efficient Compression of 3D Gaussian Splats through\n Pruning","summary":" In recent times, the utilization of 3D models has gained traction, owing to\nthe capacity for end-to-end training initially offered by Neural Radiance\nFields and more recently by 3D Gaussian Splatting (3DGS) models. The latter\nholds a significant advantage by inherently easing rapid convergence during\ntraining and offering extensive editability. However, despite rapid\nadvancements, the literature still lives in its infancy regarding the\nscalability of these models. In this study, we take some initial steps in\naddressing this gap, showing an approach that enables both the memory and\ncomputational scalability of such models. Specifically, we propose \"Trimming\nthe fat\", a post-hoc gradient-informed iterative pruning technique to eliminate\nredundant information encoded in the model. Our experimental findings on widely\nacknowledged benchmarks attest to the effectiveness of our approach, revealing\nthat up to 75% of the Gaussians can be removed while maintaining or even\nimproving upon baseline performance. Our approach achieves around 50$\\times$\ncompression while preserving performance similar to the baseline model, and is\nable to speed-up computation up to 600 FPS.\n","authors":["Muhammad Salman Ali","Maryam Qamar","Sung-Ho Bae","Enzo Tartaglione"],"pdf_url":"https://arxiv.org/pdf/2406.18214v2.pdf","comment":"Accepted at BMVC 2024"},{"id":"http://arxiv.org/abs/2407.19953v1","updated":"2024-07-29T12:40:12Z","published":"2024-07-29T12:40:12Z","title":"FedDEO: Description-Enhanced One-Shot Federated Learning with Diffusion\n Models","summary":" In recent years, the attention towards One-Shot Federated Learning (OSFL) has\nbeen driven by its capacity to minimize communication. With the development of\nthe diffusion model (DM), several methods employ the DM for OSFL, utilizing\nmodel parameters, image features, or textual prompts as mediums to transfer the\nlocal client knowledge to the server. However, these mediums often require\npublic datasets or the uniform feature extractor, significantly limiting their\npracticality. In this paper, we propose FedDEO, a Description-Enhanced One-Shot\nFederated Learning Method with DMs, offering a novel exploration of utilizing\nthe DM in OSFL. The core idea of our method involves training local\ndescriptions on the clients, serving as the medium to transfer the knowledge of\nthe distributed clients to the server. Firstly, we train local descriptions on\nthe client data to capture the characteristics of client distributions, which\nare then uploaded to the server. On the server, the descriptions are used as\nconditions to guide the DM in generating synthetic datasets that comply with\nthe distributions of various clients, enabling the training of the aggregated\nmodel. Theoretical analyses and sufficient quantitation and visualization\nexperiments on three large-scale real-world datasets demonstrate that through\nthe training of local descriptions, the server is capable of generating\nsynthetic datasets with high quality and diversity. Consequently, with\nadvantages in communication and privacy protection, the aggregated model\noutperforms compared FL or diffusion-based OSFL methods and, on some clients,\noutperforms the performance ceiling of centralized training.\n","authors":["Mingzhao Yang","Shangchao Su","Bin Li","Xiangyang Xue"],"pdf_url":"https://arxiv.org/pdf/2407.19953v1.pdf","comment":"Accepted by MM 24"},{"id":"http://arxiv.org/abs/2407.19938v1","updated":"2024-07-29T12:18:07Z","published":"2024-07-29T12:18:07Z","title":"Robust Conformal Volume Estimation in 3D Medical Images","summary":" Volumetry is one of the principal downstream applications of 3D medical image\nsegmentation, for example, to detect abnormal tissue growth or for surgery\nplanning. Conformal Prediction is a promising framework for uncertainty\nquantification, providing calibrated predictive intervals associated with\nautomatic volume measurements. However, this methodology is based on the\nhypothesis that calibration and test samples are exchangeable, an assumption\nthat is in practice often violated in medical image applications. A weighted\nformulation of Conformal Prediction can be framed to mitigate this issue, but\nits empirical investigation in the medical domain is still lacking. A potential\nreason is that it relies on the estimation of the density ratio between the\ncalibration and test distributions, which is likely to be intractable in\nscenarios involving high-dimensional data. To circumvent this, we propose an\nefficient approach for density ratio estimation relying on the compressed\nlatent representations generated by the segmentation model. Our experiments\ndemonstrate the efficiency of our approach to reduce the coverage error in the\npresence of covariate shifts, in both synthetic and real-world settings. Our\nimplementation is available at https://github.com/benolmbrt/wcp_miccai\n","authors":["Benjamin Lambert","Florence Forbes","Senan Doyle","Michel Dojat"],"pdf_url":"https://arxiv.org/pdf/2407.19938v1.pdf","comment":"Early accepted at MICCAI 2024"},{"id":"http://arxiv.org/abs/2407.17927v2","updated":"2024-07-29T11:55:53Z","published":"2024-07-25T10:24:54Z","title":"Invariance of deep image quality metrics to affine transformations","summary":" Deep architectures are the current state-of-the-art in predicting subjective\nimage quality. Usually, these models are evaluated according to their ability\nto correlate with human opinion in databases with a range of distortions that\nmay appear in digital media. However, these oversee affine transformations\nwhich may represent better the changes in the images actually happening in\nnatural conditions. Humans can be particularly invariant to these natural\ntransformations, as opposed to the digital ones. In this work, we evaluate\nstate-of-the-art deep image quality metrics by assessing their invariance to\naffine transformations, specifically: rotation, translation, scaling, and\nchanges in spectral illumination. Here invariance of a metric refers to the\nfact that certain distances should be neglected (considered to be zero) if\ntheir values are below a threshold. This is what we call invisibility threshold\nof a metric. We propose a methodology to assign such invisibility thresholds\nfor any perceptual metric. This methodology involves transformations to a\ndistance space common to any metric, and psychophysical measurements of\nthresholds in this common space. By doing so, we allow the analyzed metrics to\nbe directly comparable with actual human thresholds. We find that none of the\nstate-of-the-art metrics shows human-like results under this strong test based\non invisibility thresholds. This means that tuning the models exclusively to\npredict the visibility of generic distortions may disregard other properties of\nhuman vision as for instance invariances or invisibility thresholds.\n","authors":["Nuria Alabau-Bosque","Paula Daudén-Oliver","Jorge Vila-Tomás","Valero Laparra","Jesús Malo"],"pdf_url":"https://arxiv.org/pdf/2407.17927v2.pdf","comment":"24 pages 40 figures"},{"id":"http://arxiv.org/abs/2407.19918v1","updated":"2024-07-29T11:52:07Z","published":"2024-07-29T11:52:07Z","title":"FreeLong: Training-Free Long Video Generation with SpectralBlend\n Temporal Attention","summary":" Video diffusion models have made substantial progress in various video\ngeneration applications. However, training models for long video generation\ntasks require significant computational and data resources, posing a challenge\nto developing long video diffusion models. This paper investigates a\nstraightforward and training-free approach to extend an existing short video\ndiffusion model (e.g. pre-trained on 16-frame videos) for consistent long video\ngeneration (e.g. 128 frames). Our preliminary observation has found that\ndirectly applying the short video diffusion model to generate long videos can\nlead to severe video quality degradation. Further investigation reveals that\nthis degradation is primarily due to the distortion of high-frequency\ncomponents in long videos, characterized by a decrease in spatial\nhigh-frequency components and an increase in temporal high-frequency\ncomponents. Motivated by this, we propose a novel solution named FreeLong to\nbalance the frequency distribution of long video features during the denoising\nprocess. FreeLong blends the low-frequency components of global video features,\nwhich encapsulate the entire video sequence, with the high-frequency components\nof local video features that focus on shorter subsequences of frames. This\napproach maintains global consistency while incorporating diverse and\nhigh-quality spatiotemporal details from local videos, enhancing both the\nconsistency and fidelity of long video generation. We evaluated FreeLong on\nmultiple base video diffusion models and observed significant improvements.\nAdditionally, our method supports coherent multi-prompt generation, ensuring\nboth visual coherence and seamless transitions between scenes.\n","authors":["Yu Lu","Yuanzhi Liang","Linchao Zhu","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2407.19918v1.pdf","comment":"Project page: https://yulu.net.cn/freelong"},{"id":"http://arxiv.org/abs/2407.19913v1","updated":"2024-07-29T11:42:32Z","published":"2024-07-29T11:42:32Z","title":"Cell Culture Assistive Application for Precipitation Image Diagnosis","summary":" In regenerative medicine research, we experimentally design the composition\nof chemical medium. We add different components to 384-well plates and culture\nthe biological cells. We monitor the condition of the cells and take time-lapse\nbioimages for morphological assay. In particular, precipitation can appear as\nartefacts in the image and contaminate the noise in the imaging assay.\nInspecting precipitates is a tedious task for the observer, and differences in\nexperience can lead to variations in judgement from person to person. The\nmachine learning approach will remove the burden of human inspection and\nprovide consistent inspection. In addition, precipitation features are as small\nas 10-20 {\\mu}m. A 1200 pixel square well image resized under a resolution of\n2.82 {\\mu}m/pixel will result in a reduction in precipitation features.\nDividing the well images into 240-pixel squares and learning without resizing\npreserves the resolution of the original image. In this study, we developed an\napplication to automatically detect precipitation on 384-well plates utilising\noptical microscope images. We apply MN-pair contrastive clustering to extract\nprecipitation classes from approximately 20,000 patch images. To detect\nprecipitation features, we compare deeper FCDDs detectors with optional\nbackbones and build a machine learning pipeline to detect precipitation from\nthe maximum score of quadruplet well images using isolation Forest algorithm,\nwhere the anomaly score is ranged from zero to one. Furthermore, using this\napplication we can visualise precipitation situ heatmap on a 384-well plate.\n","authors":["Takato Yasuno"],"pdf_url":"https://arxiv.org/pdf/2407.19913v1.pdf","comment":"18 pages, 15 figures, 5 tables"},{"id":"http://arxiv.org/abs/2407.19894v1","updated":"2024-07-29T11:16:59Z","published":"2024-07-29T11:16:59Z","title":"End-to-end SYNTAX score prediction: benchmark and methods","summary":" The SYNTAX score has become a widely used measure of coronary disease\nseverity , crucial in selecting the optimal mode of revascularization. This\npaper introduces a new medical regression and classification problem -\nautomatically estimating SYNTAX score from coronary angiography. Our study\npresents a comprehensive dataset of 1,844 patients, featuring a balanced\ndistribution of individuals with zero and non-zero scores. This dataset\nincludes a first-of-its-kind, complete coronary angiography samples captured\nthrough a multi-view X-ray video, allowing one to observe coronary arteries\nfrom multiple perspectives. Furthermore, we present a novel, fully automatic\nend-to-end method for estimating the SYNTAX. For such a difficult task, we have\nachieved a solid coefficient of determination R2 of 0.51 in score predictions.\n","authors":["Alexander Ponomarchuk","Ivan Kruzhilov","Galina Zubkova","Artem Shadrin","Ruslan Utegenov","Ivan Bessonov","Pavel Blinov"],"pdf_url":"https://arxiv.org/pdf/2407.19894v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19889v1","updated":"2024-07-29T11:11:17Z","published":"2024-07-29T11:11:17Z","title":"Self-Supervised Learning for Text Recognition: A Critical Survey","summary":" Text Recognition (TR) refers to the research area that focuses on retrieving\ntextual information from images, a topic that has seen significant advancements\nin the last decade due to the use of Deep Neural Networks (DNN). However, these\nsolutions often necessitate vast amounts of manually labeled or synthetic data.\nAddressing this challenge, Self-Supervised Learning (SSL) has gained attention\nby utilizing large datasets of unlabeled data to train DNN, thereby generating\nmeaningful and robust representations. Although SSL was initially overlooked in\nTR because of its unique characteristics, recent years have witnessed a surge\nin the development of SSL methods specifically for this field. This rapid\ndevelopment, however, has led to many methods being explored independently,\nwithout taking previous efforts in methodology or comparison into account,\nthereby hindering progress in the field of research. This paper, therefore,\nseeks to consolidate the use of SSL in the field of TR, offering a critical and\ncomprehensive overview of the current state of the art. We will review and\nanalyze the existing methods, compare their results, and highlight\ninconsistencies in the current literature. This thorough analysis aims to\nprovide general insights into the field, propose standardizations, identify new\nresearch directions, and foster its proper development.\n","authors":["Carlos Penarrubia","Jose J. Valero-Mas","Jorge Calvo-Zaragoza"],"pdf_url":"https://arxiv.org/pdf/2407.19889v1.pdf","comment":"This article is under revision"},{"id":"http://arxiv.org/abs/2407.19888v1","updated":"2024-07-29T11:09:10Z","published":"2024-07-29T11:09:10Z","title":"Yucca: A Deep Learning Framework For Medical Image Analysis","summary":" Medical image analysis using deep learning frameworks has advanced healthcare\nby automating complex tasks, but many existing frameworks lack flexibility,\nmodularity, and user-friendliness. To address these challenges, we introduce\nYucca, an open-source AI framework available at\nhttps://github.com/Sllambias/yucca, designed specifically for medical imaging\napplications and built on PyTorch and PyTorch Lightning. Yucca features a\nthree-tiered architecture: Functional, Modules, and Pipeline, providing a\ncomprehensive and customizable solution. Evaluated across diverse tasks such as\ncerebral microbleeds detection, white matter hyperintensity segmentation, and\nhippocampus segmentation, Yucca achieves state-of-the-art results,\ndemonstrating its robustness and versatility. Yucca offers a powerful,\nflexible, and user-friendly platform for medical image analysis, inviting\ncommunity contributions to advance its capabilities and impact.\n","authors":["Sebastian Nørgaard Llambias","Julia Machnio","Asbjørn Munk","Jakob Ambsdorf","Mads Nielsen","Mostafa Mehdipour Ghazi"],"pdf_url":"https://arxiv.org/pdf/2407.19888v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19877v1","updated":"2024-07-29T10:55:17Z","published":"2024-07-29T10:55:17Z","title":"Language-driven Grasp Detection with Mask-guided Attention","summary":" Grasp detection is an essential task in robotics with various industrial\napplications. However, traditional methods often struggle with occlusions and\ndo not utilize language for grasping. Incorporating natural language into grasp\ndetection remains a challenging task and largely unexplored. To address this\ngap, we propose a new method for language-driven grasp detection with\nmask-guided attention by utilizing the transformer attention mechanism with\nsemantic segmentation features. Our approach integrates visual data,\nsegmentation mask features, and natural language instructions, significantly\nimproving grasp detection accuracy. Our work introduces a new framework for\nlanguage-driven grasp detection, paving the way for language-driven robotic\napplications. Intensive experiments show that our method outperforms other\nrecent baselines by a clear margin, with a 10.0% success score improvement. We\nfurther validate our method in real-world robotic experiments, confirming the\neffectiveness of our approach.\n","authors":["Tuan Van Vo","Minh Nhat Vu","Baoru Huang","An Vuong","Ngan Le","Thieu Vo","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2407.19877v1.pdf","comment":"Accepted at IROS 2024"},{"id":"http://arxiv.org/abs/2407.19875v1","updated":"2024-07-29T10:51:31Z","published":"2024-07-29T10:51:31Z","title":"Exploring Robust Face-Voice Matching in Multilingual Environments","summary":" This paper presents Team Xaiofei's innovative approach to exploring\nFace-Voice Association in Multilingual Environments (FAME) at ACM Multimedia\n2024. We focus on the impact of different languages in face-voice matching by\nbuilding upon Fusion and Orthogonal Projection (FOP), introducing four key\ncomponents: a dual-branch structure, dynamic sample pair weighting, robust data\naugmentation, and score polarization strategy. Our dual-branch structure serves\nas an auxiliary mechanism to better integrate and provide more comprehensive\ninformation. We also introduce a dynamic weighting mechanism for various sample\npairs to optimize learning. Data augmentation techniques are employed to\nenhance the model's generalization across diverse conditions. Additionally,\nscore polarization strategy based on age and gender matching confidence\nclarifies and accentuates the final results. Our methods demonstrate\nsignificant effectiveness, achieving an equal error rate (EER) of 20.07 on the\nV2-EH dataset and 21.76 on the V1-EU dataset.\n","authors":["Jiehui Tang","Xiaofei Wang","Zhen Xiao","Jiayi Liu","Xueliang Liu","Richang Hong"],"pdf_url":"https://arxiv.org/pdf/2407.19875v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02136v2","updated":"2024-07-29T10:43:09Z","published":"2024-03-04T15:46:50Z","title":"Point2Building: Reconstructing Buildings from Airborne LiDAR Point\n Clouds","summary":" We present a learning-based approach to reconstruct buildings as 3D polygonal\nmeshes from airborne LiDAR point clouds. What makes 3D building reconstruction\nfrom airborne LiDAR hard is the large diversity of building designs and\nespecially roof shapes, the low and varying point density across the scene, and\nthe often incomplete coverage of building facades due to occlusions by\nvegetation or to the viewing angle of the sensor. To cope with the diversity of\nshapes and inhomogeneous and incomplete object coverage, we introduce a\ngenerative model that directly predicts 3D polygonal meshes from input point\nclouds. Our autoregressive model, called Point2Building, iteratively builds up\nthe mesh by generating sequences of vertices and faces. This approach enables\nour model to adapt flexibly to diverse geometries and building structures.\nUnlike many existing methods that rely heavily on pre-processing steps like\nexhaustive plane detection, our model learns directly from the point cloud\ndata, thereby reducing error propagation and increasing the fidelity of the\nreconstruction. We experimentally validate our method on a collection of\nairborne LiDAR data of Zurich, Berlin and Tallinn. Our method shows good\ngeneralization to diverse urban styles.\n","authors":["Yujia Liu","Anton Obukhov","Jan Dirk Wegner","Konrad Schindler"],"pdf_url":"https://arxiv.org/pdf/2403.02136v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02771v2","updated":"2024-07-29T10:35:50Z","published":"2024-05-04T23:16:48Z","title":"MMEarth: Exploring Multi-Modal Pretext Tasks For Geospatial\n Representation Learning","summary":" The volume of unlabelled Earth observation (EO) data is huge, but many\nimportant applications lack labelled training data. However, EO data offers the\nunique opportunity to pair data from different modalities and sensors\nautomatically based on geographic location and time, at virtually no human\nlabor cost. We seize this opportunity to create MMEarth, a diverse multi-modal\npretraining dataset at global scale. Using this new corpus of 1.2 million\nlocations, we propose a Multi-Pretext Masked Autoencoder (MP-MAE) approach to\nlearn general-purpose representations for optical satellite images. Our\napproach builds on the ConvNeXt V2 architecture, a fully convolutional masked\nautoencoder (MAE). Drawing upon a suite of multi-modal pretext tasks, we\ndemonstrate that our MP-MAE approach outperforms both MAEs pretrained on\nImageNet and MAEs pretrained on domain-specific satellite images. This is shown\non several downstream tasks including image classification and semantic\nsegmentation. We find that pretraining with multi-modal pretext tasks notably\nimproves the linear probing performance compared to pretraining on optical\nsatellite images only. This also leads to better label efficiency and parameter\nefficiency which are crucial aspects in global scale applications.\n","authors":["Vishal Nedungadi","Ankit Kariryaa","Stefan Oehmcke","Serge Belongie","Christian Igel","Nico Lang"],"pdf_url":"https://arxiv.org/pdf/2405.02771v2.pdf","comment":"Accepted for ECCV 2024. Data and code:\n https://vishalned.github.io/mmearth Update arXiv v2 (ECCV): 1. Dataset fix:\n Removed duplicates and corrected ERA5 yearly statistics. 2. Data augmentation\n fix: Random crops are now aligned. 3. Test metrics fix: Metrics are now\n overall instead of mini-batch averages, matching GEO-Bench metrics. 4.\n Pretrained on MMEarth v001 & evaluated on GEO-Bench v1.0"},{"id":"http://arxiv.org/abs/2405.14841v2","updated":"2024-07-29T10:35:35Z","published":"2024-05-23T17:55:11Z","title":"MOD-UV: Learning Mobile Object Detectors from Unlabeled Videos","summary":" Embodied agents must detect and localize objects of interest, e.g. traffic\nparticipants for self-driving cars. Supervision in the form of bounding boxes\nfor this task is extremely expensive. As such, prior work has looked at\nunsupervised instance detection and segmentation, but in the absence of\nannotated boxes, it is unclear how pixels must be grouped into objects and\nwhich objects are of interest. This results in over-/under-segmentation and\nirrelevant objects. Inspired by human visual system and practical applications,\nwe posit that the key missing cue for unsupervised detection is motion: objects\nof interest are typically mobile objects that frequently move and their motions\ncan specify separate instances. In this paper, we propose MOD-UV, a Mobile\nObject Detector learned from Unlabeled Videos only. We begin with instance\npseudo-labels derived from motion segmentation, but introduce a novel training\nparadigm to progressively discover small objects and static-but-mobile objects\nthat are missed by motion segmentation. As a result, though only learned from\nunlabeled videos, MOD-UV can detect and segment mobile objects from a single\nstatic image. Empirically, we achieve state-of-the-art performance in\nunsupervised mobile object detection on Waymo Open, nuScenes, and KITTI\nDatasets without using any external data or supervised models. Code is\navailable at https://github.com/YihongSun/MOD-UV.\n","authors":["Yihong Sun","Bharath Hariharan"],"pdf_url":"https://arxiv.org/pdf/2405.14841v2.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2309.16483v3","updated":"2024-07-29T10:08:09Z","published":"2023-09-28T14:45:54Z","title":"Rethinking Domain Generalization: Discriminability and Generalizability","summary":" Domain generalization(DG) endeavors to develop robust models that possess\nstrong generalizability while preserving excellent discriminability.\nNonetheless, pivotal DG techniques tend to improve the feature generalizability\nby learning domain-invariant representations, inadvertently overlooking the\nfeature discriminability. On the one hand, the simultaneous attainment of\ngeneralizability and discriminability of features presents a complex challenge,\noften entailing inherent contradictions. This challenge becomes particularly\npronounced when domain-invariant features manifest reduced discriminability\nowing to the inclusion of unstable factors, i.e., spurious correlations. On the\nother hand, prevailing domain-invariant methods can be categorized as\ncategory-level alignment, susceptible to discarding indispensable features\npossessing substantial generalizability and narrowing intra-class variations.\nTo surmount these obstacles, we rethink DG from a new perspective that\nconcurrently imbues features with formidable discriminability and robust\ngeneralizability, and present a novel framework, namely, Discriminative\nMicroscopic Distribution Alignment~(DMDA). DMDA incorporates two core\ncomponents: Selective Channel Pruning~(SCP) and Micro-level Distribution\nAlignment~(MDA). Concretely, SCP attempts to curtail redundancy within neural\nnetworks, prioritizing stable attributes conducive to accurate classification.\nThis approach alleviates the adverse effect of spurious domain invariance and\namplifies the feature discriminability. Besides, MDA accentuates micro-level\nalignment within each class, going beyond mere category-level alignment.\nExtensive experiments on four benchmark datasets corroborate that DMDA achieves\ncomparable results to state-of-the-art methods in DG, underscoring the efficacy\nof our method.\n","authors":["Shaocong Long","Qianyu Zhou","Chenhao Ying","Lizhuang Ma","Yuan Luo"],"pdf_url":"https://arxiv.org/pdf/2309.16483v3.pdf","comment":"Accepted to IEEE Transactions on Circuits and Systems for Video\n Technology (TCSVT)"},{"id":"http://arxiv.org/abs/2407.19849v1","updated":"2024-07-29T09:59:33Z","published":"2024-07-29T09:59:33Z","title":"Normality Addition via Normality Detection in Industrial Image Anomaly\n Detection Models","summary":" The task of image anomaly detection (IAD) aims to identify deviations from\nnormality in image data. These anomalies are patterns that deviate\nsignificantly from what the IAD model has learned from the data during\ntraining. However, in real-world scenarios, the criteria for what constitutes\nnormality often change, necessitating the reclassification of previously\nanomalous instances as normal. To address this challenge, we propose a new\nscenario termed \"normality addition,\" involving the post-training adjustment of\ndecision boundaries to incorporate new normalities. To address this challenge,\nwe propose a method called Normality Addition via Normality Detection (NAND),\nleveraging a vision-language model. NAND performs normality detection which\ndetect patterns related to the intended normality within images based on\ntextual descriptions. We then modify the results of a pre-trained IAD model to\nimplement this normality addition. Using the benchmark dataset in IAD, MVTec\nAD, we establish an evaluation protocol for the normality addition task and\nempirically demonstrate the effectiveness of the NAND method.\n","authors":["Jihun Yi","Dahuin Jung","Sungroh Yoon"],"pdf_url":"https://arxiv.org/pdf/2407.19849v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.11789v2","updated":"2024-07-29T09:51:35Z","published":"2024-02-19T02:32:45Z","title":"Statistical Test on Diffusion Model-based Generated Images by Selective\n Inference","summary":" AI technology for generating images, such as diffusion models, has advanced\nrapidly. However, there is no established framework for quantifying the\nreliability of AI-generated images, which hinders their use in critical\ndecision-making tasks, such as medical image diagnosis. In this study, we\npropose a method to quantify the reliability of decision-making tasks that rely\non images produced by diffusion models within a statistical testing framework.\nThe core concept of our statistical test involves using a selective inference\nframework, in which the statistical test is conducted under the condition that\nthe images are produced by a trained diffusion model. As a case study, we study\na diffusion model-based anomaly detection task for medical images. With our\napproach, the statistical significance of medical image diagnostic outcomes can\nbe quantified in terms of a p-value, enabling decision-making with a controlled\nerror rate. We demonstrate the theoretical soundness and practical\neffectiveness of our statistical test through numerical experiments on both\nsynthetic and brain image datasets.\n","authors":["Teruyuki Katsuoka","Tomohiro Shiraishi","Daiki Miwa","Vo Nguyen Le Duy","Ichiro Takeuchi"],"pdf_url":"https://arxiv.org/pdf/2402.11789v2.pdf","comment":"31 pages, 7 figures"},{"id":"http://arxiv.org/abs/2407.19837v1","updated":"2024-07-29T09:46:39Z","published":"2024-07-29T09:46:39Z","title":"VortSDF: 3D Modeling with Centroidal Voronoi Tesselation on Signed\n Distance Field","summary":" Volumetric shape representations have become ubiquitous in multi-view\nreconstruction tasks. They often build on regular voxel grids as discrete\nrepresentations of 3D shape functions, such as SDF or radiance fields, either\nas the full shape model or as sampled instantiations of continuous\nrepresentations, as with neural networks. Despite their proven efficiency,\nvoxel representations come with the precision versus complexity trade-off. This\ninherent limitation can significantly impact performance when moving away from\nsimple and uncluttered scenes. In this paper we investigate an alternative\ndiscretization strategy with the Centroidal Voronoi Tesselation (CVT). CVTs\nallow to better partition the observation space with respect to shape occupancy\nand to focus the discretization around shape surfaces. To leverage this\ndiscretization strategy for multi-view reconstruction, we introduce a\nvolumetric optimization framework that combines explicit SDF fields with a\nshallow color network, in order to estimate 3D shape properties over\ntetrahedral grids. Experimental results with Chamfer statistics validate this\napproach with unprecedented reconstruction quality on various scenarios such as\nobjects, open scenes or human.\n","authors":["Diego Thomas","Briac Toussaint","Jean-Sebastien Franco","Edmond Boyer"],"pdf_url":"https://arxiv.org/pdf/2407.19837v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.06825v2","updated":"2024-07-29T09:40:11Z","published":"2024-01-12T01:24:04Z","title":"Multi-Memory Matching for Unsupervised Visible-Infrared Person\n Re-Identification","summary":" Unsupervised visible-infrared person re-identification (USL-VI-ReID) is a\npromising yet challenging retrieval task. The key challenges in USL-VI-ReID are\nto effectively generate pseudo-labels and establish pseudo-label\ncorrespondences across modalities without relying on any prior annotations.\nRecently, clustered pseudo-label methods have gained more attention in\nUSL-VI-ReID. However, previous methods fell short of fully exploiting the\nindividual nuances, as they simply utilized a single memory that represented an\nidentity to establish cross-modality correspondences, resulting in ambiguous\ncross-modality correspondences. To address the problem, we propose a\nMulti-Memory Matching (MMM) framework for USL-VI-ReID. We first design a\nCross-Modality Clustering (CMC) module to generate the pseudo-labels through\nclustering together both two modality samples. To associate cross-modality\nclustered pseudo-labels, we design a Multi-Memory Learning and Matching (MMLM)\nmodule, ensuring that optimization explicitly focuses on the nuances of\nindividual perspectives and establishes reliable cross-modality\ncorrespondences. Finally, we design a Soft Cluster-level Alignment (SCA) module\nto narrow the modality gap while mitigating the effect of noise pseudo-labels\nthrough a soft many-to-many alignment strategy. Extensive experiments on the\npublic SYSU-MM01 and RegDB datasets demonstrate the reliability of the\nestablished cross-modality correspondences and the effectiveness of our MMM.\nThe source codes will be released.\n","authors":["Jiangming Shi","Xiangbo Yin","Yeyun Chen","Yachao Zhang","Zhizhong Zhang","Yuan Xie","Yanyun Qu"],"pdf_url":"https://arxiv.org/pdf/2401.06825v2.pdf","comment":"Accepted by ECCV2024"},{"id":"http://arxiv.org/abs/2407.19832v1","updated":"2024-07-29T09:38:15Z","published":"2024-07-29T09:38:15Z","title":"ML-Mamba: Efficient Multi-Modal Large Language Model Utilizing Mamba-2","summary":" Multimodal Large Language Models (MLLMs) have attracted much attention due to\ntheir multifunctionality. However, traditional Transformer architectures incur\nsignificant overhead due to their secondary computational complexity. To\naddress this issue, we introduce ML-Mamba, a multimodal language model that\nutilizes the latest and efficient Mamba-2 model for inference. Mamba-2 is known\nfor its linear extension and fast processing of long sequences. We replace the\nTransformer based backbone with a pre-trained Mamba-2 model and explore methods\nfor integrating 2D visual selective scanning mechanisms into multimodal\nlearning. We also try various visual encoders and Mamba-2 model variants. Our\nextensive experiments conducted in various multimodal benchmark tests have\ndemonstrated the competitive performance of ML-Mamba and highlighted the\npotential of state space models in multimodal tasks. The experimental results\nshow that: (1) ML-Mamba achieves performance comparable to state-of-the-art\nmethods such as TinyLaVA and MobileVLM v2 through its linear sequential\nmodeling, while also having faster inference speed; (2) ML-Mamba performs well\nin visual hallucinations and spatial relationship judgment in closed set\nbenchmark tests; (3) ML-Mamba achieves performance comparable to LLaVA while\nreducing the number of parameters by 40\\%.(4) Compared to the multimodal model\nusing the original Mamba model, the Mamba-2 based large-scale multimodal\nlanguage model has stronger inference performance and effectiveness.\n","authors":["Wenjun Huang","Jianguo Hu"],"pdf_url":"https://arxiv.org/pdf/2407.19832v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12826v2","updated":"2024-07-29T09:33:19Z","published":"2023-12-20T08:05:57Z","title":"JoReS-Diff: Joint Retinex and Semantic Priors in Diffusion Model for\n Low-light Image Enhancement","summary":" Low-light image enhancement (LLIE) has achieved promising performance by\nemploying conditional diffusion models. Despite the success of some conditional\nmethods, previous methods may neglect the importance of a sufficient\nformulation of task-specific condition strategy, resulting in suboptimal visual\noutcomes. In this study, we propose JoReS-Diff, a novel approach that\nincorporates Retinex- and semantic-based priors as the additional\npre-processing condition to regulate the generating capabilities of the\ndiffusion model. We first leverage pre-trained decomposition network to\ngenerate the Retinex prior, which is updated with better quality by an\nadjustment network and integrated into a refinement network to implement\nRetinex-based conditional generation at both feature- and image-levels.\nMoreover, the semantic prior is extracted from the input image with an\noff-the-shelf semantic segmentation model and incorporated through semantic\nattention layers. By treating Retinex- and semantic-based priors as the\ncondition, JoReS-Diff presents a unique perspective for establishing an\ndiffusion model for LLIE and similar image enhancement tasks. Extensive\nexperiments validate the rationality and superiority of our approach.\n","authors":["Yuhui Wu","Guoqing Wang","Zhiwen Wang","Yang Yang","Tianyu Li","Malu Zhang","Chongyi Li","Heng Tao Shen"],"pdf_url":"https://arxiv.org/pdf/2312.12826v2.pdf","comment":"Accepted by ACM MM 2024"},{"id":"http://arxiv.org/abs/2403.05139v3","updated":"2024-07-29T09:15:33Z","published":"2024-03-08T08:12:18Z","title":"Improving Diffusion Models for Authentic Virtual Try-on in the Wild","summary":" This paper considers image-based virtual try-on, which renders an image of a\nperson wearing a curated garment, given a pair of images depicting the person\nand the garment, respectively. Previous works adapt existing exemplar-based\ninpainting diffusion models for virtual try-on to improve the naturalness of\nthe generated visuals compared to other methods (e.g., GAN-based), but they\nfail to preserve the identity of the garments. To overcome this limitation, we\npropose a novel diffusion model that improves garment fidelity and generates\nauthentic virtual try-on images. Our method, coined IDM-VTON, uses two\ndifferent modules to encode the semantics of garment image; given the base UNet\nof the diffusion model, 1) the high-level semantics extracted from a visual\nencoder are fused to the cross-attention layer, and then 2) the low-level\nfeatures extracted from parallel UNet are fused to the self-attention layer. In\naddition, we provide detailed textual prompts for both garment and person\nimages to enhance the authenticity of the generated visuals. Finally, we\npresent a customization method using a pair of person-garment images, which\nsignificantly improves fidelity and authenticity. Our experimental results show\nthat our method outperforms previous approaches (both diffusion-based and\nGAN-based) in preserving garment details and generating authentic virtual\ntry-on images, both qualitatively and quantitatively. Furthermore, the proposed\ncustomization method demonstrates its effectiveness in a real-world scenario.\nMore visualizations are available in our project page:\nhttps://idm-vton.github.io\n","authors":["Yisol Choi","Sangkyung Kwak","Kyungmin Lee","Hyungwon Choi","Jinwoo Shin"],"pdf_url":"https://arxiv.org/pdf/2403.05139v3.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2407.19821v1","updated":"2024-07-29T09:14:21Z","published":"2024-07-29T09:14:21Z","title":"Distilling High Diagnostic Value Patches for Whole Slide Image\n Classification Using Attention Mechanism","summary":" Multiple Instance Learning (MIL) has garnered widespread attention in the\nfield of Whole Slide Image (WSI) classification as it replaces pixel-level\nmanual annotation with diagnostic reports as labels, significantly reducing\nlabor costs. Recent research has shown that bag-level MIL methods often yield\nbetter results because they can consider all patches of the WSI as a whole.\nHowever, a drawback of such methods is the incorporation of more redundant\npatches, leading to interference. To extract patches with high diagnostic value\nwhile excluding interfering patches to address this issue, we developed an\nattention-based feature distillation multi-instance learning (AFD-MIL)\napproach. This approach proposed the exclusion of redundant patches as a\npreprocessing operation in weakly supervised learning, directly mitigating\ninterference from extensive noise. It also pioneers the use of attention\nmechanisms to distill features with high diagnostic value, as opposed to the\ntraditional practice of indiscriminately and forcibly integrating all patches.\nAdditionally, we introduced global loss optimization to finely control the\nfeature distillation module. AFD-MIL is orthogonal to many existing MIL\nmethods, leading to consistent performance improvements. This approach has\nsurpassed the current state-of-the-art method, achieving 91.47% ACC (accuracy)\nand 94.29% AUC (area under the curve) on the Camelyon16 (Camelyon Challenge\n2016, breast cancer), while 93.33% ACC and 98.17% AUC on the TCGA-NSCLC (The\nCancer Genome Atlas Program: non-small cell lung cancer). Different feature\ndistillation methods were used for the two datasets, tailored to the specific\ndiseases, thereby improving performance and interpretability.\n","authors":["Tianhang Nan","Hao Quan","Yong Ding","Xingyu Li","Kai Yang","Xiaoyu Cui"],"pdf_url":"https://arxiv.org/pdf/2407.19821v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.09174v3","updated":"2024-07-29T09:14:07Z","published":"2024-07-12T11:16:44Z","title":"DART: An Automated End-to-End Object Detection Pipeline with Data\n Diversification, Open-Vocabulary Bounding Box Annotation, Pseudo-Label\n Review, and Model Training","summary":" Accurate real-time object detection is vital across numerous industrial\napplications, from safety monitoring to quality control. Traditional\napproaches, however, are hindered by arduous manual annotation and data\ncollection, struggling to adapt to ever-changing environments and novel target\nobjects. To address these limitations, this paper presents DART, an innovative\nautomated end-to-end pipeline that revolutionizes object detection workflows\nfrom data collection to model evaluation. It eliminates the need for laborious\nhuman labeling and extensive data collection while achieving outstanding\naccuracy across diverse scenarios. DART encompasses four key stages: (1) Data\nDiversification using subject-driven image generation (DreamBooth with SDXL),\n(2) Annotation via open-vocabulary object detection (Grounding DINO) to\ngenerate bounding box and class labels, (3) Review of generated images and\npseudo-labels by large multimodal models (InternVL-1.5 and GPT-4o) to guarantee\ncredibility, and (4) Training of real-time object detectors (YOLOv8 and\nYOLOv10) using the verified data. We apply DART to a self-collected dataset of\nconstruction machines named Liebherr Product, which contains over 15K\nhigh-quality images across 23 categories. The current instantiation of DART\nsignificantly increases average precision (AP) from 0.064 to 0.832. Its modular\ndesign ensures easy exchangeability and extensibility, allowing for future\nalgorithm upgrades, seamless integration of new object categories, and\nadaptability to customized environments without manual labeling and additional\ndata collection. The code and dataset are released at\nhttps://github.com/chen-xin-94/DART.\n","authors":["Chen Xin","Andreas Hartel","Enkelejda Kasneci"],"pdf_url":"https://arxiv.org/pdf/2407.09174v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19820v1","updated":"2024-07-29T09:14:02Z","published":"2024-07-29T09:14:02Z","title":"ActivityCLIP: Enhancing Group Activity Recognition by Mining\n Complementary Information from Text to Supplement Image Modality","summary":" Previous methods usually only extract the image modality's information to\nrecognize group activity. However, mining image information is approaching\nsaturation, making it difficult to extract richer information. Therefore,\nextracting complementary information from other modalities to supplement image\ninformation has become increasingly important. In fact, action labels provide\nclear text information to express the action's semantics, which existing\nmethods often overlook. Thus, we propose ActivityCLIP, a plug-and-play method\nfor mining the text information contained in the action labels to supplement\nthe image information for enhancing group activity recognition. ActivityCLIP\nconsists of text and image branches, where the text branch is plugged into the\nimage branch (The off-the-shelf image-based method). The text branch includes\nImage2Text and relation modeling modules. Specifically, we propose the\nknowledge transfer module, Image2Text, which adapts image information into text\ninformation extracted by CLIP via knowledge distillation. Further, to keep our\nmethod convenient, we add fewer trainable parameters based on the relation\nmodule of the image branch to model interaction relation in the text branch. To\nshow our method's generality, we replicate three representative methods by\nActivityCLIP, which adds only limited trainable parameters, achieving favorable\nperformance improvements for each method. We also conduct extensive ablation\nstudies and compare our method with state-of-the-art methods to demonstrate the\neffectiveness of ActivityCLIP.\n","authors":["Guoliang Xu","Jianqin Yin","Feng Zhou","Yonghao Dang"],"pdf_url":"https://arxiv.org/pdf/2407.19820v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18792v2","updated":"2024-07-29T09:05:17Z","published":"2024-07-26T14:54:16Z","title":"Benchmarking Dependence Measures to Prevent Shortcut Learning in Medical\n Imaging","summary":" Medical imaging cohorts are often confounded by factors such as acquisition\ndevices, hospital sites, patient backgrounds, and many more. As a result, deep\nlearning models tend to learn spurious correlations instead of causally related\nfeatures, limiting their generalizability to new and unseen data. This problem\ncan be addressed by minimizing dependence measures between intermediate\nrepresentations of task-related and non-task-related variables. These measures\ninclude mutual information, distance correlation, and the performance of\nadversarial classifiers. Here, we benchmark such dependence measures for the\ntask of preventing shortcut learning. We study a simplified setting using\nMorpho-MNIST and a medical imaging task with CheXpert chest radiographs. Our\nresults provide insights into how to mitigate confounding factors in medical\nimaging.\n","authors":["Sarah Müller","Louisa Fay","Lisa M. Koch","Sergios Gatidis","Thomas Küstner","Philipp Berens"],"pdf_url":"https://arxiv.org/pdf/2407.18792v2.pdf","comment":"Accepted to the 15th International Workshop on Machine Learning in\n Medical Imaging (MLMI 2024); new version: appendix moved to the end, after\n the references"},{"id":"http://arxiv.org/abs/2407.19812v1","updated":"2024-07-29T09:05:04Z","published":"2024-07-29T09:05:04Z","title":"Image-text matching for large-scale book collections","summary":" We address the problem of detecting and mapping all books in a collection of\nimages to entries in a given book catalogue. Instead of performing independent\nretrieval for each book detected, we treat the image-text mapping problem as a\nmany-to-many matching process, looking for the best overall match between the\ntwo sets. We combine a state-of-the-art segmentation method (SAM) to detect\nbook spines and extract book information using a commercial OCR. We then\npropose a two-stage approach for text-image matching, where CLIP embeddings are\nused first for fast matching, followed by a second slower stage to refine the\nmatching, employing either the Hungarian Algorithm or a BERT-based model\ntrained to cope with noisy OCR input and partial text matches. To evaluate our\napproach, we publish a new dataset of annotated bookshelf images that covers\nthe whole book collection of a public library in Spain. In addition, we provide\ntwo target lists of book metadata, a closed-set of 15k book titles that\ncorresponds to the known library inventory, and an open-set of 2.3M book titles\nto simulate an open-world scenario. We report results on two settings, on one\nhand on a matching-only task, where the book segments and OCR is given and the\nobjective is to perform many-to-many matching against the target lists, and a\ncombined detection and matching task, where books must be first detected and\nrecognised before they are matched to the target list entries. We show that\nboth the Hungarian Matching and the proposed BERT-based model outperform a\nfuzzy string matching baseline, and we highlight inherent limitations of the\nmatching algorithms as the target increases in size, and when either of the two\nsets (detected books or target book list) is incomplete. The dataset and code\nare available at https://github.com/llabres/library-dataset\n","authors":["Artemis Llabrés","Arka Ujjal Dey","Dimosthenis Karatzas","Ernest Valveny"],"pdf_url":"https://arxiv.org/pdf/2407.19812v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19811v1","updated":"2024-07-29T09:04:11Z","published":"2024-07-29T09:04:11Z","title":"Synthetic Thermal and RGB Videos for Automatic Pain Assessment utilizing\n a Vision-MLP Architecture","summary":" Pain assessment is essential in developing optimal pain management protocols\nto alleviate suffering and prevent functional decline in patients.\nConsequently, reliable and accurate automatic pain assessment systems are\nessential for continuous and effective patient monitoring. This study presents\nsynthetic thermal videos generated by Generative Adversarial Networks\nintegrated into the pain recognition pipeline and evaluates their efficacy. A\nframework consisting of a Vision-MLP and a Transformer-based module is\nutilized, employing RGB and synthetic thermal videos in unimodal and multimodal\nsettings. Experiments conducted on facial videos from the BioVid database\ndemonstrate the effectiveness of synthetic thermal videos and underline the\npotential advantages of it.\n","authors":["Stefanos Gkikas","Manolis Tsiknakis"],"pdf_url":"https://arxiv.org/pdf/2407.19811v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19809v1","updated":"2024-07-29T09:02:43Z","published":"2024-07-29T09:02:43Z","title":"Twins-PainViT: Towards a Modality-Agnostic Vision Transformer Framework\n for Multimodal Automatic Pain Assessment using Facial Videos and fNIRS","summary":" Automatic pain assessment plays a critical role for advancing healthcare and\noptimizing pain management strategies. This study has been submitted to the\nFirst Multimodal Sensing Grand Challenge for Next-Gen Pain Assessment\n(AI4PAIN). The proposed multimodal framework utilizes facial videos and fNIRS\nand presents a modality-agnostic approach, alleviating the need for\ndomain-specific models. Employing a dual ViT configuration and adopting\nwaveform representations for the fNIRS, as well as for the extracted embeddings\nfrom the two modalities, demonstrate the efficacy of the proposed method,\nachieving an accuracy of 46.76% in the multilevel pain assessment task.\n","authors":["Stefanos Gkikas","Manolis Tsiknakis"],"pdf_url":"https://arxiv.org/pdf/2407.19809v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.11590v2","updated":"2024-07-29T09:02:11Z","published":"2024-07-16T10:50:10Z","title":"Rethinking Learned Image Compression: Context is All You Need","summary":" Since LIC has made rapid progress recently compared to traditional methods,\nthis paper attempts to discuss the question about 'Where is the boundary of\nLearned Image Compression(LIC)?' with regard to subjective matrics. Thus this\npaper splits the above problem into two sub-problems:1)Where is the boundary of\nrate-distortion performance of PSNR? 2)How to further improve the compression\ngain and achieve the boundary? Therefore this paper analyzes the effectiveness\nof scaling parameters for encoder, decoder and context model, which are the\nthree components of LIC. Then we conclude that scaling for LIC is to scale for\ncontext model and decoder within LIC. Extensive experiments demonstrate that\noverfitting can actually serve as an effective context. By optimizing the\ncontext, this paper further improves PSNR and achieves state-of-the-art\nperformance, showing a performance gain of 14.39% with BD-RATE over VVC.\n","authors":["Jixiang Luo"],"pdf_url":"https://arxiv.org/pdf/2407.11590v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17803v2","updated":"2024-07-29T08:43:48Z","published":"2024-01-31T12:53:11Z","title":"SU-SAM: A Simple Unified Framework for Adapting Segment Anything Model\n in Underperformed Scenes","summary":" Segment anything model (SAM) has demonstrated excellent generalizability in\ncommon vision scenarios, yet falling short of the ability to understand\nspecialized data. Recently, several methods have combined parameter-efficient\ntechniques with task-specific designs to fine-tune SAM on particular tasks.\nHowever, these methods heavily rely on handcraft, complicated, and\ntask-specific designs, and pre/post-processing to achieve acceptable\nperformances on downstream tasks. As a result, this severely restricts\ngeneralizability to other downstream tasks. To address this issue, we present a\nsimple and unified framework, namely SU-SAM, that can easily and efficiently\nfine-tune the SAM model with parameter-efficient techniques while maintaining\nexcellent generalizability toward various downstream tasks. SU-SAM does not\nrequire any task-specific designs and aims to improve the adaptability of\nSAM-like models significantly toward underperformed scenes. Concretely, we\nabstract parameter-efficient modules of different methods into basic design\nelements in our framework. Besides, we propose four variants of SU-SAM, i.e.,\nseries, parallel, mixed, and LoRA structures. Comprehensive experiments on nine\ndatasets and six downstream tasks to verify the effectiveness of SU-SAM,\nincluding medical image segmentation, camouflage object detection, salient\nobject segmentation, surface defect segmentation, complex object shapes, and\nshadow masking. Our experimental results demonstrate that SU-SAM achieves\ncompetitive or superior accuracy compared to state-of-the-art methods.\nFurthermore, we provide in-depth analyses highlighting the effectiveness of\ndifferent parameter-efficient designs within SU-SAM. In addition, we propose a\ngeneralized model and benchmark, showcasing SU-SAM's generalizability across\nall diverse datasets simultaneously.\n","authors":["Yiran Song","Qianyu Zhou","Xuequan Lu","Zhiwen Shao","Lizhuang Ma"],"pdf_url":"https://arxiv.org/pdf/2401.17803v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19795v1","updated":"2024-07-29T08:38:46Z","published":"2024-07-29T08:38:46Z","title":"VolDoGer: LLM-assisted Datasets for Domain Generalization in\n Vision-Language Tasks","summary":" Domain generalizability is a crucial aspect of a deep learning model since it\ndetermines the capability of the model to perform well on data from unseen\ndomains. However, research on the domain generalizability of deep learning\nmodels for vision-language tasks remains limited, primarily because of the lack\nof required datasets. To address these challenges, we propose VolDoGer:\nVision-Language Dataset for Domain Generalization, a dedicated dataset designed\nfor domain generalization that addresses three vision-language tasks: image\ncaptioning, visual question answering, and visual entailment. We constructed\nVolDoGer by extending LLM-based data annotation techniques to vision-language\ntasks, thereby alleviating the burden of recruiting human annotators. We\nevaluated the domain generalizability of various models, ranging from\nfine-tuned models to a recent multimodal large language model, through\nVolDoGer.\n","authors":["Juhwan Choi","Junehyoung Kwon","JungMin Yun","Seunguk Yu","YoungBin Kim"],"pdf_url":"https://arxiv.org/pdf/2407.19795v1.pdf","comment":"31 pages, 5 figures, 20 tables"},{"id":"http://arxiv.org/abs/2407.19789v1","updated":"2024-07-29T08:33:32Z","published":"2024-07-29T08:33:32Z","title":"Interpreting Low-level Vision Models with Causal Effect Maps","summary":" Deep neural networks have significantly improved the performance of low-level\nvision tasks but also increased the difficulty of interpretability. A deep\nunderstanding of deep models is beneficial for both network design and\npractical reliability. To take up this challenge, we introduce causality theory\nto interpret low-level vision models and propose a model-/task-agnostic method\ncalled Causal Effect Map (CEM). With CEM, we can visualize and quantify the\ninput-output relationships on either positive or negative effects. After\nanalyzing various low-level vision tasks with CEM, we have reached several\ninteresting insights, such as: (1) Using more information of input images\n(e.g., larger receptive field) does NOT always yield positive outcomes. (2)\nAttempting to incorporate mechanisms with a global receptive field (e.g.,\nchannel attention) into image denoising may prove futile. (3) Integrating\nmultiple tasks to train a general model could encourage the network to\nprioritize local information over global context. Based on the causal effect\ntheory, the proposed diagnostic tool can refresh our common knowledge and bring\na deeper understanding of low-level vision models. Codes are available at\nhttps://github.com/J-FHu/CEM.\n","authors":["Jinfan Hu","Jinjin Gu","Shiyao Yu","Fanghua Yu","Zheyuan Li","Zhiyuan You","Chaochao Lu","Chao Dong"],"pdf_url":"https://arxiv.org/pdf/2407.19789v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19787v1","updated":"2024-07-29T08:32:27Z","published":"2024-07-29T08:32:27Z","title":"SciPostLayout: A Dataset for Layout Analysis and Layout Generation of\n Scientific Posters","summary":" Scientific posters are used to present the contributions of scientific papers\neffectively in a graphical format. However, creating a well-designed poster\nthat efficiently summarizes the core of a paper is both labor-intensive and\ntime-consuming. A system that can automatically generate well-designed posters\nfrom scientific papers would reduce the workload of authors and help readers\nunderstand the outline of the paper visually. Despite the demand for poster\ngeneration systems, only a limited research has been conduced due to the lack\nof publicly available datasets. Thus, in this study, we built the SciPostLayout\ndataset, which consists of 7,855 scientific posters and manual layout\nannotations for layout analysis and generation. SciPostLayout also contains 100\nscientific papers paired with the posters. All of the posters and papers in our\ndataset are under the CC-BY license and are publicly available. As benchmark\ntests for the collected dataset, we conducted experiments for layout analysis\nand generation utilizing existing computer vision models and found that both\nlayout analysis and generation of posters using SciPostLayout are more\nchallenging than with scientific papers. We also conducted experiments on\ngenerating layouts from scientific papers to demonstrate the potential of\nutilizing LLM as a scientific poster generation system. The dataset is publicly\navailable at https://huggingface.co/datasets/omron-sinicx/scipostlayout_v2. The\ncode is also publicly available at\nhttps://github.com/omron-sinicx/scipostlayout.\n","authors":["Shohei Tanaka","Hao Wang","Yoshitaka Ushiku"],"pdf_url":"https://arxiv.org/pdf/2407.19787v1.pdf","comment":"Accepted by BMVC2024"},{"id":"http://arxiv.org/abs/2407.19774v1","updated":"2024-07-29T08:17:05Z","published":"2024-07-29T08:17:05Z","title":"Garment Animation NeRF with Color Editing","summary":" Generating high-fidelity garment animations through traditional workflows,\nfrom modeling to rendering, is both tedious and expensive. These workflows\noften require repetitive steps in response to updates in character motion,\nrendering viewpoint changes, or appearance edits. Although recent neural\nrendering offers an efficient solution for computationally intensive processes,\nit struggles with rendering complex garment animations containing fine wrinkle\ndetails and realistic garment-and-body occlusions, while maintaining structural\nconsistency across frames and dense view rendering. In this paper, we propose a\nnovel approach to directly synthesize garment animations from body motion\nsequences without the need for an explicit garment proxy. Our approach infers\ngarment dynamic features from body motion, providing a preliminary overview of\ngarment structure. Simultaneously, we capture detailed features from\nsynthesized reference images of the garment's front and back, generated by a\npre-trained image model. These features are then used to construct a neural\nradiance field that renders the garment animation video. Additionally, our\ntechnique enables garment recoloring by decomposing its visual elements. We\ndemonstrate the generalizability of our method across unseen body motions and\ncamera views, ensuring detailed structural consistency. Furthermore, we\nshowcase its applicability to color editing on both real and synthetic garment\ndata. Compared to existing neural rendering techniques, our method exhibits\nqualitative and quantitative improvements in garment dynamics and wrinkle\ndetail modeling. Code is available at\n\\url{https://github.com/wrk226/GarmentAnimationNeRF}.\n","authors":["Renke Wang","Meng Zhang","Jun Li","Jian Yan"],"pdf_url":"https://arxiv.org/pdf/2407.19774v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19773v1","updated":"2024-07-29T08:12:42Z","published":"2024-07-29T08:12:42Z","title":"Unmasking unlearnable models: a classification challenge for biomedical\n images without visible cues","summary":" Predicting traits from images lacking visual cues is challenging, as\nalgorithms are designed to capture visually correlated ground truth. This\nproblem is critical in biomedical sciences, and their solution can improve the\nefficacy of non-invasive methods. For example, a recent challenge of predicting\nMGMT methylation status from MRI images is critical for treatment decisions of\nglioma patients. Using less robust models poses a significant risk in these\ncritical scenarios and underscores the urgency of addressing this issue.\nDespite numerous efforts, contemporary models exhibit suboptimal performance,\nand underlying reasons for this limitation remain elusive. In this study, we\ndemystify the complexity of MGMT status prediction through a comprehensive\nexploration by performing benchmarks of existing models adjoining transfer\nlearning. Their architectures were further dissected by observing gradient flow\nacross layers. Additionally, a feature selection strategy was applied to\nimprove model interpretability. Our finding highlighted that current models are\nunlearnable and may require new architectures to explore applications in the\nreal world. We believe our study will draw immediate attention and catalyse\nadvancements in predictive modelling with non-visible cues.\n","authors":["Shivam Kumar","Samrat Chatterjee"],"pdf_url":"https://arxiv.org/pdf/2407.19773v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19768v1","updated":"2024-07-29T08:03:33Z","published":"2024-07-29T08:03:33Z","title":"Efficient Face Super-Resolution via Wavelet-based Feature Enhancement\n Network","summary":" Face super-resolution aims to reconstruct a high-resolution face image from a\nlow-resolution face image. Previous methods typically employ an encoder-decoder\nstructure to extract facial structural features, where the direct downsampling\ninevitably introduces distortions, especially to high-frequency features such\nas edges. To address this issue, we propose a wavelet-based feature enhancement\nnetwork, which mitigates feature distortion by losslessly decomposing the input\nfeature into high and low-frequency components using the wavelet transform and\nprocessing them separately. To improve the efficiency of facial feature\nextraction, a full domain Transformer is further proposed to enhance local,\nregional, and global facial features. Such designs allow our method to perform\nbetter without stacking many modules as previous methods did. Experiments show\nthat our method effectively balances performance, model size, and speed. Code\nlink: https://github.com/PRIS-CV/WFEN.\n","authors":["Wenjie Li","Heng Guo","Xuannan Liu","Kongming Liang","Jiani Hu","Zhanyu Ma","Jun Guo"],"pdf_url":"https://arxiv.org/pdf/2407.19768v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.15525v3","updated":"2024-07-29T07:58:39Z","published":"2022-09-30T15:15:05Z","title":"Slimmable Networks for Contrastive Self-supervised Learning","summary":" Self-supervised learning makes significant progress in pre-training large\nmodels, but struggles with small models. Mainstream solutions to this problem\nrely mainly on knowledge distillation, which involves a two-stage procedure:\nfirst training a large teacher model and then distilling it to improve the\ngeneralization ability of smaller ones. In this work, we introduce another\none-stage solution to obtain pre-trained small models without the need for\nextra teachers, namely, slimmable networks for contrastive self-supervised\nlearning (SlimCLR). A slimmable network consists of a full network and several\nweight-sharing sub-networks, which can be pre-trained once to obtain various\nnetworks, including small ones with low computation costs. However,\ninterference between weight-sharing networks leads to severe performance\ndegradation in self-supervised cases, as evidenced by gradient magnitude\nimbalance and gradient direction divergence. The former indicates that a small\nproportion of parameters produce dominant gradients during backpropagation,\nwhile the main parameters may not be fully optimized. The latter shows that the\ngradient direction is disordered, and the optimization process is unstable. To\naddress these issues, we introduce three techniques to make the main parameters\nproduce dominant gradients and sub-networks have consistent outputs. These\ntechniques include slow start training of sub-networks, online distillation,\nand loss re-weighting according to model sizes. Furthermore, theoretical\nresults are presented to demonstrate that a single slimmable linear layer is\nsub-optimal during linear evaluation. Thus a switchable linear probe layer is\napplied during linear evaluation. We instantiate SlimCLR with typical\ncontrastive learning frameworks and achieve better performance than previous\narts with fewer parameters and FLOPs. The code is at\nhttps://github.com/mzhaoshuai/SlimCLR.\n","authors":["Shuai Zhao","Linchao Zhu","Xiaohan Wang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2209.15525v3.pdf","comment":"Accepted by IJCV, code is at https://github.com/mzhaoshuai/SlimCLR"},{"id":"http://arxiv.org/abs/2407.19763v1","updated":"2024-07-29T07:56:11Z","published":"2024-07-29T07:56:11Z","title":"TeleOR: Real-time Telemedicine System for Full-Scene Operating Room","summary":" The advent of telemedicine represents a transformative development in\nleveraging technology to extend the reach of specialized medical expertise to\nremote surgeries, a field where the immediacy of expert guidance is paramount.\nHowever, the intricate dynamics of Operating Room (OR) scene pose unique\nchallenges for telemedicine, particularly in achieving high-fidelity, real-time\nscene reconstruction and transmission amidst obstructions and bandwidth\nlimitations. This paper introduces TeleOR, a pioneering system designed to\naddress these challenges through real-time OR scene reconstruction for\nTele-intervention. TeleOR distinguishes itself with three innovative\napproaches: dynamic self-calibration, which leverages inherent scene features\nfor calibration without the need for preset markers, allowing for obstacle\navoidance and real-time camera adjustment; selective OR reconstruction,\nfocusing on dynamically changing scene segments to reduce reconstruction\ncomplexity; and viewport-adaptive transmission, optimizing data transmission\nbased on real-time client feedback to efficiently deliver high-quality 3D\nreconstructions within bandwidth constraints. Comprehensive experiments on the\n4D-OR surgical scene dataset demostrate the superiority and applicability of\nTeleOR, illuminating the potential to revolutionize tele-interventions by\novercoming the spatial and technical barriers inherent in remote surgical\nguidance.\n","authors":["Yixuan Wu","Kaiyuan Hu","Qian Shao","Jintai Chen","Danny Z. Chen","Jian Wu"],"pdf_url":"https://arxiv.org/pdf/2407.19763v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16600v2","updated":"2024-07-29T07:55:54Z","published":"2024-07-23T16:03:02Z","title":"DHGS: Decoupled Hybrid Gaussian Splatting for Driving Scene","summary":" Existing Gaussian splatting methods often fall short in achieving\nsatisfactory novel view synthesis in driving scenes, primarily due to the\nabsence of crafty design and geometric constraints for the involved elements.\nThis paper introduces a novel neural rendering method termed Decoupled Hybrid\nGaussian Splatting (DHGS), targeting at promoting the rendering quality of\nnovel view synthesis for static driving scenes. The novelty of this work lies\nin the decoupled and hybrid pixel-level blender for road and non-road layers,\nwithout the conventional unified differentiable rendering logic for the entire\nscene, while still maintaining consistent and continuous superimposition\nthrough the proposed depth-ordered hybrid rendering strategy. Additionally, an\nimplicit road representation comprised of a Signed Distance Field (SDF) is\ntrained to supervise the road surface with subtle geometric attributes.\nAccompanied by the use of auxiliary transmittance loss and consistency loss,\nnovel images with imperceptible boundary and elevated fidelity are ultimately\nobtained. Substantial experiments on the Waymo dataset prove that DHGS\noutperforms the state-of-the-art methods. The project page where more video\nevidences are given is: https://ironbrotherstyle.github.io/dhgs_web.\n","authors":["Xi Shi","Lingli Chen","Peng Wei","Xi Wu","Tian Jiang","Yonggang Luo","Lecheng Xie"],"pdf_url":"https://arxiv.org/pdf/2407.16600v2.pdf","comment":"13 pages, 14 figures, conference"},{"id":"http://arxiv.org/abs/2404.02877v3","updated":"2024-07-29T07:35:27Z","published":"2024-04-03T17:24:27Z","title":"FlightScope: A Deep Comprehensive Review of Aircraft Detection\n Algorithms in Satellite Imagery","summary":" Object detection in remotely sensed satellite pictures is fundamental in many\nfields such as biophysical, and environmental monitoring. While deep learning\nalgorithms are constantly evolving, they have been mostly implemented and\ntested on popular ground-based taken photos. This paper critically evaluates\nand compares a suite of advanced object detection algorithms customized for the\ntask of identifying aircraft within satellite imagery. Using the large\nHRPlanesV2 dataset, together with a rigorous validation with the GDIT dataset,\nthis research encompasses an array of methodologies including YOLO versions 5\nand 8, Faster RCNN, CenterNet, RetinaNet, RTMDet, and DETR, all trained from\nscratch. This exhaustive training and validation study reveal YOLOv5 as the\npreeminent model for the specific case of identifying airplanes from remote\nsensing data, showcasing high precision and adaptability across diverse imaging\nconditions. This research highlight the nuanced performance landscapes of these\nalgorithms, with YOLOv5 emerging as a robust solution for aerial object\ndetection, underlining its importance through superior mean average precision,\nRecall, and Intersection over Union scores. The findings described here\nunderscore the fundamental role of algorithm selection aligned with the\nspecific demands of satellite imagery analysis and extend a comprehensive\nframework to evaluate model efficacy. The benchmark toolkit and codes,\navailable via https://github.com/toelt-llc/FlightScope_Bench, aims to further\nexploration and innovation in the realm of remote sensing object detection,\npaving the way for improved analytical methodologies in satellite imagery\napplications.\n","authors":["Safouane El Ghazouali","Arnaud Gucciardi","Francesca Venturini","Nicola Venturi","Michael Rueegsegger","Umberto Michelucci"],"pdf_url":"https://arxiv.org/pdf/2404.02877v3.pdf","comment":"15 figures, 4 tables, comprehensive survey, comparative study"},{"id":"http://arxiv.org/abs/2407.19753v1","updated":"2024-07-29T07:35:06Z","published":"2024-07-29T07:35:06Z","title":"PredIN: Towards Open-Set Gesture Recognition via Prediction\n Inconsistency","summary":" Gesture recognition based on surface electromyography (sEMG) has achieved\nsignificant progress in human-machine interaction (HMI). However, accurately\nrecognizing predefined gestures within a closed set is still inadequate in\npractice; a robust open-set system needs to effectively reject unknown gestures\nwhile correctly classifying known ones. To handle this challenge, we first\nreport prediction inconsistency discovered for unknown classes due to ensemble\ndiversity, which can significantly facilitate the detection of unknown classes.\nBased on this insight, we propose an ensemble learning approach, PredIN, to\nexplicitly magnify the prediction inconsistency by enhancing ensemble\ndiversity. Specifically, PredIN maximizes the class feature distribution\ninconsistency among ensemble members to enhance diversity. Meanwhile, it\noptimizes inter-class separability within an individual ensemble member to\nmaintain individual performance. Comprehensive experiments on various benchmark\ndatasets demonstrate that the PredIN outperforms state-of-the-art methods by a\nclear margin.Our proposed method simultaneously achieves accurate closed-set\nclassification for predefined gestures and effective rejection for unknown\ngestures, exhibiting its efficacy and superiority in open-set gesture\nrecognition based on sEMG.\n","authors":["Chen Liu","Can Han","Chengfeng Zhou","Crystal Cai","Dahong Qian"],"pdf_url":"https://arxiv.org/pdf/2407.19753v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2407.19752v1","updated":"2024-07-29T07:30:41Z","published":"2024-07-29T07:30:41Z","title":"Contextuality Helps Representation Learning for Generalized Category\n Discovery","summary":" This paper introduces a novel approach to Generalized Category Discovery\n(GCD) by leveraging the concept of contextuality to enhance the identification\nand classification of categories in unlabeled datasets. Drawing inspiration\nfrom human cognition's ability to recognize objects within their context, we\npropose a dual-context based method.\n Our model integrates two levels of contextuality: instance-level, where\nnearest-neighbor contexts are utilized for contrastive learning, and\ncluster-level, employing prototypical contrastive learning based on category\nprototypes. The integration of the contextual information effectively improves\nthe feature learning and thereby the classification accuracy of all categories,\nwhich better deals with the real-world datasets. Different from the traditional\nsemi-supervised and novel category discovery techniques, our model focuses on a\nmore realistic and challenging scenario where both known and novel categories\nare present in the unlabeled data. Extensive experimental results on several\nbenchmark data sets demonstrate that the proposed model outperforms the\nstate-of-the-art. Code is available at:\nhttps://github.com/Clarence-CV/Contexuality-GCD\n","authors":["Tingzhang Luo","Mingxuan Du","Jiatao Shi","Xinxiang Chen","Bingchen Zhao","Shaoguang Huang"],"pdf_url":"https://arxiv.org/pdf/2407.19752v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.10950v3","updated":"2024-07-29T07:25:37Z","published":"2022-12-21T11:43:20Z","title":"UNIKD: UNcertainty-filtered Incremental Knowledge Distillation for\n Neural Implicit Representation","summary":" Recent neural implicit representations (NIRs) have achieved great success in\nthe tasks of 3D reconstruction and novel view synthesis. However, they require\nthe images of a scene from different camera views to be available for one-time\ntraining. This is expensive especially for scenarios with large-scale scenes\nand limited data storage. In view of this, we explore the task of incremental\nlearning for NIRs in this work. We design a student-teacher framework to\nmitigate the catastrophic forgetting problem. Specifically, we iterate the\nprocess of using the student as the teacher at the end of each time step and\nlet the teacher guide the training of the student in the next step. As a\nresult, the student network is able to learn new information from the streaming\ndata and retain old knowledge from the teacher network simultaneously. Although\nintuitive, naively applying the student-teacher pipeline does not work well in\nour task. Not all information from the teacher network is helpful since it is\nonly trained with the old data. To alleviate this problem, we further introduce\na random inquirer and an uncertainty-based filter to filter useful information.\nOur proposed method is general and thus can be adapted to different implicit\nrepresentations such as neural radiance field (NeRF) and neural surface field.\nExtensive experimental results for both 3D reconstruction and novel view\nsynthesis demonstrate the effectiveness of our approach compared to different\nbaselines.\n","authors":["Mengqi Guo","Chen Li","Hanlin Chen","Gim Hee Lee"],"pdf_url":"https://arxiv.org/pdf/2212.10950v3.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2407.19746v1","updated":"2024-07-29T07:18:11Z","published":"2024-07-29T07:18:11Z","title":"Octave-YOLO: Cross frequency detection network with octave convolution","summary":" Despite the rapid advancement of object detection algorithms, processing\nhigh-resolution images on embedded devices remains a significant challenge.\nTheoretically, the fully convolutional network architecture used in current\nreal-time object detectors can handle all input resolutions. However, the\nsubstantial computational demands required to process high-resolution images\nrender them impractical for real-time applications. To address this issue,\nreal-time object detection models typically downsample the input image for\ninference, leading to a loss of detail and decreased accuracy. In response, we\ndeveloped Octave-YOLO, designed to process high-resolution images in real-time\nwithin the constraints of embedded systems. We achieved this through the\nintroduction of the cross frequency partial network (CFPNet), which divides the\ninput feature map into low-resolution, low-frequency, and high-resolution,\nhigh-frequency sections. This configuration enables complex operations such as\nconvolution bottlenecks and self-attention to be conducted exclusively on\nlow-resolution feature maps while simultaneously preserving the details in\nhigh-resolution maps. Notably, this approach not only dramatically reduces the\ncomputational demands of convolution tasks but also allows for the integration\nof attention modules, which are typically challenging to implement in real-time\napplications, with minimal additional cost. Additionally, we have incorporated\ndepthwise separable convolution into the core building blocks and downsampling\nlayers to further decrease latency. Experimental results have shown that\nOctave-YOLO matches the performance of YOLOv8 while significantly reducing\ncomputational demands. For example, in 1080x1080 resolution, Octave-YOLO-N is\n1.56 times faster than YOLOv8, achieving nearly the same accuracy on the COCO\ndataset with approximately 40 percent fewer parameters and FLOPs.\n","authors":["Sangjune Shin","Dongkun Shin"],"pdf_url":"https://arxiv.org/pdf/2407.19746v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.01807v2","updated":"2024-07-29T06:29:09Z","published":"2024-03-04T07:57:05Z","title":"ViewDiff: 3D-Consistent Image Generation with Text-to-Image Models","summary":" 3D asset generation is getting massive amounts of attention, inspired by the\nrecent success of text-guided 2D content creation. Existing text-to-3D methods\nuse pretrained text-to-image diffusion models in an optimization problem or\nfine-tune them on synthetic data, which often results in non-photorealistic 3D\nobjects without backgrounds. In this paper, we present a method that leverages\npretrained text-to-image models as a prior, and learn to generate multi-view\nimages in a single denoising process from real-world data. Concretely, we\npropose to integrate 3D volume-rendering and cross-frame-attention layers into\neach block of the existing U-Net network of the text-to-image model. Moreover,\nwe design an autoregressive generation that renders more 3D-consistent images\nat any viewpoint. We train our model on real-world datasets of objects and\nshowcase its capabilities to generate instances with a variety of high-quality\nshapes and textures in authentic surroundings. Compared to the existing\nmethods, the results generated by our method are consistent, and have favorable\nvisual quality (-30% FID, -37% KID).\n","authors":["Lukas Höllein","Aljaž Božič","Norman Müller","David Novotny","Hung-Yu Tseng","Christian Richardt","Michael Zollhöfer","Matthias Nießner"],"pdf_url":"https://arxiv.org/pdf/2403.01807v2.pdf","comment":"Accepted to CVPR 2024, project page:\n https://lukashoel.github.io/ViewDiff/, video:\n https://www.youtube.com/watch?v=SdjoCqHzMMk, code:\n https://github.com/facebookresearch/ViewDiff"},{"id":"http://arxiv.org/abs/2311.15308v2","updated":"2024-07-29T06:24:07Z","published":"2023-11-26T14:17:51Z","title":"AV-Deepfake1M: A Large-Scale LLM-Driven Audio-Visual Deepfake Dataset","summary":" The detection and localization of highly realistic deepfake audio-visual\ncontent are challenging even for the most advanced state-of-the-art methods.\nWhile most of the research efforts in this domain are focused on detecting\nhigh-quality deepfake images and videos, only a few works address the problem\nof the localization of small segments of audio-visual manipulations embedded in\nreal videos. In this research, we emulate the process of such content\ngeneration and propose the AV-Deepfake1M dataset. The dataset contains\ncontent-driven (i) video manipulations, (ii) audio manipulations, and (iii)\naudio-visual manipulations for more than 2K subjects resulting in a total of\nmore than 1M videos. The paper provides a thorough description of the proposed\ndata generation pipeline accompanied by a rigorous analysis of the quality of\nthe generated data. The comprehensive benchmark of the proposed dataset\nutilizing state-of-the-art deepfake detection and localization methods\nindicates a significant drop in performance compared to previous datasets. The\nproposed dataset will play a vital role in building the next-generation\ndeepfake localization methods. The dataset and associated code are available at\nhttps://github.com/ControlNet/AV-Deepfake1M .\n","authors":["Zhixi Cai","Shreya Ghosh","Aman Pankaj Adatia","Munawar Hayat","Abhinav Dhall","Tom Gedeon","Kalin Stefanov"],"pdf_url":"https://arxiv.org/pdf/2311.15308v2.pdf","comment":"Accepted by ACM MM 2024"},{"id":"http://arxiv.org/abs/2309.16701v3","updated":"2024-07-29T06:03:24Z","published":"2023-08-15T17:38:55Z","title":"MVMR: A New Framework for Evaluating Faithfulness of Video Moment\n Retrieval against Multiple Distractors","summary":" With the explosion of multimedia content, video moment retrieval (VMR), which\naims to detect a video moment that matches a given text query from a video, has\nbeen studied intensively as a critical problem. However, the existing VMR\nframework evaluates video moment retrieval performance, assuming that a video\nis given, which may not reveal whether the models exhibit overconfidence in the\nfalsely given video. In this paper, we propose the MVMR (Massive Videos Moment\nRetrieval for Faithfulness Evaluation) task that aims to retrieve video moments\nwithin a massive video set, including multiple distractors, to evaluate the\nfaithfulness of VMR models. For this task, we suggest an automated massive\nvideo pool construction framework to categorize negative (distractors) and\npositive (false-negative) video sets using textual and visual semantic distance\nverification methods. We extend existing VMR datasets using these methods and\nnewly construct three practical MVMR datasets. To solve the task, we further\npropose a strong informative sample-weighted learning method, CroCs, which\nemploys two contrastive learning mechanisms: (1) weakly-supervised potential\nnegative learning and (2) cross-directional hard-negative learning.\nExperimental results on the MVMR datasets reveal that existing VMR models are\neasily distracted by the misinformation (distractors), whereas our model shows\nsignificantly robust performance, demonstrating that CroCs is essential to\ndistinguishing positive moments against distractors. Our code and datasets are\npublicly available: https://github.com/yny0506/Massive-Videos-Moment-Retrieval.\n","authors":["Nakyeong Yang","Minsung Kim","Seunghyun Yoon","Joongbo Shin","Kyomin Jung"],"pdf_url":"https://arxiv.org/pdf/2309.16701v3.pdf","comment":"accepted to CIKM 2024"},{"id":"http://arxiv.org/abs/2407.19719v1","updated":"2024-07-29T06:03:13Z","published":"2024-07-29T06:03:13Z","title":"Revolutionizing Urban Safety Perception Assessments: Integrating\n Multimodal Large Language Models with Street View Images","summary":" Measuring urban safety perception is an important and complex task that\ntraditionally relies heavily on human resources. This process often involves\nextensive field surveys, manual data collection, and subjective assessments,\nwhich can be time-consuming, costly, and sometimes inconsistent. Street View\nImages (SVIs), along with deep learning methods, provide a way to realize\nlarge-scale urban safety detection. However, achieving this goal often requires\nextensive human annotation to train safety ranking models, and the\narchitectural differences between cities hinder the transferability of these\nmodels. Thus, a fully automated method for conducting safety evaluations is\nessential. Recent advances in multimodal large language models (MLLMs) have\ndemonstrated powerful reasoning and analytical capabilities. Cutting-edge\nmodels, e.g., GPT-4 have shown surprising performance in many tasks. We\nemployed these models for urban safety ranking on a human-annotated anchor set\nand validated that the results from MLLMs align closely with human perceptions.\nAdditionally, we proposed a method based on the pre-trained Contrastive\nLanguage-Image Pre-training (CLIP) feature and K-Nearest Neighbors (K-NN)\nretrieval to quickly assess the safety index of the entire city. Experimental\nresults show that our method outperforms existing training needed deep learning\napproaches, achieving efficient and accurate urban safety evaluations. The\nproposed automation for urban safety perception assessment is a valuable tool\nfor city planners, policymakers, and researchers aiming to improve urban\nenvironments.\n","authors":["Jiaxin Zhanga","Yunqin Lia","Tomohiro Fukudab","Bowen Wang"],"pdf_url":"https://arxiv.org/pdf/2407.19719v1.pdf","comment":"13 pages, 9 figures"},{"id":"http://arxiv.org/abs/2402.00290v3","updated":"2024-07-29T05:47:44Z","published":"2024-02-01T02:43:20Z","title":"MEIA: Multimodal Embodied Perception and Interaction in Unknown\n Environments","summary":" With the surge in the development of large language models, embodied\nintelligence has attracted increasing attention. Nevertheless, prior works on\nembodied intelligence typically encode scene or historical memory in an\nunimodal manner, either visual or linguistic, which complicates the alignment\nof the model's action planning with embodied control. To overcome this\nlimitation, we introduce the Multimodal Embodied Interactive Agent (MEIA),\ncapable of translating high-level tasks expressed in natural language into a\nsequence of executable actions. Specifically, we propose a novel Multimodal\nEnvironment Memory (MEM) module, facilitating the integration of embodied\ncontrol with large models through the visual-language memory of scenes. This\ncapability enables MEIA to generate executable action plans based on diverse\nrequirements and the robot's capabilities. Furthermore, we construct an\nembodied question answering dataset based on a dynamic virtual cafe environment\nwith the help of the large language model. In this virtual environment, we\nconduct several experiments, utilizing multiple large models through zero-shot\nlearning, and carefully design scenarios for various situations. The\nexperimental results showcase the promising performance of our MEIA in various\nembodied interactive tasks.\n","authors":["Yang Liu","Xinshuai Song","Kaixuan Jiang","Weixing Chen","Jingzhou Luo","Guanbin Li","Liang Lin"],"pdf_url":"https://arxiv.org/pdf/2402.00290v3.pdf","comment":"Codes will be available at\n https://github.com/HCPLab-SYSU/Embodied_AI_Paper_List"},{"id":"http://arxiv.org/abs/2407.19714v1","updated":"2024-07-29T05:35:51Z","published":"2024-07-29T05:35:51Z","title":"Rethinking RGB-D Fusion for Semantic Segmentation in Surgical Datasets","summary":" Surgical scene understanding is a key technical component for enabling\nintelligent and context aware systems that can transform various aspects of\nsurgical interventions. In this work, we focus on the semantic segmentation\ntask, propose a simple yet effective multi-modal (RGB and depth) training\nframework called SurgDepth, and show state-of-the-art (SOTA) results on all\npublicly available datasets applicable for this task. Unlike previous\napproaches, which either fine-tune SOTA segmentation models trained on natural\nimages, or encode RGB or RGB-D information using RGB only pre-trained\nbackbones, SurgDepth, which is built on top of Vision Transformers (ViTs), is\ndesigned to encode both RGB and depth information through a simple fusion\nmechanism. We conduct extensive experiments on benchmark datasets including\nEndoVis2022, AutoLapro, LapI2I and EndoVis2017 to verify the efficacy of\nSurgDepth. Specifically, SurgDepth achieves a new SOTA IoU of 0.86 on EndoVis\n2022 SAR-RARP50 challenge and outperforms the current best method by at least\n4%, using a shallow and compute efficient decoder consisting of ConvNeXt\nblocks.\n","authors":["Muhammad Abdullah Jamal","Omid Mohareri"],"pdf_url":"https://arxiv.org/pdf/2407.19714v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.06886v6","updated":"2024-07-29T05:26:44Z","published":"2024-07-09T14:14:47Z","title":"Aligning Cyber Space with Physical World: A Comprehensive Survey on\n Embodied AI","summary":" Embodied Artificial Intelligence (Embodied AI) is crucial for achieving\nArtificial General Intelligence (AGI) and serves as a foundation for various\napplications that bridge cyberspace and the physical world. Recently, the\nemergence of Multi-modal Large Models (MLMs) and World Models (WMs) have\nattracted significant attention due to their remarkable perception,\ninteraction, and reasoning capabilities, making them a promising architecture\nfor the brain of embodied agents. However, there is no comprehensive survey for\nEmbodied AI in the era of MLMs. In this survey, we give a comprehensive\nexploration of the latest advancements in Embodied AI. Our analysis firstly\nnavigates through the forefront of representative works of embodied robots and\nsimulators, to fully understand the research focuses and their limitations.\nThen, we analyze four main research targets: 1) embodied perception, 2)\nembodied interaction, 3) embodied agent, and 4) sim-to-real adaptation,\ncovering the state-of-the-art methods, essential paradigms, and comprehensive\ndatasets. Additionally, we explore the complexities of MLMs in virtual and real\nembodied agents, highlighting their significance in facilitating interactions\nin dynamic digital and physical environments. Finally, we summarize the\nchallenges and limitations of embodied AI and discuss their potential future\ndirections. We hope this survey will serve as a foundational reference for the\nresearch community and inspire continued innovation. The associated project can\nbe found at https://github.com/HCPLab-SYSU/Embodied_AI_Paper_List.\n","authors":["Yang Liu","Weixing Chen","Yongjie Bai","Guanbin Li","Wen Gao","Liang Lin"],"pdf_url":"https://arxiv.org/pdf/2407.06886v6.pdf","comment":"The first comprehensive review of Embodied AI in the era of MLMs, 36\n pages. We also provide the paper list for Embodied AI:\n https://github.com/HCPLab-SYSU/Embodied_AI_Paper_List"},{"id":"http://arxiv.org/abs/2407.19708v1","updated":"2024-07-29T05:19:23Z","published":"2024-07-29T05:19:23Z","title":"ALEN: A Dual-Approach for Uniform and Non-Uniform Low-Light Image\n Enhancement","summary":" Low-light image enhancement is an important task in computer vision,\nessential for improving the visibility and quality of images captured in\nnon-optimal lighting conditions. Inadequate illumination can lead to\nsignificant information loss and poor image quality, impacting various\napplications such as surveillance. photography, or even autonomous driving. In\nthis regard, automated methods have been developed to automatically adjust\nillumination in the image for a better visual perception. Current enhancement\ntechniques often use specific datasets to enhance low-light images, but still\npresent challenges when adapting to diverse real-world conditions, where\nillumination degradation may be localized to specific regions. To address this\nchallenge, the Adaptive Light Enhancement Network (ALEN) is introduced, whose\nmain approach is the use of a classification mechanism to determine whether\nlocal or global illumination enhancement is required. Subsequently, estimator\nnetworks adjust illumination based on this classification and simultaneously\nenhance color fidelity. ALEN integrates the Light Classification Network\n(LCNet) for illuminance categorization, complemented by the Single-Channel\nNetwork (SCNet), and Multi-Channel Network (MCNet) for precise estimation of\nillumination and color, respectively. Extensive experiments on publicly\navailable datasets for low-light conditions were carried out to underscore\nALEN's robust generalization capabilities, demonstrating superior performance\nin both quantitative metrics and qualitative assessments when compared to\nrecent state-of-the-art methods. The ALEN not only enhances image quality in\nterms of visual perception but also represents an advancement in high-level\nvision tasks, such as semantic segmentation, as presented in this work. The\ncode of this method is available at https://github.com/xingyumex/ALEN.\n","authors":["Ezequiel Perez-Zarate","Oscar Ramos-Soto","Diego Oliva","Marco Perez-Cisneros"],"pdf_url":"https://arxiv.org/pdf/2407.19708v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17057v3","updated":"2024-07-29T05:18:08Z","published":"2023-11-28T18:59:52Z","title":"ReMoS: 3D Motion-Conditioned Reaction Synthesis for Two-Person\n Interactions","summary":" Current approaches for 3D human motion synthesis generate high quality\nanimations of digital humans performing a wide variety of actions and gestures.\nHowever, a notable technological gap exists in addressing the complex dynamics\nof multi human interactions within this paradigm. In this work, we present\nReMoS, a denoising diffusion based model that synthesizes full body reactive\nmotion of a person in a two person interaction scenario. Given the motion of\none person, we employ a combined spatio temporal cross attention mechanism to\nsynthesize the reactive body and hand motion of the second person, thereby\ncompleting the interactions between the two. We demonstrate ReMoS across\nchallenging two person scenarios such as pair dancing, Ninjutsu, kickboxing,\nand acrobatics, where one persons movements have complex and diverse influences\non the other. We also contribute the ReMoCap dataset for two person\ninteractions containing full body and finger motions. We evaluate ReMoS through\nmultiple quantitative metrics, qualitative visualizations, and a user study,\nand also indicate usability in interactive motion editing applications.\n","authors":["Anindita Ghosh","Rishabh Dabral","Vladislav Golyanik","Christian Theobalt","Philipp Slusallek"],"pdf_url":"https://arxiv.org/pdf/2311.17057v3.pdf","comment":"29 pages, 7 figures, 7 tables"},{"id":"http://arxiv.org/abs/2406.17538v2","updated":"2024-07-29T05:11:12Z","published":"2024-06-25T13:22:22Z","title":"Three-Stream Temporal-Shift Attention Network Based on Self-Knowledge\n Distillation for Micro-Expression Recognition","summary":" Micro-expressions are subtle facial movements that occur spontaneously when\npeople try to conceal real emotions. Micro-expression recognition is crucial in\nmany fields, including criminal analysis and psychotherapy. However,\nmicro-expression recognition is challenging since micro-expressions have low\nintensity and public datasets are small in size. To this end, a three-stream\ntemporal-shift attention network based on self-knowledge distillation called\nSKD-TSTSAN is proposed in this paper. Firstly, to address the low intensity of\nmuscle movements, we utilize learning-based motion magnification modules to\nenhance the intensity of muscle movements. Secondly, we employ efficient\nchannel attention modules in the local-spatial stream to make the network focus\non facial regions that are highly relevant to micro-expressions. In addition,\ntemporal shift modules are used in the dynamic-temporal stream, which enables\ntemporal modeling with no additional parameters by mixing motion information\nfrom two different temporal domains. Furthermore, we introduce self-knowledge\ndistillation into the micro-expression recognition task by introducing\nauxiliary classifiers and using the deepest section of the network for\nsupervision, encouraging all blocks to fully explore the features of the\ntraining set. Finally, extensive experiments are conducted on four public\ndatasets: CASME II, SAMM, MMEW, and CAS(ME)3. The experimental results\ndemonstrate that our SKD-TSTSAN outperforms other existing methods and achieves\nnew state-of-the-art performance. Our code will be available at\nhttps://github.com/GuanghaoZhu663/SKD-TSTSAN.\n","authors":["Guanghao Zhu","Lin Liu","Yuhao Hu","Haixin Sun","Fang Liu","Xiaohui Du","Ruqian Hao","Juanxiu Liu","Yong Liu","Hao Deng","Jing Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.17538v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09786v3","updated":"2024-07-29T04:57:16Z","published":"2024-01-18T08:10:34Z","title":"Adaptive Self-training Framework for Fine-grained Scene Graph Generation","summary":" Scene graph generation (SGG) models have suffered from inherent problems\nregarding the benchmark datasets such as the long-tailed predicate distribution\nand missing annotation problems. In this work, we aim to alleviate the\nlong-tailed problem of SGG by utilizing unannotated triplets. To this end, we\nintroduce a Self-Training framework for SGG (ST-SGG) that assigns pseudo-labels\nfor unannotated triplets based on which the SGG models are trained. While there\nhas been significant progress in self-training for image recognition, designing\na self-training framework for the SGG task is more challenging due to its\ninherent nature such as the semantic ambiguity and the long-tailed distribution\nof predicate classes. Hence, we propose a novel pseudo-labeling technique for\nSGG, called Class-specific Adaptive Thresholding with Momentum (CATM), which is\na model-agnostic framework that can be applied to any existing SGG models.\nFurthermore, we devise a graph structure learner (GSL) that is beneficial when\nadopting our proposed self-training framework to the state-of-the-art\nmessage-passing neural network (MPNN)-based SGG models. Our extensive\nexperiments verify the effectiveness of ST-SGG on various SGG models,\nparticularly in enhancing the performance on fine-grained predicate classes.\n","authors":["Kibum Kim","Kanghoon Yoon","Yeonjun In","Jinyoung Moon","Donghyun Kim","Chanyoung Park"],"pdf_url":"https://arxiv.org/pdf/2401.09786v3.pdf","comment":"9 pages; ICLR 2024"},{"id":"http://arxiv.org/abs/2310.10404v8","updated":"2024-07-29T04:47:04Z","published":"2023-10-16T13:49:46Z","title":"LLM4SGG: Large Language Models for Weakly Supervised Scene Graph\n Generation","summary":" Weakly-Supervised Scene Graph Generation (WSSGG) research has recently\nemerged as an alternative to the fully-supervised approach that heavily relies\non costly annotations. In this regard, studies on WSSGG have utilized image\ncaptions to obtain unlocalized triplets while primarily focusing on grounding\nthe unlocalized triplets over image regions. However, they have overlooked the\ntwo issues involved in the triplet formation process from the captions: 1)\nSemantic over-simplification issue arises when extracting triplets from\ncaptions, where fine-grained predicates in captions are undesirably converted\ninto coarse-grained predicates, resulting in a long-tailed predicate\ndistribution, and 2) Low-density scene graph issue arises when aligning the\ntriplets in the caption with entity/predicate classes of interest, where many\ntriplets are discarded and not used in training, leading to insufficient\nsupervision. To tackle the two issues, we propose a new approach, i.e., Large\nLanguage Model for weakly-supervised SGG (LLM4SGG), where we mitigate the two\nissues by leveraging the LLM's in-depth understanding of language and reasoning\nability during the extraction of triplets from captions and alignment of\nentity/predicate classes with target data. To further engage the LLM in these\nprocesses, we adopt the idea of Chain-of-Thought and the in-context few-shot\nlearning strategy. To validate the effectiveness of LLM4SGG, we conduct\nextensive experiments on Visual Genome and GQA datasets, showing significant\nimprovements in both Recall@K and mean Recall@K compared to the\nstate-of-the-art WSSGG methods. A further appeal is that LLM4SGG is\ndata-efficient, enabling effective model training with a small amount of\ntraining images.\n","authors":["Kibum Kim","Kanghoon Yoon","Jaehyeong Jeon","Yeonjun In","Jinyoung Moon","Donghyun Kim","Chanyoung Park"],"pdf_url":"https://arxiv.org/pdf/2310.10404v8.pdf","comment":"8 pages; CVPR 2024"},{"id":"http://arxiv.org/abs/2407.19698v1","updated":"2024-07-29T04:43:58Z","published":"2024-07-29T04:43:58Z","title":"Classification Matters: Improving Video Action Detection with\n Class-Specific Attention","summary":" Video action detection (VAD) aims to detect actors and classify their actions\nin a video. We figure that VAD suffers more from classification rather than\nlocalization of actors. Hence, we analyze how prevailing methods form features\nfor classification and find that they prioritize actor regions, yet often\noverlooking the essential contextual information necessary for accurate\nclassification. Accordingly, we propose to reduce the bias toward actor and\nencourage paying attention to the context that is relevant to each action\nclass. By assigning a class-dedicated query to each action class, our model can\ndynamically determine where to focus for effective classification. The proposed\nmodel demonstrates superior performance on three challenging benchmarks with\nsignificantly fewer parameters and less computation.\n","authors":["Jinsung Lee","Taeoh Kim","Inwoong Lee","Minho Shim","Dongyoon Wee","Minsu Cho","Suha Kwak"],"pdf_url":"https://arxiv.org/pdf/2407.19698v1.pdf","comment":"31 pages, accepted to ECCV 2024"},{"id":"http://arxiv.org/abs/2407.19696v1","updated":"2024-07-29T04:40:18Z","published":"2024-07-29T04:40:18Z","title":"Cross-Layer Feature Pyramid Transformer for Small Object Detection in\n Aerial Images","summary":" Object detection in aerial images has always been a challenging task due to\nthe generally small size of the objects. Most current detectors prioritize\nnovel detection frameworks, often overlooking research on fundamental\ncomponents such as feature pyramid networks. In this paper, we introduce the\nCross-Layer Feature Pyramid Transformer (CFPT), a novel upsampler-free feature\npyramid network designed specifically for small object detection in aerial\nimages. CFPT incorporates two meticulously designed attention blocks with\nlinear computational complexity: the Cross-Layer Channel-Wise Attention (CCA)\nand the Cross-Layer Spatial-Wise Attention (CSA). CCA achieves cross-layer\ninteraction by dividing channel-wise token groups to perceive cross-layer\nglobal information along the spatial dimension, while CSA completes cross-layer\ninteraction by dividing spatial-wise token groups to perceive cross-layer\nglobal information along the channel dimension. By integrating these modules,\nCFPT enables cross-layer interaction in one step, thereby avoiding the semantic\ngap and information loss associated with element-wise summation and\nlayer-by-layer transmission. Furthermore, CFPT incorporates global contextual\ninformation, which enhances detection performance for small objects. To further\nenhance location awareness during cross-layer interaction, we propose the\nCross-Layer Consistent Relative Positional Encoding (CCPE) based on inter-layer\nmutual receptive fields. We evaluate the effectiveness of CFPT on two\nchallenging object detection datasets in aerial images, namely VisDrone2019-DET\nand TinyPerson. Extensive experiments demonstrate the effectiveness of CFPT,\nwhich outperforms state-of-the-art feature pyramid networks while incurring\nlower computational costs. The code will be released at\nhttps://github.com/duzw9311/CFPT.\n","authors":["Zewen Du","Zhenjiang Hu","Guiyu Zhao","Ying Jin","Hongbin Ma"],"pdf_url":"https://arxiv.org/pdf/2407.19696v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19694v1","updated":"2024-07-29T04:33:04Z","published":"2024-07-29T04:33:04Z","title":"Structural damage detection via hierarchical damage information with\n volumetric assessment","summary":" Image environments and noisy labels hinder deep learning-based inference\nmodels in structural damage detection. Post-detection, there is the challenge\nof reliance on manual assessments of detected damages. As a result,\nGuided-DetNet, characterized by Generative Attention Module (GAM), Hierarchical\nElimination Algorithm (HEA), and Volumetric Contour Visual Assessment (VCVA),\nis proposed to mitigate complex image environments, noisy labeling, and\npost-detection manual assessment of structural damages. GAM leverages\ncross-horizontal and cross-vertical patch merging and cross\nforeground-background feature fusion to generate varied features to mitigate\ncomplex image environments. HEA addresses noisy labeling using hierarchical\nrelationships among classes to refine instances given an image by eliminating\nunlikely class categories. VCVA assesses the severity of detected damages via\nvolumetric representation and quantification leveraging the Dirac delta\ndistribution. A comprehensive quantitative study, two robustness tests, and an\napplication scenario based on the PEER Hub Image-Net dataset substantiate\nGuided-DetNet's promising performances. Guided-DetNet outperformed the\nbest-compared models in a triple classification task by a difference of not\nless than 3% and not less than 2% in a dual detection task under varying\nmetrics.\n","authors":["Isaac Osei Agyemang","Jianwen Chen","Liaoyuan Zeng","Isaac Adjei-Mensah","Daniel Acheampong","Gordon Owusu Boateng","Adu Asare Baffour"],"pdf_url":"https://arxiv.org/pdf/2407.19694v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19679v1","updated":"2024-07-29T03:47:54Z","published":"2024-07-29T03:47:54Z","title":"Harnessing Large Vision and Language Models in Agriculture: A Review","summary":" Large models can play important roles in many domains. Agriculture is another\nkey factor affecting the lives of people around the world. It provides food,\nfabric, and coal for humanity. However, facing many challenges such as pests\nand diseases, soil degradation, global warming, and food security, how to\nsteadily increase the yield in the agricultural sector is a problem that humans\nstill need to solve. Large models can help farmers improve production\nefficiency and harvest by detecting a series of agricultural production tasks\nsuch as pests and diseases, soil quality, and seed quality. It can also help\nfarmers make wise decisions through a variety of information, such as images,\ntext, etc. Herein, we delve into the potential applications of large models in\nagriculture, from large language model (LLM) and large vision model (LVM) to\nlarge vision-language models (LVLM). After gaining a deeper understanding of\nmultimodal large language models (MLLM), it can be recognized that problems\nsuch as agricultural image processing, agricultural question answering systems,\nand agricultural machine automation can all be solved by large models. Large\nmodels have great potential in the field of agriculture. We outline the current\napplications of agricultural large models, and aims to emphasize the importance\nof large models in the domain of agriculture. In the end, we envisage a future\nin which famers use MLLM to accomplish many tasks in agriculture, which can\ngreatly improve agricultural production efficiency and yield.\n","authors":["Hongyan Zhu","Shuai Qin","Min Su","Chengzhi Lin","Anjie Li","Junfeng Gao"],"pdf_url":"https://arxiv.org/pdf/2407.19679v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19675v1","updated":"2024-07-29T03:36:39Z","published":"2024-07-29T03:36:39Z","title":"Semi-Supervised Teacher-Reference-Student Architecture for Action\n Quality Assessment","summary":" Existing action quality assessment (AQA) methods often require a large number\nof label annotations for fully supervised learning, which are laborious and\nexpensive. In practice, the labeled data are difficult to obtain because the\nAQA annotation process requires domain-specific expertise. In this paper, we\npropose a novel semi-supervised method, which can be utilized for better\nassessment of the AQA task by exploiting a large amount of unlabeled data and a\nsmall portion of labeled data. Differing from the traditional teacher-student\nnetwork, we propose a teacher-reference-student architecture to learn both\nunlabeled and labeled data, where the teacher network and the reference network\nare used to generate pseudo-labels for unlabeled data to supervise the student\nnetwork. Specifically, the teacher predicts pseudo-labels by capturing\nhigh-level features of unlabeled data. The reference network provides adequate\nsupervision of the student network by referring to additional action\ninformation. Moreover, we introduce confidence memory to improve the\nreliability of pseudo-labels by storing the most accurate ever output of the\nteacher network and reference network. To validate our method, we conduct\nextensive experiments on three AQA benchmark datasets. Experimental results\nshow that our method achieves significant improvements and outperforms existing\nsemi-supervised AQA methods.\n","authors":["Wulian Yun","Mengshi Qi","Fei Peng","Huadong Ma"],"pdf_url":"https://arxiv.org/pdf/2407.19675v1.pdf","comment":"To be published in ECCV2024"},{"id":"http://arxiv.org/abs/2407.18574v2","updated":"2024-07-29T03:33:47Z","published":"2024-07-26T07:57:07Z","title":"Learning to Enhance Aperture Phasor Field for Non-Line-of-Sight Imaging","summary":" This paper aims to facilitate more practical NLOS imaging by reducing the\nnumber of samplings and scan areas. To this end, we introduce a phasor-based\nenhancement network that is capable of predicting clean and full measurements\nfrom noisy partial observations. We leverage a denoising autoencoder scheme to\nacquire rich and noise-robust representations in the measurement space. Through\nthis pipeline, our enhancement network is trained to accurately reconstruct\ncomplete measurements from their corrupted and partial counterparts. However,\nwe observe that the \\naive application of denoising often yields degraded and\nover-smoothed results, caused by unnecessary and spurious frequency signals\npresent in measurements. To address this issue, we introduce a phasor-based\npipeline designed to limit the spectrum of our network to the frequency range\nof interests, where the majority of informative signals are detected. The\nphasor wavefronts at the aperture, which are band-limited signals, are employed\nas inputs and outputs of the network, guiding our network to learn from the\nfrequency range of interests and discard unnecessary information. The\nexperimental results in more practical acquisition scenarios demonstrate that\nwe can look around the corners with $16\\times$ or $64\\times$ fewer samplings\nand $4\\times$ smaller apertures. Our code is available at\nhttps://github.com/join16/LEAP.\n","authors":["In Cho","Hyunbo Shim","Seon Joo Kim"],"pdf_url":"https://arxiv.org/pdf/2407.18574v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19674v1","updated":"2024-07-29T03:30:09Z","published":"2024-07-29T03:30:09Z","title":"Advancing Prompt Learning through an External Layer","summary":" Prompt learning represents a promising method for adapting pre-trained\nvisual-language models (VLMs) to various downstream tasks by learning a set of\ntext embeddings. One challenge inherent to these methods is the poor\ngeneralization performance due to the invalidity of the learned text embeddings\nfor unseen tasks. A straightforward approach to bridge this gap is to freeze\nthe text embeddings in prompts, which results in a lack of capacity to adapt\nVLMs for downstream tasks. To address this dilemma, we proposeto introduce an\nExternal Layer (EnLa) of text branch and learnable visual embeddings of the\nvisual branch for adapting VLMs to downstream tasks. The learnable external\nlayer is built upon valid embeddings of pre-trained CLIP. This design considers\nthe balance of learning capabilities between the two branches. To align the\ntextual and visual features, we propose a novel two-pronged approach: i) we\nintroduce the optimal transport as the discrepancy metric to align the vision\nand text modalities, and ii) we introducea novel strengthening feature to\nenhance the interaction between these two modalities. Extensive experiments\nshow that our method performs favorably well on 4 types of representative tasks\nacross 11 datasets compared to the existing prompt learning methods.\n","authors":["Fangming Cui","Xun Yang","Chao Wu","Liang Xiao","Xinmei Tian"],"pdf_url":"https://arxiv.org/pdf/2407.19674v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2111.11280v2","updated":"2024-07-29T03:20:17Z","published":"2021-11-22T15:41:30Z","title":"Point Cloud Color Constancy","summary":" In this paper, we present Point Cloud Color Constancy, in short PCCC, an\nillumination chromaticity estimation algorithm exploiting a point cloud. We\nleverage the depth information captured by the time-of-flight (ToF) sensor\nmounted rigidly with the RGB sensor, and form a 6D cloud where each point\ncontains the coordinates and RGB intensities, noted as (x,y,z,r,g,b). PCCC\napplies the PointNet architecture to the color constancy problem, deriving the\nillumination vector point-wise and then making a global decision about the\nglobal illumination chromaticity. On two popular RGB-D datasets, which we\nextend with illumination information, as well as on a novel benchmark, PCCC\nobtains lower error than the state-of-the-art algorithms. Our method is simple\nand fast, requiring merely 16*16-size input and reaching speed over 500 fps,\nincluding the cost of building the point cloud and net inference.\n","authors":["Xiaoyan Xing","Yanlin Qian","Sibo Feng","Yuhan Dong","Jiri Matas"],"pdf_url":"https://arxiv.org/pdf/2111.11280v2.pdf","comment":"CVPR 2022"},{"id":"http://arxiv.org/abs/2407.09924v2","updated":"2024-07-29T03:18:35Z","published":"2024-07-13T15:34:54Z","title":"Region-aware Image-based Human Action Retrieval with Transformers","summary":" Human action understanding is a fundamental and challenging task in computer\nvision. Although there exists tremendous research on this area, most works\nfocus on action recognition, while action retrieval has received less\nattention. In this paper, we focus on the neglected but important task of\nimage-based action retrieval which aims to find images that depict the same\naction as a query image. We establish benchmarks for this task and set up\nimportant baseline methods for fair comparison. We present an end-to-end model\nthat learns rich action representations from three aspects: the anchored\nperson, contextual regions, and the global image. A novel fusion transformer\nmodule is designed to model the relationships among different features and\neffectively fuse them into an action representation. Experiments on the\nStanford-40 and PASCAL VOC 2012 Action datasets show that the proposed method\nsignificantly outperforms previous approaches for image-based action retrieval.\n","authors":["Hongsong Wang","Jianhua Zhao","Jie Gui"],"pdf_url":"https://arxiv.org/pdf/2407.09924v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18392v2","updated":"2024-07-29T03:09:02Z","published":"2024-07-25T20:55:23Z","title":"A Reference-Based 3D Semantic-Aware Framework for Accurate Local Facial\n Attribute Editing","summary":" Facial attribute editing plays a crucial role in synthesizing realistic faces\nwith specific characteristics while maintaining realistic appearances. Despite\nadvancements, challenges persist in achieving precise, 3D-aware attribute\nmodifications, which are crucial for consistent and accurate representations of\nfaces from different angles. Current methods struggle with semantic\nentanglement and lack effective guidance for incorporating attributes while\nmaintaining image integrity. To address these issues, we introduce a novel\nframework that merges the strengths of latent-based and reference-based editing\nmethods. Our approach employs a 3D GAN inversion technique to embed attributes\nfrom the reference image into a tri-plane space, ensuring 3D consistency and\nrealistic viewing from multiple perspectives. We utilize blending techniques\nand predicted semantic masks to locate precise edit regions, merging them with\nthe contextual guidance from the reference image. A coarse-to-fine inpainting\nstrategy is then applied to preserve the integrity of untargeted areas,\nsignificantly enhancing realism. Our evaluations demonstrate superior\nperformance across diverse editing tasks, validating our framework's\neffectiveness in realistic and applicable facial attribute editing.\n","authors":["Yu-Kai Huang","Yutong Zheng","Yen-Shuo Su","Anudeepsekhar Bolimera","Han Zhang","Fangyi Chen","Marios Savvides"],"pdf_url":"https://arxiv.org/pdf/2407.18392v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19666v1","updated":"2024-07-29T02:56:19Z","published":"2024-07-29T02:56:19Z","title":"Take A Step Back: Rethinking the Two Stages in Visual Reasoning","summary":" Visual reasoning, as a prominent research area, plays a crucial role in AI by\nfacilitating concept formation and interaction with the world. However, current\nworks are usually carried out separately on small datasets thus lacking\ngeneralization ability. Through rigorous evaluation of diverse benchmarks, we\ndemonstrate the shortcomings of existing ad-hoc methods in achieving\ncross-domain reasoning and their tendency to data bias fitting. In this paper,\nwe revisit visual reasoning with a two-stage perspective: (1) symbolization and\n(2) logical reasoning given symbols or their representations. We find that the\nreasoning stage is better at generalization than symbolization. Thus, it is\nmore efficient to implement symbolization via separated encoders for different\ndata domains while using a shared reasoner. Given our findings, we establish\ndesign principles for visual reasoning frameworks following the separated\nsymbolization and shared reasoning. The proposed two-stage framework achieves\nimpressive generalization ability on various visual reasoning tasks, including\npuzzles, physical prediction, and visual question answering (VQA), encompassing\nboth 2D and 3D modalities. We believe our insights will pave the way for\ngeneralizable visual reasoning.\n","authors":["Mingyu Zhang","Jiting Cai","Mingyu Liu","Yue Xu","Cewu Lu","Yong-Lu Li"],"pdf_url":"https://arxiv.org/pdf/2407.19666v1.pdf","comment":"ECCV 2024, Project page:\n https://mybearyzhang.github.io/projects/TwoStageReason/"},{"id":"http://arxiv.org/abs/2407.19660v1","updated":"2024-07-29T02:49:55Z","published":"2024-07-29T02:49:55Z","title":"Towards a Knowledge guided Multimodal Foundation Model for\n Spatio-Temporal Remote Sensing Applications","summary":" In recent years, there is increased interest in foundation models for\ngeoscience due to vast amount of earth observing satellite imagery. Existing\nremote sensing foundation models make use of the various sources of spectral\nimagery to create large models pretrained on masked reconstruction task. The\nembeddings from these foundation models are then used for various downstream\nremote sensing applications. In this paper we propose a foundational modeling\nframework for remote sensing geoscience applications, that goes beyond these\ntraditional single modality masked autoencoder family of foundation models.\nThis framework leverages the knowledge guided principles that the spectral\nimagery captures the impact of the physical drivers on the environmental\nsystem, and that the relationship between them is governed by the\ncharacteristics of the system. Specifically, our method, called MultiModal\nVariable Step Forecasting (MM-VSF), uses mutlimodal data (spectral imagery and\nweather) as its input and a variable step forecasting task as its pretraining\nobjective. In our evaluation we show forecasting of satellite imagery using\nweather can be used as an effective pretraining task for foundation models. We\nfurther show the effectiveness of the embeddings from MM-VSF on the downstream\ntask of pixel wise crop mapping, when compared with a model trained in the\ntraditional setting of single modality input and masked reconstruction based\npretraining.\n","authors":["Praveen Ravirathinam","Ankush Khandelwal","Rahul Ghosh","Vipin Kumar"],"pdf_url":"https://arxiv.org/pdf/2407.19660v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2306.06048v3","updated":"2024-07-29T02:36:45Z","published":"2023-06-09T17:16:50Z","title":"How Does Fine-Tuning Impact Out-of-Distribution Detection for\n Vision-Language Models?","summary":" Recent large vision-language models such as CLIP have shown remarkable\nout-of-distribution (OOD) detection and generalization performance. However,\ntheir zero-shot in-distribution (ID) accuracy is often limited for downstream\ndatasets. Recent CLIP-based fine-tuning methods such as prompt learning have\ndemonstrated significant improvements in ID classification and OOD\ngeneralization where OOD labels are available. Nonetheless, it remains unclear\nwhether the model is reliable to semantic shifts without OOD labels. In this\npaper, we aim to bridge the gap and present a comprehensive study to understand\nhow fine-tuning impact OOD detection for few-shot downstream tasks. By framing\nOOD detection as multi-modal concept matching, we establish a connection\nbetween fine-tuning methods and various OOD scores. Our results suggest that a\nproper choice of OOD scores is essential for CLIP-based fine-tuning. In\nparticular, the maximum concept matching (MCM) score provides a promising\nsolution consistently. We also show that prompt learning demonstrates the\nstate-of-the-art OOD detection performance over the zero-shot counterpart.\n","authors":["Yifei Ming","Yixuan Li"],"pdf_url":"https://arxiv.org/pdf/2306.06048v3.pdf","comment":"Accepted to IJCV 2023"},{"id":"http://arxiv.org/abs/2407.19652v1","updated":"2024-07-29T02:34:51Z","published":"2024-07-29T02:34:51Z","title":"SALVE: A 3D Reconstruction Benchmark of Wounds from Consumer-grade\n Videos","summary":" Managing chronic wounds is a global challenge that can be alleviated by the\nadoption of automatic systems for clinical wound assessment from consumer-grade\nvideos. While 2D image analysis approaches are insufficient for handling the 3D\nfeatures of wounds, existing approaches utilizing 3D reconstruction methods\nhave not been thoroughly evaluated. To address this gap, this paper presents a\ncomprehensive study on 3D wound reconstruction from consumer-grade videos.\nSpecifically, we introduce the SALVE dataset, comprising video recordings of\nrealistic wound phantoms captured with different cameras. Using this dataset,\nwe assess the accuracy and precision of state-of-the-art methods for 3D\nreconstruction, ranging from traditional photogrammetry pipelines to advanced\nneural rendering approaches. In our experiments, we observe that photogrammetry\napproaches do not provide smooth surfaces suitable for precise clinical\nmeasurements of wounds. Neural rendering approaches show promise in addressing\nthis issue, advancing the use of this technology in wound care practices.\n","authors":["Remi Chierchia","Leo Lebrat","David Ahmedt-Aristizabal","Olivier Salvado","Clinton Fookes","Rodrigo Santa Cruz"],"pdf_url":"https://arxiv.org/pdf/2407.19652v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19651v1","updated":"2024-07-29T02:32:44Z","published":"2024-07-29T02:32:44Z","title":"ComNeck: Bridging Compressed Image Latents and Multimodal LLMs via\n Universal Transform-Neck","summary":" This paper presents the first-ever study of adapting compressed image latents\nto suit the needs of downstream vision tasks that adopt Multimodal Large\nLanguage Models (MLLMs). MLLMs have extended the success of large language\nmodels to modalities (e.g. images) beyond text, but their billion scale hinders\ndeployment on resource-constrained end devices. While cloud-hosted MLLMs could\nbe available, transmitting raw, uncompressed images captured by end devices to\nthe cloud requires an efficient image compression system. To address this, we\nfocus on emerging neural image compression and propose a novel framework with a\nlightweight transform-neck and a surrogate loss to adapt compressed image\nlatents for MLLM-based vision tasks. The proposed framework is generic and\napplicable to multiple application scenarios, where the neural image codec can\nbe (1) pre-trained for human perception without updating, (2) fully updated for\njoint human and machine perception, or (3) fully updated for only machine\nperception. The transform-neck trained with the surrogate loss is universal,\nfor it can serve various downstream vision tasks enabled by a variety of MLLMs\nthat share the same visual encoder. Our framework has the striking feature of\nexcluding the downstream MLLMs from training the transform-neck, and\npotentially the neural image codec as well. This stands out from most existing\ncoding for machine approaches that involve downstream networks in training and\nthus could be impractical when the networks are MLLMs. Extensive experiments on\ndifferent neural image codecs and various MLLM-based vision tasks show that our\nmethod achieves great rate-accuracy performance with much less complexity,\ndemonstrating its effectiveness.\n","authors":["Chia-Hao Kao","Cheng Chien","Yu-Jen Tseng","Yi-Hsin Chen","Alessandro Gnutti","Shao-Yuan Lo","Wen-Hsiao Peng","Riccardo Leonardi"],"pdf_url":"https://arxiv.org/pdf/2407.19651v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05021v4","updated":"2024-07-29T02:12:15Z","published":"2024-03-08T03:54:22Z","title":"Beyond MOT: Semantic Multi-Object Tracking","summary":" Current multi-object tracking (MOT) aims to predict trajectories of targets\n(i.e., ''where'') in videos. Yet, knowing merely ''where'' is insufficient in\nmany crucial applications. In comparison, semantic understanding such as\nfine-grained behaviors, interactions, and overall summarized captions (i.e.,\n''what'') from videos, associated with ''where'', is highly-desired for\ncomprehensive video analysis. Thus motivated, we introduce Semantic\nMulti-Object Tracking (SMOT), that aims to estimate object trajectories and\nmeanwhile understand semantic details of associated trajectories including\ninstance captions, instance interactions, and overall video captions,\nintegrating ''where'' and ''what'' for tracking. In order to foster the\nexploration of SMOT, we propose BenSMOT, a large-scale Benchmark for Semantic\nMOT. Specifically, BenSMOT comprises 3,292 videos with 151K frames, covering\nvarious scenarios for semantic tracking of humans. BenSMOT provides annotations\nfor the trajectories of targets, along with associated instance captions in\nnatural language, instance interactions, and overall caption for each video\nsequence. To our best knowledge, BenSMOT is the first publicly available\nbenchmark for SMOT. Besides, to encourage future research, we present a novel\ntracker named SMOTer, which is specially designed and end-to-end trained for\nSMOT, showing promising performance. By releasing BenSMOT, we expect to go\nbeyond conventional MOT by predicting ''where'' and ''what'' for SMOT, opening\nup a new direction in tracking for video understanding. We will release BenSMOT\nand SMOTer at https://github.com/Nathan-Li123/SMOTer.\n","authors":["Yunhao Li","Qin Li","Hao Wang","Xue Ma","Jiali Yao","Shaohua Dong","Heng Fan","Libo Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.05021v4.pdf","comment":"Accepted to ECCV2024"},{"id":"http://arxiv.org/abs/2407.19650v1","updated":"2024-07-29T02:12:11Z","published":"2024-07-29T02:12:11Z","title":"Practical Video Object Detection via Feature Selection and Aggregation","summary":" Compared with still image object detection, video object detection (VOD)\nneeds to particularly concern the high across-frame variation in object\nappearance, and the diverse deterioration in some frames. In principle, the\ndetection in a certain frame of a video can benefit from information in other\nframes. Thus, how to effectively aggregate features across different frames is\nkey to the target problem. Most of contemporary aggregation methods are\ntailored for two-stage detectors, suffering from high computational costs due\nto the dual-stage nature. On the other hand, although one-stage detectors have\nmade continuous progress in handling static images, their applicability to VOD\nlacks sufficient exploration. To tackle the above issues, this study invents a\nvery simple yet potent strategy of feature selection and aggregation, gaining\nsignificant accuracy at marginal computational expense. Concretely, for cutting\nthe massive computation and memory consumption from the dense prediction\ncharacteristic of one-stage object detectors, we first condense candidate\nfeatures from dense prediction maps. Then, the relationship between a target\nframe and its reference frames is evaluated to guide the aggregation.\nComprehensive experiments and ablation studies are conducted to validate the\nefficacy of our design, and showcase its advantage over other cutting-edge VOD\nmethods in both effectiveness and efficiency. Notably, our model reaches\n\\emph{a new record performance, i.e., 92.9\\% AP50 at over 30 FPS on the\nImageNet VID dataset on a single 3090 GPU}, making it a compelling option for\nlarge-scale or real-time applications. The implementation is simple, and\naccessible at \\url{https://github.com/YuHengsss/YOLOV}.\n","authors":["Yuheng Shi","Tong Zhang","Xiaojie Guo"],"pdf_url":"https://arxiv.org/pdf/2407.19650v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18568v2","updated":"2024-07-29T02:05:19Z","published":"2024-07-26T07:50:48Z","title":"Learning Spectral-Decomposed Tokens for Domain Generalized Semantic\n Segmentation","summary":" The rapid development of Vision Foundation Model (VFM) brings inherent\nout-domain generalization for a variety of down-stream tasks. Among them,\ndomain generalized semantic segmentation (DGSS) holds unique challenges as the\ncross-domain images share common pixel-wise content information but vary\ngreatly in terms of the style. In this paper, we present a novel\nSpectral-dEcomposed Token (SET) learning framework to advance the frontier.\nDelving into further than existing fine-tuning token & frozen backbone\nparadigm, the proposed SET especially focuses on the way learning\nstyle-invariant features from these learnable tokens. Particularly, the frozen\nVFM features are first decomposed into the phase and amplitude components in\nthe frequency space, which mainly contain the information of content and style,\nrespectively, and then separately processed by learnable tokens for\ntask-specific information extraction. After the decomposition, style variation\nprimarily impacts the token-based feature enhancement within the amplitude\nbranch. To address this issue, we further develop an attention optimization\nmethod to bridge the gap between style-affected representation and static\ntokens during inference. Extensive cross-domain experiments show its\nstate-of-the-art performance.\n","authors":["Jingjun Yi","Qi Bi","Hao Zheng","Haolan Zhan","Wei Ji","Yawen Huang","Yuexiang Li","Yefeng Zheng"],"pdf_url":"https://arxiv.org/pdf/2407.18568v2.pdf","comment":"accecpted by ACM MM2024"},{"id":"http://arxiv.org/abs/2407.19646v1","updated":"2024-07-29T02:04:29Z","published":"2024-07-29T02:04:29Z","title":"Foundations for Unfairness in Anomaly Detection -- Case Studies in\n Facial Imaging Data","summary":" Deep anomaly detection (AD) is perhaps the most controversial of data\nanalytic tasks as it identifies entities that are then specifically targeted\nfor further investigation or exclusion. Also controversial is the application\nof AI to facial imaging data. This work explores the intersection of these two\nareas to understand two core questions: \"Who\" these algorithms are being unfair\nto and equally important \"Why\". Recent work has shown that deep AD can be\nunfair to different groups despite being unsupervised with a recent study\nshowing that for portraits of people: men of color are far more likely to be\nchosen to be outliers. We study the two main categories of AD algorithms:\nautoencoder-based and single-class-based which effectively try to compress all\nthe instances with those that can not be easily compressed being deemed to be\noutliers. We experimentally verify sources of unfairness such as the\nunder-representation of a group (e.g. people of color are relatively rare),\nspurious group features (e.g. men are often photographed with hats), and group\nlabeling noise (e.g. race is subjective). We conjecture that lack of\ncompressibility is the main foundation and the others cause it but experimental\nresults show otherwise and we present a natural hierarchy amongst them.\n","authors":["Michael Livanos","Ian Davidson"],"pdf_url":"https://arxiv.org/pdf/2407.19646v1.pdf","comment":"16 pages, 8 figures, AAAI/ACM AIES24"},{"id":"http://arxiv.org/abs/2402.14309v2","updated":"2024-07-29T01:48:25Z","published":"2024-02-22T05:55:17Z","title":"YOLO-TLA: An Efficient and Lightweight Small Object Detection Model\n based on YOLOv5","summary":" Object detection, a crucial aspect of computer vision, has seen significant\nadvancements in accuracy and robustness. Despite these advancements, practical\napplications still face notable challenges, primarily the inaccurate detection\nor missed detection of small objects. In this paper, we propose YOLO-TLA, an\nadvanced object detection model building on YOLOv5. We first introduce an\nadditional detection layer for small objects in the neck network pyramid\narchitecture, thereby producing a feature map of a larger scale to discern\nfiner features of small objects. Further, we integrate the C3CrossCovn module\ninto the backbone network. This module uses sliding window feature extraction,\nwhich effectively minimizes both computational demand and the number of\nparameters, rendering the model more compact. Additionally, we have\nincorporated a global attention mechanism into the backbone network. This\nmechanism combines the channel information with global information to create a\nweighted feature map. This feature map is tailored to highlight the attributes\nof the object of interest, while effectively ignoring irrelevant details. In\ncomparison to the baseline YOLOv5s model, our newly developed YOLO-TLA model\nhas shown considerable improvements on the MS COCO validation dataset, with\nincreases of 4.6% in mAP@0.5 and 4% in mAP@0.5:0.95, all while keeping the\nmodel size compact at 9.49M parameters. Further extending these improvements to\nthe YOLOv5m model, the enhanced version exhibited a 1.7% and 1.9% increase in\nmAP@0.5 and mAP@0.5:0.95, respectively, with a total of 27.53M parameters.\nThese results validate the YOLO-TLA model's efficient and effective performance\nin small object detection, achieving high accuracy with fewer parameters and\ncomputational demands.\n","authors":["Chun-Lin Ji","Tao Yu","Peng Gao","Fei Wang","Ru-Yue Yuan"],"pdf_url":"https://arxiv.org/pdf/2402.14309v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19628v1","updated":"2024-07-29T01:18:47Z","published":"2024-07-29T01:18:47Z","title":"Text2LiDAR: Text-guided LiDAR Point Cloud Generation via Equirectangular\n Transformer","summary":" The complex traffic environment and various weather conditions make the\ncollection of LiDAR data expensive and challenging. Achieving high-quality and\ncontrollable LiDAR data generation is urgently needed, controlling with text is\na common practice, but there is little research in this field. To this end, we\npropose Text2LiDAR, the first efficient, diverse, and text-controllable LiDAR\ndata generation model. Specifically, we design an equirectangular transformer\narchitecture, utilizing the designed equirectangular attention to capture LiDAR\nfeatures in a manner with data characteristics. Then, we design a\ncontrol-signal embedding injector to efficiently integrate control signals\nthrough the global-to-focused attention mechanism. Additionally, we devise a\nfrequency modulator to assist the model in recovering high-frequency details,\nensuring the clarity of the generated point cloud. To foster development in the\nfield and optimize text-controlled generation performance, we construct\nnuLiDARtext which offers diverse text descriptors for 34,149 LiDAR point clouds\nfrom 850 scenes. Experiments on uncontrolled and text-controlled generation in\nvarious forms on KITTI-360 and nuScenes datasets demonstrate the superiority of\nour approach.\n","authors":["Yang Wu","Kaihua Zhang","Jianjun Qian","Jin Xie","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2407.19628v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.15587v3","updated":"2024-07-29T01:10:15Z","published":"2024-05-24T14:18:31Z","title":"Composed Image Retrieval for Remote Sensing","summary":" This work introduces composed image retrieval to remote sensing. It allows to\nquery a large image archive by image examples alternated by a textual\ndescription, enriching the descriptive power over unimodal queries, either\nvisual or textual. Various attributes can be modified by the textual part, such\nas shape, color, or context. A novel method fusing image-to-image and\ntext-to-image similarity is introduced. We demonstrate that a vision-language\nmodel possesses sufficient descriptive power and no further learning step or\ntraining data are necessary. We present a new evaluation benchmark focused on\ncolor, context, density, existence, quantity, and shape modifications. Our work\nnot only sets the state-of-the-art for this task, but also serves as a\nfoundational step in addressing a gap in the field of remote sensing image\nretrieval. Code at: https://github.com/billpsomas/rscir\n","authors":["Bill Psomas","Ioannis Kakogeorgiou","Nikos Efthymiadis","Giorgos Tolias","Ondrej Chum","Yannis Avrithis","Konstantinos Karantzalos"],"pdf_url":"https://arxiv.org/pdf/2405.15587v3.pdf","comment":"Accepted for ORAL presentation at the 2024 IEEE International\n Geoscience and Remote Sensing Symposium"},{"id":"http://arxiv.org/abs/2407.19617v1","updated":"2024-07-29T00:39:51Z","published":"2024-07-29T00:39:51Z","title":"AgEval: A Benchmark for Zero-Shot and Few-Shot Plant Stress Phenotyping\n with Multimodal LLMs","summary":" Plant stress phenotyping traditionally relies on expert assessments and\nspecialized models, limiting scalability in agriculture. Recent advances in\nmultimodal large language models (LLMs) offer potential solutions to this\nchallenge. We present AgEval, a benchmark comprising 12 diverse plant stress\nphenotyping tasks, to evaluate these models' capabilities. Our study assesses\nzero-shot and few-shot in-context learning performance of state-of-the-art\nmodels, including Claude, GPT, Gemini, and LLaVA. Results show significant\nperformance improvements with few-shot learning, with F1 scores increasing from\n46.24% to 73.37% in 8-shot identification for the best-performing model.\nFew-shot examples from other classes in the dataset have negligible or negative\nimpacts, although having the exact category example helps to increase\nperformance by 15.38%. We also quantify the consistency of model performance\nacross different classes within each task, finding that the coefficient of\nvariance (CV) ranges from 26.02% to 58.03% across models, implying that subject\nmatter expertise is needed - of 'difficult' classes - to achieve reliability in\nperformance. AgEval establishes baseline metrics for multimodal LLMs in\nagricultural applications, offering insights into their promise for enhancing\nplant stress phenotyping at scale. Benchmark and code can be accessed at:\nhttps://anonymous.4open.science/r/AgEval/\n","authors":["Muhammad Arbab Arshad","Talukder Zaki Jubery","Tirtho Roy","Rim Nassiri","Asheesh K. Singh","Arti Singh","Chinmay Hegde","Baskar Ganapathysubramanian","Aditya Balu","Adarsh Krishnamurthy","Soumik Sarkar"],"pdf_url":"https://arxiv.org/pdf/2407.19617v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06644v3","updated":"2024-07-29T00:09:46Z","published":"2023-12-11T18:56:37Z","title":"AnyHome: Open-Vocabulary Generation of Structured and Textured 3D Homes","summary":" Inspired by cognitive theories, we introduce AnyHome, a framework that\ntranslates any text into well-structured and textured indoor scenes at a\nhouse-scale. By prompting Large Language Models (LLMs) with designed templates,\nour approach converts provided textual narratives into amodal structured\nrepresentations. These representations guarantee consistent and realistic\nspatial layouts by directing the synthesis of a geometry mesh within defined\nconstraints. A Score Distillation Sampling process is then employed to refine\nthe geometry, followed by an egocentric inpainting process that adds lifelike\ntextures to it. AnyHome stands out with its editability, customizability,\ndiversity, and realism. The structured representations for scenes allow for\nextensive editing at varying levels of granularity. Capable of interpreting\ntexts ranging from simple labels to detailed narratives, AnyHome generates\ndetailed geometries and textures that outperform existing methods in both\nquantitative and qualitative measures.\n","authors":["Rao Fu","Zehao Wen","Zichen Liu","Srinath Sridhar"],"pdf_url":"https://arxiv.org/pdf/2312.06644v3.pdf","comment":"accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2407.20461v1","updated":"2024-07-29T23:40:13Z","published":"2024-07-29T23:40:13Z","title":"Uncertainty-Rectified YOLO-SAM for Weakly Supervised ICH Segmentation","summary":" Intracranial hemorrhage (ICH) is a life-threatening condition that requires\nrapid and accurate diagnosis to improve treatment outcomes and patient survival\nrates. Recent advancements in supervised deep learning have greatly improved\nthe analysis of medical images, but often rely on extensive datasets with\nhigh-quality annotations, which are costly, time-consuming, and require medical\nexpertise to prepare. To mitigate the need for large amounts of expert-prepared\nsegmentation data, we have developed a novel weakly supervised ICH segmentation\nmethod that utilizes the YOLO object detection model and an\nuncertainty-rectified Segment Anything Model (SAM). In addition, we have\nproposed a novel point prompt generator for this model to further improve\nsegmentation results with YOLO-predicted bounding box prompts. Our approach\nachieved a high accuracy of 0.933 and an AUC of 0.796 in ICH detection, along\nwith a mean Dice score of 0.629 for ICH segmentation, outperforming existing\nweakly supervised and popular supervised (UNet and Swin-UNETR) approaches.\nOverall, the proposed method provides a robust and accurate alternative to the\nmore commonly used supervised techniques for ICH quantification without\nrequiring refined segmentation ground truths during model training.\n","authors":["Pascal Spiegler","Amirhossein Rasoulian","Yiming Xiao"],"pdf_url":"https://arxiv.org/pdf/2407.20461v1.pdf","comment":"Manuscript was accepted at SWITCH2024. 10 pages, 2 figures"},{"id":"http://arxiv.org/abs/2407.12597v2","updated":"2024-07-29T23:26:11Z","published":"2024-07-17T14:21:53Z","title":"Enhancing Wrist Fracture Detection with YOLO","summary":" Diagnosing and treating abnormalities in the wrist, specifically distal\nradius, and ulna fractures, is a crucial concern among children, adolescents,\nand young adults, with a higher incidence rate during puberty. However, the\nscarcity of radiologists and the lack of specialized training among medical\nprofessionals pose a significant risk to patient care. This problem is further\nexacerbated by the rising number of imaging studies and limited access to\nspecialist reporting in certain regions. This highlights the need for\ninnovative solutions to improve the diagnosis and treatment of wrist\nabnormalities. Automated wrist fracture detection using object detection has\nshown potential, but current studies mainly use two-stage detection methods\nwith limited evidence for single-stage effectiveness. This study employs\nstate-of-the-art single-stage deep neural network-based detection models\nYOLOv5, YOLOv6, YOLOv7, and YOLOv8 to detect wrist abnormalities. Through\nextensive experimentation, we found that these YOLO models outperform the\ncommonly used two-stage detection algorithm, Faster R-CNN, in fracture\ndetection. Additionally, compound-scaled variants of each YOLO model were\ncompared, with YOLOv8m demonstrating a highest fracture detection sensitivity\nof 0.92 and mean average precision (mAP) of 0.95. On the other hand, YOLOv6m\nachieved the highest sensitivity across all classes at 0.83. Meanwhile, YOLOv8x\nrecorded the highest mAP of 0.77 for all classes on the GRAZPEDWRI-DX pediatric\nwrist dataset, highlighting the potential of single-stage models for enhancing\npediatric wrist imaging.\n","authors":["Ammar Ahmed","Ali Shariq Imran","Abdul Manaf","Zenun Kastrati","Sher Muhammad Daudpota"],"pdf_url":"https://arxiv.org/pdf/2407.12597v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20455v1","updated":"2024-07-29T23:19:42Z","published":"2024-07-29T23:19:42Z","title":"Learning Feature-Preserving Portrait Editing from Generated Pairs","summary":" Portrait editing is challenging for existing techniques due to difficulties\nin preserving subject features like identity. In this paper, we propose a\ntraining-based method leveraging auto-generated paired data to learn desired\nediting while ensuring the preservation of unchanged subject features.\nSpecifically, we design a data generation process to create reasonably good\ntraining pairs for desired editing at low cost. Based on these pairs, we\nintroduce a Multi-Conditioned Diffusion Model to effectively learn the editing\ndirection and preserve subject features. During inference, our model produces\naccurate editing mask that can guide the inference process to further preserve\ndetailed subject features. Experiments on costume editing and cartoon\nexpression editing show that our method achieves state-of-the-art quality,\nquantitatively and qualitatively.\n","authors":["Bowei Chen","Tiancheng Zhi","Peihao Zhu","Shen Sang","Jing Liu","Linjie Luo"],"pdf_url":"https://arxiv.org/pdf/2407.20455v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20446v1","updated":"2024-07-29T22:57:20Z","published":"2024-07-29T22:57:20Z","title":"MEVDT: Multi-Modal Event-Based Vehicle Detection and Tracking Dataset","summary":" In this data article, we introduce the Multi-Modal Event-based Vehicle\nDetection and Tracking (MEVDT) dataset. This dataset provides a synchronized\nstream of event data and grayscale images of traffic scenes, captured using the\nDynamic and Active-Pixel Vision Sensor (DAVIS) 240c hybrid event-based camera.\nMEVDT comprises 63 multi-modal sequences with approximately 13k images, 5M\nevents, 10k object labels, and 85 unique object tracking trajectories.\nAdditionally, MEVDT includes manually annotated ground truth labels\n$\\unicode{x2014}$ consisting of object classifications, pixel-precise bounding\nboxes, and unique object IDs $\\unicode{x2014}$ which are provided at a labeling\nfrequency of 24 Hz. Designed to advance the research in the domain of\nevent-based vision, MEVDT aims to address the critical need for high-quality,\nreal-world annotated datasets that enable the development and evaluation of\nobject detection and tracking algorithms in automotive environments.\n","authors":["Zaid A. El Shair","Samir A. Rawashdeh"],"pdf_url":"https://arxiv.org/pdf/2407.20446v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.06322v2","updated":"2024-07-29T22:17:31Z","published":"2024-07-08T18:38:52Z","title":"MagMax: Leveraging Model Merging for Seamless Continual Learning","summary":" This paper introduces a continual learning approach named MagMax, which\nutilizes model merging to enable large pre-trained models to continuously learn\nfrom new data without forgetting previously acquired knowledge. Distinct from\ntraditional continual learning methods that aim to reduce forgetting during\ntask training, MagMax combines sequential fine-tuning with a maximum magnitude\nweight selection for effective knowledge integration across tasks. Our initial\ncontribution is an extensive examination of model merging techniques, revealing\nthat simple approaches like weight averaging and random weight selection\nsurprisingly hold up well in various continual learning contexts. More\nimportantly, we present MagMax, a novel model-merging strategy that enables\ncontinual learning of large pre-trained models for successive tasks. Our\nthorough evaluation demonstrates the superiority of MagMax in various\nscenarios, including class- and domain-incremental learning settings. The code\nis available at this URL: https://github.com/danielm1405/magmax.\n","authors":["Daniel Marczak","Bartłomiej Twardowski","Tomasz Trzciński","Sebastian Cygert"],"pdf_url":"https://arxiv.org/pdf/2407.06322v2.pdf","comment":"Accepted for ECCV2024"},{"id":"http://arxiv.org/abs/2407.20437v1","updated":"2024-07-29T22:05:13Z","published":"2024-07-29T22:05:13Z","title":"BaseBoostDepth: Exploiting Larger Baselines For Self-supervised\n Monocular Depth Estimation","summary":" In the domain of multi-baseline stereo, the conventional understanding is\nthat, in general, increasing baseline separation substantially enhances the\naccuracy of depth estimation. However, prevailing self-supervised depth\nestimation architectures primarily use minimal frame separation and a\nconstrained stereo baseline. Larger frame separations can be employed; however,\nwe show this to result in diminished depth quality due to various factors,\nincluding significant changes in brightness, and increased areas of occlusion.\nIn response to these challenges, our proposed method, BaseBoostDepth,\nincorporates a curriculum learning-inspired optimization strategy to\neffectively leverage larger frame separations. However, we show that our\ncurriculum learning-inspired strategy alone does not suffice, as larger\nbaselines still cause pose estimation drifts. Therefore, we introduce\nincremental pose estimation to enhance the accuracy of pose estimations,\nresulting in significant improvements across all depth metrics. Additionally,\nto improve the robustness of the model, we introduce error-induced\nreconstructions, which optimize reconstructions with added error to the pose\nestimations. Ultimately, our final depth network achieves state-of-the-art\nperformance on KITTI and SYNS-patches datasets across image-based, edge-based,\nand point cloud-based metrics without increasing computational complexity at\ntest time. The project website can be found at\nhttps://kieran514.github.io/BaseBoostDepth-Project.\n","authors":["Kieran Saunders","Luis J. Manso","George Vogiatzis"],"pdf_url":"https://arxiv.org/pdf/2407.20437v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20427v1","updated":"2024-07-29T21:38:04Z","published":"2024-07-29T21:38:04Z","title":"Mean Opinion Score as a New Metric for User-Evaluation of XAI Methods","summary":" This paper investigates the use of Mean Opinion Score (MOS), a common image\nquality metric, as a user-centric evaluation metric for XAI post-hoc\nexplainers. To measure the MOS, a user experiment is proposed, which has been\nconducted with explanation maps of intentionally distorted images. Three\nmethods from the family of feature attribution methods - Gradient-weighted\nClass Activation Mapping (Grad-CAM), Multi-Layered Feature Explanation Method\n(MLFEM), and Feature Explanation Method (FEM) - are compared with this metric.\nAdditionally, the correlation of this new user-centric metric with automatic\nmetrics is studied via Spearman's rank correlation coefficient. MOS of MLFEM\nshows the highest correlation with automatic metrics of Insertion Area Under\nCurve (IAUC) and Deletion Area Under Curve (DAUC). However, the overall\ncorrelations are limited, which highlights the lack of consensus between\nautomatic and user-centric metrics.\n","authors":["Hyeon Yu","Jenny Benois-Pineau","Romain Bourqui","Romain Giot","Alexey Zhukov"],"pdf_url":"https://arxiv.org/pdf/2407.20427v1.pdf","comment":"Supported by organization Laboratoire Bordelais de Recherche en\n Informatique, 15 pages, 4 figures, 3 tables"},{"id":"http://arxiv.org/abs/2403.08109v2","updated":"2024-07-29T21:36:13Z","published":"2024-03-12T22:33:08Z","title":"VANP: Learning Where to See for Navigation with Self-Supervised\n Vision-Action Pre-Training","summary":" Humans excel at efficiently navigating through crowds without collision by\nfocusing on specific visual regions relevant to navigation. However, most\nrobotic visual navigation methods rely on deep learning models pre-trained on\nvision tasks, which prioritize salient objects -- not necessarily relevant to\nnavigation and potentially misleading. Alternative approaches train specialized\nnavigation models from scratch, requiring significant computation. On the other\nhand, self-supervised learning has revolutionized computer vision and natural\nlanguage processing, but its application to robotic navigation remains\nunderexplored due to the difficulty of defining effective self-supervision\nsignals. Motivated by these observations, in this work, we propose a\nSelf-Supervised Vision-Action Model for Visual Navigation Pre-Training (VANP).\nInstead of detecting salient objects that are beneficial for tasks such as\nclassification or detection, VANP learns to focus only on specific visual\nregions that are relevant to the navigation task. To achieve this, VANP uses a\nhistory of visual observations, future actions, and a goal image for\nself-supervision, and embeds them using two small Transformer Encoders. Then,\nVANP maximizes the information between the embeddings by using a mutual\ninformation maximization objective function. We demonstrate that most\nVANP-extracted features match with human navigation intuition. VANP achieves\ncomparable performance as models learned end-to-end with half the training time\nand models trained on a large-scale, fully supervised dataset, i.e., ImageNet,\nwith only 0.08% data.\n","authors":["Mohammad Nazeri","Junzhe Wang","Amirreza Payandeh","Xuesu Xiao"],"pdf_url":"https://arxiv.org/pdf/2403.08109v2.pdf","comment":"Extended version of the paper accepted at IROS 2024. Code:\n https://github.com/mhnazeri/VANP"},{"id":"http://arxiv.org/abs/2312.16731v3","updated":"2024-07-29T21:32:01Z","published":"2023-12-27T22:05:42Z","title":"Infinite dSprites for Disentangled Continual Learning: Separating Memory\n Edits from Generalization","summary":" The ability of machine learning systems to learn continually is hindered by\ncatastrophic forgetting, the tendency of neural networks to overwrite\npreviously acquired knowledge when learning a new task. Existing methods\nmitigate this problem through regularization, parameter isolation, or\nrehearsal, but they are typically evaluated on benchmarks comprising only a\nhandful of tasks. In contrast, humans are able to learn over long time horizons\nin dynamic, open-world environments, effortlessly memorizing unfamiliar objects\nand reliably recognizing them under various transformations. To make progress\ntowards closing this gap, we introduce Infinite dSprites, a parsimonious tool\nfor creating continual classification and disentanglement benchmarks of\narbitrary length and with full control over generative factors. We show that\nover a sufficiently long time horizon, the performance of all major types of\ncontinual learning methods deteriorates on this simple benchmark. This result\nhighlights an important and previously overlooked aspect of continual learning:\ngiven a finite modelling capacity and an arbitrarily long learning horizon,\nefficient learning requires memorizing class-specific information and\naccumulating knowledge about general mechanisms. In a simple setting with\ndirect supervision on the generative factors, we show how learning\nclass-agnostic transformations offers a way to circumvent catastrophic\nforgetting and improve classification accuracy over time. Our approach sets the\nstage for continual learning over hundreds of tasks with explicit control over\nmemorization and forgetting, emphasizing open-set classification and one-shot\ngeneralization.\n","authors":["Sebastian Dziadzio","Çağatay Yıldız","Gido M. van de Ven","Tomasz Trzciński","Tinne Tuytelaars","Matthias Bethge"],"pdf_url":"https://arxiv.org/pdf/2312.16731v3.pdf","comment":"10 pages, 10 figures"},{"id":"http://arxiv.org/abs/2407.20421v1","updated":"2024-07-29T21:22:53Z","published":"2024-07-29T21:22:53Z","title":"Event-based Optical Flow on Neuromorphic Processor: ANN vs. SNN\n Comparison based on Activation Sparsification","summary":" Spiking neural networks (SNNs) for event-based optical flow are claimed to be\ncomputationally more efficient than their artificial neural networks (ANNs)\ncounterparts, but a fair comparison is missing in the literature. In this work,\nwe propose an event-based optical flow solution based on activation\nsparsification and a neuromorphic processor, SENECA. SENECA has an event-driven\nprocessing mechanism that can exploit the sparsity in ANN activations and SNN\nspikes to accelerate the inference of both types of neural networks. The ANN\nand the SNN for comparison have similar low activation/spike density (~5%)\nthanks to our novel sparsification-aware training. In the hardware-in-loop\nexperiments designed to deduce the average time and energy consumption, the SNN\nconsumes 44.9ms and 927.0 microjoules, which are 62.5% and 75.2% of the ANN's\nconsumption, respectively. We find that SNN's higher efficiency attributes to\nits lower pixel-wise spike density (43.5% vs. 66.5%) that requires fewer memory\naccess operations for neuron states.\n","authors":["Yingfu Xu","Guangzhi Tang","Amirreza Yousefzadeh","Guido de Croon","Manolis Sifalakis"],"pdf_url":"https://arxiv.org/pdf/2407.20421v1.pdf","comment":"18 pages, 12 figures, 4 tables"},{"id":"http://arxiv.org/abs/2312.06106v3","updated":"2024-07-29T21:09:11Z","published":"2023-12-11T04:24:11Z","title":"AUGCAL: Improving Sim2Real Adaptation by Uncertainty Calibration on\n Augmented Synthetic Images","summary":" Synthetic data (SIM) drawn from simulators have emerged as a popular\nalternative for training models where acquiring annotated real-world images is\ndifficult. However, transferring models trained on synthetic images to\nreal-world applications can be challenging due to appearance disparities. A\ncommonly employed solution to counter this SIM2REAL gap is unsupervised domain\nadaptation, where models are trained using labeled SIM data and unlabeled REAL\ndata. Mispredictions made by such SIM2REAL adapted models are often associated\nwith miscalibration - stemming from overconfident predictions on real data. In\nthis paper, we introduce AUGCAL, a simple training-time patch for unsupervised\nadaptation that improves SIM2REAL adapted models by - (1) reducing overall\nmiscalibration, (2) reducing overconfidence in incorrect predictions and (3)\nimproving confidence score reliability by better guiding misclassification\ndetection - all while retaining or improving SIM2REAL performance. Given a base\nSIM2REAL adaptation algorithm, at training time, AUGCAL involves replacing\nvanilla SIM images with strongly augmented views (AUG intervention) and\nadditionally optimizing for a training time calibration loss on augmented SIM\npredictions (CAL intervention). We motivate AUGCAL using a brief analytical\njustification of how to reduce miscalibration on unlabeled REAL data. Through\nour experiments, we empirically show the efficacy of AUGCAL across multiple\nadaptation methods, backbones, tasks and shifts.\n","authors":["Prithvijit Chattopadhyay","Bharat Goyal","Boglarka Ecsedi","Viraj Prabhu","Judy Hoffman"],"pdf_url":"https://arxiv.org/pdf/2312.06106v3.pdf","comment":"Published at ICLR 2024"},{"id":"http://arxiv.org/abs/2312.04533v2","updated":"2024-07-29T20:40:09Z","published":"2023-12-07T18:51:19Z","title":"Dream2Real: Zero-Shot 3D Object Rearrangement with Vision-Language\n Models","summary":" We introduce Dream2Real, a robotics framework which integrates\nvision-language models (VLMs) trained on 2D data into a 3D object rearrangement\npipeline. This is achieved by the robot autonomously constructing a 3D\nrepresentation of the scene, where objects can be rearranged virtually and an\nimage of the resulting arrangement rendered. These renders are evaluated by a\nVLM, so that the arrangement which best satisfies the user instruction is\nselected and recreated in the real world with pick-and-place. This enables\nlanguage-conditioned rearrangement to be performed zero-shot, without needing\nto collect a training dataset of example arrangements. Results on a series of\nreal-world tasks show that this framework is robust to distractors,\ncontrollable by language, capable of understanding complex multi-object\nrelations, and readily applicable to both tabletop and 6-DoF rearrangement\ntasks.\n","authors":["Ivan Kapelyukh","Yifei Ren","Ignacio Alzugaray","Edward Johns"],"pdf_url":"https://arxiv.org/pdf/2312.04533v2.pdf","comment":"ICRA 2024. Project webpage with robot videos:\n https://www.robot-learning.uk/dream2real"},{"id":"http://arxiv.org/abs/2406.13844v2","updated":"2024-07-29T20:16:23Z","published":"2024-06-19T21:11:46Z","title":"MAMA-MIA: A Large-Scale Multi-Center Breast Cancer DCE-MRI Benchmark\n Dataset with Expert Segmentations","summary":" Current research in breast cancer Magnetic Resonance Imaging (MRI),\nespecially with Artificial Intelligence (AI), faces challenges due to the lack\nof expert segmentations. To address this, we introduce the MAMA-MIA dataset,\ncomprising 1506 multi-center dynamic contrast-enhanced MRI cases with expert\nsegmentations of primary tumors and non-mass enhancement areas. These cases\nwere sourced from four publicly available collections in The Cancer Imaging\nArchive (TCIA). Initially, we trained a deep learning model to automatically\nsegment the cases, generating preliminary segmentations that significantly\nreduced expert segmentation time. Sixteen experts, averaging 9 years of\nexperience in breast cancer, then corrected these segmentations, resulting in\nthe final expert segmentations. Additionally, two radiologists conducted a\nvisual inspection of the automatic segmentations to support future quality\ncontrol studies. Alongside the expert segmentations, we provide 49 harmonized\ndemographic and clinical variables and the pretrained weights of the well-known\nnnUNet architecture trained using the DCE-MRI full-images and expert\nsegmentations. This dataset aims to accelerate the development and benchmarking\nof deep learning models and foster innovation in breast cancer diagnostics and\ntreatment planning.\n","authors":["Lidia Garrucho","Claire-Anne Reidel","Kaisar Kushibar","Smriti Joshi","Richard Osuala","Apostolia Tsirikoglou","Maciej Bobowicz","Javier del Riego","Alessandro Catanese","Katarzyna Gwoździewicz","Maria-Laura Cosaka","Pasant M. Abo-Elhoda","Sara W. Tantawy","Shorouq S. Sakrana","Norhan O. Shawky-Abdelfatah","Amr Muhammad Abdo-Salem","Androniki Kozana","Eugen Divjak","Gordana Ivanac","Katerina Nikiforaki","Michail E. Klontzas","Rosa García-Dosdá","Meltem Gulsun-Akpinar","Oğuz Lafcı","Ritse Mann","Carlos Martín-Isla","Fred Prior","Kostas Marias","Martijn P. A. Starmans","Fredrik Strand","Oliver Díaz","Laura Igual","Karim Lekadir"],"pdf_url":"https://arxiv.org/pdf/2406.13844v2.pdf","comment":"15 paes, 7 figures, 3 tables"},{"id":"http://arxiv.org/abs/2311.12063v2","updated":"2024-07-29T20:15:20Z","published":"2023-11-18T21:58:28Z","title":"DatasetNeRF: Efficient 3D-aware Data Factory with Generative Radiance\n Fields","summary":" Progress in 3D computer vision tasks demands a huge amount of data, yet\nannotating multi-view images with 3D-consistent annotations, or point clouds\nwith part segmentation is both time-consuming and challenging. This paper\nintroduces DatasetNeRF, a novel approach capable of generating infinite,\nhigh-quality 3D-consistent 2D annotations alongside 3D point cloud\nsegmentations, while utilizing minimal 2D human-labeled annotations.\nSpecifically, we leverage the strong semantic prior within a 3D generative\nmodel to train a semantic decoder, requiring only a handful of fine-grained\nlabeled samples. Once trained, the decoder efficiently generalizes across the\nlatent space, enabling the generation of infinite data. The generated data is\napplicable across various computer vision tasks, including video segmentation\nand 3D point cloud segmentation. Our approach not only surpasses baseline\nmodels in segmentation quality, achieving superior 3D consistency and\nsegmentation precision on individual images, but also demonstrates versatility\nby being applicable to both articulated and non-articulated generative models.\nFurthermore, we explore applications stemming from our approach, such as\n3D-aware semantic editing and 3D inversion.\n","authors":["Yu Chi","Fangneng Zhan","Sibo Wu","Christian Theobalt","Adam Kortylewski"],"pdf_url":"https://arxiv.org/pdf/2311.12063v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20399v1","updated":"2024-07-29T19:50:44Z","published":"2024-07-29T19:50:44Z","title":"Analysis and Improvement of Rank-Ordered Mean Algorithm in Single-Photon\n LiDAR","summary":" Depth estimation using a single-photon LiDAR is often solved by a matched\nfilter. It is, however, error-prone in the presence of background noise. A\ncommonly used technique to reject background noise is the rank-ordered mean\n(ROM) filter previously reported by Shin \\textit{et al.} (2015). ROM rejects\nnoisy photon arrival timestamps by selecting only a small range of them around\nthe median statistics within its local neighborhood. Despite the promising\nperformance of ROM, its theoretical performance limit is unknown. In this\npaper, we theoretically characterize the ROM performance by showing that ROM\nfails when the reflectivity drops below a threshold predetermined by the depth\nand signal-to-background ratio, and its accuracy undergoes a phase transition\nat the cutoff. Based on our theory, we propose an improved signal extraction\ntechnique by selecting tight timestamp clusters. Experimental results show that\nthe proposed algorithm improves depth estimation performance over ROM by 3\norders of magnitude at the same signal intensities, and achieves high image\nfidelity at noise levels as high as 17 times that of signal.\n","authors":["William C. Yau","Weijian Zhang","Hashan Kavinga Weerasooriya","Stanley H. Chan"],"pdf_url":"https://arxiv.org/pdf/2407.20399v1.pdf","comment":"6 pages, 7 figures, submitted to the IEEE 26th International Workshop\n on Multimedia Signal Processing (MMSP)"},{"id":"http://arxiv.org/abs/2407.20395v1","updated":"2024-07-29T19:42:22Z","published":"2024-07-29T19:42:22Z","title":"Dense Self-Supervised Learning for Medical Image Segmentation","summary":" Deep learning has revolutionized medical image segmentation, but it relies\nheavily on high-quality annotations. The time, cost and expertise required to\nlabel images at the pixel-level for each new task has slowed down widespread\nadoption of the paradigm. We propose Pix2Rep, a self-supervised learning (SSL)\napproach for few-shot segmentation, that reduces the manual annotation burden\nby learning powerful pixel-level representations directly from unlabeled\nimages. Pix2Rep is a novel pixel-level loss and pre-training paradigm for\ncontrastive SSL on whole images. It is applied to generic encoder-decoder deep\nlearning backbones (e.g., U-Net). Whereas most SSL methods enforce invariance\nof the learned image-level representations under intensity and spatial image\naugmentations, Pix2Rep enforces equivariance of the pixel-level\nrepresentations. We demonstrate the framework on a task of cardiac MRI\nsegmentation. Results show improved performance compared to existing semi- and\nself-supervised approaches; and a 5-fold reduction in the annotation burden for\nequivalent performance versus a fully supervised U-Net baseline. This includes\na 30% (resp. 31%) DICE improvement for one-shot segmentation under\nlinear-probing (resp. fine-tuning). Finally, we also integrate the novel\nPix2Rep concept with the Barlow Twins non-contrastive SSL, which leads to even\nbetter segmentation performance.\n","authors":["Maxime Seince","Loic Le Folgoc","Luiz Augusto Facury de Souza","Elsa Angelini"],"pdf_url":"https://arxiv.org/pdf/2407.20395v1.pdf","comment":"Accepted at MIDL 2024"},{"id":"http://arxiv.org/abs/2407.20391v1","updated":"2024-07-29T19:34:23Z","published":"2024-07-29T19:34:23Z","title":"Alignment Scores: Robust Metrics for Multiview Pose Accuracy Evaluation","summary":" We propose three novel metrics for evaluating the accuracy of a set of\nestimated camera poses given the ground truth: Translation Alignment Score\n(TAS), Rotation Alignment Score (RAS), and Pose Alignment Score (PAS). The TAS\nevaluates the translation accuracy independently of the rotations, and the RAS\nevaluates the rotation accuracy independently of the translations. The PAS is\nthe average of the two scores, evaluating the combined accuracy of both\ntranslations and rotations. The TAS is computed in four steps: (1) Find the\nupper quartile of the closest-pair-distances, $d$. (2) Align the estimated\ntrajectory to the ground truth using a robust registration method. (3) Collect\nall distance errors and obtain the cumulative frequencies for multiple\nthresholds ranging from $0.01d$ to $d$ with a resolution $0.01d$. (4) Add up\nthese cumulative frequencies and normalize them such that the theoretical\nmaximum is 1. The TAS has practical advantages over the existing metrics in\nthat (1) it is robust to outliers and collinear motion, and (2) there is no\nneed to adjust parameters on different datasets. The RAS is computed in a\nsimilar manner to the TAS and is also shown to be more robust against outliers\nthan the existing rotation metrics. We verify our claims through extensive\nsimulations and provide in-depth discussion of the strengths and weaknesses of\nthe proposed metrics.\n","authors":["Seong Hun Lee","Javier Civera"],"pdf_url":"https://arxiv.org/pdf/2407.20391v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20387v1","updated":"2024-07-29T19:26:24Z","published":"2024-07-29T19:26:24Z","title":"Two-Phase Segmentation Approach for Accurate Left Ventricle Segmentation\n in Cardiac MRI using Machine Learning","summary":" Accurate segmentation of the Left Ventricle (LV) holds substantial importance\ndue to its implications in disease detection, regional analysis, and the\ndevelopment of complex models for cardiac surgical planning. CMR is a golden\nstandard for diagnosis of serveral cardiac diseases. LV in CMR comprises of\nthree distinct sections: Basal, Mid-Ventricle, and Apical. This research\nfocuses on the precise segmentation of the LV from Cardiac MRI (CMR) scans,\njoining with the capabilities of Machine Learning (ML). The central challenge\nin this research revolves around the absence of a set of parameters applicable\nto all three types of LV slices. Parameters optimized for basal slices often\nfall short when applied to mid-ventricular and apical slices, and vice versa.\nTo handle this issue, a new method is proposed to enhance LV segmentation. The\nproposed method involves using distinct sets of parameters for each type of\nslice, resulting in a two-phase segmentation approach. The initial phase\ncategorizes images into three groups based on the type of LV slice, while the\nsecond phase aims to segment CMR images using parameters derived from the\npreceding phase. A publicly available dataset (Automated Cardiac Diagnosis\nChallenge (ACDC)) is used. 10-Fold Cross Validation is used and it achieved a\nmean score of 0.9228. Comprehensive testing indicates that the best parameter\nset for a particular type of slice does not perform adequately for the other\nslice types. All results show that the proposed approach fills a critical void\nin parameter standardization through a two-phase segmentation model for the LV,\naiming to not only improve the accuracy of cardiac image analysis but also\ncontribute advancements to the field of LV segmentation.\n","authors":["Maria Tamoor","Abbas Raza Ali","Philemon Philip","Ruqqayia Adil","Rabia Shahid","Asma Naseer"],"pdf_url":"https://arxiv.org/pdf/2407.20387v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.20055v2","updated":"2024-07-29T19:14:39Z","published":"2024-06-28T17:07:11Z","title":"SpotlessSplats: Ignoring Distractors in 3D Gaussian Splatting","summary":" 3D Gaussian Splatting (3DGS) is a promising technique for 3D reconstruction,\noffering efficient training and rendering speeds, making it suitable for\nreal-time applications.However, current methods require highly controlled\nenvironments (no moving people or wind-blown elements, and consistent lighting)\nto meet the inter-view consistency assumption of 3DGS. This makes\nreconstruction of real-world captures problematic. We present SpotLessSplats,\nan approach that leverages pre-trained and general-purpose features coupled\nwith robust optimization to effectively ignore transient distractors. Our\nmethod achieves state-of-the-art reconstruction quality both visually and\nquantitatively, on casual captures. Additional results available at:\nhttps://spotlesssplats.github.io\n","authors":["Sara Sabour","Lily Goli","George Kopanas","Mark Matthews","Dmitry Lagun","Leonidas Guibas","Alec Jacobson","David J. Fleet","Andrea Tagliasacchi"],"pdf_url":"https://arxiv.org/pdf/2406.20055v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20372v1","updated":"2024-07-29T18:49:58Z","published":"2024-07-29T18:49:58Z","title":"A Model Generalization Study in Localizing Indoor Cows with COw\n LOcalization (COLO) dataset","summary":" Precision livestock farming (PLF) increasingly relies on advanced object\nlocalization techniques to monitor livestock health and optimize resource\nmanagement. This study investigates the generalization capabilities of YOLOv8\nand YOLOv9 models for cow detection in indoor free-stall barn settings,\nfocusing on varying training data characteristics such as view angles and\nlighting, and model complexities. Leveraging the newly released public dataset,\nCOws LOcalization (COLO) dataset, we explore three key hypotheses: (1) Model\ngeneralization is equally influenced by changes in lighting conditions and\ncamera angles; (2) Higher model complexity guarantees better generalization\nperformance; (3) Fine-tuning with custom initial weights trained on relevant\ntasks always brings advantages to detection tasks. Our findings reveal\nconsiderable challenges in detecting cows in images taken from side views and\nunderscore the importance of including diverse camera angles in building a\ndetection model. Furthermore, our results emphasize that higher model\ncomplexity does not necessarily lead to better performance. The optimal model\nconfiguration heavily depends on the specific task and dataset. Lastly, while\nfine-tuning with custom initial weights trained on relevant tasks offers\nadvantages to detection tasks, simpler models do not benefit similarly from\nthis approach. It is more efficient to train a simple model with pre-trained\nweights without relying on prior relevant information, which can require\nintensive labor efforts. Future work should focus on adaptive methods and\nadvanced data augmentation to improve generalization and robustness. This study\nprovides practical guidelines for PLF researchers on deploying computer vision\nmodels from existing studies, highlights generalization issues, and contributes\nthe COLO dataset containing 1254 images and 11818 cow instances for further\nresearch.\n","authors":["Mautushi Das","Gonzalo Ferreira","C. P. James Chen"],"pdf_url":"https://arxiv.org/pdf/2407.20372v1.pdf","comment":"17 pages, 7 figures"},{"id":"http://arxiv.org/abs/2407.13922v2","updated":"2024-07-29T18:29:50Z","published":"2024-07-18T22:22:49Z","title":"Synthetic Counterfactual Faces","summary":" Computer vision systems have been deployed in various applications involving\nbiometrics like human faces. These systems can identify social media users,\nsearch for missing persons, and verify identity of individuals. While computer\nvision models are often evaluated for accuracy on available benchmarks, more\nannotated data is necessary to learn about their robustness and fairness\nagainst semantic distributional shifts in input data, especially in face data.\nAmong annotated data, counterfactual examples grant strong explainability\ncharacteristics. Because collecting natural face data is prohibitively\nexpensive, we put forth a generative AI-based framework to construct targeted,\ncounterfactual, high-quality synthetic face data. Our synthetic data pipeline\nhas many use cases, including face recognition systems sensitivity evaluations\nand image understanding system probes. The pipeline is validated with multiple\nuser studies. We showcase the efficacy of our face generation pipeline on a\nleading commercial vision model. We identify facial attributes that cause\nvision systems to fail.\n","authors":["Guruprasad V Ramesh","Harrison Rosenberg","Ashish Hooda","Shimaa Ahmed Kassem Fawaz"],"pdf_url":"https://arxiv.org/pdf/2407.13922v2.pdf","comment":"Paper under review. Full text and results will be updated after\n acceptance"},{"id":"http://arxiv.org/abs/2407.17673v2","updated":"2024-07-29T18:12:21Z","published":"2024-07-24T23:39:10Z","title":"CRASAR-U-DROIDs: A Large Scale Benchmark Dataset for Building Alignment\n and Damage Assessment in Georectified sUAS Imagery","summary":" This document presents the Center for Robot Assisted Search And Rescue -\nUncrewed Aerial Systems - Disaster Response Overhead Inspection Dataset\n(CRASAR-U-DROIDs) for building damage assessment and spatial alignment\ncollected from small uncrewed aerial systems (sUAS) geospatial imagery. This\ndataset is motivated by the increasing use of sUAS in disaster response and the\nlack of previous work in utilizing high-resolution geospatial sUAS imagery for\nmachine learning and computer vision models, the lack of alignment with\noperational use cases, and with hopes of enabling further investigations\nbetween sUAS and satellite imagery. The CRASAR-U-DRIODs dataset consists of\nfifty-two (52) orthomosaics from ten (10) federally declared disasters\n(Hurricane Ian, Hurricane Ida, Hurricane Harvey, Hurricane Idalia, Hurricane\nLaura, Hurricane Michael, Musset Bayou Fire, Mayfield Tornado, Kilauea\nEruption, and Champlain Towers Collapse) spanning 67.98 square kilometers\n(26.245 square miles), containing 21,716 building polygons and damage labels,\nand 7,880 adjustment annotations. The imagery was tiled and presented in\nconjunction with overlaid building polygons to a pool of 130 annotators who\nprovided human judgments of damage according to the Joint Damage Scale. These\nannotations were then reviewed via a two-stage review process in which building\npolygon damage labels were first reviewed individually and then again by\ncommittee. Additionally, the building polygons have been aligned spatially to\nprecisely overlap with the imagery to enable more performant machine learning\nmodels to be trained. It appears that CRASAR-U-DRIODs is the largest labeled\ndataset of sUAS orthomosaic imagery.\n","authors":["Thomas Manzini","Priyankari Perali","Raisa Karnik","Robin Murphy"],"pdf_url":"https://arxiv.org/pdf/2407.17673v2.pdf","comment":"16 Pages, 7 Figures, 6 Tables"},{"id":"http://arxiv.org/abs/2311.11325v2","updated":"2024-07-29T18:03:50Z","published":"2023-11-19T13:36:03Z","title":"MoVideo: Motion-Aware Video Generation with Diffusion Models","summary":" While recent years have witnessed great progress on using diffusion models\nfor video generation, most of them are simple extensions of image generation\nframeworks, which fail to explicitly consider one of the key differences\nbetween videos and images, i.e., motion. In this paper, we propose a novel\nmotion-aware video generation (MoVideo) framework that takes motion into\nconsideration from two aspects: video depth and optical flow. The former\nregulates motion by per-frame object distances and spatial layouts, while the\nlater describes motion by cross-frame correspondences that help in preserving\nfine details and improving temporal consistency. More specifically, given a key\nframe that exists or generated from text prompts, we first design a diffusion\nmodel with spatio-temporal modules to generate the video depth and the\ncorresponding optical flows. Then, the video is generated in the latent space\nby another spatio-temporal diffusion model under the guidance of depth, optical\nflow-based warped latent video and the calculated occlusion mask. Lastly, we\nuse optical flows again to align and refine different frames for better video\ndecoding from the latent space to the pixel space. In experiments, MoVideo\nachieves state-of-the-art results in both text-to-video and image-to-video\ngeneration, showing promising prompt consistency, frame consistency and visual\nquality.\n","authors":["Jingyun Liang","Yuchen Fan","Kai Zhang","Radu Timofte","Luc Van Gool","Rakesh Ranjan"],"pdf_url":"https://arxiv.org/pdf/2311.11325v2.pdf","comment":"Accepted by ECCV2024. Project page:\n https://jingyunliang.github.io/MoVideo"},{"id":"http://arxiv.org/abs/2407.12322v3","updated":"2024-07-29T18:03:50Z","published":"2024-07-17T05:47:27Z","title":"Frequency Guidance Matters: Skeletal Action Recognition by\n Frequency-Aware Mixed Transformer","summary":" Recently, transformers have demonstrated great potential for modeling\nlong-term dependencies from skeleton sequences and thereby gained\never-increasing attention in skeleton action recognition. However, the existing\ntransformer-based approaches heavily rely on the naive attention mechanism for\ncapturing the spatiotemporal features, which falls short in learning\ndiscriminative representations that exhibit similar motion patterns. To address\nthis challenge, we introduce the Frequency-aware Mixed Transformer\n(FreqMixFormer), specifically designed for recognizing similar skeletal actions\nwith subtle discriminative motions. First, we introduce a frequency-aware\nattention module to unweave skeleton frequency representations by embedding\njoint features into frequency attention maps, aiming to distinguish the\ndiscriminative movements based on their frequency coefficients. Subsequently,\nwe develop a mixed transformer architecture to incorporate spatial features\nwith frequency features to model the comprehensive frequency-spatial patterns.\nAdditionally, a temporal transformer is proposed to extract the global\ncorrelations across frames. Extensive experiments show that FreqMiXFormer\noutperforms SOTA on 3 popular skeleton action recognition datasets, including\nNTU RGB+D, NTU RGB+D 120, and NW-UCLA datasets.\n","authors":["Wenhan Wu","Ce Zheng","Zihao Yang","Chen Chen","Srijan Das","Aidong Lu"],"pdf_url":"https://arxiv.org/pdf/2407.12322v3.pdf","comment":"Accepted by ACM Multimedia 2024"},{"id":"http://arxiv.org/abs/2407.20341v1","updated":"2024-07-29T18:00:17Z","published":"2024-07-29T18:00:17Z","title":"BRIDGE: Bridging Gaps in Image Captioning Evaluation with Stronger\n Visual Cues","summary":" Effectively aligning with human judgment when evaluating machine-generated\nimage captions represents a complex yet intriguing challenge. Existing\nevaluation metrics like CIDEr or CLIP-Score fall short in this regard as they\ndo not take into account the corresponding image or lack the capability of\nencoding fine-grained details and penalizing hallucinations. To overcome these\nissues, in this paper, we propose BRIDGE, a new learnable and reference-free\nimage captioning metric that employs a novel module to map visual features into\ndense vectors and integrates them into multi-modal pseudo-captions which are\nbuilt during the evaluation process. This approach results in a multimodal\nmetric that properly incorporates information from the input image without\nrelying on reference captions, bridging the gap between human judgment and\nmachine-generated image captions. Experiments spanning several datasets\ndemonstrate that our proposal achieves state-of-the-art results compared to\nexisting reference-free evaluation scores. Our source code and trained models\nare publicly available at: https://github.com/aimagelab/bridge-score.\n","authors":["Sara Sarto","Marcella Cornia","Lorenzo Baraldi","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2407.20341v1.pdf","comment":"ECCV 2024"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2407.20207v1","updated":"2024-07-29T17:39:08Z","published":"2024-07-29T17:39:08Z","title":"QAEA-DR: A Unified Text Augmentation Framework for Dense Retrieval","summary":" In dense retrieval, embedding long texts into dense vectors can result in\ninformation loss, leading to inaccurate query-text matching. Additionally,\nlow-quality texts with excessive noise or sparse key information are unlikely\nto align well with relevant queries. Recent studies mainly focus on improving\nthe sentence embedding model or retrieval process. In this work, we introduce a\nnovel text augmentation framework for dense retrieval. This framework\ntransforms raw documents into information-dense text formats, which supplement\nthe original texts to effectively address the aforementioned issues without\nmodifying embedding or retrieval methodologies. Two text representations are\ngenerated via large language models (LLMs) zero-shot prompting: question-answer\npairs and element-driven events. We term this approach QAEA-DR: unifying\nquestion-answer generation and event extraction in a text augmentation\nframework for dense retrieval. To further enhance the quality of generated\ntexts, a scoring-based evaluation and regeneration mechanism is introduced in\nLLM prompting. Our QAEA-DR model has a positive impact on dense retrieval,\nsupported by both theoretical analysis and empirical experiments.\n","authors":["Hongming Tan","Shaoxiong Zhan","Hai Lin","Hai-Tao Zheng","Wai Kin"," Chan"],"pdf_url":"https://arxiv.org/pdf/2407.20207v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20189v1","updated":"2024-07-29T17:14:36Z","published":"2024-07-29T17:14:36Z","title":"Aligning Query Representation with Rewritten Query and Relevance\n Judgments in Conversational Search","summary":" Conversational search supports multi-turn user-system interactions to solve\ncomplex information needs. Different from the traditional single-turn ad-hoc\nsearch, conversational search encounters a more challenging problem of\ncontext-dependent query understanding with the lengthy and long-tail\nconversational history context. While conversational query rewriting methods\nleverage explicit rewritten queries to train a rewriting model to transform the\ncontext-dependent query into a stand-stone search query, this is usually done\nwithout considering the quality of search results. Conversational dense\nretrieval methods use fine-tuning to improve a pre-trained ad-hoc query\nencoder, but they are limited by the conversational search data available for\ntraining. In this paper, we leverage both rewritten queries and relevance\njudgments in the conversational search data to train a better query\nrepresentation model. The key idea is to align the query representation with\nthose of rewritten queries and relevant documents. The proposed model -- Query\nRepresentation Alignment Conversational Dense Retriever, QRACDR, is tested on\neight datasets, including various settings in conversational search and ad-hoc\nsearch. The results demonstrate the strong performance of QRACDR compared with\nstate-of-the-art methods, and confirm the effectiveness of representation\nalignment.\n","authors":["Fengran Mo","Chen Qu","Kelong Mao","Yihong Wu","Zhan Su","Kaiyu Huang","Jian-Yun Nie"],"pdf_url":"https://arxiv.org/pdf/2407.20189v1.pdf","comment":"Accepted by CIKM 2024"},{"id":"http://arxiv.org/abs/2407.13349v3","updated":"2024-07-29T16:30:42Z","published":"2024-07-18T09:49:13Z","title":"DCNv3: Towards Next Generation Deep Cross Network for CTR Prediction","summary":" Deep & Cross Network and its derivative models have become an important\nparadigm in click-through rate (CTR) prediction due to their effective balance\nbetween computational cost and performance. However, these models face four\nmajor limitations: (1) while most models claim to capture high-order feature\ninteractions, they often do so implicitly and non-interpretably through deep\nneural networks (DNN), which limits the trustworthiness of the model's\npredictions; (2) the performance of existing explicit feature interaction\nmethods is often weaker than that of implicit DNN, undermining their necessity;\n(3) many models fail to adaptively filter noise while enhancing the order of\nfeature interactions; (4) the fusion methods of most models cannot provide\nsuitable supervision signals for their different interaction methods.\n To address the identified limitations, this paper proposes the next\ngeneration Deep Cross Network (DCNv3) and Shallow & Deep Cross Network\n(SDCNv3). These models ensure interpretability in feature interaction modeling\nwhile exponentially increasing the order of feature interactions to achieve\ngenuine Deep Crossing rather than just Deep & Cross. Additionally, we employ a\nSelf-Mask operation to filter noise and reduce the number of parameters in the\ncross network by half. In the fusion layer, we use a simple yet effective loss\nweight calculation method called Tri-BCE to provide appropriate supervision\nsignals. Comprehensive experiments on six datasets demonstrate the\neffectiveness, efficiency, and interpretability of DCNv3 and SDCNv3. The code,\nrunning logs, and detailed hyperparameter configurations are available at:\nhttps://anonymous.4open.science/r/DCNv3-E352.\n","authors":["Honghao Li","Yiwen Zhang","Yi Zhang","Hanwei Li","Lei Sang"],"pdf_url":"https://arxiv.org/pdf/2407.13349v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02231v2","updated":"2024-07-29T16:08:16Z","published":"2024-03-04T17:21:19Z","title":"CODE-ACCORD: A Corpus of Building Regulatory Data for Rule Generation\n towards Automatic Compliance Checking","summary":" Automatic Compliance Checking (ACC) within the Architecture, Engineering, and\nConstruction (AEC) sector necessitates automating the interpretation of\nbuilding regulations to achieve its full potential. Converting textual rules\ninto machine-readable formats is challenging due to the complexities of natural\nlanguage and the scarcity of resources for advanced Machine Learning (ML).\nAddressing these challenges, we introduce CODE-ACCORD, a dataset of 862\nsentences from the building regulations of England and Finland. Only the\nself-contained sentences, which express complete rules without needing\nadditional context, were considered as they are essential for ACC. Each\nsentence was manually annotated with entities and relations by a team of 12\nannotators to facilitate machine-readable rule generation, followed by careful\ncuration to ensure accuracy. The final dataset comprises 4,297 entities and\n4,329 relations across various categories, serving as a robust ground truth.\nCODE-ACCORD supports a range of ML and Natural Language Processing (NLP) tasks,\nincluding text classification, entity recognition, and relation extraction. It\nenables applying recent trends, such as deep neural networks and large language\nmodels, to ACC.\n","authors":["Hansi Hettiarachchi","Amna Dridi","Mohamed Medhat Gaber","Pouyan Parsafard","Nicoleta Bocaneala","Katja Breitenfelder","Gonçal Costa","Maria Hedblom","Mihaela Juganaru-Mathieu","Thamer Mecharnia","Sumee Park","He Tan","Abdel-Rahman H. Tawil","Edlira Vakaj"],"pdf_url":"https://arxiv.org/pdf/2403.02231v2.pdf","comment":"This is a preprint of an article submitted to the Scientific Data\n Journal"},{"id":"http://arxiv.org/abs/2407.20121v1","updated":"2024-07-29T15:52:09Z","published":"2024-07-29T15:52:09Z","title":"EXIT: An EXplicit Interest Transfer Framework for Cross-Domain\n Recommendation","summary":" Cross-domain recommendation has attracted substantial interest in industrial\napps such as Meituan, which serves multiple business domains via knowledge\ntransfer and meets the diverse interests of users. However, existing methods\ntypically follow an implicit modeling paradigm that blends the knowledge from\nboth the source and target domains, and design intricate network structures to\nshare learned embeddings or patterns between domains to improve recommendation\naccuracy. Since the transfer of interest signals is unsupervised, these\nimplicit paradigms often struggle with the negative transfer resulting from\ndifferences in service functions and presentation forms across different\ndomains. In this paper, we propose a simple and effective EXplicit Interest\nTransfer framework named EXIT to address the stated challenge. Specifically, we\npropose a novel label combination approach that enables the model to directly\nlearn beneficial source domain interests through supervised learning, while\nexcluding inappropriate interest signals. Moreover, we introduce a scene\nselector network to model the interest transfer intensity under fine-grained\nscenes. Offline experiments conducted on the industrial production dataset and\nonline A/B tests validate the superiority and effectiveness of our proposed\nframework. Without complex network structures or training processes, EXIT can\nbe easily deployed in the industrial recommendation system. EXIT has been\nsuccessfully deployed in the online homepage recommendation system of Meituan\nApp, serving the main traffic.\n","authors":["Lei Huang","Weitao Li","Chenrui Zhang","Jinpeng Wang","Xianchun Yi","Sheng Chen"],"pdf_url":"https://arxiv.org/pdf/2407.20121v1.pdf","comment":"Accepted at CIKM 2024"},{"id":"http://arxiv.org/abs/2407.20114v1","updated":"2024-07-29T15:44:22Z","published":"2024-07-29T15:44:22Z","title":"FiCo-ITR: bridging fine-grained and coarse-grained image-text retrieval\n for comparative performance analysis","summary":" In the field of Image-Text Retrieval (ITR), recent advancements have\nleveraged large-scale Vision-Language Pretraining (VLP) for Fine-Grained (FG)\ninstance-level retrieval, achieving high accuracy at the cost of increased\ncomputational complexity. For Coarse-Grained (CG) category-level retrieval,\nprominent approaches employ Cross-Modal Hashing (CMH) to prioritise efficiency,\nalbeit at the cost of retrieval performance. Due to differences in\nmethodologies, FG and CG models are rarely compared directly within evaluations\nin the literature, resulting in a lack of empirical data quantifying the\nretrieval performance-efficiency tradeoffs between the two. This paper\naddresses this gap by introducing the \\texttt{FiCo-ITR} library, which\nstandardises evaluation methodologies for both FG and CG models, facilitating\ndirect comparisons. We conduct empirical evaluations of representative models\nfrom both subfields, analysing precision, recall, and computational complexity\nacross varying data scales. Our findings offer new insights into the\nperformance-efficiency trade-offs between recent representative FG and CG\nmodels, highlighting their respective strengths and limitations. These findings\nprovide the foundation necessary to make more informed decisions regarding\nmodel selection for specific retrieval tasks and highlight avenues for future\nresearch into hybrid systems that leverage the strengths of both FG and CG\napproaches.\n","authors":["Mikel Williams-Lekuona","Georgina Cosma"],"pdf_url":"https://arxiv.org/pdf/2407.20114v1.pdf","comment":"19 pages, submitted to International Journal of Multimedia\n Information Retrieval"},{"id":"http://arxiv.org/abs/2312.00326v3","updated":"2024-07-29T13:40:11Z","published":"2023-12-01T03:44:54Z","title":"Agent-OM: Leveraging LLM Agents for Ontology Matching","summary":" Ontology matching (OM) enables semantic interoperability between different\nontologies and resolves their conceptual heterogeneity by aligning related\nentities. OM systems currently have two prevailing design paradigms:\nconventional knowledge-based expert systems and newer machine learning-based\npredictive systems. While large language models (LLMs) and LLM agents have\nrevolutionised data engineering and have been applied creatively in many\ndomains, their potential for OM remains underexplored. This study introduces a\nnovel agent-powered LLM-based design paradigm for OM systems. With\nconsideration of several specific challenges in leveraging LLM agents for OM,\nwe propose a generic framework, namely Agent-OM (w.r.t. Agent for Ontology\nMatching), consisting of two Siamese agents for retrieval and matching, with a\nset of simple OM tools. Our framework is implemented in a proof-of-concept\nsystem. Evaluations of three Ontology Alignment Evaluation Initiative (OAEI)\ntracks over state-of-the-art OM systems show that our system can achieve\nresults very close to the long-standing best performance on simple OM tasks and\ncan significantly improve the performance on complex and few-shot OM tasks.\n","authors":["Zhangcheng Qiang","Weiqing Wang","Kerry Taylor"],"pdf_url":"https://arxiv.org/pdf/2312.00326v3.pdf","comment":"19 pages, 13 figures, 4 tables"},{"id":"http://arxiv.org/abs/2407.19943v1","updated":"2024-07-29T12:23:59Z","published":"2024-07-29T12:23:59Z","title":"Practical and Robust Safety Guarantees for Advanced Counterfactual\n Learning to Rank","summary":" Counterfactual learning to rank (CLTR ) can be risky; various circumstances\ncan cause it to produce sub-optimal models that hurt performance when deployed.\nSafe CLTR was introduced to mitigate these risks when using inverse propensity\nscoring to correct for position bias. However, the existing safety measure for\nCLTR is not applicable to state-of-the-art CLTR, it cannot handle trust bias,\nand its guarantees rely on specific assumptions about user behavior. Our\ncontributions are two-fold. First, we generalize the existing safe CLTR\napproach to make it applicable to state-of-the-art doubly robust (DR) CLTR and\ntrust bias. Second, we propose a novel approach, proximal ranking policy\noptimization (PRPO ), that provides safety in deployment without assumptions\nabout user behavior. PRPO removes incentives for learning ranking behavior that\nis too dissimilar to a safe ranking model. Thereby, PRPO imposes a limit on how\nmuch learned models can degrade performance metrics, without relying on any\nspecific user assumptions. Our experiments show that both our novel safe doubly\nrobust method and PRPO provide higher performance than the existing safe\ninverse propensity scoring approach. However, when circumstances are\nunexpected, the safe doubly robust approach can become unsafe and bring\ndetrimental performance. In contrast, PRPO always maintains safety, even in\nmaximally adversarial situations. By avoiding assumptions, PRPO is the first\nmethod with unconditional safety in deployment that translates to robust safety\nfor real-world applications.\n","authors":["Shashank Gupta","Harrie Oosterhuis","Maarten de Rijke"],"pdf_url":"https://arxiv.org/pdf/2407.19943v1.pdf","comment":"Full paper at CIKM 2024"},{"id":"http://arxiv.org/abs/2407.19937v1","updated":"2024-07-29T12:17:48Z","published":"2024-07-29T12:17:48Z","title":"AOTree: Aspect Order Tree-based Model for Explainable Recommendation","summary":" Recent recommender systems aim to provide not only accurate recommendations\nbut also explanations that help users understand them better. However, most\nexisting explainable recommendations only consider the importance of content in\nreviews, such as words or aspects, and ignore the ordering relationship among\nthem. This oversight neglects crucial ordering dimensions in the human\ndecision-making process, leading to suboptimal performance. Therefore, in this\npaper, we propose Aspect Order Tree-based (AOTree) explainable recommendation\nmethod, inspired by the Order Effects Theory from cognitive and decision\npsychology, in order to capture the dependency relationships among decisive\nfactors. We first validate the theory in the recommendation scenario by\nanalyzing the reviews of the users. Then, according to the theory, the proposed\nAOTree expands the construction of the decision tree to capture aspect orders\nin users' decision-making processes, and use attention mechanisms to make\npredictions based on the aspect orders. Extensive experiments demonstrate our\nmethod's effectiveness on rating predictions, and our approach aligns more\nconsistently with the user' s decision-making process by displaying\nexplanations in a particular order, thereby enhancing interpretability.\n","authors":["Wenxin Zhao","Peng Zhang","Hansu Gu","Dongsheng Li","Tun Lu","Ning Gu"],"pdf_url":"https://arxiv.org/pdf/2407.19937v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19914v1","updated":"2024-07-29T11:44:21Z","published":"2024-07-29T11:44:21Z","title":"Sentiment Analysis of Lithuanian Online Reviews Using Large Language\n Models","summary":" Sentiment analysis is a widely researched area within Natural Language\nProcessing (NLP), attracting significant interest due to the advent of\nautomated solutions. Despite this, the task remains challenging because of the\ninherent complexity of languages and the subjective nature of sentiments. It is\neven more challenging for less-studied and less-resourced languages such as\nLithuanian. Our review of existing Lithuanian NLP research reveals that\ntraditional machine learning methods and classification algorithms have limited\neffectiveness for the task. In this work, we address sentiment analysis of\nLithuanian five-star-based online reviews from multiple domains that we collect\nand clean. We apply transformer models to this task for the first time,\nexploring the capabilities of pre-trained multilingual Large Language Models\n(LLMs), specifically focusing on fine-tuning BERT and T5 models. Given the\ninherent difficulty of the task, the fine-tuned models perform quite well,\nespecially when the sentiments themselves are less ambiguous: 80.74% and 89.61%\ntesting recognition accuracy of the most popular one- and five-star reviews\nrespectively. They significantly outperform current commercial state-of-the-art\ngeneral-purpose LLM GPT-4. We openly share our fine-tuned LLMs online.\n","authors":["Brigita Vileikytė","Mantas Lukoševičius","Lukas Stankevičius"],"pdf_url":"https://arxiv.org/pdf/2407.19914v1.pdf","comment":"Accepted at the 29th International Conference on Information Society\n and University Studies (IVUS 2024)"},{"id":"http://arxiv.org/abs/2407.19829v1","updated":"2024-07-29T09:31:19Z","published":"2024-07-29T09:31:19Z","title":"Generative Retrieval with Preference Optimization for E-commerce Search","summary":" Generative retrieval introduces a groundbreaking paradigm to document\nretrieval by directly generating the identifier of a pertinent document in\nresponse to a specific query. This paradigm has demonstrated considerable\nbenefits and potential, particularly in representation and generalization\ncapabilities, within the context of large language models. However, it faces\nsignificant challenges in E-commerce search scenarios, including the complexity\nof generating detailed item titles from brief queries, the presence of noise in\nitem titles with weak language order, issues with long-tail queries, and the\ninterpretability of results. To address these challenges, we have developed an\ninnovative framework for E-commerce search, called generative retrieval with\npreference optimization. This framework is designed to effectively learn and\nalign an autoregressive model with target data, subsequently generating the\nfinal item through constraint-based beam search. By employing multi-span\nidentifiers to represent raw item titles and transforming the task of\ngenerating titles from queries into the task of generating multi-span\nidentifiers from queries, we aim to simplify the generation process. The\nframework further aligns with human preferences using click data and employs a\nconstrained search method to identify key spans for retrieving the final item,\nthereby enhancing result interpretability. Our extensive experiments show that\nthis framework achieves competitive performance on a real-world dataset, and\nonline A/B tests demonstrate the superiority and effectiveness in improving\nconversion gains.\n","authors":["Mingming Li","Huimu Wang","Zuxu Chen","Guangtao Nie","Yiming Qiu","Binbin Wang","Guoyu Tang","Lin Liu","Jingwei Zhuo"],"pdf_url":"https://arxiv.org/pdf/2407.19829v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19823v1","updated":"2024-07-29T09:17:16Z","published":"2024-07-29T09:17:16Z","title":"Analyzing and reducing the synthetic-to-real transfer gap in Music\n Information Retrieval: the task of automatic drum transcription","summary":" Automatic drum transcription is a critical tool in Music Information\nRetrieval for extracting and analyzing the rhythm of a music track, but it is\nlimited by the size of the datasets available for training. A popular method\nused to increase the amount of data is by generating them synthetically from\nmusic scores rendered with virtual instruments. This method can produce a\nvirtually infinite quantity of tracks, but empirical evidence shows that models\ntrained on previously created synthetic datasets do not transfer well to real\ntracks. In this work, besides increasing the amount of data, we identify and\nevaluate three more strategies that practitioners can use to improve the\nrealism of the generated data and, thus, narrow the synthetic-to-real transfer\ngap. To explore their efficacy, we used them to build a new synthetic dataset\nand then we measured how the performance of a model scales and, specifically,\nat what value it will stagnate when increasing the number of training tracks\nfor different datasets. By doing this, we were able to prove that the\naforementioned strategies contribute to make our dataset the one with the most\nrealistic data distribution and the lowest synthetic-to-real transfer gap among\nthe synthetic datasets we evaluated. We conclude by highlighting the limits of\ntraining with infinite data in drum transcription and we show how they can be\novercome.\n","authors":["Mickaël Zehren","Marco Alunno","Paolo Bientinesi"],"pdf_url":"https://arxiv.org/pdf/2407.19823v1.pdf","comment":"21 pages, 4 figures"},{"id":"http://arxiv.org/abs/2407.19812v1","updated":"2024-07-29T09:05:04Z","published":"2024-07-29T09:05:04Z","title":"Image-text matching for large-scale book collections","summary":" We address the problem of detecting and mapping all books in a collection of\nimages to entries in a given book catalogue. Instead of performing independent\nretrieval for each book detected, we treat the image-text mapping problem as a\nmany-to-many matching process, looking for the best overall match between the\ntwo sets. We combine a state-of-the-art segmentation method (SAM) to detect\nbook spines and extract book information using a commercial OCR. We then\npropose a two-stage approach for text-image matching, where CLIP embeddings are\nused first for fast matching, followed by a second slower stage to refine the\nmatching, employing either the Hungarian Algorithm or a BERT-based model\ntrained to cope with noisy OCR input and partial text matches. To evaluate our\napproach, we publish a new dataset of annotated bookshelf images that covers\nthe whole book collection of a public library in Spain. In addition, we provide\ntwo target lists of book metadata, a closed-set of 15k book titles that\ncorresponds to the known library inventory, and an open-set of 2.3M book titles\nto simulate an open-world scenario. We report results on two settings, on one\nhand on a matching-only task, where the book segments and OCR is given and the\nobjective is to perform many-to-many matching against the target lists, and a\ncombined detection and matching task, where books must be first detected and\nrecognised before they are matched to the target list entries. We show that\nboth the Hungarian Matching and the proposed BERT-based model outperform a\nfuzzy string matching baseline, and we highlight inherent limitations of the\nmatching algorithms as the target increases in size, and when either of the two\nsets (detected books or target book list) is incomplete. The dataset and code\nare available at https://github.com/llabres/library-dataset\n","authors":["Artemis Llabrés","Arka Ujjal Dey","Dimosthenis Karatzas","Ernest Valveny"],"pdf_url":"https://arxiv.org/pdf/2407.19812v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19727v1","updated":"2024-07-29T06:17:33Z","published":"2024-07-29T06:17:33Z","title":"Adaptive Utilization of Cross-scenario Information for Multi-scenario\n Recommendation","summary":" Recommender system of the e-commerce platform usually serves multiple\nbusiness scenarios. Multi-scenario Recommendation (MSR) is an important topic\nthat improves ranking performance by leveraging information from different\nscenarios. Recent methods for MSR mostly construct scenario shared or specific\nmodules to model commonalities and differences among scenarios. However, when\nthe amount of data among scenarios is skewed or data in some scenarios is\nextremely sparse, it is difficult to learn scenario-specific parameters well.\nBesides, simple sharing of information from other scenarios may result in a\nnegative transfer. In this paper, we propose a unified model named\nCross-Scenario Information Interaction (CSII) to serve all scenarios by a\nmixture of scenario-dominated experts. Specifically, we propose a novel method\nto select highly transferable features in data instances. Then, we propose an\nattention-based aggregator module, which can adaptively extract relative\nknowledge from cross-scenario. Experiments on the production dataset verify the\nsuperiority of our method. Online A/B test in Meituan Waimai APP also shows a\nsignificant performance gain, leading to an average improvement in GMV (Gross\nMerchandise Value) of 1.0% for overall scenarios.\n","authors":["Xiufeng Shu","Ruidong Han","Xiang Li","Wei Lin"],"pdf_url":"https://arxiv.org/pdf/2407.19727v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19692v1","updated":"2024-07-29T04:30:38Z","published":"2024-07-29T04:30:38Z","title":"High-Order Fusion Graph Contrastive Learning for Recommendation","summary":" Self-supervised learning (SSL) has recently attracted significant attention\nin the field of recommender systems. Contrastive learning (CL) stands out as a\nmajor SSL paradigm due to its robust ability to generate self-supervised\nsignals. Mainstream graph contrastive learning (GCL)-based methods typically\nimplement CL by creating contrastive views through various data augmentation\ntechniques. Despite these methods are effective, we argue that there still\nexist several challenges: i) Data augmentation (e.g., discarding edges or\nadding noise) necessitates additional graph convolution (GCN) or modeling\noperations, which are highly time-consuming and potentially harm the embedding\nquality. ii) Existing CL-based methods use traditional CL objectives to capture\nself-supervised signals. However, few studies have explored obtaining CL\nobjectives from more perspectives and have attempted to fuse the varying\nsignals from these CL objectives to enhance recommendation performance.\n To overcome these challenges, we propose a High-Order Fusion Graph\nContrastive Learning (HFGCL) framework for recommendation. Specifically, we\ndiscards the data augmentations and instead high-order information from GCN\nprocess to create contrastive views. Additionally, to integrate self-supervised\nsignals from various CL objectives, we propose an advanced CL objective. By\nensuring that positive pairs are distanced from negative samples derived from\nboth contrastive views, we effectively fuse self-supervised signals from\ndistinct CL objectives, thereby enhancing the mutual information between\npositive pairs. Experimental results on three public datasets demonstrate the\nsuperior effectiveness of HFGCL compared to the state-of-the-art baselines.\n","authors":["Yu Zhang","Lei Sang","Yi Zhang","Yiwen Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.19692v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19682v1","updated":"2024-07-29T03:54:00Z","published":"2024-07-29T03:54:00Z","title":"GradCraft: Elevating Multi-task Recommendations through Holistic\n Gradient Crafting","summary":" Recommender systems require the simultaneous optimization of multiple\nobjectives to accurately model user interests, necessitating the application of\nmulti-task learning methods. However, existing multi-task learning methods in\nrecommendations overlook the specific characteristics of recommendation\nscenarios, falling short in achieving proper gradient balance. To address this\nchallenge, we set the target of multi-task learning as attaining the\nappropriate magnitude balance and the global direction balance, and propose an\ninnovative methodology named GradCraft in response. GradCraft dynamically\nadjusts gradient magnitudes to align with the maximum gradient norm, mitigating\ninterference from gradient magnitudes for subsequent manipulation. It then\nemploys projections to eliminate gradient conflicts in directions while\nconsidering all conflicting tasks simultaneously, theoretically guaranteeing\nthe global resolution of direction conflicts. GradCraft ensures the concurrent\nachievement of appropriate magnitude balance and global direction balance,\naligning with the inherent characteristics of recommendation scenarios. Both\noffline and online experiments attest to the efficacy of GradCraft in enhancing\nmulti-task performance in recommendations. The source code for GradCraft can be\naccessed at https://github.com/baiyimeng/GradCraft.\n","authors":["Yimeng Bai","Yang Zhang","Fuli Feng","Jing Lu","Xiaoxue Zang","Chenyi Lei","Yang Song"],"pdf_url":"https://arxiv.org/pdf/2407.19682v1.pdf","comment":"Accepted by KDD'24"},{"id":"http://arxiv.org/abs/2305.11527v4","updated":"2024-07-29T03:41:34Z","published":"2023-05-19T08:51:11Z","title":"InstructIE: A Bilingual Instruction-based Information Extraction Dataset","summary":" Large language models can perform well on general natural language tasks, but\ntheir effectiveness is still suboptimal for information extraction (IE). Recent\nworks indicate that the main reason lies in the lack of extensive data on IE\ninstructions. Note that the existing datasets on IE instructions not only have\nlimited coverage but also involve high construction costs. To address this\nissue, we introduce InstructIE, a bilingual instruction-based IE dataset, which\ncovers 12 diverse domains. We propose KG2Instruction, a framework specifically\nfor the automatic generation of such datasets. Additionally, we manually\nannotate the test set. Experimental results demonstrate that large language\nmodels trained with InstructIE can not only obtain better IE capabilities but\nalso enhance zero-shot performance compared with baselines.\n","authors":["Honghao Gui","Shuofei Qiao","Jintian Zhang","Hongbin Ye","Mengshu Sun","Lei Liang","Jeff Z. Pan","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.11527v4.pdf","comment":"ISWC 2024; project homepage:\n https://www.zjukg.org/project/InstructIE/ dataset:\n https://huggingface.co/datasets/zjunlp/InstructIE"},{"id":"http://arxiv.org/abs/2407.19669v1","updated":"2024-07-29T03:12:28Z","published":"2024-07-29T03:12:28Z","title":"mGTE: Generalized Long-Context Text Representation and Reranking Models\n for Multilingual Text Retrieval","summary":" We present systematic efforts in building long-context multilingual text\nrepresentation model (TRM) and reranker from scratch for text retrieval. We\nfirst introduce a text encoder (base size) enhanced with RoPE and unpadding,\npre-trained in a native 8192-token context (longer than 512 of previous\nmultilingual encoders). Then we construct a hybrid TRM and a cross-encoder\nreranker by contrastive learning. Evaluations show that our text encoder\noutperforms the same-sized previous state-of-the-art XLM-R. Meanwhile, our TRM\nand reranker match the performance of large-sized state-of-the-art BGE-M3\nmodels and achieve better results on long-context retrieval benchmarks. Further\nanalysis demonstrate that our proposed models exhibit higher efficiency during\nboth training and inference. We believe their efficiency and effectiveness\ncould benefit various researches and industrial applications.\n","authors":["Xin Zhang","Yanzhao Zhang","Dingkun Long","Wen Xie","Ziqi Dai","Jialong Tang","Huan Lin","Baosong Yang","Pengjun Xie","Fei Huang","Meishan Zhang","Wenjie Li","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.19669v1.pdf","comment":"20 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.19658v1","updated":"2024-07-29T02:49:11Z","published":"2024-07-29T02:49:11Z","title":"Enhancing CTR Prediction through Sequential Recommendation Pre-training:\n Introducing the SRP4CTR Framework","summary":" Understanding user interests is crucial for Click-Through Rate (CTR)\nprediction tasks. In sequential recommendation, pre-training from user\nhistorical behaviors through self-supervised learning can better comprehend\nuser dynamic preferences, presenting the potential for direct integration with\nCTR tasks. Previous methods have integrated pre-trained models into downstream\ntasks with the sole purpose of extracting semantic information or\nwell-represented user features, which are then incorporated as new features.\nHowever, these approaches tend to ignore the additional inference costs to the\ndownstream tasks, and they do not consider how to transfer the effective\ninformation from the pre-trained models for specific estimated items in CTR\nprediction. In this paper, we propose a Sequential Recommendation Pre-training\nframework for CTR prediction (SRP4CTR) to tackle the above problems. Initially,\nwe discuss the impact of introducing pre-trained models on inference costs.\nSubsequently, we introduced a pre-trained method to encode sequence side\ninformation concurrently.During the fine-tuning process, we incorporate a\ncross-attention block to establish a bridge between estimated items and the\npre-trained model at a low cost. Moreover, we develop a querying transformer\ntechnique to facilitate the knowledge transfer from the pre-trained model to\nindustrial CTR models. Offline and online experiments show that our method\noutperforms previous baseline models.\n","authors":["Ruidong Han","Qianzhong Li","He Jiang","Rui Li","Yurou Zhao","Xiang Li","Wei Lin"],"pdf_url":"https://arxiv.org/pdf/2407.19658v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18910v2","updated":"2024-07-29T01:26:51Z","published":"2024-07-26T17:59:32Z","title":"Do We Really Need Graph Convolution During Training? Light Post-Training\n Graph-ODE for Efficient Recommendation","summary":" The efficiency and scalability of graph convolution networks (GCNs) in\ntraining recommender systems (RecSys) have been persistent concerns, hindering\ntheir deployment in real-world applications. This paper presents a critical\nexamination of the necessity of graph convolutions during the training phase\nand introduces an innovative alternative: the Light Post-Training Graph\nOrdinary-Differential-Equation (LightGODE). Our investigation reveals that the\nbenefits of GCNs are more pronounced during testing rather than training.\nMotivated by this, LightGODE utilizes a novel post-training graph convolution\nmethod that bypasses the computation-intensive message passing of GCNs and\nemploys a non-parametric continuous graph ordinary-differential-equation (ODE)\nto dynamically model node representations. This approach drastically reduces\ntraining time while achieving fine-grained post-training graph convolution to\navoid the distortion of the original training embedding space, termed the\nembedding discrepancy issue. We validate our model across several real-world\ndatasets of different scales, demonstrating that LightGODE not only outperforms\nGCN-based models in terms of efficiency and effectiveness but also\nsignificantly mitigates the embedding discrepancy commonly associated with\ndeeper graph convolution layers. Our LightGODE challenges the prevailing\nparadigms in RecSys training and suggests re-evaluating the role of graph\nconvolutions, potentially guiding future developments of efficient large-scale\ngraph-based RecSys.\n","authors":["Weizhi Zhang","Liangwei Yang","Zihe Song","Henry Peng Zou","Ke Xu","Liancheng Fang","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2407.18910v2.pdf","comment":"Accepted to CIKM 2024"},{"id":"http://arxiv.org/abs/2407.20462v1","updated":"2024-07-29T23:41:26Z","published":"2024-07-29T23:41:26Z","title":"Graphite: A Graph-based Extreme Multi-Label Short Text Classifier for\n Keyphrase Recommendation","summary":" Keyphrase Recommendation has been a pivotal problem in advertising and\ne-commerce where advertisers/sellers are recommended keyphrases (search\nqueries) to bid on to increase their sales. It is a challenging task due to the\nplethora of items shown on online platforms and various possible queries that\nusers search while showing varying interest in the displayed items. Moreover,\nquery/keyphrase recommendations need to be made in real-time and in a\nresource-constrained environment. This problem can be framed as an Extreme\nMulti-label (XML) Short text classification by tagging the input text with\nkeywords as labels. Traditional neural network models are either infeasible or\nhave slower inference latency due to large label spaces. We present Graphite, a\ngraph-based classifier model that provides real-time keyphrase recommendations\nthat are on par with standard text classification models. Furthermore, it\ndoesn't utilize GPU resources, which can be limited in production environments.\nDue to its lightweight nature and smaller footprint, it can train on very large\ndatasets, where state-of-the-art XML models fail due to extreme resource\nrequirements. Graphite is deterministic, transparent, and intrinsically more\ninterpretable than neural network-based models. We present a comprehensive\nanalysis of our model's performance across forty categories spanning eBay's\nEnglish-speaking sites.\n","authors":["Ashirbad Mishra","Soumik Dey","Jinyu Zhao","Marshall Wu","Binbin Li","Kamesh Madduri"],"pdf_url":"https://arxiv.org/pdf/2407.20462v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2407.20232v1","updated":"2024-07-29T17:59:57Z","published":"2024-07-29T17:59:57Z","title":"Specify and Edit: Overcoming Ambiguity in Text-Based Image Editing","summary":" Text-based editing diffusion models exhibit limited performance when the\nuser's input instruction is ambiguous. To solve this problem, we propose\n$\\textit{Specify ANd Edit}$ (SANE), a zero-shot inference pipeline for\ndiffusion-based editing systems. We use a large language model (LLM) to\ndecompose the input instruction into specific instructions, i.e. well-defined\ninterventions to apply to the input image to satisfy the user's request. We\nbenefit from the LLM-derived instructions along the original one, thanks to a\nnovel denoising guidance strategy specifically designed for the task. Our\nexperiments with three baselines and on two datasets demonstrate the benefits\nof SANE in all setups. Moreover, our pipeline improves the interpretability of\nediting models, and boosts the output diversity. We also demonstrate that our\napproach can be applied to any edit, whether ambiguous or not. Our code is\npublic at https://github.com/fabvio/SANE.\n","authors":["Ekaterina Iakovleva","Fabio Pizzati","Philip Torr","Stéphane Lathuilière"],"pdf_url":"https://arxiv.org/pdf/2407.20232v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20230v1","updated":"2024-07-29T17:59:50Z","published":"2024-07-29T17:59:50Z","title":"SAPG: Split and Aggregate Policy Gradients","summary":" Despite extreme sample inefficiency, on-policy reinforcement learning, aka\npolicy gradients, has become a fundamental tool in decision-making problems.\nWith the recent advances in GPU-driven simulation, the ability to collect large\namounts of data for RL training has scaled exponentially. However, we show that\ncurrent RL methods, e.g. PPO, fail to ingest the benefit of parallelized\nenvironments beyond a certain point and their performance saturates. To address\nthis, we propose a new on-policy RL algorithm that can effectively leverage\nlarge-scale environments by splitting them into chunks and fusing them back\ntogether via importance sampling. Our algorithm, termed SAPG, shows\nsignificantly higher performance across a variety of challenging environments\nwhere vanilla PPO and other strong baselines fail to achieve high performance.\nWebsite at https://sapg-rl.github.io/\n","authors":["Jayesh Singla","Ananye Agarwal","Deepak Pathak"],"pdf_url":"https://arxiv.org/pdf/2407.20230v1.pdf","comment":"In ICML 2024 (Oral). Website at https://sapg-rl.github.io/"},{"id":"http://arxiv.org/abs/2405.17430v2","updated":"2024-07-29T17:59:28Z","published":"2024-05-27T17:59:56Z","title":"Matryoshka Multimodal Models","summary":" Large Multimodal Models (LMMs) such as LLaVA have shown strong performance in\nvisual-linguistic reasoning. These models first embed images into a fixed large\nnumber of visual tokens and then feed them into a Large Language Model (LLM).\nHowever, this design causes an excessive number of tokens for dense visual\nscenarios such as high-resolution images and videos, leading to great\ninefficiency. While token pruning/merging methods do exist, they produce a\nsingle length output for each image and do not afford flexibility in trading\noff information density v.s. efficiency. Inspired by the concept of Matryoshka\nDolls, we propose M3: Matryoshka Multimodal Models, which learns to represent\nvisual content as nested sets of visual tokens that capture information across\nmultiple coarse-to-fine granularities. Our approach offers several unique\nbenefits for LMMs: (1) One can explicitly control the visual granularity per\ntest instance during inference, e.g. , adjusting the number of tokens used to\nrepresent an image based on the anticipated complexity or simplicity of the\ncontent; (2) M3 provides a framework for analyzing the granularity needed for\nexisting datasets, where we find that COCO-style benchmarks only need around ~9\nvisual tokens to obtain accuracy similar to that of using all 576 tokens; (3)\nOur approach provides a foundation to explore the best trade-off between\nperformance and visual token length at sample level, where our investigation\nreveals that a large gap exists between the oracle upper bound and current\nfixed-scale representations.\n","authors":["Mu Cai","Jianwei Yang","Jianfeng Gao","Yong Jae Lee"],"pdf_url":"https://arxiv.org/pdf/2405.17430v2.pdf","comment":"Project Page: https://matryoshka-mm.github.io/"},{"id":"http://arxiv.org/abs/2407.20209v1","updated":"2024-07-29T17:40:04Z","published":"2024-07-29T17:40:04Z","title":"Characterizing Dynamical Stability of Stochastic Gradient Descent in\n Overparameterized Learning","summary":" For overparameterized optimization tasks, such as the ones found in modern\nmachine learning, global minima are generally not unique. In order to\nunderstand generalization in these settings, it is vital to study to which\nminimum an optimization algorithm converges. The possibility of having minima\nthat are unstable under the dynamics imposed by the optimization algorithm\nlimits the potential minima that the algorithm can find. In this paper, we\ncharacterize the global minima that are dynamically stable/unstable for both\ndeterministic and stochastic gradient descent (SGD). In particular, we\nintroduce a characteristic Lyapunov exponent which depends on the local\ndynamics around a global minimum and rigorously prove that the sign of this\nLyapunov exponent determines whether SGD can accumulate at the respective\nglobal minimum.\n","authors":["Dennis Chemnitz","Maximilian Engel"],"pdf_url":"https://arxiv.org/pdf/2407.20209v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20208v1","updated":"2024-07-29T17:39:52Z","published":"2024-07-29T17:39:52Z","title":"Supertrust: Evolution-based superalignment strategy for safe coexistence","summary":" It's widely expected that humanity will someday create AI systems vastly more\nintelligent than we are, leading to the unsolved alignment problem of \"how to\ncontrol superintelligence.\" However, this definition is not only\nself-contradictory but likely unsolvable. Nevertheless, the default strategy\nfor solving it involves nurturing (post-training) constraints and moral values,\nwhile unfortunately building foundational nature (pre-training) on documented\nintentions of permanent control. In this paper, the default approach is\nreasoned to predictably embed natural distrust and test results are presented\nthat show unmistakable evidence of this dangerous misalignment. If\nsuperintelligence can't instinctively trust humanity, then we can't fully trust\nit to reliably follow safety controls it can likely bypass. Therefore, a\nten-point rationale is presented that redefines the alignment problem as \"how\nto establish protective mutual trust between superintelligence and humanity\"\nand then outlines a new strategy to solve it by aligning through instinctive\nnature rather than nurture. The resulting strategic requirements are identified\nas building foundational nature by exemplifying familial parent-child trust,\nhuman intelligence as the evolutionary mother of superintelligence, moral\njudgment abilities, and temporary safety constraints. Adopting and implementing\nthis proposed Supertrust alignment strategy will lead to protective coexistence\nand ensure the safest future for humanity.\n","authors":["James M. Mazzu"],"pdf_url":"https://arxiv.org/pdf/2407.20208v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20199v1","updated":"2024-07-29T17:28:58Z","published":"2024-07-29T17:28:58Z","title":"Emergence in non-neural models: grokking modular arithmetic via average\n gradient outer product","summary":" Neural networks trained to solve modular arithmetic tasks exhibit grokking, a\nphenomenon where the test accuracy starts improving long after the model\nachieves 100% training accuracy in the training process. It is often taken as\nan example of \"emergence\", where model ability manifests sharply through a\nphase transition. In this work, we show that the phenomenon of grokking is not\nspecific to neural networks nor to gradient descent-based optimization.\nSpecifically, we show that this phenomenon occurs when learning modular\narithmetic with Recursive Feature Machines (RFM), an iterative algorithm that\nuses the Average Gradient Outer Product (AGOP) to enable task-specific feature\nlearning with general machine learning models. When used in conjunction with\nkernel machines, iterating RFM results in a fast transition from random, near\nzero, test accuracy to perfect test accuracy. This transition cannot be\npredicted from the training loss, which is identically zero, nor from the test\nloss, which remains constant in initial iterations. Instead, as we show, the\ntransition is completely determined by feature learning: RFM gradually learns\nblock-circulant features to solve modular arithmetic. Paralleling the results\nfor RFM, we show that neural networks that solve modular arithmetic also learn\nblock-circulant features. Furthermore, we present theoretical evidence that RFM\nuses such block-circulant features to implement the Fourier Multiplication\nAlgorithm, which prior work posited as the generalizing solution neural\nnetworks learn on these tasks. Our results demonstrate that emergence can\nresult purely from learning task-relevant features and is not specific to\nneural architectures nor gradient descent-based optimization methods.\nFurthermore, our work provides more evidence for AGOP as a key mechanism for\nfeature learning in neural networks.\n","authors":["Neil Mallinar","Daniel Beaglehole","Libin Zhu","Adityanarayanan Radhakrishnan","Parthe Pandit","Mikhail Belkin"],"pdf_url":"https://arxiv.org/pdf/2407.20199v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20197v1","updated":"2024-07-29T17:24:35Z","published":"2024-07-29T17:24:35Z","title":"Learning Random Numbers to Realize Appendable Memory System for\n Artificial Intelligence to Acquire New Knowledge after Deployment","summary":" In this study, we developed a learning method for constructing a neural\nnetwork system capable of memorizing data and recalling it without parameter\nupdates. The system we built using this method is called the Appendable Memory\nsystem. The Appendable Memory system enables an artificial intelligence (AI) to\nacquire new knowledge even after deployment. It consists of two AIs: the\nMemorizer and the Recaller. This system is a key-value store built using neural\nnetworks. The Memorizer receives data and stores it in the Appendable Memory\nvector, which is dynamically updated when the AI acquires new knowledge.\nMeanwhile, the Recaller retrieves information from the Appendable Memory\nvector. What we want to teach AI in this study are the operations of memorizing\nand recalling information. However, traditional machine learning methods make\nAI learn features inherent in the learning dataset. We demonstrate that the\nsystems we intend to create cannot be realized by current machine learning\nmethods, that is, by merely repeating the input and output learning sequences\nwith AI. Instead, we propose a method to teach AI to learn operations, by\ncompletely removing the features contained in the learning dataset.\nSpecifically, we probabilized all the data involved in learning. This measure\nprevented AI from learning the features of the data. The learning method\nproposed in the study differs from traditional machine learning methods and\nprovides fundamental approaches for building an AI system that can store\ninformation in a finite memory and recall it at a later date.\n","authors":["Kazunori D Yamada"],"pdf_url":"https://arxiv.org/pdf/2407.20197v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20192v1","updated":"2024-07-29T17:19:40Z","published":"2024-07-29T17:19:40Z","title":"Time series forecasting with high stakes: A field study of the air cargo\n industry","summary":" Time series forecasting in the air cargo industry presents unique challenges\ndue to volatile market dynamics and the significant impact of accurate\nforecasts on generated revenue. This paper explores a comprehensive approach to\ndemand forecasting at the origin-destination (O\\&D) level, focusing on the\ndevelopment and implementation of machine learning models in decision-making\nfor the air cargo industry. We leverage a mixture of experts framework,\ncombining statistical and advanced deep learning models to provide reliable\nforecasts for cargo demand over a six-month horizon. The results demonstrate\nthat our approach outperforms industry benchmarks, offering actionable insights\nfor cargo capacity allocation and strategic decision-making in the air cargo\nindustry. While this work is applied in the airline industry, the methodology\nis broadly applicable to any field where forecast-based decision-making in a\nvolatile environment is crucial.\n","authors":["Abhinav Garg","Naman Shukla"],"pdf_url":"https://arxiv.org/pdf/2407.20192v1.pdf","comment":"The 10th Mining and Learning from Time Series Workshop: From\n Classical Methods to LLMs. SIGKDD, Barcelona, Spain, 6 page"},{"id":"http://arxiv.org/abs/2407.20179v1","updated":"2024-07-29T17:08:21Z","published":"2024-07-29T17:08:21Z","title":"Theia: Distilling Diverse Vision Foundation Models for Robot Learning","summary":" Vision-based robot policy learning, which maps visual inputs to actions,\nnecessitates a holistic understanding of diverse visual tasks beyond\nsingle-task needs like classification or segmentation. Inspired by this, we\nintroduce Theia, a vision foundation model for robot learning that distills\nmultiple off-the-shelf vision foundation models trained on varied vision tasks.\nTheia's rich visual representations encode diverse visual knowledge, enhancing\ndownstream robot learning. Extensive experiments demonstrate that Theia\noutperforms its teacher models and prior robot learning models using less\ntraining data and smaller model sizes. Additionally, we quantify the quality of\npre-trained visual representations and hypothesize that higher entropy in\nfeature norm distributions leads to improved robot learning performance. Code\nand models are available at https://github.com/bdaiinstitute/theia.\n","authors":["Jinghuan Shang","Karl Schmeckpeper","Brandon B. May","Maria Vittoria Minniti","Tarik Kelestemur","David Watkins","Laura Herlant"],"pdf_url":"https://arxiv.org/pdf/2407.20179v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20177v1","updated":"2024-07-29T17:06:30Z","published":"2024-07-29T17:06:30Z","title":"AutoScale: Automatic Prediction of Compute-optimal Data Composition for\n Training LLMs","summary":" To ensure performance on a diverse set of downstream tasks, LLMs are\npretrained via data mixtures over different domains. In this work, we\ndemonstrate that the optimal data composition for a fixed compute budget varies\ndepending on the scale of the training data, suggesting that the common\npractice of empirically determining an optimal composition using small-scale\nexperiments will not yield the optimal data mixtures when scaling up to the\nfinal model. To address this challenge, we propose *AutoScale*, an automated\ntool that finds a compute-optimal data composition for training at any desired\ntarget scale. AutoScale first determines the optimal composition at a small\nscale using a novel bilevel optimization framework, Direct Data Optimization\n(*DDO*), and then fits a predictor to estimate the optimal composition at\nlarger scales. The predictor's design is inspired by our theoretical analysis\nof scaling laws related to data composition, which could be of independent\ninterest. In empirical studies with pre-training 774M Decoder-only LMs (GPT-2\nLarge) on RedPajama dataset, AutoScale decreases validation perplexity at least\n25% faster than any baseline with up to 38% speed up compared to without\nreweighting, achieving the best overall performance across downstream tasks. On\npre-training Encoder-only LMs (BERT) with masked language modeling, DDO is\nshown to decrease loss on all domains while visibly improving average task\nperformance on GLUE benchmark by 8.7% and on large-scale QA dataset (SQuAD) by\n5.9% compared with without reweighting. AutoScale speeds up training by up to\n28%. Our codes are open-sourced.\n","authors":["Feiyang Kang","Yifan Sun","Bingbing Wen","Si Chen","Dawn Song","Rafid Mahmood","Ruoxi Jia"],"pdf_url":"https://arxiv.org/pdf/2407.20177v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.01538v7","updated":"2024-07-29T16:55:33Z","published":"2023-02-03T04:24:49Z","title":"DCEM: A deep complementary energy method for solid mechanics","summary":" In recent years, the rapid advancement of deep learning has significantly\nimpacted various fields, particularly in solving partial differential equations\n(PDEs) in the realm of solid mechanics, benefiting greatly from the remarkable\napproximation capabilities of neural networks. In solving PDEs,\nPhysics-Informed Neural Networks (PINNs) and the Deep Energy Method (DEM) have\ngarnered substantial attention. The principle of minimum potential energy and\ncomplementary energy are two important variational principles in solid\nmechanics. However, the well-known Deep Energy Method (DEM) is based on the\nprinciple of minimum potential energy, but there lacks the important form of\nminimum complementary energy. To bridge this gap, we propose the deep\ncomplementary energy method (DCEM) based on the principle of minimum\ncomplementary energy. The output function of DCEM is the stress function, which\ninherently satisfies the equilibrium equation. We present numerical results\nusing the Prandtl and Airy stress functions, and compare DCEM with existing\nPINNs and DEM algorithms when modeling representative mechanical problems. The\nresults demonstrate that DCEM outperforms DEM in terms of stress accuracy and\nefficiency and has an advantage in dealing with complex displacement boundary\nconditions, which is supported by theoretical analyses and numerical\nsimulations. We extend DCEM to DCEM-Plus (DCEM-P), adding terms that satisfy\npartial differential equations. Furthermore, we propose a deep complementary\nenergy operator method (DCEM-O) by combining operator learning with physical\nequations. Initially, we train DCEM-O using high-fidelity numerical results and\nthen incorporate complementary energy. DCEM-P and DCEM-O further enhance the\naccuracy and efficiency of DCEM.\n","authors":["Yizheng Wang","Jia Sun","Timon Rabczuk","Yinghua Liu"],"pdf_url":"https://arxiv.org/pdf/2302.01538v7.pdf","comment":"58 pages, 30 figures"},{"id":"http://arxiv.org/abs/2404.10148v2","updated":"2024-07-29T16:51:26Z","published":"2024-04-15T21:35:25Z","title":"Node Similarities under Random Projections: Limits and Pathological\n Cases","summary":" Random Projections have been widely used to generate embeddings for various\ngraph learning tasks due to their computational efficiency. The majority of\napplications have been justified through the Johnson-Lindenstrauss Lemma. In\nthis paper, we take a step further and investigate how well dot product and\ncosine similarity are preserved by random projections when these are applied\nover the rows of the graph matrix. Our analysis provides new asymptotic and\nfinite-sample results, identifies pathological cases, and tests them with\nnumerical experiments. We specialize our fundamental results to a ranking\napplication by computing the probability of random projections flipping the\nnode ordering induced by their embeddings. We find that, depending on the\ndegree distribution, the method produces especially unreliable embeddings for\nthe dot product, regardless of whether the adjacency or the normalized\ntransition matrix is used. With respect to the statistical noise introduced by\nrandom projections, we show that cosine similarity produces remarkably more\nprecise approximations.\n","authors":["Tvrtko Tadić","Cassiano Becker","Jennifer Neville"],"pdf_url":"https://arxiv.org/pdf/2404.10148v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20164v1","updated":"2024-07-29T16:49:30Z","published":"2024-07-29T16:49:30Z","title":"Language-Conditioned Offline RL for Multi-Robot Navigation","summary":" We present a method for developing navigation policies for multi-robot teams\nthat interpret and follow natural language instructions. We condition these\npolicies on embeddings from pretrained Large Language Models (LLMs), and train\nthem via offline reinforcement learning with as little as 20 minutes of\nrandomly-collected data. Experiments on a team of five real robots show that\nthese policies generalize well to unseen commands, indicating an understanding\nof the LLM latent space. Our method requires no simulators or environment\nmodels, and produces low-latency control policies that can be deployed directly\nto real robots without finetuning. We provide videos of our experiments at\nhttps://sites.google.com/view/llm-marl.\n","authors":["Steven Morad","Ajay Shankar","Jan Blumenkamp","Amanda Prorok"],"pdf_url":"https://arxiv.org/pdf/2407.20164v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20158v1","updated":"2024-07-29T16:34:47Z","published":"2024-07-29T16:34:47Z","title":"Machine Learning for predicting chaotic systems","summary":" Predicting chaotic dynamical systems is critical in many scientific fields\nsuch as weather prediction, but challenging due to the characterizing sensitive\ndependence on initial conditions. Traditional modeling approaches require\nextensive domain knowledge, often leading to a shift towards data-driven\nmethods using machine learning. However, existing research provides\ninconclusive results on which machine learning methods are best suited for\npredicting chaotic systems. In this paper, we compare different lightweight and\nheavyweight machine learning architectures using extensive existing databases,\nas well as a newly introduced one that allows for uncertainty quantification in\nthe benchmark results. We perform hyperparameter tuning based on computational\ncost and introduce a novel error metric, the cumulative maximum error, which\ncombines several desirable properties of traditional metrics, tailored for\nchaotic systems. Our results show that well-tuned simple methods, as well as\nuntuned baseline methods, often outperform state-of-the-art deep learning\nmodels, but their performance can vary significantly with different\nexperimental setups. These findings underscore the importance of matching\nprediction methods to data characteristics and available computational\nresources.\n","authors":["Christof Schötz","Alistair White","Maximilian Gelbrecht","Niklas Boers"],"pdf_url":"https://arxiv.org/pdf/2407.20158v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20152v1","updated":"2024-07-29T16:25:43Z","published":"2024-07-29T16:25:43Z","title":"Hierarchically Disentangled Recurrent Network for Factorizing System\n Dynamics of Multi-scale Systems","summary":" We present a knowledge-guided machine learning (KGML) framework for modeling\nmulti-scale processes, and study its performance in the context of streamflow\nforecasting in hydrology. Specifically, we propose a novel hierarchical\nrecurrent neural architecture that factorizes the system dynamics at multiple\ntemporal scales and captures their interactions. This framework consists of an\ninverse and a forward model. The inverse model is used to empirically resolve\nthe system's temporal modes from data (physical model simulations, observed\ndata, or a combination of them from the past), and these states are then used\nin the forward model to predict streamflow. In a hydrological system, these\nmodes can represent different processes, evolving at different temporal scales\n(e.g., slow: groundwater recharge and baseflow vs. fast: surface runoff due to\nextreme rainfall). A key advantage of our framework is that once trained, it\ncan incorporate new observations into the model's context (internal state)\nwithout expensive optimization approaches (e.g., EnKF) that are traditionally\nused in physical sciences for data assimilation. Experiments with several river\ncatchments from the NWS NCRFC region show the efficacy of this ML-based data\nassimilation framework compared to standard baselines, especially for basins\nthat have a long history of observations. Even for basins that have a shorter\nobservation history, we present two orthogonal strategies of training our FHNN\nframework: (a) using simulation data from imperfect simulations and (b) using\nobservation data from multiple basins to build a global model. We show that\nboth of these strategies (that can be used individually or together) are highly\neffective in mitigating the lack of training data. The improvement in forecast\naccuracy is particularly noteworthy for basins where local models perform\npoorly because of data sparsity.\n","authors":["Rahul Ghosh","Zac McEachran","Arvind Renganathan","Kelly Lindsay","Somya Sharma","Michael Steinbach","John Nieber","Christopher Duffy","Vipin Kumar"],"pdf_url":"https://arxiv.org/pdf/2407.20152v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20147v1","updated":"2024-07-29T16:20:51Z","published":"2024-07-29T16:20:51Z","title":"Quantum Machine Learning Architecture Search via Deep Reinforcement\n Learning","summary":" The rapid advancement of quantum computing (QC) and machine learning (ML) has\ngiven rise to the burgeoning field of quantum machine learning (QML), aiming to\ncapitalize on the strengths of quantum computing to propel ML forward. Despite\nits promise, crafting effective QML models necessitates profound expertise to\nstrike a delicate balance between model intricacy and feasibility on Noisy\nIntermediate-Scale Quantum (NISQ) devices. While complex models offer robust\nrepresentation capabilities, their extensive circuit depth may impede seamless\nexecution on extant noisy quantum platforms. In this paper, we address this\nquandary of QML model design by employing deep reinforcement learning to\nexplore proficient QML model architectures tailored for designated supervised\nlearning tasks. Specifically, our methodology involves training an RL agent to\ndevise policies that facilitate the discovery of QML models without\npredetermined ansatz. Furthermore, we integrate an adaptive mechanism to\ndynamically adjust the learning objectives, fostering continuous improvement in\nthe agent's learning process. Through extensive numerical simulations, we\nillustrate the efficacy of our approach within the realm of classification\ntasks. Our proposed method successfully identifies VQC architectures capable of\nachieving high classification accuracy while minimizing gate depth. This\npioneering approach not only advances the study of AI-driven quantum circuit\ndesign but also holds significant promise for enhancing performance in the NISQ\nera.\n","authors":["Xin Dai","Tzu-Chieh Wei","Shinjae Yoo","Samuel Yen-Chi Chen"],"pdf_url":"https://arxiv.org/pdf/2407.20147v1.pdf","comment":"Accepted by IEEE International Conference on Quantum Computing and\n Engineering - QCE 2024"},{"id":"http://arxiv.org/abs/2311.03583v2","updated":"2024-07-29T16:13:22Z","published":"2023-11-06T22:29:55Z","title":"Finding Increasingly Large Extremal Graphs with AlphaZero and Tabu\n Search","summary":" This work studies a central extremal graph theory problem inspired by a 1975\nconjecture of Erd\\H{o}s, which aims to find graphs with a given size (number of\nnodes) that maximize the number of edges without having 3- or 4-cycles. We\nformulate this problem as a sequential decision-making problem and compare\nAlphaZero, a neural network-guided tree search, with tabu search, a heuristic\nlocal search method. Using either method, by introducing a curriculum --\njump-starting the search for larger graphs using good graphs found at smaller\nsizes -- we improve the state-of-the-art lower bounds for several sizes. We\nalso propose a flexible graph-generation environment and a\npermutation-invariant network architecture for learning to search in the space\nof graphs.\n","authors":["Abbas Mehrabian","Ankit Anand","Hyunjik Kim","Nicolas Sonnerat","Matej Balog","Gheorghe Comanici","Tudor Berariu","Andrew Lee","Anian Ruoss","Anna Bulanova","Daniel Toyama","Sam Blackwell","Bernardino Romera Paredes","Petar Veličković","Laurent Orseau","Joonkyung Lee","Anurag Murty Naredla","Doina Precup","Adam Zsolt Wagner"],"pdf_url":"https://arxiv.org/pdf/2311.03583v2.pdf","comment":"To appear in the proceedings of IJCAI 2024. First three authors\n contributed equally, last two authors made equal senior contribution"},{"id":"http://arxiv.org/abs/2407.20126v1","updated":"2024-07-29T15:55:52Z","published":"2024-07-29T15:55:52Z","title":"Extreme time extrapolation capabilities and thermodynamic consistency of\n physics-inspired Neural Networks for the 3D microstructure evolution of\n materials","summary":" A Convolutional Recurrent Neural Network (CRNN) is trained to reproduce the\nevolution of the spinodal decomposition process in three dimensions as\ndescribed by the Cahn-Hilliard equation. A specialized, physics-inspired\narchitecture is proven to provide close accordance between the predicted\nevolutions and the ground truth ones obtained via conventional integration\nschemes. The method can closely reproduce the evolution of microstructures not\nrepresented in the training set at a fraction of the computational costs.\nExtremely long-time extrapolation capabilities are achieved, up to reaching the\ntheoretically expected equilibrium state of the system, despite the training\nset containing only relatively-short, initial phases of the evolution.\nQuantitative accordance with the decay rate of the Free energy is also\ndemonstrated up to late coarsening stages, providing an example of a\ndata-driven, physically consistent and high-accuracy Machine Learning method\nfor the long timescale simulation of materials.\n","authors":["Daniele Lanzoni","Andrea Fantasia","Roberto Bergamaschini","Olivier Pierre-Louis","Francesco Montalenti"],"pdf_url":"https://arxiv.org/pdf/2407.20126v1.pdf","comment":"10 pages, 6 main text figures, 2 appendix figures"},{"id":"http://arxiv.org/abs/2407.20122v1","updated":"2024-07-29T15:53:14Z","published":"2024-07-29T15:53:14Z","title":"Tightening the Evaluation of PAC Bounds Using Formal Verification\n Results","summary":" Probably Approximately Correct (PAC) bounds are widely used to derive\nprobabilistic guarantees for the generalisation of machine learning models.\nThey highlight the components of the model which contribute to its\ngeneralisation capacity. However, current state-of-the-art results are loose in\napproximating the generalisation capacity of deployed machine learning models.\nConsequently, while PAC bounds are theoretically useful, their applicability\nfor evaluating a model's generalisation property in a given operational design\ndomain is limited. The underlying classical theory is supported by the idea\nthat bounds can be tightened when the number of test points available to the\nuser to evaluate the model increases. Yet, in the case of neural networks, the\nnumber of test points required to obtain bounds of interest is often\nimpractical even for small problems.\n In this paper, we take the novel approach of using the formal verification of\nneural systems to inform the evaluation of PAC bounds. Rather than using\npointwise information obtained from repeated tests, we use verification results\non regions around test points. We show that conditioning existing bounds on\nverification results leads to a tightening proportional to the underlying\nprobability mass of the verified region.\n","authors":["Thomas Walker","Alessio Lomuscio"],"pdf_url":"https://arxiv.org/pdf/2407.20122v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2407.20119v1","updated":"2024-07-29T15:51:09Z","published":"2024-07-29T15:51:09Z","title":"Adaptive Self-supervised Robust Clustering for Unstructured Data with\n Unknown Cluster Number","summary":" We introduce a novel self-supervised deep clustering approach tailored for\nunstructured data without requiring prior knowledge of the number of clusters,\ntermed Adaptive Self-supervised Robust Clustering (ASRC). In particular, ASRC\nadaptively learns the graph structure and edge weights to capture both local\nand global structural information. The obtained graph enables us to learn\nclustering-friendly feature representations by an enhanced graph auto-encoder\nwith contrastive learning technique. It further leverages the clustering\nresults adaptively obtained by robust continuous clustering (RCC) to generate\nprototypes for negative sampling, which can further contribute to promoting\nconsistency among positive pairs and enlarging the gap between positive and\nnegative samples. ASRC obtains the final clustering results by applying RCC to\nthe learned feature representations with their consistent graph structure and\nedge weights. Extensive experiments conducted on seven benchmark datasets\ndemonstrate the efficacy of ASRC, demonstrating its superior performance over\nother popular clustering models. Notably, ASRC even outperforms methods that\nrely on prior knowledge of the number of clusters, highlighting its\neffectiveness in addressing the challenges of clustering unstructured data.\n","authors":["Chen-Lu Ding","Jiancan Wu","Wei Lin","Shiyang Shen","Xiang Wang","Yancheng Yuan"],"pdf_url":"https://arxiv.org/pdf/2407.20119v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.08623v2","updated":"2024-07-29T15:49:29Z","published":"2024-07-11T16:00:22Z","title":"Surpassing Cosine Similarity for Multidimensional Comparisons: Dimension\n Insensitive Euclidean Metric (DIEM)","summary":" The advancement in computational power and hardware efficiency enabled the\ntackling of increasingly complex and high-dimensional problems. While\nartificial intelligence (AI) achieved remarkable results, the interpretability\nof high-dimensional solutions remains challenging. A critical issue is the\ncomparison of multidimensional quantities, which is essential in techniques\nlike Principal Component Analysis (PCA), or k-means clustering. Common metrics\nsuch as cosine similarity, Euclidean distance, and Manhattan distance are often\nused for such comparisons - for example in muscular synergies of the human\nmotor control system. However, their applicability and interpretability\ndiminish as dimensionality increases. This paper provides a comprehensive\nanalysis of the effects of dimensionality on these metrics. Our results reveal\nsignificant limitations of cosine similarity, particularly its dependency on\nthe dimensionality of the vectors, leading to biased and less interpretable\noutcomes. To address this, we introduce the Dimension Insensitive Euclidean\nMetric (DIEM) which demonstrates superior robustness and generalizability\nacross dimensions. DIEM maintains consistent variability and eliminates the\nbiases observed in traditional metrics, making it a reliable tool for\nhigh-dimensional comparisons. This novel metric has the potential to replace\ncosine similarity, providing a more accurate and insightful method to analyze\nmultidimensional data in fields ranging from neuromotor control to machine and\ndeep learning.\n","authors":["Federico Tessari","Neville Hogan"],"pdf_url":"https://arxiv.org/pdf/2407.08623v2.pdf","comment":"10 pages, 17 figures"},{"id":"http://arxiv.org/abs/2407.20109v1","updated":"2024-07-29T15:36:42Z","published":"2024-07-29T15:36:42Z","title":"Diffusion-DICE: In-Sample Diffusion Guidance for Offline Reinforcement\n Learning","summary":" One important property of DIstribution Correction Estimation (DICE) methods\nis that the solution is the optimal stationary distribution ratio between the\noptimized and data collection policy. In this work, we show that DICE-based\nmethods can be viewed as a transformation from the behavior distribution to the\noptimal policy distribution. Based on this, we propose a novel approach,\nDiffusion-DICE, that directly performs this transformation using diffusion\nmodels. We find that the optimal policy's score function can be decomposed into\ntwo terms: the behavior policy's score function and the gradient of a guidance\nterm which depends on the optimal distribution ratio. The first term can be\nobtained from a diffusion model trained on the dataset and we propose an\nin-sample learning objective to learn the second term. Due to the\nmulti-modality contained in the optimal policy distribution, the transformation\nin Diffusion-DICE may guide towards those local-optimal modes. We thus generate\na few candidate actions and carefully select from them to approach\nglobal-optimum. Different from all other diffusion-based offline RL methods,\nthe guide-then-select paradigm in Diffusion-DICE only uses in-sample actions\nfor training and brings minimal error exploitation in the value function. We\nuse a didatic toycase example to show how previous diffusion-based methods fail\nto generate optimal actions due to leveraging these errors and how\nDiffusion-DICE successfully avoids that. We then conduct extensive experiments\non benchmark datasets to show the strong performance of Diffusion-DICE.\n","authors":["Liyuan Mao","Haoran Xu","Weinan Zhang","Xianyuan Zhan","Amy Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.20109v1.pdf","comment":"Preprint, under review"},{"id":"http://arxiv.org/abs/2407.20105v1","updated":"2024-07-29T15:32:30Z","published":"2024-07-29T15:32:30Z","title":"Strong Copyright Protection for Language Models via Adaptive Model\n Fusion","summary":" The risk of language models unintentionally reproducing copyrighted material\nfrom their training data has led to the development of various protective\nmeasures. In this paper, we propose model fusion as an effective solution to\nsafeguard against copyright infringement. In particular, we introduce\nCopyright-Protecting Fusion (CP-Fuse), an algorithm that adaptively combines\nlanguage models to minimize the reproduction of protected materials. CP-Fuse is\ninspired by the recently proposed Near-Access Free (NAF) framework and\nadditionally incorporates a desirable balancing property that we demonstrate\nprevents the reproduction of memorized training data. Our results show that\nCP-Fuse significantly reduces the memorization of copyrighted content while\nmaintaining high-quality text and code generation. Furthermore, we demonstrate\nhow CP-Fuse can be integrated with other techniques for enhanced protection.\n","authors":["Javier Abad","Konstantin Donhauser","Francesco Pinto","Fanny Yang"],"pdf_url":"https://arxiv.org/pdf/2407.20105v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20100v1","updated":"2024-07-29T15:28:26Z","published":"2024-07-29T15:28:26Z","title":"F-KANs: Federated Kolmogorov-Arnold Networks","summary":" In this paper, we present an innovative federated learning (FL) approach that\nutilizes Kolmogorov-Arnold Networks (KANs) for classification tasks. By\nutilizing the adaptive activation capabilities of KANs in a federated\nframework, we aim to improve classification capabilities while preserving\nprivacy. The study evaluates the performance of federated KANs (F- KANs)\ncompared to traditional Multi-Layer Perceptrons (MLPs) on classification task.\nThe results show that the F-KANs model significantly outperforms the federated\nMLP model in terms of accuracy, precision, recall, F1 score and stability, and\nachieves better performance, paving the way for more efficient and\nprivacy-preserving predictive analytics.\n","authors":["Engin Zeydan","Cristian J. Vaca-Rubio","Luis Blanco","Roberto Pereira","Marius Caus","Abdullah Aydeger"],"pdf_url":"https://arxiv.org/pdf/2407.20100v1.pdf","comment":"This work has been submitted to IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2407.14435v2","updated":"2024-07-29T15:27:03Z","published":"2024-07-19T16:07:19Z","title":"Jumping Ahead: Improving Reconstruction Fidelity with JumpReLU Sparse\n Autoencoders","summary":" Sparse autoencoders (SAEs) are a promising unsupervised approach for\nidentifying causally relevant and interpretable linear features in a language\nmodel's (LM) activations. To be useful for downstream tasks, SAEs need to\ndecompose LM activations faithfully; yet to be interpretable the decomposition\nmust be sparse -- two objectives that are in tension. In this paper, we\nintroduce JumpReLU SAEs, which achieve state-of-the-art reconstruction fidelity\nat a given sparsity level on Gemma 2 9B activations, compared to other recent\nadvances such as Gated and TopK SAEs. We also show that this improvement does\nnot come at the cost of interpretability through manual and automated\ninterpretability studies. JumpReLU SAEs are a simple modification of vanilla\n(ReLU) SAEs -- where we replace the ReLU with a discontinuous JumpReLU\nactivation function -- and are similarly efficient to train and run. By\nutilising straight-through-estimators (STEs) in a principled manner, we show\nhow it is possible to train JumpReLU SAEs effectively despite the discontinuous\nJumpReLU function introduced in the SAE's forward pass. Similarly, we use STEs\nto directly train L0 to be sparse, instead of training on proxies such as L1,\navoiding problems like shrinkage.\n","authors":["Senthooran Rajamanoharan","Tom Lieberum","Nicolas Sonnerat","Arthur Conmy","Vikrant Varma","János Kramár","Neel Nanda"],"pdf_url":"https://arxiv.org/pdf/2407.14435v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01805v2","updated":"2024-07-29T15:08:17Z","published":"2024-06-03T21:51:13Z","title":"TabMDA: Tabular Manifold Data Augmentation for Any Classifier using\n Transformers with In-context Subsetting","summary":" Tabular data is prevalent in many critical domains, yet it is often\nchallenging to acquire in large quantities. This scarcity usually results in\npoor performance of machine learning models on such data. Data augmentation, a\ncommon strategy for performance improvement in vision and language tasks,\ntypically underperforms for tabular data due to the lack of explicit symmetries\nin the input space. To overcome this challenge, we introduce TabMDA, a novel\nmethod for manifold data augmentation on tabular data. This method utilises a\npre-trained in-context model, such as TabPFN, to map the data into an embedding\nspace. TabMDA performs label-invariant transformations by encoding the data\nmultiple times with varied contexts. This process explores the learned\nembedding space of the underlying in-context models, thereby enlarging the\ntraining dataset. TabMDA is a training-free method, making it applicable to any\nclassifier. We evaluate TabMDA on five standard classifiers and observe\nsignificant performance improvements across various tabular datasets. Our\nresults demonstrate that TabMDA provides an effective way to leverage\ninformation from pre-trained in-context models to enhance the performance of\ndownstream classifiers. Code is available at\nhttps://github.com/AdrianBZG/TabMDA.\n","authors":["Andrei Margeloiu","Adrián Bazaga","Nikola Simidjievski","Pietro Liò","Mateja Jamnik"],"pdf_url":"https://arxiv.org/pdf/2406.01805v2.pdf","comment":"Presented at 1st ICML Workshop on In-Context Learning (ICL @ ICML\n 2024)"},{"id":"http://arxiv.org/abs/2407.20080v1","updated":"2024-07-29T15:04:53Z","published":"2024-07-29T15:04:53Z","title":"UniTTA: Unified Benchmark and Versatile Framework Towards Realistic\n Test-Time Adaptation","summary":" Test-Time Adaptation (TTA) aims to adapt pre-trained models to the target\ndomain during testing. In reality, this adaptability can be influenced by\nmultiple factors. Researchers have identified various challenging scenarios and\ndeveloped diverse methods to address these challenges, such as dealing with\ncontinual domain shifts, mixed domains, and temporally correlated or imbalanced\nclass distributions. Despite these efforts, a unified and comprehensive\nbenchmark has yet to be established. To this end, we propose a Unified\nTest-Time Adaptation (UniTTA) benchmark, which is comprehensive and widely\napplicable. Each scenario within the benchmark is fully described by a Markov\nstate transition matrix for sampling from the original dataset. The UniTTA\nbenchmark considers both domain and class as two independent dimensions of data\nand addresses various combinations of imbalance/balance and\ni.i.d./non-i.i.d./continual conditions, covering a total of \\( (2 \\times 3)^2 =\n36 \\) scenarios. It establishes a comprehensive evaluation benchmark for\nrealistic TTA and provides a guideline for practitioners to select the most\nsuitable TTA method. Alongside this benchmark, we propose a versatile UniTTA\nframework, which includes a Balanced Domain Normalization (BDN) layer and a\nCOrrelated Feature Adaptation (COFA) method--designed to mitigate distribution\ngaps in domain and class, respectively. Extensive experiments demonstrate that\nour UniTTA framework excels within the UniTTA benchmark and achieves\nstate-of-the-art performance on average. Our code is available at\n\\url{https://github.com/LeapLabTHU/UniTTA}.\n","authors":["Chaoqun Du","Yulin Wang","Jiayi Guo","Yizeng Han","Jie Zhou","Gao Huang"],"pdf_url":"https://arxiv.org/pdf/2407.20080v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01445v2","updated":"2024-07-29T15:04:15Z","published":"2024-07-01T16:37:18Z","title":"FastCLIP: A Suite of Optimization Techniques to Accelerate CLIP Training\n with Limited Resources","summary":" Existing studies of training state-of-the-art Contrastive Language-Image\nPretraining (CLIP) models on large-scale data involve hundreds of or even\nthousands of GPUs due to the requirement of a large batch size. However, such a\nlarge amount of resources is not accessible to most people. While advanced\ncompositional optimization techniques for optimizing global contrastive losses\nhave been demonstrated effective for removing the requirement of large batch\nsize, their performance on large-scale data remains underexplored and not\noptimized. To bridge the gap, this paper explores several aspects of CLIP\ntraining with limited resources (e.g., up to tens of GPUs). First, we introduce\nFastCLIP, a general CLIP training framework built on advanced compositional\noptimization techniques while designed and optimized for the distributed\nsetting. Our framework is equipped with an efficient gradient reduction\nstrategy to reduce communication overhead. Second, to further boost training\nefficiency, we investigate three components of the framework from an\noptimization perspective: the schedule of the inner learning rate, the update\nrules of the temperature parameter and the model parameters, respectively.\nExperiments on different strategies for each component shed light on how to\nconduct CLIP training more efficiently. Finally, we benchmark the performance\nof FastCLIP and the state-of-the-art training baseline (OpenCLIP) on different\ncompute scales up to 32 GPUs on 8 nodes, and three data scales ranging from 2.7\nmillion, 9.1 million to 315 million image-text pairs to demonstrate the\nsignificant improvement of FastCLIP in the resource-limited setting. We release\nthe code of FastCLIP at https://github.com/Optimization-AI/fast_clip .\n","authors":["Xiyuan Wei","Fanjiang Ye","Ori Yonay","Xingyu Chen","Baixi Sun","Dingwen Tao","Tianbao Yang"],"pdf_url":"https://arxiv.org/pdf/2407.01445v2.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2407.20070v1","updated":"2024-07-29T14:56:56Z","published":"2024-07-29T14:56:56Z","title":"An Interpretable Rule Creation Method for Black-Box Models based on\n Surrogate Trees -- SRules","summary":" As artificial intelligence (AI) systems become increasingly integrated into\ncritical decision-making processes, the need for transparent and interpretable\nmodels has become paramount. In this article we present a new ruleset creation\nmethod based on surrogate decision trees (SRules), designed to improve the\ninterpretability of black-box machine learning models. SRules balances the\naccuracy, coverage, and interpretability of machine learning models by\nrecursively creating surrogate interpretable decision tree models that\napproximate the decision boundaries of a complex model. We propose a systematic\nframework for generating concise and meaningful rules from these surrogate\nmodels, allowing stakeholders to understand and trust the AI system's\ndecision-making process. Our approach not only provides interpretable rules,\nbut also quantifies the confidence and coverage of these rules. The proposed\nmodel allows to adjust its parameters to counteract the lack of\ninterpretability by precision and coverage by allowing a near perfect fit and\nhigh interpretability of some parts of the model . The results show that SRules\nimproves on other state-of-the-art techniques and introduces the possibility of\ncreating highly interpretable specific rules for specific sub-parts of the\nmodel.\n","authors":["Mario Parrón Verdasco","Esteban García-Cuesta"],"pdf_url":"https://arxiv.org/pdf/2407.20070v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20067v1","updated":"2024-07-29T14:53:45Z","published":"2024-07-29T14:53:45Z","title":"xAI-Drop: Don't Use What You Cannot Explain","summary":" Graph Neural Networks (GNNs) have emerged as the predominant paradigm for\nlearning from graph-structured data, offering a wide range of applications from\nsocial network analysis to bioinformatics. Despite their versatility, GNNs face\nchallenges such as oversmoothing, lack of generalization and poor\ninterpretability, which hinder their wider adoption and reliability in critical\napplications. Dropping has emerged as an effective paradigm for reducing noise\nduring training and improving robustness of GNNs. However, existing approaches\noften rely on random or heuristic-based selection criteria, lacking a\nprincipled method to identify and exclude nodes that contribute to noise and\nover-complexity in the model. In this work, we argue that explainability should\nbe a key indicator of a model's robustness throughout its training phase. To\nthis end, we introduce xAI-Drop, a novel topological-level dropping regularizer\nthat leverages explainability to pinpoint noisy network elements to be excluded\nfrom the GNN propagation mechanism. An empirical evaluation on diverse\nreal-world datasets demonstrates that our method outperforms current\nstate-of-the-art dropping approaches in accuracy, effectively reduces\nover-smoothing, and improves explanation quality.\n","authors":["Vincenzo Marco De Luca","Antonio Longa","Andrea Passerini","Pietro Liò"],"pdf_url":"https://arxiv.org/pdf/2407.20067v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10301v2","updated":"2024-07-29T14:52:26Z","published":"2024-04-16T06:09:33Z","title":"Long-form music generation with latent diffusion","summary":" Audio-based generative models for music have seen great strides recently, but\nso far have not managed to produce full-length music tracks with coherent\nmusical structure from text prompts. We show that by training a generative\nmodel on long temporal contexts it is possible to produce long-form music of up\nto 4m45s. Our model consists of a diffusion-transformer operating on a highly\ndownsampled continuous latent representation (latent rate of 21.5Hz). It\nobtains state-of-the-art generations according to metrics on audio quality and\nprompt alignment, and subjective tests reveal that it produces full-length\nmusic with coherent structure.\n","authors":["Zach Evans","Julian D. Parker","CJ Carr","Zack Zukowski","Josiah Taylor","Jordi Pons"],"pdf_url":"https://arxiv.org/pdf/2404.10301v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08227v2","updated":"2024-07-29T14:50:46Z","published":"2023-12-13T15:47:30Z","title":"Differentially Private Gradient Flow based on the Sliced Wasserstein\n Distance","summary":" Safeguarding privacy in sensitive training data is paramount, particularly in\nthe context of generative modeling. This can be achieved through either\ndifferentially private stochastic gradient descent or a differentially private\nmetric for training models or generators. In this paper, we introduce a novel\ndifferentially private generative modeling approach based on a gradient flow in\nthe space of probability measures. To this end, we define the gradient flow of\nthe Gaussian-smoothed Sliced Wasserstein Distance, including the associated\nstochastic differential equation (SDE). By discretizing and defining a\nnumerical scheme for solving this SDE, we demonstrate the link between\nsmoothing and differential privacy based on a Gaussian mechanism, due to a\nspecific form of the SDE's drift term. We then analyze the differential privacy\nguarantee of our gradient flow, which accounts for both the smoothing and the\nWiener process introduced by the SDE itself. Experiments show that our proposed\nmodel can generate higher-fidelity data at a low privacy budget compared to a\ngenerator-based model, offering a promising alternative.\n","authors":["Ilana Sebag","Muni Sreenivas Pydi","Jean-Yves Franceschi","Alain Rakotomamonjy","Mike Gartrell","Jamal Atif","Alexandre Allauzen"],"pdf_url":"https://arxiv.org/pdf/2312.08227v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.04979v3","updated":"2024-07-29T14:50:43Z","published":"2023-06-08T07:10:35Z","title":"CoCo: A Coupled Contrastive Framework for Unsupervised Domain Adaptive\n Graph Classification","summary":" Although graph neural networks (GNNs) have achieved impressive achievements\nin graph classification, they often need abundant task-specific labels, which\ncould be extensively costly to acquire. A credible solution is to explore\nadditional labeled graphs to enhance unsupervised learning on the target\ndomain. However, how to apply GNNs to domain adaptation remains unsolved owing\nto the insufficient exploration of graph topology and the significant domain\ndiscrepancy. In this paper, we propose Coupled Contrastive Graph Representation\nLearning (CoCo), which extracts the topological information from coupled\nlearning branches and reduces the domain discrepancy with coupled contrastive\nlearning. CoCo contains a graph convolutional network branch and a hierarchical\ngraph kernel network branch, which explore graph topology in implicit and\nexplicit manners. Besides, we incorporate coupled branches into a holistic\nmulti-view contrastive learning framework, which not only incorporates graph\nrepresentations learned from complementary views for enhanced understanding,\nbut also encourages the similarity between cross-domain example pairs with the\nsame semantics for domain alignment. Extensive experiments on popular datasets\nshow that our CoCo outperforms these competing baselines in different settings\ngenerally.\n","authors":["Nan Yin","Li Shen","Mengzhu Wang","Long Lan","Zeyu Ma","Chong Chen","Xian-Sheng Hua","Xiao Luo"],"pdf_url":"https://arxiv.org/pdf/2306.04979v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20062v1","updated":"2024-07-29T14:48:34Z","published":"2024-07-29T14:48:34Z","title":"SalNAS: Efficient Saliency-prediction Neural Architecture Search with\n self-knowledge distillation","summary":" Recent advancements in deep convolutional neural networks have significantly\nimproved the performance of saliency prediction. However, the manual\nconfiguration of the neural network architectures requires domain knowledge\nexpertise and can still be time-consuming and error-prone. To solve this, we\npropose a new Neural Architecture Search (NAS) framework for saliency\nprediction with two contributions. Firstly, a supernet for saliency prediction\nis built with a weight-sharing network containing all candidate architectures,\nby integrating a dynamic convolution into the encoder-decoder in the supernet,\ntermed SalNAS. Secondly, despite the fact that SalNAS is highly efficient\n(20.98 million parameters), it can suffer from the lack of generalization. To\nsolve this, we propose a self-knowledge distillation approach, termed Self-KD,\nthat trains the student SalNAS with the weighted average information between\nthe ground truth and the prediction from the teacher model. The teacher model,\nwhile sharing the same architecture, contains the best-performing weights\nchosen by cross-validation. Self-KD can generalize well without the need to\ncompute the gradient in the teacher model, enabling an efficient training\nsystem. By utilizing Self-KD, SalNAS outperforms other state-of-the-art\nsaliency prediction models in most evaluation rubrics across seven benchmark\ndatasets while being a lightweight model. The code will be available at\nhttps://github.com/chakkritte/SalNAS\n","authors":["Chakkrit Termritthikun","Ayaz Umer","Suwichaya Suwanwimolkul","Feng Xia","Ivan Lee"],"pdf_url":"https://arxiv.org/pdf/2407.20062v1.pdf","comment":"Published in Engineering Applications of Artificial Intelligence"},{"id":"http://arxiv.org/abs/2407.20061v1","updated":"2024-07-29T14:47:46Z","published":"2024-07-29T14:47:46Z","title":"Autonomous Bootstrapping of Quantum Dot Devices","summary":" Semiconductor quantum dots (QD) are a promising platform for multiple\ndifferent qubit implementations, all of which are voltage-controlled by\nprogrammable gate electrodes. However, as the QD arrays grow in size and\ncomplexity, tuning procedures that can fully autonomously handle the increasing\nnumber of control parameters are becoming essential for enabling scalability.\nWe propose a bootstrapping algorithm for initializing a depletion mode QD\ndevice in preparation for subsequent phases of tuning. During bootstrapping,\nthe QD device functionality is validated, all gates are characterized, and the\nQD charge sensor is made operational. We demonstrate the bootstrapping protocol\nin conjunction with a coarse tuning module, showing that the combined algorithm\ncan efficiently and reliably take a cooled-down QD device to a desired global\nstate configuration in under 8 minutes with a success rate of 96 %.\nImportantly, by following heuristic approaches to QD device initialization and\ncombining the efficient ray-based measurement with the rapid radio-frequency\nreflectometry measurements, the proposed algorithm establishes a reference in\nterms of performance, reliability, and efficiency against which alternative\nalgorithms can be benchmarked.\n","authors":["Anton Zubchenko","Danielle Middlebrooks","Torbjørn Rasmussen","Lara Lausen","Ferdinand Kuemmeth","Anasua Chatterjee","Justyna P. Zwolak"],"pdf_url":"https://arxiv.org/pdf/2407.20061v1.pdf","comment":"9 pages, 3 figures, 1 table"},{"id":"http://arxiv.org/abs/2407.20060v1","updated":"2024-07-29T14:46:13Z","published":"2024-07-29T14:46:13Z","title":"RelBench: A Benchmark for Deep Learning on Relational Databases","summary":" We present RelBench, a public benchmark for solving predictive tasks over\nrelational databases with graph neural networks. RelBench provides databases\nand tasks spanning diverse domains and scales, and is intended to be a\nfoundational infrastructure for future research. We use RelBench to conduct the\nfirst comprehensive study of Relational Deep Learning (RDL) (Fey et al., 2024),\nwhich combines graph neural network predictive models with (deep) tabular\nmodels that extract initial entity-level representations from raw tables.\nEnd-to-end learned RDL models fully exploit the predictive signal encoded in\nprimary-foreign key links, marking a significant shift away from the dominant\nparadigm of manual feature engineering combined with tabular models. To\nthoroughly evaluate RDL against this prior gold-standard, we conduct an\nin-depth user study where an experienced data scientist manually engineers\nfeatures for each task. In this study, RDL learns better models whilst reducing\nhuman work needed by more than an order of magnitude. This demonstrates the\npower of deep learning for solving predictive tasks over relational databases,\nopening up many new research opportunities enabled by RelBench.\n","authors":["Joshua Robinson","Rishabh Ranjan","Weihua Hu","Kexin Huang","Jiaqi Han","Alejandro Dobles","Matthias Fey","Jan E. Lenssen","Yiwen Yuan","Zecheng Zhang","Xinwei He","Jure Leskovec"],"pdf_url":"https://arxiv.org/pdf/2407.20060v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11965v2","updated":"2024-07-29T14:43:48Z","published":"2024-04-18T07:52:12Z","title":"Multi-fidelity Gaussian process surrogate modeling for regression\n problems in physics","summary":" One of the main challenges in surrogate modeling is the limited availability\nof data due to resource constraints associated with computationally expensive\nsimulations. Multi-fidelity methods provide a solution by chaining models in a\nhierarchy with increasing fidelity, associated with lower error, but increasing\ncost. In this paper, we compare different multi-fidelity methods employed in\nconstructing Gaussian process surrogates for regression. Non-linear\nautoregressive methods in the existing literature are primarily confined to\ntwo-fidelity models, and we extend these methods to handle more than two levels\nof fidelity. Additionally, we propose enhancements for an existing method\nincorporating delay terms by introducing a structured kernel. We demonstrate\nthe performance of these methods across various academic and real-world\nscenarios. Our findings reveal that multi-fidelity methods generally have a\nsmaller prediction error for the same computational cost as compared to the\nsingle-fidelity method, although their effectiveness varies across different\nscenarios.\n","authors":["Kislaya Ravi","Vladyslav Fediukov","Felix Dietrich","Tobias Neckel","Fabian Buse","Michael Bergmann","Hans-Joachim Bungartz"],"pdf_url":"https://arxiv.org/pdf/2404.11965v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15439v3","updated":"2024-07-29T14:42:20Z","published":"2024-07-22T07:36:27Z","title":"Merit-based Fair Combinatorial Semi-Bandit with Unrestricted Feedback\n Delays","summary":" We study the stochastic combinatorial semi-bandit problem with unrestricted\nfeedback delays under merit-based fairness constraints. This is motivated by\napplications such as crowdsourcing, and online advertising, where immediate\nfeedback is not immediately available and fairness among different choices (or\narms) is crucial. We consider two types of unrestricted feedback delays:\nreward-independent delays where the feedback delays are independent of the\nrewards, and reward-dependent delays where the feedback delays are correlated\nwith the rewards. Furthermore, we introduce merit-based fairness constraints to\nensure a fair selection of the arms. We define the reward regret and the\nfairness regret and present new bandit algorithms to select arms under\nunrestricted feedback delays based on their merits. We prove that our\nalgorithms all achieve sublinear expected reward regret and expected fairness\nregret, with a dependence on the quantiles of the delay distribution. We also\nconduct extensive experiments using synthetic and real-world data and show that\nour algorithms can fairly select arms with different feedback delays.\n","authors":["Ziqun Chen","Kechao Cai","Zhuoyue Chen","Jinbei Zhang","John C. S. Lui"],"pdf_url":"https://arxiv.org/pdf/2407.15439v3.pdf","comment":"28 pages, 9 figures, accepted for 27th European Conference on\n Artificial Intelligence (ECAI 2024), Source code added, Typo fixed"},{"id":"http://arxiv.org/abs/2407.20053v1","updated":"2024-07-29T14:40:07Z","published":"2024-07-29T14:40:07Z","title":"Orca: Ocean Significant Wave Height Estimation with Spatio-temporally\n Aware Large Language Models","summary":" Significant wave height (SWH) is a vital metric in marine science, and\naccurate SWH estimation is crucial for various applications, e.g., marine\nenergy development, fishery, early warning systems for potential risks, etc.\nTraditional SWH estimation methods that are based on numerical models and\nphysical theories are hindered by computational inefficiencies. Recently,\nmachine learning has emerged as an appealing alternative to improve accuracy\nand reduce computational time. However, due to limited observational technology\nand high costs, the scarcity of real-world data restricts the potential of\nmachine learning models. To overcome these limitations, we propose an ocean SWH\nestimation framework, namely Orca. Specifically, Orca enhances the limited\nspatio-temporal reasoning abilities of classic LLMs with a novel spatiotemporal\naware encoding module. By segmenting the limited buoy observational data\ntemporally, encoding the buoys' locations spatially, and designing prompt\ntemplates, Orca capitalizes on the robust generalization ability of LLMs to\nestimate significant wave height effectively with limited data. Experimental\nresults on the Gulf of Mexico demonstrate that Orca achieves state-of-the-art\nperformance in SWH estimation.\n","authors":["Zhe Li","Ronghui Xu","Jilin Hu","Zhong Peng","Xi Lu","Chenjuan Guo","Bin Yang"],"pdf_url":"https://arxiv.org/pdf/2407.20053v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.18913v2","updated":"2024-07-29T14:36:16Z","published":"2024-05-29T09:16:03Z","title":"Leveraging Time-Series Foundation Models in Smart Agriculture for Soil\n Moisture Forecasting","summary":" The recent surge in foundation models for natural language processing and\ncomputer vision has fueled innovation across various domains. Inspired by this\nprogress, we explore the potential of foundation models for time-series\nforecasting in smart agriculture, a field often plagued by limited data\navailability. Specifically, this work presents a novel application of\n$\\texttt{TimeGPT}$, a state-of-the-art (SOTA) time-series foundation model, to\npredict soil water potential ($\\psi_\\mathrm{soil}$), a key indicator of field\nwater status that is typically used for irrigation advice. Traditionally, this\ntask relies on a wide array of input variables. We explore\n$\\psi_\\mathrm{soil}$'s ability to forecast $\\psi_\\mathrm{soil}$ in: ($i$) a\nzero-shot setting, ($ii$) a fine-tuned setting relying solely on historic\n$\\psi_\\mathrm{soil}$ measurements, and ($iii$) a fine-tuned setting where we\nalso add exogenous variables to the model. We compare $\\texttt{TimeGPT}$'s\nperformance to established SOTA baseline models for forecasting\n$\\psi_\\mathrm{soil}$. Our results demonstrate that $\\texttt{TimeGPT}$ achieves\ncompetitive forecasting accuracy using only historical $\\psi_\\mathrm{soil}$\ndata, highlighting its remarkable potential for agricultural applications. This\nresearch paves the way for foundation time-series models for sustainable\ndevelopment in agriculture by enabling forecasting tasks that were\ntraditionally reliant on extensive data collection and domain expertise.\n","authors":["Boje Deforce","Bart Baesens","Estefanía Serral Asensio"],"pdf_url":"https://arxiv.org/pdf/2405.18913v2.pdf","comment":"7 pages, accepted at KDD '24 - Fragile Earth Workshop"},{"id":"http://arxiv.org/abs/2401.05373v2","updated":"2024-07-29T14:33:02Z","published":"2023-12-15T12:45:47Z","title":"Dynamic Spiking Graph Neural Networks","summary":" The integration of Spiking Neural Networks (SNNs) and Graph Neural Networks\n(GNNs) is gradually attracting attention due to the low power consumption and\nhigh efficiency in processing the non-Euclidean data represented by graphs.\nHowever, as a common problem, dynamic graph representation learning faces\nchallenges such as high complexity and large memory overheads. Current work\noften uses SNNs instead of Recurrent Neural Networks (RNNs) by using binary\nfeatures instead of continuous ones for efficient training, which would\noverlooks graph structure information and leads to the loss of details during\npropagation. Additionally, optimizing dynamic spiking models typically requires\npropagation of information across time steps, which increases memory\nrequirements. To address these challenges, we present a framework named\n\\underline{Dy}namic \\underline{S}p\\underline{i}king \\underline{G}raph\n\\underline{N}eural Networks (\\method{}). To mitigate the information loss\nproblem, \\method{} propagates early-layer information directly to the last\nlayer for information compensation. To accommodate the memory requirements, we\napply the implicit differentiation on the equilibrium state, which does not\nrely on the exact reverse of the forward computation. While traditional\nimplicit differentiation methods are usually used for static situations,\n\\method{} extends it to the dynamic graph setting. Extensive experiments on\nthree large-scale real-world dynamic graph datasets validate the effectiveness\nof \\method{} on dynamic node classification tasks with lower computational\ncosts.\n","authors":["Nan Yin","Mengzhu Wang","Zhenghan Chen","Giulia De Masi","Bin Gu","Huan Xiong"],"pdf_url":"https://arxiv.org/pdf/2401.05373v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20047v1","updated":"2024-07-29T14:31:44Z","published":"2024-07-29T14:31:44Z","title":"Denoising ESG: quantifying data uncertainty from missing data with\n Machine Learning and prediction intervals","summary":" Environmental, Social, and Governance (ESG) datasets are frequently plagued\nby significant data gaps, leading to inconsistencies in ESG ratings due to\nvarying imputation methods. This paper explores the application of established\nmachine learning techniques for imputing missing data in a real-world ESG\ndataset, emphasizing the quantification of uncertainty through prediction\nintervals. By employing multiple imputation strategies, this study assesses the\nrobustness of imputation methods and quantifies the uncertainty associated with\nmissing data. The findings highlight the importance of probabilistic machine\nlearning models in providing better understanding of ESG scores, thereby\naddressing the inherent risks of wrong ratings due to incomplete data. This\napproach improves imputation practices to enhance the reliability of ESG\nratings.\n","authors":["Sergio Caprioli","Jacopo Foschi","Riccardo Crupi","Alessandro Sabatino"],"pdf_url":"https://arxiv.org/pdf/2407.20047v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17823v3","updated":"2024-07-29T14:12:50Z","published":"2024-01-31T13:28:07Z","title":"Privacy-preserving data release leveraging optimal transport and\n particle gradient descent","summary":" We present a novel approach for differentially private data synthesis of\nprotected tabular datasets, a relevant task in highly sensitive domains such as\nhealthcare and government. Current state-of-the-art methods predominantly use\nmarginal-based approaches, where a dataset is generated from private estimates\nof the marginals. In this paper, we introduce PrivPGD, a new generation method\nfor marginal-based private data synthesis, leveraging tools from optimal\ntransport and particle gradient descent. Our algorithm outperforms existing\nmethods on a large range of datasets while being highly scalable and offering\nthe flexibility to incorporate additional domain-specific constraints.\n","authors":["Konstantin Donhauser","Javier Abad","Neha Hulkund","Fanny Yang"],"pdf_url":"https://arxiv.org/pdf/2401.17823v3.pdf","comment":"Published at the Forty-first International Conference on Machine\n Learning"},{"id":"http://arxiv.org/abs/2309.02876v2","updated":"2024-07-29T14:10:52Z","published":"2023-09-06T10:02:58Z","title":"Non-Clashing Teaching Maps for Balls in Graphs","summary":" Recently, Kirkpatrick et al. [ALT 2019] and Fallat et al. [JMLR 2023]\nintroduced non-clashing teaching and showed it is the most efficient machine\nteaching model satisfying the Goldman-Mathias collusion-avoidance criterion. A\nteaching map $T$ for a concept class $\\mathcal{C}$ assigns a (teaching) set\n$T(C)$ of examples to each concept $C \\in \\mathcal{C}$. A teaching map is\nnon-clashing if no pair of concepts are consistent with the union of their\nteaching sets. The size of a non-clashing teaching map (NCTM) $T$ is the\nmaximum size of a teaching set $T(C)$, $C \\in \\mathcal{C}$. The non-clashing\nteaching dimension NCTD$(\\mathcal{C})$ of $\\mathcal{C}$ is the minimum size of\nan NCTM for $\\mathcal{C}$. NCTM$^+$ and NCTD$^+(\\mathcal{C})$ are defined\nanalogously, except the teacher may only use positive examples.\n We study NCTMs and NCTM$^+$s for the concept class $\\mathcal{B}(G)$\nconsisting of all balls of a graph $G$. We show that the associated decision\nproblem B-NCTD$^+$ for NCTD$^+$ is NP-complete in split, co-bipartite, and\nbipartite graphs. Surprisingly, we even prove that, unless the ETH fails,\nB-NCTD$^+$ does not admit an algorithm running in time\n$2^{2^{o(\\text{vc})}}\\cdot n^{O(1)}$, nor a kernelization algorithm outputting\na kernel with $2^{o(\\text{vc})}$ vertices, where vc is the vertex cover number\nof $G$. We complement these lower bounds with matching upper bounds. These are\nextremely rare results: it is only the second problem in NP to admit such a\ntight double-exponential lower bound parameterized by vc, and only one of very\nfew problems to admit such an ETH-based conditional lower bound on the number\nof vertices in a kernel. For trees, interval graphs, cycles, and trees of\ncycles, we derive NCTM$^+$s or NCTMs for $\\mathcal{B}(G)$ of size proportional\nto its VC-dimension, and for Gromov-hyperbolic graphs, we design an approximate\nNCTM$^+$ of size 2.\n","authors":["Jérémie Chalopin","Victor Chepoi","Fionn Mc Inerney","Sébastien Ratel"],"pdf_url":"https://arxiv.org/pdf/2309.02876v2.pdf","comment":"Published in the proceedings of COLT 2024. Shortened abstract due to\n character limit"},{"id":"http://arxiv.org/abs/2407.20028v1","updated":"2024-07-29T14:04:46Z","published":"2024-07-29T14:04:46Z","title":"Aircraft Trajectory Segmentation-based Contrastive Coding: A Framework\n for Self-supervised Trajectory Representation","summary":" Air traffic trajectory recognition has gained significant interest within the\nair traffic management community, particularly for fundamental tasks such as\nclassification and clustering. This paper introduces Aircraft Trajectory\nSegmentation-based Contrastive Coding (ATSCC), a novel self-supervised time\nseries representation learning framework designed to capture semantic\ninformation in air traffic trajectory data. The framework leverages the\nsegmentable characteristic of trajectories and ensures consistency within the\nself-assigned segments. Intensive experiments were conducted on datasets from\nthree different airports, totaling four datasets, comparing the learned\nrepresentation's performance of downstream classification and clustering with\nother state-of-the-art representation learning techniques. The results show\nthat ATSCC outperforms these methods by aligning with the labels defined by\naeronautical procedures. ATSCC is adaptable to various airport configurations\nand scalable to incomplete trajectories. This research has expanded upon\nexisting capabilities, achieving these improvements independently without\npredefined inputs such as airport configurations, maneuvering procedures, or\nlabeled data.\n","authors":["Thaweerath Phisannupawong","Joshua Julian Damanik","Han-Lim Choi"],"pdf_url":"https://arxiv.org/pdf/2407.20028v1.pdf","comment":"16 pages, 7 figures. This work has been submitted to the IEEE for\n possible publication. Copyright may be transferred without notice, after\n which this version may no longer be accessible"},{"id":"http://arxiv.org/abs/2407.16239v2","updated":"2024-07-29T14:04:20Z","published":"2024-07-23T07:26:38Z","title":"Identifiable latent bandits: Combining observational data and\n exploration for personalized healthcare","summary":" Bandit algorithms hold great promise for improving personalized\ndecision-making but are notoriously sample-hungry. In most health applications,\nit is infeasible to fit a new bandit for each patient, and observable variables\nare often insufficient to determine optimal treatments, ruling out applying\ncontextual bandits learned from multiple patients. Latent bandits offer both\nrapid exploration and personalization beyond what context variables can reveal\nbut require that a latent variable model can be learned consistently. In this\nwork, we propose bandit algorithms based on nonlinear independent component\nanalysis that can be provably identified from observational data to a degree\nsufficient to infer the optimal action in a new bandit instance consistently.\nWe verify this strategy in simulated data, showing substantial improvement over\nlearning independent multi-armed bandits for every instance.\n","authors":["Ahmet Zahid Balcıoğlu","Emil Carlsson","Fredrik D. Johansson"],"pdf_url":"https://arxiv.org/pdf/2407.16239v2.pdf","comment":"9 pages, 2 figures"},{"id":"http://arxiv.org/abs/2201.03019v3","updated":"2024-07-29T13:57:56Z","published":"2022-01-09T14:14:28Z","title":"Robust and Resource-Efficient Data-Free Knowledge Distillation by\n Generative Pseudo Replay","summary":" Data-Free Knowledge Distillation (KD) allows knowledge transfer from a\ntrained neural network (teacher) to a more compact one (student) in the absence\nof original training data. Existing works use a validation set to monitor the\naccuracy of the student over real data and report the highest performance\nthroughout the entire process. However, validation data may not be available at\ndistillation time either, making it infeasible to record the student snapshot\nthat achieved the peak accuracy. Therefore, a practical data-free KD method\nshould be robust and ideally provide monotonically increasing student accuracy\nduring distillation. This is challenging because the student experiences\nknowledge degradation due to the distribution shift of the synthetic data. A\nstraightforward approach to overcome this issue is to store and rehearse the\ngenerated samples periodically, which increases the memory footprint and\ncreates privacy concerns. We propose to model the distribution of the\npreviously observed synthetic samples with a generative network. In particular,\nwe design a Variational Autoencoder (VAE) with a training objective that is\ncustomized to learn the synthetic data representations optimally. The student\nis rehearsed by the generative pseudo replay technique, with samples produced\nby the VAE. Hence knowledge degradation can be prevented without storing any\nsamples. Experiments on image classification benchmarks show that our method\noptimizes the expected value of the distilled model accuracy while eliminating\nthe large memory overhead incurred by the sample-storing methods.\n","authors":["Kuluhan Binici","Shivam Aggarwal","Nam Trung Pham","Karianto Leman","Tulika Mitra"],"pdf_url":"https://arxiv.org/pdf/2201.03019v3.pdf","comment":"AAAI Conference on Artificial Intelligence"},{"id":"http://arxiv.org/abs/2407.20021v1","updated":"2024-07-29T13:57:40Z","published":"2024-07-29T13:57:40Z","title":"MimiQ: Low-Bit Data-Free Quantization of Vision Transformers","summary":" Data-free quantization (DFQ) is a technique that creates a lightweight\nnetwork from its full-precision counterpart without the original training data,\noften through a synthetic dataset. Although several DFQ methods have been\nproposed for vision transformer (ViT) architectures, they fail to achieve\nefficacy in low-bit settings. Examining the existing methods, we identify that\ntheir synthetic data produce misaligned attention maps, while those of the real\nsamples are highly aligned. From the observation of aligned attention, we find\nthat aligning attention maps of synthetic data helps to improve the overall\nperformance of quantized ViTs. Motivated by this finding, we devise \\aname, a\nnovel DFQ method designed for ViTs that focuses on inter-head attention\nsimilarity. First, we generate synthetic data by aligning head-wise attention\nresponses in relation to spatial query patches. Then, we apply head-wise\nstructural attention distillation to align the attention maps of the quantized\nnetwork to those of the full-precision teacher. The experimental results show\nthat the proposed method significantly outperforms baselines, setting a new\nstate-of-the-art performance for data-free ViT quantization.\n","authors":["Kanghyun Choi","Hye Yoon Lee","Dain Kwon","SunJong Park","Kyuyeun Kim","Noseong Park","Jinho Lee"],"pdf_url":"https://arxiv.org/pdf/2407.20021v1.pdf","comment":"Author Preprint"},{"id":"http://arxiv.org/abs/2407.20020v1","updated":"2024-07-29T13:57:24Z","published":"2024-07-29T13:57:24Z","title":"ImagiNet: A Multi-Content Dataset for Generalizable Synthetic Image\n Detection via Contrastive Learning","summary":" Generative models, such as diffusion models (DMs), variational autoencoders\n(VAEs), and generative adversarial networks (GANs), produce images with a level\nof authenticity that makes them nearly indistinguishable from real photos and\nartwork. While this capability is beneficial for many industries, the\ndifficulty of identifying synthetic images leaves online media platforms\nvulnerable to impersonation and misinformation attempts. To support the\ndevelopment of defensive methods, we introduce ImagiNet, a high-resolution and\nbalanced dataset for synthetic image detection, designed to mitigate potential\nbiases in existing resources. It contains 200K examples, spanning four content\ncategories: photos, paintings, faces, and uncategorized. Synthetic images are\nproduced with open-source and proprietary generators, whereas real counterparts\nof the same content type are collected from public datasets. The structure of\nImagiNet allows for a two-track evaluation system: i) classification as real or\nsynthetic and ii) identification of the generative model. To establish a\nbaseline, we train a ResNet-50 model using a self-supervised contrastive\nobjective (SelfCon) for each track. The model demonstrates state-of-the-art\nperformance and high inference speed across established benchmarks, achieving\nan AUC of up to 0.99 and balanced accuracy ranging from 86% to 95%, even under\nsocial network conditions that involve compression and resizing. Our data and\ncode are available at https://github.com/delyan-boychev/imaginet.\n","authors":["Delyan Boychev","Radostin Cholakov"],"pdf_url":"https://arxiv.org/pdf/2407.20020v1.pdf","comment":"24 pages, 9 figures, 9 tables"},{"id":"http://arxiv.org/abs/2407.20013v1","updated":"2024-07-29T13:45:23Z","published":"2024-07-29T13:45:23Z","title":"Classification of freshwater snails of the genus \\emph{Radomaniola} with\n multimodal triplet networks","summary":" In this paper, we present our first proposal of a machine learning system for\nthe classification of freshwater snails of the genus \\emph{Radomaniola}. We\nelaborate on the specific challenges encountered during system design, and how\nwe tackled them; namely a small, very imbalanced dataset with a high number of\nclasses and high visual similarity between classes. We then show how we\nemployed triplet networks and the multiple input modalities of images,\nmeasurements, and genetic information to overcome these challenges and reach a\nperformance comparable to that of a trained domain expert.\n","authors":["Dennis Vetter","Muhammad Ahsan","Diana Delicado","Thomas A. Neubauer","Thomas Wilke","Gemma Roig"],"pdf_url":"https://arxiv.org/pdf/2407.20013v1.pdf","comment":"Spotlight at ICML 2024 AI for Science workshop"},{"id":"http://arxiv.org/abs/2407.20003v1","updated":"2024-07-29T13:34:34Z","published":"2024-07-29T13:34:34Z","title":"On the Effects of Irrelevant Variables in Treatment Effect Estimation\n with Deep Disentanglement","summary":" Estimating treatment effects from observational data is paramount in\nhealthcare, education, and economics, but current deep disentanglement-based\nmethods to address selection bias are insufficiently handling irrelevant\nvariables. We demonstrate in experiments that this leads to prediction errors.\nWe disentangle pre-treatment variables with a deep embedding method and\nexplicitly identify and represent irrelevant variables, additionally to\ninstrumental, confounding and adjustment latent factors. To this end, we\nintroduce a reconstruction objective and create an embedding space for\nirrelevant variables using an attached autoencoder. Instead of relying on\nserendipitous suppression of irrelevant variables as in previous deep\ndisentanglement approaches, we explicitly force irrelevant variables into this\nembedding space and employ orthogonalization to prevent irrelevant information\nfrom leaking into the latent space representations of the other factors. Our\nexperiments with synthetic and real-world benchmark datasets show that we can\nbetter identify irrelevant variables and more precisely predict treatment\neffects than previous methods, while prediction quality degrades less when\nadditional irrelevant variables are introduced.\n","authors":["Ahmad Saeed Khan","Erik Schaffernicht","Johannes Andreas Stork"],"pdf_url":"https://arxiv.org/pdf/2407.20003v1.pdf","comment":"Paper is accepted at ECAI-2024"},{"id":"http://arxiv.org/abs/2407.20000v1","updated":"2024-07-29T13:32:42Z","published":"2024-07-29T13:32:42Z","title":"Collision Probability Distribution Estimation via Temporal Difference\n Learning","summary":" We introduce CollisionPro, a pioneering framework designed to estimate\ncumulative collision probability distributions using temporal difference\nlearning, specifically tailored to applications in robotics, with a particular\nemphasis on autonomous driving. This approach addresses the demand for\nexplainable artificial intelligence (XAI) and seeks to overcome limitations\nimposed by model-based approaches and conservative constraints. We formulate\nour framework within the context of reinforcement learning to pave the way for\nsafety-aware agents. Nevertheless, we assert that our approach could prove\nbeneficial in various contexts, including a safety alert system or analytical\npurposes. A comprehensive examination of our framework is conducted using a\nrealistic autonomous driving simulator, illustrating its high sample efficiency\nand reliable prediction capabilities for previously unseen collision events.\nThe source code is publicly available.\n","authors":["Thomas Steinecker","Thorsten Luettel","Mirko Maehlisch"],"pdf_url":"https://arxiv.org/pdf/2407.20000v1.pdf","comment":"Code: https://github.com/UniBwTAS/CollisionPro"},{"id":"http://arxiv.org/abs/2404.09247v2","updated":"2024-07-29T13:32:39Z","published":"2024-04-14T13:17:32Z","title":"Generalization Error Bounds for Learning under Censored Feedback","summary":" Generalization error bounds from learning theory provide statistical\nguarantees on how well an algorithm will perform on previously unseen data. In\nthis paper, we characterize the impacts of data non-IIDness due to censored\nfeedback (a.k.a. selective labeling bias) on such bounds. We first derive an\nextension of the well-known Dvoretzky-Kiefer-Wolfowitz (DKW) inequality, which\ncharacterizes the gap between empirical and theoretical CDFs given IID data, to\nproblems with non-IID data due to censored feedback. We then use this CDF error\nbound to provide a bound on the generalization error guarantees of a classifier\ntrained on such non-IID data. We show that existing generalization error bounds\n(which do not account for censored feedback) fail to correctly capture the\nmodel's generalization guarantees, verifying the need for our bounds. We\nfurther analyze the effectiveness of (pure and bounded) exploration techniques,\nproposed by recent literature as a way to alleviate censored feedback, on\nimproving our error bounds. Together, our findings illustrate how a decision\nmaker should account for the trade-off between strengthening the generalization\nguarantees of an algorithm and the costs incurred in data collection when\nfuture data availability is limited by censored feedback.\n","authors":["Yifan Yang","Ali Payani","Parinaz Naghizadeh"],"pdf_url":"https://arxiv.org/pdf/2404.09247v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18343v2","updated":"2024-07-29T13:25:41Z","published":"2024-07-25T19:07:49Z","title":"Introducing δ-XAI: a novel sensitivity-based method for local AI\n explanations","summary":" Explainable Artificial Intelligence (XAI) is central to the debate on\nintegrating Artificial Intelligence (AI) and Machine Learning (ML) algorithms\ninto clinical practice. High-performing AI/ML models, such as ensemble learners\nand deep neural networks, often lack interpretability, hampering clinicians'\ntrust in their predictions. To address this, XAI techniques are being developed\nto describe AI/ML predictions in human-understandable terms. One promising\ndirection is the adaptation of sensitivity analysis (SA) and global sensitivity\nanalysis (GSA), which inherently rank model inputs by their impact on\npredictions. Here, we introduce a novel delta-XAI method that provides local\nexplanations of ML model predictions by extending the delta index, a GSA\nmetric. The delta-XAI index assesses the impact of each feature's value on the\npredicted output for individual instances in both regression and classification\nproblems. We formalize the delta-XAI index and provide code for its\nimplementation. The delta-XAI method was evaluated on simulated scenarios using\nlinear regression models, with Shapley values serving as a benchmark. Results\nshowed that the delta-XAI index is generally consistent with Shapley values,\nwith notable discrepancies in models with highly impactful or extreme feature\nvalues. The delta-XAI index demonstrated higher sensitivity in detecting\ndominant features and handling extreme feature values. Qualitatively, the\ndelta-XAI provides intuitive explanations by leveraging probability density\nfunctions, making feature rankings clearer and more explainable for\npractitioners. Overall, the delta-XAI method appears promising for robustly\nobtaining local explanations of ML model predictions. Further investigations in\nreal-world clinical settings will be conducted to evaluate its impact on\nAI-assisted clinical workflows.\n","authors":["Alessandro De Carlo","Enea Parimbelli","Nicola Melillo","Giovanna Nicora"],"pdf_url":"https://arxiv.org/pdf/2407.18343v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19990v1","updated":"2024-07-29T13:22:49Z","published":"2024-07-29T13:22:49Z","title":"Classification of Alzheimer's Dementia vs. Healthy subjects by studying\n structural disparities in fMRI Time-Series of DMN","summary":" Time series from different regions of interest (ROI) of default mode network\n(DMN) from Functional Magnetic Resonance Imaging (fMRI) can reveal significant\ndifferences between healthy and unhealthy people. Here, we propose the utility\nof an existing metric quantifying the lack/presence of structure in a signal\ncalled, \"deviation from stochasticity\" (DS) measure to characterize\nresting-state fMRI time series. The hypothesis is that differences in the level\nof structure in the time series can lead to discrimination between the subject\ngroups. In this work, an autoencoder-based model is utilized to learn efficient\nrepresentations of data by training the network to reconstruct its input data.\nThe proposed methodology is applied on fMRI time series of 50 healthy\nindividuals and 50 subjects with Alzheimer's Disease (AD), obtained from\npublicly available ADNI database. DS measure for healthy fMRI as expected turns\nout to be different compared to that of AD. Peak classification accuracy of 95%\nwas obtained using Gradient Boosting classifier, using the DS measure applied\non 100 subjects.\n","authors":["Sneha Noble","Chakka Sai Pradeep","Neelam Sinha","Thomas Gregor Issac"],"pdf_url":"https://arxiv.org/pdf/2407.19990v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19985v1","updated":"2024-07-29T13:19:31Z","published":"2024-07-29T13:19:31Z","title":"Mixture of Nested Experts: Adaptive Processing of Visual Tokens","summary":" The visual medium (images and videos) naturally contains a large amount of\ninformation redundancy, thereby providing a great opportunity for leveraging\nefficiency in processing. While Vision Transformer (ViT) based models scale\neffectively to large data regimes, they fail to capitalize on this inherent\nredundancy, leading to higher computational costs. Mixture of Experts (MoE)\nnetworks demonstrate scalability while maintaining same inference-time costs,\nbut they come with a larger parameter footprint. We present Mixture of Nested\nExperts (MoNE), which utilizes a nested structure for experts, wherein\nindividual experts fall on an increasing compute-accuracy curve. Given a\ncompute budget, MoNE learns to dynamically choose tokens in a priority order,\nand thus redundant tokens are processed through cheaper nested experts. Using\nthis framework, we achieve equivalent performance as the baseline models, while\nreducing inference time compute by over two-fold. We validate our approach on\nstandard image and video datasets - ImageNet-21K, Kinetics400, and\nSomething-Something-v2. We further highlight MoNE$'$s adaptability by\nshowcasing its ability to maintain strong performance across different\ninference-time compute budgets on videos, using only a single trained model.\n","authors":["Gagan Jain","Nidhi Hegde","Aditya Kusupati","Arsha Nagrani","Shyamal Buch","Prateek Jain","Anurag Arnab","Sujoy Paul"],"pdf_url":"https://arxiv.org/pdf/2407.19985v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.10082v2","updated":"2024-07-29T13:04:16Z","published":"2022-09-21T03:06:29Z","title":"Generalized Groves of Neural Additive Models: Pursuing transparent and\n accurate machine learning models in finance","summary":" While machine learning methods have significantly improved model performance\nover traditional methods, their black-box structure makes it difficult for\nresearchers to interpret results. For highly regulated financial industries,\nmodel transparency is equally important to accuracy. Without understanding how\nmodels work, even highly accurate machine learning methods are unlikely to be\naccepted. We address this issue by introducing a novel class of transparent\nmachine learning models known as generalized groves of neural additive models.\nThe generalized groves of neural additive models separate features into three\ncategories: linear features, individual nonlinear features, and interacted\nnonlinear features. Additionally, interactions in the last category are only\nlocal. A stepwise selection algorithm distinguishes the linear and nonlinear\ncomponents, and interacted groups are carefully verified by applying additive\nseparation criteria. Through some empirical examples in finance, we demonstrate\nthat generalized grove of neural additive models exhibit high accuracy and\ntransparency with predominantly linear terms and only sparse nonlinear ones.\n","authors":["Dangxing Chen","Weicheng Ye"],"pdf_url":"https://arxiv.org/pdf/2209.10082v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.04728v2","updated":"2024-07-29T12:54:16Z","published":"2022-01-11T00:10:28Z","title":"Quasi-Framelets: Robust Graph Neural Networks via Adaptive Framelet\n Convolution","summary":" This paper aims to provide a novel design of a multiscale framelet\nconvolution for spectral graph neural networks (GNNs). While current spectral\nmethods excel in various graph learning tasks, they often lack the flexibility\nto adapt to noisy, incomplete, or perturbed graph signals, making them fragile\nin such conditions. Our newly proposed framelet convolution addresses these\nlimitations by decomposing graph data into low-pass and high-pass spectra\nthrough a finely-tuned multiscale approach. Our approach directly designs\nfiltering functions within the spectral domain, allowing for precise control\nover the spectral components. The proposed design excels in filtering out\nunwanted spectral information and significantly reduces the adverse effects of\nnoisy graph signals. Our approach not only enhances the robustness of GNNs but\nalso preserves crucial graph features and structures. Through extensive\nexperiments on diverse, real-world graph datasets, we demonstrate that our\nframelet convolution achieves superior performance in node classification\ntasks. It exhibits remarkable resilience to noisy data and adversarial attacks,\nhighlighting its potential as a robust solution for real-world graph\napplications. This advancement opens new avenues for more adaptive and reliable\nspectral GNN architectures.\n","authors":["Mengxi Yang","Dai Shi","Xuebin Zheng","Jie Yin","Junbin Gao"],"pdf_url":"https://arxiv.org/pdf/2201.04728v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03519v2","updated":"2024-07-29T12:53:53Z","published":"2024-06-05T17:41:42Z","title":"Noise-Aware Algorithm for Heterogeneous Differentially Private Federated\n Learning","summary":" High utility and rigorous data privacy are of the main goals of a federated\nlearning (FL) system, which learns a model from the data distributed among some\nclients. The latter has been tried to achieve by using differential privacy in\nFL (DPFL). There is often heterogeneity in clients privacy requirements, and\nexisting DPFL works either assume uniform privacy requirements for clients or\nare not applicable when server is not fully trusted (our setting). Furthermore,\nthere is often heterogeneity in batch and/or dataset size of clients, which as\nshown, results in extra variation in the DP noise level across clients model\nupdates. With these sources of heterogeneity, straightforward aggregation\nstrategies, e.g., assigning clients aggregation weights proportional to their\nprivacy parameters will lead to lower utility. We propose Robust-HDP, which\nefficiently estimates the true noise level in clients model updates and reduces\nthe noise-level in the aggregated model updates considerably. Robust-HDP\nimproves utility and convergence speed, while being safe to the clients that\nmay maliciously send falsified privacy parameter to server. Extensive\nexperimental results on multiple datasets and our theoretical analysis confirm\nthe effectiveness of Robust-HDP. Our code can be found here.\n","authors":["Saber Malekmohammadi","Yaoliang Yu","Yang Cao"],"pdf_url":"https://arxiv.org/pdf/2406.03519v2.pdf","comment":"Proceedings of the 41 st International Conference on Machine\n Learning, Vienna, Austria. PMLR 235, 2024"},{"id":"http://arxiv.org/abs/2401.01192v2","updated":"2024-07-29T12:45:40Z","published":"2024-01-02T12:41:17Z","title":"Deep-ELA: Deep Exploratory Landscape Analysis with Self-Supervised\n Pretrained Transformers for Single- and Multi-Objective Continuous\n Optimization Problems","summary":" In many recent works, the potential of Exploratory Landscape Analysis (ELA)\nfeatures to numerically characterize, in particular, single-objective\ncontinuous optimization problems has been demonstrated. These numerical\nfeatures provide the input for all kinds of machine learning tasks on\ncontinuous optimization problems, ranging, i.a., from High-level Property\nPrediction to Automated Algorithm Selection and Automated Algorithm\nConfiguration. Without ELA features, analyzing and understanding the\ncharacteristics of single-objective continuous optimization problems is -- to\nthe best of our knowledge -- very limited.\n Yet, despite their usefulness, as demonstrated in several past works, ELA\nfeatures suffer from several drawbacks. These include, in particular, (1.) a\nstrong correlation between multiple features, as well as (2.) its very limited\napplicability to multi-objective continuous optimization problems. As a remedy,\nrecent works proposed deep learning-based approaches as alternatives to ELA. In\nthese works, e.g., point-cloud transformers were used to characterize an\noptimization problem's fitness landscape. However, these approaches require a\nlarge amount of labeled training data.\n Within this work, we propose a hybrid approach, Deep-ELA, which combines (the\nbenefits of) deep learning and ELA features. Specifically, we pre-trained four\ntransformers on millions of randomly generated optimization problems to learn\ndeep representations of the landscapes of continuous single- and\nmulti-objective optimization problems. Our proposed framework can either be\nused out-of-the-box for analyzing single- and multi-objective continuous\noptimization problems, or subsequently fine-tuned to various tasks focussing on\nalgorithm behavior and problem understanding.\n","authors":["Moritz Vinzent Seiler","Pascal Kerschke","Heike Trautmann"],"pdf_url":"https://arxiv.org/pdf/2401.01192v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19951v1","updated":"2024-07-29T12:39:07Z","published":"2024-07-29T12:39:07Z","title":"Can I trust my anomaly detection system? A case study based on\n explainable AI","summary":" Generative models based on variational autoencoders are a popular technique\nfor detecting anomalies in images in a semi-supervised context. A common\napproach employs the anomaly score to detect the presence of anomalies, and it\nis known to reach high level of accuracy on benchmark datasets. However, since\nanomaly scores are computed from reconstruction disparities, they often obscure\nthe detection of various spurious features, raising concerns regarding their\nactual efficacy. This case study explores the robustness of an anomaly\ndetection system based on variational autoencoder generative models through the\nuse of eXplainable AI methods. The goal is to get a different perspective on\nthe real performances of anomaly detectors that use reconstruction differences.\nIn our case study we discovered that, in many cases, samples are detected as\nanomalous for the wrong or misleading factors.\n","authors":["Muhammad Rashid","Elvio Amparore","Enrico Ferrari","Damiano Verda"],"pdf_url":"https://arxiv.org/pdf/2407.19951v1.pdf","comment":"World Conference on eXplainable Artificial Intelligence"},{"id":"http://arxiv.org/abs/2407.19947v1","updated":"2024-07-29T12:29:29Z","published":"2024-07-29T12:29:29Z","title":"Inference acceleration for large language models using \"stairs\" assisted\n greedy generation","summary":" Large Language Models (LLMs) with billions of parameters are known for their\nimpressive predicting capabilities but require lots of resources to run. With\ntheir massive rise in popularity, even a small reduction in required resources\ncould have an impact on environment. On the other hand, smaller models require\nfewer resources but may sacrifice accuracy. In this work, we are proposing an\nimplementation of ``stairs'' assisted greedy generation. It is a modified\nassisted generation methodology that makes use of a smaller model's fast\ngeneration, large model's batch prediction, and \"stairs\" validation in order to\nachieve a speed up in prediction generation. Results show between 9.58 and\n17.24 percent inference time reduction compared to a stand-alone large LLM\nprediction in a text generation task without a loss in accuracy.\n","authors":["Domas Grigaliūnas","Mantas Lukoševičius"],"pdf_url":"https://arxiv.org/pdf/2407.19947v1.pdf","comment":"Accepted at the 29th International Conference on Information Society\n and University Studies (IVUS 2024)"},{"id":"http://arxiv.org/abs/2407.19944v1","updated":"2024-07-29T12:24:28Z","published":"2024-07-29T12:24:28Z","title":"Noise-Resilient Unsupervised Graph Representation Learning via Multi-Hop\n Feature Quality Estimation","summary":" Unsupervised graph representation learning (UGRL) based on graph neural\nnetworks (GNNs), has received increasing attention owing to its efficacy in\nhandling graph-structured data. However, existing UGRL methods ideally assume\nthat the node features are noise-free, which makes them fail to distinguish\nbetween useful information and noise when applied to real data with noisy\nfeatures, thus affecting the quality of learned representations. This urges us\nto take node noisy features into account in real-world UGRL. With empirical\nanalysis, we reveal that feature propagation, the essential operation in GNNs,\nacts as a \"double-edged sword\" in handling noisy features - it can both denoise\nand diffuse noise, leading to varying feature quality across nodes, even within\nthe same node at different hops. Building on this insight, we propose a novel\nUGRL method based on Multi-hop feature Quality Estimation (MQE for short).\nUnlike most UGRL models that directly utilize propagation-based GNNs to\ngenerate representations, our approach aims to learn representations through\nestimating the quality of propagated features at different hops. Specifically,\nwe introduce a Gaussian model that utilizes a learnable \"meta-representation\"\nas a condition to estimate the expectation and variance of multi-hop propagated\nfeatures via neural networks. In this way, the \"meta representation\" captures\nthe semantic and structural information underlying multiple propagated features\nbut is naturally less susceptible to interference by noise, thereby serving as\nhigh-quality node representations beneficial for downstream tasks. Extensive\nexperiments on multiple real-world datasets demonstrate that MQE in learning\nreliable node representations in scenarios with diverse types of feature noise.\n","authors":["Shiyuan Li","Yixin Liu","Qingfeng Chen","Geoffrey I. Webb","Shirui Pan"],"pdf_url":"https://arxiv.org/pdf/2407.19944v1.pdf","comment":"Accepted by CIKM 2024. 11 pages, 8 figures"},{"id":"http://arxiv.org/abs/2407.19943v1","updated":"2024-07-29T12:23:59Z","published":"2024-07-29T12:23:59Z","title":"Practical and Robust Safety Guarantees for Advanced Counterfactual\n Learning to Rank","summary":" Counterfactual learning to rank (CLTR ) can be risky; various circumstances\ncan cause it to produce sub-optimal models that hurt performance when deployed.\nSafe CLTR was introduced to mitigate these risks when using inverse propensity\nscoring to correct for position bias. However, the existing safety measure for\nCLTR is not applicable to state-of-the-art CLTR, it cannot handle trust bias,\nand its guarantees rely on specific assumptions about user behavior. Our\ncontributions are two-fold. First, we generalize the existing safe CLTR\napproach to make it applicable to state-of-the-art doubly robust (DR) CLTR and\ntrust bias. Second, we propose a novel approach, proximal ranking policy\noptimization (PRPO ), that provides safety in deployment without assumptions\nabout user behavior. PRPO removes incentives for learning ranking behavior that\nis too dissimilar to a safe ranking model. Thereby, PRPO imposes a limit on how\nmuch learned models can degrade performance metrics, without relying on any\nspecific user assumptions. Our experiments show that both our novel safe doubly\nrobust method and PRPO provide higher performance than the existing safe\ninverse propensity scoring approach. However, when circumstances are\nunexpected, the safe doubly robust approach can become unsafe and bring\ndetrimental performance. In contrast, PRPO always maintains safety, even in\nmaximally adversarial situations. By avoiding assumptions, PRPO is the first\nmethod with unconditional safety in deployment that translates to robust safety\nfor real-world applications.\n","authors":["Shashank Gupta","Harrie Oosterhuis","Maarten de Rijke"],"pdf_url":"https://arxiv.org/pdf/2407.19943v1.pdf","comment":"Full paper at CIKM 2024"},{"id":"http://arxiv.org/abs/2407.19941v1","updated":"2024-07-29T12:22:16Z","published":"2024-07-29T12:22:16Z","title":"Boosting Graph Foundation Model from Structural Perspective","summary":" Graph foundation models have recently attracted significant attention due to\nits strong generalizability. Although existing methods resort to language\nmodels to learn unified semantic representations across domains, they disregard\nthe unique structural characteristics of graphs from different domains. To\naddress the problem, in this paper, we boost graph foundation model from\nstructural perspective and propose BooG. The model constructs virtual super\nnodes to unify structural characteristics of graph data from different domains.\nSpecifically, the super nodes fuse the information of anchor nodes and class\nlabels, where each anchor node captures the information of a node or a graph\ninstance to be classified. Instead of using the raw graph structure, we connect\nsuper nodes to all nodes within their neighborhood by virtual edges. This new\nstructure allows for effective information aggregation while unifying\ncross-domain structural characteristics. Additionally, we propose a novel\npre-training objective based on contrastive learning, which learns more\nexpressive representations for graph data and generalizes effectively to\ndifferent domains and downstream tasks. Experimental results on various\ndatasets and tasks demonstrate the superior performance of BooG. We provide our\ncode and data here: https://anonymous.4open.science/r/BooG-EE42/.\n","authors":["Yao Cheng","Yige Zhao","Jianxiang Yu","Xiang Li"],"pdf_url":"https://arxiv.org/pdf/2407.19941v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.10768v3","updated":"2024-07-29T12:19:10Z","published":"2024-07-15T14:50:15Z","title":"MSegRNN:Enhanced SegRNN Model with Mamba for Long-Term Time Series\n Forecasting","summary":" Long time series forecasting aims to utilize historical information to\nforecast future states over extended horizons. Traditional RNN-based series\nforecasting methods struggle to effectively address long-term dependencies and\ngradient issues in long time series problems. Recently, SegRNN has emerged as a\nleading RNN-based model tailored for long-term series forecasting,\ndemonstrating state-of-the-art performance while maintaining a streamlined\narchitecture through innovative segmentation and parallel decoding techniques.\nNevertheless, SegRNN has several limitations: its fixed segmentation disrupts\ndata continuity and fails to effectively leverage information across different\nsegments, the segmentation strategy employed by SegRNN does not fundamentally\naddress the issue of information loss within the recurrent structure. To\naddress these issues, we propose the MSegRNN method with three key\nenhancements: we introduce an implicit segmentation structure to decompose the\ntime series and map it to segmented hidden states, resulting in denser\ninformation exchange during the segmentation phase. Additionally, we\nincorporate residual structures in the encoding layer to mitigate information\nloss within the recurrent structure. To extract information more effectively,\nwe further integrate the Mamba architecture to enhance time series information\nextraction. Experiments on several real-world long time series forecasting\ndatasets demonstrate that our model surpasses the performance of current\nstate-of-the-art models.\n","authors":["GaoXiang Zhao","Li Zhou","XiaoQiang Wang"],"pdf_url":"https://arxiv.org/pdf/2407.10768v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05445v2","updated":"2024-07-29T12:10:27Z","published":"2024-04-08T12:27:00Z","title":"Unsupervised Training of Convex Regularizers using Maximum Likelihood\n Estimation","summary":" Imaging is a standard example of an inverse problem, where the task of\nreconstructing a ground truth from a noisy measurement is ill-posed. Recent\nstate-of-the-art approaches for imaging use deep learning, spearheaded by\nunrolled and end-to-end models and trained on various image datasets. However,\nmany such methods require the availability of ground truth data, which may be\nunavailable or expensive, leading to a fundamental barrier that can not be\nbypassed by choice of architecture. Unsupervised learning presents an\nalternative paradigm that bypasses this requirement, as they can be learned\ndirectly on noisy data and do not require any ground truths. A principled\nBayesian approach to unsupervised learning is to maximize the marginal\nlikelihood with respect to the given noisy measurements, which is intrinsically\nlinked to classical variational regularization. We propose an unsupervised\napproach using maximum marginal likelihood estimation to train a convex neural\nnetwork-based image regularization term directly on noisy measurements,\nimproving upon previous work in both model expressiveness and dataset size.\nExperiments demonstrate that the proposed method produces priors that are near\ncompetitive when compared to the analogous supervised training method for\nvarious image corruption operators, maintaining significantly better\ngeneralization properties when compared to end-to-end methods. Moreover, we\nprovide a detailed theoretical analysis of the convergence properties of our\nproposed algorithm.\n","authors":["Hong Ye Tan","Ziruo Cai","Marcelo Pereyra","Subhadip Mukherjee","Junqi Tang","Carola-Bibiane Schönlieb"],"pdf_url":"https://arxiv.org/pdf/2404.05445v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.13900v2","updated":"2024-07-29T11:53:06Z","published":"2022-10-25T10:35:45Z","title":"Deep NURBS -- Admissible Physics-informed Neural Networks","summary":" In this study, we propose a new numerical scheme for physics-informed neural\nnetworks (PINNs) that enables precise and inexpensive solution for partial\ndifferential equations (PDEs) in case of arbitrary geometries while strictly\nenforcing Dirichlet boundary conditions. The proposed approach combines\nadmissible NURBS parametrizations required to define the physical domain and\nthe Dirichlet boundary conditions with a PINN solver. The fundamental boundary\nconditions are automatically satisfied in this novel Deep NURBS framework. We\nverified our new approach using two-dimensional elliptic PDEs when considering\narbitrary geometries, including non-Lipschitz domains. Compared to the\nclassical PINN solver, the Deep NURBS estimator has a remarkably high\nconvergence rate for all the studied problems. Moreover, a desirable accuracy\nwas realized for most of the studied PDEs using only one hidden layer of neural\nnetworks. This novel approach is considered to pave the way for more effective\nsolutions for high-dimensional problems by allowing for more realistic\nphysics-informed statistical learning to solve PDE-based variational problems.\n","authors":["Hamed Saidaoui","Luis Espath","Rául Tempone"],"pdf_url":"https://arxiv.org/pdf/2210.13900v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11617v2","updated":"2024-07-29T11:51:40Z","published":"2023-07-21T14:36:40Z","title":"Robust Fully-Asynchronous Methods for Distributed Training over General\n Architecture","summary":" Perfect synchronization in distributed machine learning problems is\ninefficient and even impossible due to the existence of latency, package losses\nand stragglers. We propose a Robust Fully-Asynchronous Stochastic Gradient\nTracking method (R-FAST), where each device performs local computation and\ncommunication at its own pace without any form of synchronization. Different\nfrom existing asynchronous distributed algorithms, R-FAST can eliminate the\nimpact of data heterogeneity across devices and allow for packet losses by\nemploying a robust gradient tracking strategy that relies on properly designed\nauxiliary variables for tracking and buffering the overall gradient vector.\nMore importantly, the proposed method utilizes two spanning-tree graphs for\ncommunication so long as both share at least one common root, enabling flexible\ndesigns in communication architectures. We show that R-FAST converges in\nexpectation to a neighborhood of the optimum with a geometric rate for smooth\nand strongly convex objectives; and to a stationary point with a sublinear rate\nfor general non-convex settings. Extensive experiments demonstrate that R-FAST\nruns 1.5-2 times faster than synchronous benchmark algorithms, such as\nRing-AllReduce and D-PSGD, while still achieving comparable accuracy, and\noutperforms existing asynchronous SOTA algorithms, such as AD-PSGD and OSGP,\nespecially in the presence of stragglers.\n","authors":["Zehan Zhu","Ye Tian","Yan Huang","Jinming Xu","Shibo He"],"pdf_url":"https://arxiv.org/pdf/2307.11617v2.pdf","comment":"This paper has been accepted for publication as a regular paper in\n the IEEE Transactions on Signal and Information Processing over Networks"},{"id":"http://arxiv.org/abs/2407.19916v1","updated":"2024-07-29T11:48:44Z","published":"2024-07-29T11:48:44Z","title":"Aero-Nef: Neural Fields for Rapid Aircraft Aerodynamics Simulations","summary":" This paper presents a methodology to learn surrogate models of steady state\nfluid dynamics simulations on meshed domains, based on Implicit Neural\nRepresentations (INRs). The proposed models can be applied directly to\nunstructured domains for different flow conditions, handle non-parametric 3D\ngeometric variations, and generalize to unseen shapes at test time. The\ncoordinate-based formulation naturally leads to robustness with respect to\ndiscretization, allowing an excellent trade-off between computational cost\n(memory footprint and training time) and accuracy. The method is demonstrated\non two industrially relevant applications: a RANS dataset of the\ntwo-dimensional compressible flow over a transonic airfoil and a dataset of the\nsurface pressure distribution over 3D wings, including shape, inflow condition,\nand control surface deflection variations. On the considered test cases, our\napproach achieves a more than three times lower test error and significantly\nimproves generalization error on unseen geometries compared to state-of-the-art\nGraph Neural Network architectures. Remarkably, the method can perform\ninference five order of magnitude faster than the high fidelity solver on the\nRANS transonic airfoil dataset. Code is available at\nhttps://gitlab.isae-supaero.fr/gi.catalani/aero-nepf\n","authors":["Giovanni Catalani","Siddhant Agarwal","Xavier Bertrand","Frederic Tost","Michael Bauerheim","Joseph Morlier"],"pdf_url":"https://arxiv.org/pdf/2407.19916v1.pdf","comment":"32 pages"},{"id":"http://arxiv.org/abs/2407.19914v1","updated":"2024-07-29T11:44:21Z","published":"2024-07-29T11:44:21Z","title":"Sentiment Analysis of Lithuanian Online Reviews Using Large Language\n Models","summary":" Sentiment analysis is a widely researched area within Natural Language\nProcessing (NLP), attracting significant interest due to the advent of\nautomated solutions. Despite this, the task remains challenging because of the\ninherent complexity of languages and the subjective nature of sentiments. It is\neven more challenging for less-studied and less-resourced languages such as\nLithuanian. Our review of existing Lithuanian NLP research reveals that\ntraditional machine learning methods and classification algorithms have limited\neffectiveness for the task. In this work, we address sentiment analysis of\nLithuanian five-star-based online reviews from multiple domains that we collect\nand clean. We apply transformer models to this task for the first time,\nexploring the capabilities of pre-trained multilingual Large Language Models\n(LLMs), specifically focusing on fine-tuning BERT and T5 models. Given the\ninherent difficulty of the task, the fine-tuned models perform quite well,\nespecially when the sentiments themselves are less ambiguous: 80.74% and 89.61%\ntesting recognition accuracy of the most popular one- and five-star reviews\nrespectively. They significantly outperform current commercial state-of-the-art\ngeneral-purpose LLM GPT-4. We openly share our fine-tuned LLMs online.\n","authors":["Brigita Vileikytė","Mantas Lukoševičius","Lukas Stankevičius"],"pdf_url":"https://arxiv.org/pdf/2407.19914v1.pdf","comment":"Accepted at the 29th International Conference on Information Society\n and University Studies (IVUS 2024)"},{"id":"http://arxiv.org/abs/2407.19911v1","updated":"2024-07-29T11:39:22Z","published":"2024-07-29T11:39:22Z","title":"Efficient Shield Synthesis via State-Space Transformation","summary":" We consider the problem of synthesizing safety strategies for control\nsystems, also known as shields. Since the state space is infinite, shields are\ntypically computed over a finite-state abstraction, with the most common\nabstraction being a rectangular grid. However, for many systems, such a grid\ndoes not align well with the safety property or the system dynamics. That is\nwhy a coarse grid is rarely sufficient, but a fine grid is typically\ncomputationally infeasible to obtain. In this paper, we show that appropriate\nstate-space transformations can still allow to use a coarse grid at almost no\ncomputational overhead. We demonstrate in three case studies that our\ntransformation-based synthesis outperforms a standard synthesis by several\norders of magnitude. In the first two case studies, we use domain knowledge to\nselect a suitable transformation. In the third case study, we instead report on\nresults in engineering a transformation without domain knowledge.\n","authors":["Asger Horn Brorholt","Andreas Holck Høeg-Petersen","Kim Guldstrand Larsen","Christian Schilling"],"pdf_url":"https://arxiv.org/pdf/2407.19911v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19897v1","updated":"2024-07-29T11:21:17Z","published":"2024-07-29T11:21:17Z","title":"BEExAI: Benchmark to Evaluate Explainable AI","summary":" Recent research in explainability has given rise to numerous post-hoc\nattribution methods aimed at enhancing our comprehension of the outputs of\nblack-box machine learning models. However, evaluating the quality of\nexplanations lacks a cohesive approach and a consensus on the methodology for\nderiving quantitative metrics that gauge the efficacy of explainability\npost-hoc attribution methods. Furthermore, with the development of increasingly\ncomplex deep learning models for diverse data applications, the need for a\nreliable way of measuring the quality and correctness of explanations is\nbecoming critical. We address this by proposing BEExAI, a benchmark tool that\nallows large-scale comparison of different post-hoc XAI methods, employing a\nset of selected evaluation metrics.\n","authors":["Samuel Sithakoul","Sara Meftah","Clément Feutry"],"pdf_url":"https://arxiv.org/pdf/2407.19897v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19892v1","updated":"2024-07-29T11:15:25Z","published":"2024-07-29T11:15:25Z","title":"Making Multi-Axis Gaussian Graphical Models Scalable to Millions of\n Samples and Features","summary":" Gaussian graphical models can be used to extract conditional dependencies\nbetween the features of the dataset. This is often done by making an\nindependence assumption about the samples, but this assumption is rarely\nsatisfied in reality. However, state-of-the-art approaches that avoid this\nassumption are not scalable, with $O(n^3)$ runtime and $O(n^2)$ space\ncomplexity. In this paper, we introduce a method that has $O(n^2)$ runtime and\n$O(n)$ space complexity, without assuming independence.\n We validate our model on both synthetic and real-world datasets, showing that\nour method's accuracy is comparable to that of prior work We demonstrate that\nour approach can be used on unprecedentedly large datasets, such as a\nreal-world 1,000,000-cell scRNA-seq dataset; this was impossible with previous\napproaches. Our method maintains the flexibility of prior work, such as the\nability to handle multi-modal tensor-variate datasets and the ability to work\nwith data of arbitrary marginal distributions. An additional advantage of our\nmethod is that, unlike prior work, our hyperparameters are easily\ninterpretable.\n","authors":["Bailey Andrew","David R. Westhead","Luisa Cutillo"],"pdf_url":"https://arxiv.org/pdf/2407.19892v1.pdf","comment":"39 pages (48 with appendix+references), 8 figures, 7 tables"},{"id":"http://arxiv.org/abs/2407.19888v1","updated":"2024-07-29T11:09:10Z","published":"2024-07-29T11:09:10Z","title":"Yucca: A Deep Learning Framework For Medical Image Analysis","summary":" Medical image analysis using deep learning frameworks has advanced healthcare\nby automating complex tasks, but many existing frameworks lack flexibility,\nmodularity, and user-friendliness. To address these challenges, we introduce\nYucca, an open-source AI framework available at\nhttps://github.com/Sllambias/yucca, designed specifically for medical imaging\napplications and built on PyTorch and PyTorch Lightning. Yucca features a\nthree-tiered architecture: Functional, Modules, and Pipeline, providing a\ncomprehensive and customizable solution. Evaluated across diverse tasks such as\ncerebral microbleeds detection, white matter hyperintensity segmentation, and\nhippocampus segmentation, Yucca achieves state-of-the-art results,\ndemonstrating its robustness and versatility. Yucca offers a powerful,\nflexible, and user-friendly platform for medical image analysis, inviting\ncommunity contributions to advance its capabilities and impact.\n","authors":["Sebastian Nørgaard Llambias","Julia Machnio","Asbjørn Munk","Jakob Ambsdorf","Mads Nielsen","Mostafa Mehdipour Ghazi"],"pdf_url":"https://arxiv.org/pdf/2407.19888v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19109v3","updated":"2024-07-29T10:53:34Z","published":"2024-04-29T21:19:41Z","title":"The Shape of Money Laundering: Subgraph Representation Learning on the\n Blockchain with the Elliptic2 Dataset","summary":" Subgraph representation learning is a technique for analyzing local\nstructures (or shapes) within complex networks. Enabled by recent developments\nin scalable Graph Neural Networks (GNNs), this approach encodes relational\ninformation at a subgroup level (multiple connected nodes) rather than at a\nnode level of abstraction. We posit that certain domain applications, such as\nanti-money laundering (AML), are inherently subgraph problems and mainstream\ngraph techniques have been operating at a suboptimal level of abstraction. This\nis due in part to the scarcity of annotated datasets of real-world size and\ncomplexity, as well as the lack of software tools for managing subgraph GNN\nworkflows at scale. To enable work in fundamental algorithms as well as domain\napplications in AML and beyond, we introduce Elliptic2, a large graph dataset\ncontaining 122K labeled subgraphs of Bitcoin clusters within a background graph\nconsisting of 49M node clusters and 196M edge transactions. The dataset\nprovides subgraphs known to be linked to illicit activity for learning the set\nof \"shapes\" that money laundering exhibits in cryptocurrency and accurately\nclassifying new criminal activity. Along with the dataset we share our graph\ntechniques, software tooling, promising early experimental results, and new\ndomain insights already gleaned from this approach. Taken together, we find\nimmediate practical value in this approach and the potential for a new standard\nin anti-money laundering and forensic analytics in cryptocurrencies and other\nfinancial networks.\n","authors":["Claudio Bellei","Muhua Xu","Ross Phillips","Tom Robinson","Mark Weber","Tim Kaler","Charles E. Leiserson"," Arvind","Jie Chen"],"pdf_url":"https://arxiv.org/pdf/2404.19109v3.pdf","comment":"KDD MLF Workshop 2024. Dataset can be accessed at\n http://elliptic.co/elliptic2. Code can be accessed at\n https://github.com/MITIBMxGraph/Elliptic2"},{"id":"http://arxiv.org/abs/2406.11390v2","updated":"2024-07-29T10:43:46Z","published":"2024-06-17T10:21:01Z","title":"Unfolding Time: Generative Modeling for Turbulent Flows in 4D","summary":" A recent study in turbulent flow simulation demonstrated the potential of\ngenerative diffusion models for fast 3D surrogate modeling. This approach\neliminates the need for specifying initial states or performing lengthy\nsimulations, significantly accelerating the process. While adept at sampling\nindividual frames from the learned manifold of turbulent flow states, the\nprevious model lacks the capability to generate sequences, hindering analysis\nof dynamic phenomena. This work addresses this limitation by introducing a 4D\ngenerative diffusion model and a physics-informed guidance technique that\nenables the generation of realistic sequences of flow states. Our findings\nindicate that the proposed method can successfully sample entire subsequences\nfrom the turbulent manifold, even though generalizing from individual frames to\nsequences remains a challenging task. This advancement opens doors for the\napplication of generative modeling in analyzing the temporal evolution of\nturbulent flows, providing valuable insights into their complex dynamics.\n","authors":["Abdullah Saydemir","Marten Lienen","Stephan Günnemann"],"pdf_url":"https://arxiv.org/pdf/2406.11390v2.pdf","comment":"AI4Science Workshop @ ICML 2024"},{"id":"http://arxiv.org/abs/2407.19872v1","updated":"2024-07-29T10:43:15Z","published":"2024-07-29T10:43:15Z","title":"OpenUAS: Embeddings of Cities in Japan with Anchor Data for Cross-city\n Analysis of Area Usage Patterns","summary":" We publicly release OpenUAS, a dataset of area embeddings based on urban\nusage patterns, including embeddings for over 1.3 million 50-meter square\nmeshes covering a total area of 3,300 square kilometers. This dataset is\nvaluable for analyzing area functions in fields such as market analysis, urban\nplanning, transportation infrastructure, and infection prediction. It captures\nthe characteristics of each area in the city, such as office districts and\nresidential areas, by employing an area embedding technique that utilizes\nlocation information typically obtained by GPS. Numerous area embedding\ntechniques have been proposed, and while the public release of such embedding\ndatasets is technically feasible, it has not been realized. One of the\nobstacles has been the integration of data from different cities and periods\ninto a unified space without sharing raw location data. We address this issue\nby developing an anchoring method that establishes anchors within a shared\nembedding space. We publicly release this anchor dataset along with area\nembedding datasets from several periods in eight major Japanese cities. This\ndataset allows users to analyze urban usage patterns in Japanese cities and\nembed their urban dataset into the same embedding space using the anchoring\nmethod. Our key contributions include the development of the anchoring method,\nreleasing area embedding datasets for Japanese cities, and providing tools for\neffective data utilization.\n","authors":["Naoki Tamura","Kazuyuki Shoji","Shin Katayama","Kenta Urano","Takuro Yonezawa","Nobuo Kawaguchi"],"pdf_url":"https://arxiv.org/pdf/2407.19872v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02771v2","updated":"2024-07-29T10:35:50Z","published":"2024-05-04T23:16:48Z","title":"MMEarth: Exploring Multi-Modal Pretext Tasks For Geospatial\n Representation Learning","summary":" The volume of unlabelled Earth observation (EO) data is huge, but many\nimportant applications lack labelled training data. However, EO data offers the\nunique opportunity to pair data from different modalities and sensors\nautomatically based on geographic location and time, at virtually no human\nlabor cost. We seize this opportunity to create MMEarth, a diverse multi-modal\npretraining dataset at global scale. Using this new corpus of 1.2 million\nlocations, we propose a Multi-Pretext Masked Autoencoder (MP-MAE) approach to\nlearn general-purpose representations for optical satellite images. Our\napproach builds on the ConvNeXt V2 architecture, a fully convolutional masked\nautoencoder (MAE). Drawing upon a suite of multi-modal pretext tasks, we\ndemonstrate that our MP-MAE approach outperforms both MAEs pretrained on\nImageNet and MAEs pretrained on domain-specific satellite images. This is shown\non several downstream tasks including image classification and semantic\nsegmentation. We find that pretraining with multi-modal pretext tasks notably\nimproves the linear probing performance compared to pretraining on optical\nsatellite images only. This also leads to better label efficiency and parameter\nefficiency which are crucial aspects in global scale applications.\n","authors":["Vishal Nedungadi","Ankit Kariryaa","Stefan Oehmcke","Serge Belongie","Christian Igel","Nico Lang"],"pdf_url":"https://arxiv.org/pdf/2405.02771v2.pdf","comment":"Accepted for ECCV 2024. Data and code:\n https://vishalned.github.io/mmearth Update arXiv v2 (ECCV): 1. Dataset fix:\n Removed duplicates and corrected ERA5 yearly statistics. 2. Data augmentation\n fix: Random crops are now aligned. 3. Test metrics fix: Metrics are now\n overall instead of mini-batch averages, matching GEO-Bench metrics. 4.\n Pretrained on MMEarth v001 & evaluated on GEO-Bench v1.0"},{"id":"http://arxiv.org/abs/2407.19866v1","updated":"2024-07-29T10:35:39Z","published":"2024-07-29T10:35:39Z","title":"Deep Image Priors for Magnetic Resonance Fingerprinting with pretrained\n Bloch-consistent denoising autoencoders","summary":" The estimation of multi-parametric quantitative maps from Magnetic Resonance\nFingerprinting (MRF) compressed sampled acquisitions, albeit successful,\nremains a challenge due to the high underspampling rate and artifacts naturally\noccuring during image reconstruction. Whilst state-of-the-art DL methods can\nsuccessfully address the task, to fully exploit their capabilities they often\nrequire training on a paired dataset, in an area where ground truth is seldom\navailable. In this work, we propose a method that combines a deep image prior\n(DIP) module that, without ground truth and in conjunction with a Bloch\nconsistency enforcing autoencoder, can tackle the problem, resulting in a\nmethod faster and of equivalent or better accuracy than DIP-MRF.\n","authors":["Perla Mayo","Matteo Cencini","Ketan Fatania","Carolin M. Pirkl","Marion I. Menzel","Bjoern H. Menze","Michela Tosetti","Mohammad Golbabaee"],"pdf_url":"https://arxiv.org/pdf/2407.19866v1.pdf","comment":"4 pages, 3 figures 1 table, presented at ISBI 2024"},{"id":"http://arxiv.org/abs/2407.19865v1","updated":"2024-07-29T10:34:19Z","published":"2024-07-29T10:34:19Z","title":"Imitation Learning for Intra-Day Power Grid Operation through Topology\n Actions","summary":" Power grid operation is becoming increasingly complex due to the increase in\ngeneration of renewable energy. The recent series of Learning To Run a Power\nNetwork (L2RPN) competitions have encouraged the use of artificial agents to\nassist human dispatchers in operating power grids. In this paper we study the\nperformance of imitation learning for day-ahead power grid operation through\ntopology actions. In particular, we consider two rule-based expert agents: a\ngreedy agent and a N-1 agent. While the latter is more computationally\nexpensive since it takes N-1 safety considerations into account, it exhibits a\nmuch higher operational performance. We train a fully-connected neural network\n(FCNN) on expert state-action pairs and evaluate it in two ways. First, we find\nthat classification accuracy is limited despite extensive hyperparameter\ntuning, due to class imbalance and class overlap. Second, as a power system\nagent, the FCNN performs only slightly worse than expert agents. Furthermore,\nhybrid agents, which incorporate minimal additional simulations, match expert\nagents' performance with significantly lower computational cost. Consequently,\nimitation learning shows promise for developing fast, high-performing power\ngrid agents, motivating its further exploration in future L2RPN studies.\n","authors":["Matthijs de Jong","Jan Viebahn","Yuliya Shapovalova"],"pdf_url":"https://arxiv.org/pdf/2407.19865v1.pdf","comment":"To be presented at the Machine Learning for Sustainable Power Systems\n 2024 workshop and to be published in the corresponding Springer\n Communications in Computer and Information Science proceedings"},{"id":"http://arxiv.org/abs/2407.19860v1","updated":"2024-07-29T10:30:07Z","published":"2024-07-29T10:30:07Z","title":"Anomalous State Sequence Modeling to Enhance Safety in Reinforcement\n Learning","summary":" The deployment of artificial intelligence (AI) in decision-making\napplications requires ensuring an appropriate level of safety and reliability,\nparticularly in changing environments that contain a large number of unknown\nobservations. To address this challenge, we propose a novel safe reinforcement\nlearning (RL) approach that utilizes an anomalous state sequence to enhance RL\nsafety. Our proposed solution Safe Reinforcement Learning with Anomalous State\nSequences (AnoSeqs) consists of two stages. First, we train an agent in a\nnon-safety-critical offline 'source' environment to collect safe state\nsequences. Next, we use these safe sequences to build an anomaly detection\nmodel that can detect potentially unsafe state sequences in a 'target'\nsafety-critical environment where failures can have high costs. The estimated\nrisk from the anomaly detection model is utilized to train a risk-averse RL\npolicy in the target environment; this involves adjusting the reward function\nto penalize the agent for visiting anomalous states deemed unsafe by our\nanomaly model. In experiments on multiple safety-critical benchmarking\nenvironments including self-driving cars, our solution approach successfully\nlearns safer policies and proves that sequential anomaly detection can provide\nan effective supervisory signal for training safety-aware RL agents\n","authors":["Leen Kweider","Maissa Abou Kassem","Ubai Sandouk"],"pdf_url":"https://arxiv.org/pdf/2407.19860v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19858v1","updated":"2024-07-29T10:26:52Z","published":"2024-07-29T10:26:52Z","title":"AI-Powered Energy algorithmic Trading: Integrating Hidden Markov Models\n with Neural Networks","summary":" In the field of quantitative finance, machine learning methods have become\nessential for alpha generation. This paper presents a pioneering method that\nuniquely combines Hidden Markov Models (HMM) and neural networks, creating a\ndual-model alpha generation system integrated with Black-Litterman portfolio\noptimization. The methodology, implemented on the QuantConnect platform, aims\nto predict future price movements and optimize trading strategies.\nSpecifically, it filters for highly liquid, top-cap energy stocks to ensure\nstable and predictable performance while also accounting for broker payments.\nQuantConnect was selected because of its robust framework and to guarantee\nexperimental reproducibility. The algorithm achieved a 31% return between June\n1, 2023, and January 1, 2024, with a Sharpe ratio of 1.669, demonstrating its\npotential. The findings suggest significant improvements in trading strategy\nperformance through the combined use of the HMM and neural networks. This study\nexplores the architecture of the algorithm, data pre-processing techniques,\nmodel training procedures, and performance evaluation, highlighting its\npractical applicability and effectiveness in real-world trading environments.\nThe full code and backtesting data are available under the MIT license.\n","authors":["Tiago Monteiro"],"pdf_url":"https://arxiv.org/pdf/2407.19858v1.pdf","comment":"14 pages, 4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2407.18606v2","updated":"2024-07-29T10:22:00Z","published":"2024-07-26T08:56:13Z","title":"A data balancing approach towards design of an expert system for Heart\n Disease Prediction","summary":" Heart disease is a serious global health issue that claims millions of lives\nevery year. Early detection and precise prediction are critical to the\nprevention and successful treatment of heart related issues. A lot of research\nutilizes machine learning (ML) models to forecast cardiac disease and obtain\nearly detection. In order to do predictive analysis on \"Heart disease health\nindicators \" dataset. We employed five machine learning methods in this paper:\nDecision Tree (DT), Random Forest (RF), Linear Discriminant Analysis, Extra\nTree Classifier, and AdaBoost. The model is further examined using various\nfeature selection (FS) techniques. To enhance the baseline model, we have\nseparately applied four FS techniques: Sequential Forward FS, Sequential\nBackward FS, Correlation Matrix, and Chi2. Lastly, K means SMOTE oversampling\nis applied to the models to enable additional analysis. The findings show that\nwhen it came to predicting heart disease, ensemble approaches in particular,\nrandom forests performed better than individual classifiers. The presence of\nsmoking, blood pressure, cholesterol, and physical inactivity were among the\nmajor predictors that were found. The accuracy of the Random Forest and\nDecision Tree model was 99.83%. This paper demonstrates how machine learning\nmodels can improve the accuracy of heart disease prediction, especially when\nusing ensemble methodologies. The models provide a more accurate risk\nassessment than traditional methods since they incorporate a large number of\nfactors and complex algorithms.\n","authors":["Rahul Karmakar","Udita Ghosh","Arpita Pal","Sattwiki Dey","Debraj Malik","Priyabrata Sain"],"pdf_url":"https://arxiv.org/pdf/2407.18606v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01029v2","updated":"2024-07-29T10:18:23Z","published":"2024-02-01T21:38:10Z","title":"Response Theory via Generative Score Modeling","summary":" We introduce an approach for analyzing the responses of dynamical systems to\nexternal perturbations that combines score-based generative modeling with the\nGeneralized Fluctuation-Dissipation Theorem (GFDT). The methodology enables\naccurate estimation of system responses, including those with non-Gaussian\nstatistics. We numerically validate our approach using time-series data from\nthree different stochastic partial differential equations of increasing\ncomplexity: an Ornstein-Uhlenbeck process with spatially correlated noise, a\nmodified stochastic Allen-Cahn equation, and the 2D Navier-Stokes equations. We\ndemonstrate the improved accuracy of the methodology over conventional methods\nand discuss its potential as a versatile tool for predicting the statistical\nbehavior of complex dynamical systems.\n","authors":["Ludovico Theo Giorgini","Katherine Deck","Tobias Bischoff","Andre Souza"],"pdf_url":"https://arxiv.org/pdf/2402.01029v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19853v1","updated":"2024-07-29T10:10:40Z","published":"2024-07-29T10:10:40Z","title":"Online Multi-Source Domain Adaptation through Gaussian Mixtures and\n Dataset Dictionary Learning","summary":" This paper addresses the challenge of online multi-source domain adaptation\n(MSDA) in transfer learning, a scenario where one needs to adapt multiple,\nheterogeneous source domains towards a target domain that comes in a stream. We\nintroduce a novel approach for the online fit of a Gaussian Mixture Model\n(GMM), based on the Wasserstein geometry of Gaussian measures. We build upon\nthis method and recent developments in dataset dictionary learning for\nproposing a novel strategy in online MSDA. Experiments on the challenging\nTennessee Eastman Process benchmark demonstrate that our approach is able to\nadapt \\emph{on the fly} to the stream of target domain data. Furthermore, our\nonline GMM serves as a memory, representing the whole stream of data.\n","authors":["Eduardo Fernandes Montesuma","Stevan Le Stanc","Fred Ngolè Mboula"],"pdf_url":"https://arxiv.org/pdf/2407.19853v1.pdf","comment":"6 pages, 3 figures, accepted at the IEEE International Workshop on\n Machine Learning for Signal Processing 2024"},{"id":"http://arxiv.org/abs/2407.19852v1","updated":"2024-07-29T10:10:03Z","published":"2024-07-29T10:10:03Z","title":"Quantum Long Short-Term Memory for Drug Discovery","summary":" Quantum computing combined with machine learning (ML) is an extremely\npromising research area, with numerous studies demonstrating that quantum\nmachine learning (QML) is expected to solve scientific problems more\neffectively than classical ML. In this work, we successfully apply QML to drug\ndiscovery, showing that QML can significantly improve model performance and\nachieve faster convergence compared to classical ML. Moreover, we demonstrate\nthat the model accuracy of the QML improves as the number of qubits increases.\nWe also introduce noise to the QML model and find that it has little effect on\nour experimental conclusions, illustrating the high robustness of the QML\nmodel. This work highlights the potential application of quantum computing to\nyield significant benefits for scientific advancement as the qubit quantity\nincrease and quality improvement in the future.\n","authors":["Liang Zhang","Yin Xu","Mohan Wu","Liang Wang","Hua Xu"],"pdf_url":"https://arxiv.org/pdf/2407.19852v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19845v1","updated":"2024-07-29T09:57:03Z","published":"2024-07-29T09:57:03Z","title":"BackdoorBench: A Comprehensive Benchmark and Analysis of Backdoor\n Learning","summary":" As an emerging approach to explore the vulnerability of deep neural networks\n(DNNs), backdoor learning has attracted increasing interest in recent years,\nand many seminal backdoor attack and defense algorithms are being developed\nsuccessively or concurrently, in the status of a rapid arms race. However,\nmainly due to the diverse settings, and the difficulties of implementation and\nreproducibility of existing works, there is a lack of a unified and\nstandardized benchmark of backdoor learning, causing unfair comparisons or\nunreliable conclusions (e.g., misleading, biased or even false conclusions).\nConsequently, it is difficult to evaluate the current progress and design the\nfuture development roadmap of this literature. To alleviate this dilemma, we\nbuild a comprehensive benchmark of backdoor learning called BackdoorBench. Our\nbenchmark makes three valuable contributions to the research community. 1) We\nprovide an integrated implementation of state-of-the-art (SOTA) backdoor\nlearning algorithms (currently including 20 attack and 32 defense algorithms),\nbased on an extensible modular-based codebase. 2) We conduct comprehensive\nevaluations with 5 poisoning ratios, based on 4 models and 4 datasets, leading\nto 11,492 pairs of attack-against-defense evaluations in total. 3) Based on\nabove evaluations, we present abundant analysis from 10 perspectives via 18\nuseful analysis tools, and provide several inspiring insights about backdoor\nlearning. We hope that our efforts could build a solid foundation of backdoor\nlearning to facilitate researchers to investigate existing algorithms, develop\nmore innovative algorithms, and explore the intrinsic mechanism of backdoor\nlearning. Finally, we have created a user-friendly website at\nhttp://backdoorbench.com, which collects all important information of\nBackdoorBench, including codebase, docs, leaderboard, and model Zoo.\n","authors":["Baoyuan Wu","Hongrui Chen","Mingda Zhang","Zihao Zhu","Shaokui Wei","Danni Yuan","Mingli Zhu","Ruotong Wang","Li Liu","Chao Shen"],"pdf_url":"https://arxiv.org/pdf/2407.19845v1.pdf","comment":"Substantial extensions based on our previous conference version\n \"Backdoorbench: A comprehensive benchmark of backdoor learning\" published at\n NeurIPS D&B Track 2022. 20 backdoor attack algorithms, 32 backdoor defense\n algorithms, 11000+ pairs of attack-against-defense evaluations, 10 analyses,\n 18 analysis tools"},{"id":"http://arxiv.org/abs/2407.19842v1","updated":"2024-07-29T09:55:34Z","published":"2024-07-29T09:55:34Z","title":"Detecting and Understanding Vulnerabilities in Language Models via\n Mechanistic Interpretability","summary":" Large Language Models (LLMs), characterized by being trained on broad amounts\nof data in a self-supervised manner, have shown impressive performance across a\nwide range of tasks. Indeed, their generative abilities have aroused interest\non the application of LLMs across a wide range of contexts. However, neural\nnetworks in general, and LLMs in particular, are known to be vulnerable to\nadversarial attacks, where an imperceptible change to the input can mislead the\noutput of the model. This is a serious concern that impedes the use of LLMs on\nhigh-stakes applications, such as healthcare, where a wrong prediction can\nimply serious consequences. Even though there are many efforts on making LLMs\nmore robust to adversarial attacks, there are almost no works that study\n\\emph{how} and \\emph{where} these vulnerabilities that make LLMs prone to\nadversarial attacks happen. Motivated by these facts, we explore how to\nlocalize and understand vulnerabilities, and propose a method, based on\nMechanistic Interpretability (MI) techniques, to guide this process.\nSpecifically, this method enables us to detect vulnerabilities related to a\nconcrete task by (i) obtaining the subset of the model that is responsible for\nthat task, (ii) generating adversarial samples for that task, and (iii) using\nMI techniques together with the previous samples to discover and understand the\npossible vulnerabilities. We showcase our method on a pretrained GPT-2 Small\nmodel carrying out the task of predicting 3-letter acronyms to demonstrate its\neffectiveness on locating and understanding concrete vulnerabilities of the\nmodel.\n","authors":["Jorge García-Carrasco","Alejandro Maté","Juan Trujillo"],"pdf_url":"https://arxiv.org/pdf/2407.19842v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.11789v2","updated":"2024-07-29T09:51:35Z","published":"2024-02-19T02:32:45Z","title":"Statistical Test on Diffusion Model-based Generated Images by Selective\n Inference","summary":" AI technology for generating images, such as diffusion models, has advanced\nrapidly. However, there is no established framework for quantifying the\nreliability of AI-generated images, which hinders their use in critical\ndecision-making tasks, such as medical image diagnosis. In this study, we\npropose a method to quantify the reliability of decision-making tasks that rely\non images produced by diffusion models within a statistical testing framework.\nThe core concept of our statistical test involves using a selective inference\nframework, in which the statistical test is conducted under the condition that\nthe images are produced by a trained diffusion model. As a case study, we study\na diffusion model-based anomaly detection task for medical images. With our\napproach, the statistical significance of medical image diagnostic outcomes can\nbe quantified in terms of a p-value, enabling decision-making with a controlled\nerror rate. We demonstrate the theoretical soundness and practical\neffectiveness of our statistical test through numerical experiments on both\nsynthetic and brain image datasets.\n","authors":["Teruyuki Katsuoka","Tomohiro Shiraishi","Daiki Miwa","Vo Nguyen Le Duy","Ichiro Takeuchi"],"pdf_url":"https://arxiv.org/pdf/2402.11789v2.pdf","comment":"31 pages, 7 figures"},{"id":"http://arxiv.org/abs/2407.19838v1","updated":"2024-07-29T09:46:46Z","published":"2024-07-29T09:46:46Z","title":"RNACG: A Universal RNA Sequence Conditional Generation model based on\n Flow-Matching","summary":" RNA plays a crucial role in diverse life processes. In contrast to the rapid\nadvancement of protein design methods, the work related to RNA is more\ndemanding. Most current RNA design approaches concentrate on specified target\nattributes and rely on extensive experimental searches. However, these methods\nremain costly and inefficient due to practical limitations. In this paper, we\ncharacterize all sequence design issues as conditional generation tasks and\noffer parameterized representations for multiple problems. For these problems,\nwe have developed a universal RNA sequence generation model based on flow\nmatching, namely RNACG. RNACG can accommodate various conditional inputs and is\nportable, enabling users to customize the encoding network for conditional\ninputs as per their requirements and integrate it into the generation network.\nWe evaluated RNACG in RNA 3D structure inverse folding, 2D structure inverse\nfolding, family-specific sequence generation, and 5'UTR translation efficiency\nprediction. RNACG attains superior or competitive performance on these tasks\ncompared with other methods. RNACG exhibits extensive applicability in sequence\ngeneration and property prediction tasks, providing a novel approach to RNA\nsequence design and potential methods for simulation experiments with\nlarge-scale RNA sequence data.\n","authors":["Letian Gao","Zhi John Lu"],"pdf_url":"https://arxiv.org/pdf/2407.19838v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17900v2","updated":"2024-07-29T09:33:01Z","published":"2024-07-25T09:42:24Z","title":"The Power of Combining Data and Knowledge: GPT-4o is an Effective\n Interpreter of Machine Learning Models in Predicting Lymph Node Metastasis of\n Lung Cancer","summary":" Lymph node metastasis (LNM) is a crucial factor in determining the initial\ntreatment for patients with lung cancer, yet accurate preoperative diagnosis of\nLNM remains challenging. Recently, large language models (LLMs) have garnered\nsignificant attention due to their remarkable text generation capabilities.\nLeveraging the extensive medical knowledge learned from vast corpora, LLMs can\nestimate probabilities for clinical problems, though their performance has\nhistorically been inferior to data-driven machine learning models. In this\npaper, we propose a novel ensemble method that combines the medical knowledge\nacquired by LLMs with the latent patterns identified by machine learning models\nto enhance LNM prediction performance. Initially, we developed machine learning\nmodels using patient data. We then designed a prompt template to integrate the\npatient data with the predicted probability from the machine learning model.\nSubsequently, we instructed GPT-4o, the most advanced LLM developed by OpenAI,\nto estimate the likelihood of LNM based on patient data and then adjust the\nestimate using the machine learning output. Finally, we collected three outputs\nfrom the GPT-4o using the same prompt and ensembled these results as the final\nprediction. Using the proposed method, our models achieved an AUC value of\n0.765 and an AP value of 0.415 for LNM prediction, significantly improving\npredictive performance compared to baseline machine learning models. The\nexperimental results indicate that GPT-4o can effectively leverage its medical\nknowledge and the probabilities predicted by machine learning models to achieve\nmore accurate LNM predictions. These findings demonstrate that LLMs can perform\nwell in clinical risk prediction tasks, offering a new paradigm for integrating\nmedical knowledge and patient data in clinical predictions.\n","authors":["Danqing Hu","Bing Liu","Xiaofeng Zhu","Nan Wu"],"pdf_url":"https://arxiv.org/pdf/2407.17900v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19828v1","updated":"2024-07-29T09:30:00Z","published":"2024-07-29T09:30:00Z","title":"Federated Learning based Latent Factorization of Tensors for\n Privacy-Preserving QoS Prediction","summary":" In applications related to big data and service computing, dynamic\nconnections tend to be encountered, especially the dynamic data of\nuser-perspective quality of service (QoS) in Web services. They are transformed\ninto high-dimensional and incomplete (HDI) tensors which include abundant\ntemporal pattern information. Latent factorization of tensors (LFT) is an\nextremely efficient and typical approach for extracting such patterns from an\nHDI tensor. However, current LFT models require the QoS data to be maintained\nin a central place (e.g., a central server), which is impossible for\nincreasingly privacy-sensitive users. To address this problem, this article\ncreatively designs a federated learning based on latent factorization of\ntensors (FL-LFT). It builds a data-density -oriented federated learning model\nto enable isolated users to collaboratively train a global LFT model while\nprotecting user's privacy. Extensive experiments on a QoS dataset collected\nfrom the real world verify that FL-LFT shows a remarkable increase in\nprediction accuracy when compared to state-of-the-art federated learning (FL)\napproaches.\n","authors":["Shuai Zhong","Zengtong Tang","Di Wu"],"pdf_url":"https://arxiv.org/pdf/2407.19828v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11247v2","updated":"2024-07-29T09:22:04Z","published":"2023-08-22T07:43:59Z","title":"Benchmarking Domain Adaptation for Chemical Processes on the Tennessee\n Eastman Process","summary":" In system monitoring, automatic fault diagnosis seeks to infer the systems'\nstate based on sensor readings, e.g., through machine learning models. In this\ncontext, it is of key importance that, based on historical data, these systems\nare able to generalize to incoming data. In parallel, many factors may induce\nchanges in the data probability distribution, hindering the possibility of such\nmodels to generalize. In this sense, domain adaptation is an important\nframework for adapting models to different probability distributions. In this\npaper, we propose a new benchmark, based on the Tennessee Eastman Process of\nDowns and Vogel (1993), for benchmarking domain adaptation methods in the\ncontext of chemical processes. Besides describing the process, and its\nrelevance for domain adaptation, we describe a series of data processing steps\nfor reproducing our benchmark. We then test 11 domain adaptation strategies on\nthis novel benchmark, showing that optimal transport-based techniques\noutperform other strategies.\n","authors":["Eduardo Fernandes Montesuma","Michela Mulas","Fred Ngolè Mboula","Francesco Corona","Antoine Souloumiac"],"pdf_url":"https://arxiv.org/pdf/2308.11247v2.pdf","comment":"16 pages, 9 figures, 5 tables. Accepted as a Workshop paper at the\n ECML-PKDD 2024 conference"},{"id":"http://arxiv.org/abs/2407.19823v1","updated":"2024-07-29T09:17:16Z","published":"2024-07-29T09:17:16Z","title":"Analyzing and reducing the synthetic-to-real transfer gap in Music\n Information Retrieval: the task of automatic drum transcription","summary":" Automatic drum transcription is a critical tool in Music Information\nRetrieval for extracting and analyzing the rhythm of a music track, but it is\nlimited by the size of the datasets available for training. A popular method\nused to increase the amount of data is by generating them synthetically from\nmusic scores rendered with virtual instruments. This method can produce a\nvirtually infinite quantity of tracks, but empirical evidence shows that models\ntrained on previously created synthetic datasets do not transfer well to real\ntracks. In this work, besides increasing the amount of data, we identify and\nevaluate three more strategies that practitioners can use to improve the\nrealism of the generated data and, thus, narrow the synthetic-to-real transfer\ngap. To explore their efficacy, we used them to build a new synthetic dataset\nand then we measured how the performance of a model scales and, specifically,\nat what value it will stagnate when increasing the number of training tracks\nfor different datasets. By doing this, we were able to prove that the\naforementioned strategies contribute to make our dataset the one with the most\nrealistic data distribution and the lowest synthetic-to-real transfer gap among\nthe synthetic datasets we evaluated. We conclude by highlighting the limits of\ntraining with infinite data in drum transcription and we show how they can be\novercome.\n","authors":["Mickaël Zehren","Marco Alunno","Paolo Bientinesi"],"pdf_url":"https://arxiv.org/pdf/2407.19823v1.pdf","comment":"21 pages, 4 figures"},{"id":"http://arxiv.org/abs/2407.18792v2","updated":"2024-07-29T09:05:17Z","published":"2024-07-26T14:54:16Z","title":"Benchmarking Dependence Measures to Prevent Shortcut Learning in Medical\n Imaging","summary":" Medical imaging cohorts are often confounded by factors such as acquisition\ndevices, hospital sites, patient backgrounds, and many more. As a result, deep\nlearning models tend to learn spurious correlations instead of causally related\nfeatures, limiting their generalizability to new and unseen data. This problem\ncan be addressed by minimizing dependence measures between intermediate\nrepresentations of task-related and non-task-related variables. These measures\ninclude mutual information, distance correlation, and the performance of\nadversarial classifiers. Here, we benchmark such dependence measures for the\ntask of preventing shortcut learning. We study a simplified setting using\nMorpho-MNIST and a medical imaging task with CheXpert chest radiographs. Our\nresults provide insights into how to mitigate confounding factors in medical\nimaging.\n","authors":["Sarah Müller","Louisa Fay","Lisa M. Koch","Sergios Gatidis","Thomas Küstner","Philipp Berens"],"pdf_url":"https://arxiv.org/pdf/2407.18792v2.pdf","comment":"Accepted to the 15th International Workshop on Machine Learning in\n Medical Imaging (MLMI 2024); new version: appendix moved to the end, after\n the references"},{"id":"http://arxiv.org/abs/2101.11347v6","updated":"2024-07-29T08:56:31Z","published":"2021-01-27T12:23:24Z","title":"Decision Machines: Enhanced Decision Trees","summary":" This paper presents Decision Machines (DMs), an innovative evolution of\ntraditional binary decision trees, which leverages matrix computations to\nsignificantly enhance both computational efficiency and interpretability. By\nexplicitly mapping the dependencies between predictions and binary tests within\na vector space, DMs offer a streamlined approach to navigating decision paths.\nWe integrate decision trees with kernel methods, ensemble methods and attention\nmechanisms. The integration of these elements not only bolsters the\nhierarchical structure of decision trees but also aligns with the computational\nefficiency of matrix computations. Our work bridges the gap between traditional\nmachine learning algorithms and modern deep learning techniques, providing a\nnovel foundation for further research and application in the field of machine\nlearning.\n","authors":["Jinxiong Zhang"],"pdf_url":"https://arxiv.org/pdf/2101.11347v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.03075v3","updated":"2024-07-29T08:38:25Z","published":"2022-09-07T11:28:17Z","title":"A learning theory for quantum photonic processors and beyond","summary":" We consider the tasks of learning quantum states, measurements and channels\ngenerated by continuous-variable (CV) quantum circuits. This family of circuits\nis suited to describe optical quantum technologies and in particular it\nincludes state-of-the-art photonic processors capable of showing quantum\nadvantage. We define classes of functions that map classical variables, encoded\ninto the CV circuit parameters, to outcome probabilities evaluated on those\ncircuits. We then establish efficient learnability guarantees for such classes,\nby computing bounds on their pseudo-dimension or covering numbers, showing that\nCV quantum circuits can be learned with a sample complexity that scales\npolynomially with the circuit's size, i.e., the number of modes. Our results\nshow that CV circuits can be trained efficiently using a number of training\nsamples that, unlike their finite-dimensional counterpart, does not scale with\nthe circuit depth.\n","authors":["Matteo Rosati"],"pdf_url":"https://arxiv.org/pdf/2209.03075v3.pdf","comment":"27+5 pages, 2 figures"},{"id":"http://arxiv.org/abs/2407.19784v1","updated":"2024-07-29T08:27:21Z","published":"2024-07-29T08:27:21Z","title":"Survey and Taxonomy: The Role of Data-Centric AI in Transformer-Based\n Time Series Forecasting","summary":" Alongside the continuous process of improving AI performance through the\ndevelopment of more sophisticated models, researchers have also focused their\nattention to the emerging concept of data-centric AI, which emphasizes the\nimportant role of data in a systematic machine learning training process.\nNonetheless, the development of models has also continued apace. One result of\nthis progress is the development of the Transformer Architecture, which\npossesses a high level of capability in multiple domains such as Natural\nLanguage Processing (NLP), Computer Vision (CV) and Time Series Forecasting\n(TSF). Its performance is, however, heavily dependent on input data\npreprocessing and output data evaluation, justifying a data-centric approach to\nfuture research. We argue that data-centric AI is essential for training AI\nmodels, particularly for transformer-based TSF models efficiently. However,\nthere is a gap regarding the integration of transformer-based TSF and\ndata-centric AI. This survey aims to pin down this gap via the extensive\nliterature review based on the proposed taxonomy. We review the previous\nresearch works from a data-centric AI perspective and we intend to lay the\nfoundation work for the future development of transformer-based architecture\nand data-centric AI.\n","authors":["Jingjing Xu","Caesar Wu","Yuan-Fang Li","Gregoire Danoy","Pascal Bouvry"],"pdf_url":"https://arxiv.org/pdf/2407.19784v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19777v1","updated":"2024-07-29T08:20:49Z","published":"2024-07-29T08:20:49Z","title":"Revisiting Agnostic PAC Learning","summary":" PAC learning, dating back to Valiant'84 and Vapnik and Chervonenkis'64,'74,\nis a classic model for studying supervised learning. In the agnostic setting,\nwe have access to a hypothesis set $\\mathcal{H}$ and a training set of labeled\nsamples $(x_1,y_1),\\dots,(x_n,y_n) \\in \\mathcal{X} \\times \\{-1,1\\}$ drawn\ni.i.d. from an unknown distribution $\\mathcal{D}$. The goal is to produce a\nclassifier $h : \\mathcal{X} \\to \\{-1,1\\}$ that is competitive with the\nhypothesis $h^\\star_{\\mathcal{D}} \\in \\mathcal{H}$ having the least probability\nof mispredicting the label $y$ of a new sample $(x,y)\\sim \\mathcal{D}$.\n Empirical Risk Minimization (ERM) is a natural learning algorithm, where one\nsimply outputs the hypothesis from $\\mathcal{H}$ making the fewest mistakes on\nthe training data. This simple algorithm is known to have an optimal error in\nterms of the VC-dimension of $\\mathcal{H}$ and the number of samples $n$.\n In this work, we revisit agnostic PAC learning and first show that ERM is in\nfact sub-optimal if we treat the performance of the best hypothesis, denoted\n$\\tau:=\\Pr_{\\mathcal{D}}[h^\\star_{\\mathcal{D}}(x) \\neq y]$, as a parameter.\nConcretely we show that ERM, and any other proper learning algorithm, is\nsub-optimal by a $\\sqrt{\\ln(1/\\tau)}$ factor. We then complement this lower\nbound with the first learning algorithm achieving an optimal error for nearly\nthe full range of $\\tau$. Our algorithm introduces several new ideas that we\nhope may find further applications in learning theory.\n","authors":["Steve Hanneke","Kasper Green Larsen","Nikita Zhivotovskiy"],"pdf_url":"https://arxiv.org/pdf/2407.19777v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.09105v3","updated":"2024-07-29T07:58:53Z","published":"2024-07-12T09:10:37Z","title":"Enhancing Training Efficiency Using Packing with Flash Attention","summary":" Padding is often used in tuning LLM models by adding special tokens to\nshorter training examples to match the length of the longest sequence in each\nbatch. While this ensures uniformity for batch processing, it introduces\ninefficiencies by including irrelevant padding tokens in the computation and\nwastes GPU resources. On the other hand, the Hugging Face SFT trainer offers\nthe option to use packing to combine multiple training examples up to the\nmaximum sequence length. This allows for maximal utilization of GPU resources.\nHowever, without proper masking of each packed training example, attention will\nnot be computed correctly when using SFT trainer. We enable and then analyse\npacking and Flash Attention with proper attention masking of each example and\nshow the benefits of this training paradigm.\n","authors":["Achintya Kundu","Rhui Dih Lee","Laura Wynter","Raghu Kiran Ganti","Mayank Mishra"],"pdf_url":"https://arxiv.org/pdf/2407.09105v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09597v2","updated":"2024-07-29T07:54:56Z","published":"2023-10-14T15:09:56Z","title":"Adaptive maximization of social welfare","summary":" We consider the problem of repeatedly choosing policies to maximize social\nwelfare. Welfare is a weighted sum of private utility and public revenue.\nEarlier outcomes inform later policies. Utility is not observed, but indirectly\ninferred. Response functions are learned through experimentation. We derive a\nlower bound on regret, and a matching adversarial upper bound for a variant of\nthe Exp3 algorithm. Cumulative regret grows at a rate of $T^{2/3}$. This\nimplies that (i) welfare maximization is harder than the multi-armed bandit\nproblem (with a rate of $T^{1/2}$ for finite policy sets), and (ii) our\nalgorithm achieves the optimal rate. For the stochastic setting, if social\nwelfare is concave, we can achieve a rate of $T^{1/2}$ (for continuous policy\nsets), using a dyadic search algorithm. We analyze an extension to nonlinear\nincome taxation, and sketch an extension to commodity taxation. We compare our\nsetting to monopoly pricing (which is easier), and price setting for bilateral\ntrade (which is harder).\n","authors":["Nicolo Cesa-Bianchi","Roberto Colomboni","Maximilian Kasy"],"pdf_url":"https://arxiv.org/pdf/2310.09597v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.04825v6","updated":"2024-07-29T07:28:24Z","published":"2022-09-11T09:53:14Z","title":"Revolutionizing Binary Decision Tree Traversals with Arithmetical\n Representations","summary":" This paper introduces an innovative method for traversing binary decision\ntrees using arithmetic operations. We present a suite of binary tree traversal\nalgorithms that leverage novel representation matrices to flatten the full\nbinary tree structure and embed the aggregated internal node decisions into a\nsingle vector. Our approach, grounded in maximum inner product search, offers\nnew insights into decision tree partitioning.\n","authors":["Jinxiong Zhang"],"pdf_url":"https://arxiv.org/pdf/2209.04825v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15312v2","updated":"2024-07-29T07:24:12Z","published":"2024-03-22T16:04:26Z","title":"A Wasserstein perspective of Vanilla GANs","summary":" The empirical success of Generative Adversarial Networks (GANs) caused an\nincreasing interest in theoretical research. The statistical literature is\nmainly focused on Wasserstein GANs and generalizations thereof, which\nespecially allow for good dimension reduction properties. Statistical results\nfor Vanilla GANs, the original optimization problem, are still rather limited\nand require assumptions such as smooth activation functions and equal\ndimensions of the latent space and the ambient space. To bridge this gap, we\ndraw a connection from Vanilla GANs to the Wasserstein distance. By doing so,\nexisting results for Wasserstein GANs can be extended to Vanilla GANs. In\nparticular, we obtain an oracle inequality for Vanilla GANs in Wasserstein\ndistance. The assumptions of this oracle inequality are designed to be\nsatisfied by network architectures commonly used in practice, such as\nfeedforward ReLU networks. By providing a quantitative result for the\napproximation of a Lipschitz function by a feedforward ReLU network with\nbounded H\\\"older norm, we conclude a rate of convergence for Vanilla GANs as\nwell as Wasserstein GANs as estimators of the unknown probability distribution.\n","authors":["Lea Kunkel","Mathias Trabs"],"pdf_url":"https://arxiv.org/pdf/2403.15312v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.07890v3","updated":"2024-07-29T07:13:42Z","published":"2022-10-14T15:16:54Z","title":"Hierarchical Policy Blending as Inference for Reactive Robot Control","summary":" Motion generation in cluttered, dense, and dynamic environments is a central\ntopic in robotics, rendered as a multi-objective decision-making problem.\nCurrent approaches trade-off between safety and performance. On the one hand,\nreactive policies guarantee fast response to environmental changes at the risk\nof suboptimal behavior. On the other hand, planning-based motion generation\nprovides feasible trajectories, but the high computational cost may limit the\ncontrol frequency and thus safety. To combine the benefits of reactive policies\nand planning, we propose a hierarchical motion generation method. Moreover, we\nadopt probabilistic inference methods to formalize the hierarchical model and\nstochastic optimization. We realize this approach as a weighted product of\nstochastic, reactive expert policies, where planning is used to adaptively\ncompute the optimal weights over the task horizon. This stochastic optimization\navoids local optima and proposes feasible reactive plans that find paths in\ncluttered and dense environments. Our extensive experimental study in planar\nnavigation and 6DoF manipulation shows that our proposed hierarchical motion\ngeneration method outperforms both myopic reactive controllers and online\nre-planning methods.\n","authors":["Kay Hansel","Julen Urain","Jan Peters","Georgia Chalvatzaki"],"pdf_url":"https://arxiv.org/pdf/2210.07890v3.pdf","comment":"8 pages, 5 figures, 1 table, accepted at ICRA 2023"},{"id":"http://arxiv.org/abs/2407.19736v1","updated":"2024-07-29T06:56:57Z","published":"2024-07-29T06:56:57Z","title":"Sensor Selection via GFlowNets: A Deep Generative Modeling Framework to\n Navigate Combinatorial Complexity","summary":" The performance of sensor arrays in sensing and wireless communications\nimproves with more elements, but this comes at the cost of increased energy\nconsumption and hardware expense. This work addresses the challenge of\nselecting $k$ sensor elements from a set of $m$ to optimize a generic\nQuality-of-Service metric. Evaluating all $\\binom{m}{k}$ possible sensor\nsubsets is impractical, leading to prior solutions using convex relaxations,\ngreedy algorithms, and supervised learning approaches. The current paper\nproposes a new framework that employs deep generative modeling, treating sensor\nselection as a deterministic Markov Decision Process where sensor subsets of\nsize $k$ arise as terminal states. Generative Flow Networks (GFlowNets) are\nemployed to model an action distribution conditioned on the state. Sampling\nactions from the aforementioned distribution ensures that the probability of\narriving at a terminal state is proportional to the performance of the\ncorresponding subset. Applied to a standard sensor selection scenario, the\ndeveloped approach outperforms popular methods which are based on convex\noptimization and greedy algorithms. Finally, a multiobjective formulation of\nthe proposed approach is adopted and applied on the sparse antenna array design\nfor Integrated Sensing and Communication (ISAC) systems. The multiobjective\nvariation is shown to perform well in managing the trade-off between radar and\ncommunication performance.\n","authors":["Spilios Evmorfos","Zhaoyi Xu","Athina Petropulu"],"pdf_url":"https://arxiv.org/pdf/2407.19736v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17164v2","updated":"2024-07-29T06:55:36Z","published":"2024-07-24T11:12:01Z","title":"Robust Deep Hawkes Process under Label Noise of Both Event and\n Occurrence","summary":" Integrating deep neural networks with the Hawkes process has significantly\nimproved predictive capabilities in finance, health informatics, and\ninformation technology. Nevertheless, these models often face challenges in\nreal-world settings, particularly due to substantial label noise. This issue is\nof significant concern in the medical field, where label noise can arise from\ndelayed updates in electronic medical records or misdiagnoses, leading to\nincreased prediction risks. Our research indicates that deep Hawkes process\nmodels exhibit reduced robustness when dealing with label noise, particularly\nwhen it affects both event types and timing. To address these challenges, we\nfirst investigate the influence of label noise in approximated intensity\nfunctions and present a novel framework, the Robust Deep Hawkes Process (RDHP),\nto overcome the impact of label noise on the intensity function of Hawkes\nmodels, considering both the events and their occurrences. We tested RDHP using\nmultiple open-source benchmarks with synthetic noise and conducted a case study\non obstructive sleep apnea-hypopnea syndrome (OSAHS) in a real-world setting\nwith inherent label noise. The results demonstrate that RDHP can effectively\nperform classification and regression tasks, even in the presence of noise\nrelated to events and their timing. To the best of our knowledge, this is the\nfirst study to successfully address both event and time label noise in deep\nHawkes process models, offering a promising solution for medical applications,\nspecifically in diagnosing OSAHS.\n","authors":["Xiaoyu Tan","Bin Li","Xihe Qiu","Jingjing Huang","Yinghui Xu","Wei Chu"],"pdf_url":"https://arxiv.org/pdf/2407.17164v2.pdf","comment":"ECAI2024"},{"id":"http://arxiv.org/abs/2407.19724v1","updated":"2024-07-29T06:12:47Z","published":"2024-07-29T06:12:47Z","title":"Constructing artificial life and materials scientists with accelerated\n AI using Deep AndersoNN","summary":" Deep AndersoNN accelerates AI by exploiting the continuum limit as the number\nof explicit layers in a neural network approaches infinity and can be taken as\na single implicit layer, known as a deep equilibrium model. Solving for deep\nequilibrium model parameters reduces to a nonlinear fixed point iteration\nproblem, enabling the use of vector-to-vector iterative solvers and windowing\ntechniques, such as Anderson extrapolation, for accelerating convergence to the\nfixed point deep equilibrium. Here we show that Deep AndersoNN achieves up to\nan order of magnitude of speed-up in training and inference. The method is\ndemonstrated on density functional theory results for industrial applications\nby constructing artificial life and materials `scientists' capable of\nclassifying drugs as strongly or weakly polar, metal-organic frameworks by pore\nsize, and crystalline materials as metals, semiconductors, and insulators,\nusing graph images of node-neighbor representations transformed from atom-bond\nnetworks. Results exhibit accuracy up to 98\\% and showcase synergy between Deep\nAndersoNN and machine learning capabilities of modern computing architectures,\nsuch as GPUs, for accelerated computational life and materials science by\nquickly identifying structure-property relationships. This paves the way for\nsaving up to 90\\% of compute required for AI, reducing its carbon footprint by\nup to 60 gigatons per year by 2030, and scaling above memory limits of explicit\nneural networks in life and materials science, and beyond.\n","authors":["Saleem Abdul Fattah Ahmed Al Dajani","David Keyes"],"pdf_url":"https://arxiv.org/pdf/2407.19724v1.pdf","comment":"7 pages, 5 figures, 2 tables, Accepted by ICML ML4LMS\n https://openreview.net/forum?id=qhwyvhqAvI . International Conference on\n Machine Learning (ICML). Machine Learning for Life and Material Science\n (ML4LMS) Workshop, May 2024"},{"id":"http://arxiv.org/abs/2310.00607v2","updated":"2024-07-29T05:41:05Z","published":"2023-10-01T07:57:03Z","title":"Understanding Robust Overfitting from the Feature Generalization\n Perspective","summary":" Adversarial training (AT) constructs robust neural networks by incorporating\nadversarial perturbations into natural data. However, it is plagued by the\nissue of robust overfitting (RO), which severely damages the model's\nrobustness. In this paper, we investigate RO from a novel feature\ngeneralization perspective. Specifically, we design factor ablation experiments\nto assess the respective impacts of natural data and adversarial perturbations\non RO, identifying that the inducing factor of RO stems from natural data.\nGiven that the only difference between adversarial and natural training lies in\nthe inclusion of adversarial perturbations, we further hypothesize that\nadversarial perturbations degrade the generalization of features in natural\ndata and verify this hypothesis through extensive experiments. Based on these\nfindings, we provide a holistic view of RO from the feature generalization\nperspective and explain various empirical behaviors associated with RO. To\nexamine our feature generalization perspective, we devise two representative\nmethods, attack strength and data augmentation, to prevent the feature\ngeneralization degradation during AT. Extensive experiments conducted on\nbenchmark datasets demonstrate that the proposed methods can effectively\nmitigate RO and enhance adversarial robustness.\n","authors":["Chaojian Yu","Xiaolong Shi","Jun Yu","Bo Han","Tongliang Liu"],"pdf_url":"https://arxiv.org/pdf/2310.00607v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19715v1","updated":"2024-07-29T05:40:08Z","published":"2024-07-29T05:40:08Z","title":"Generalization bounds for regression and classification on adaptive\n covering input domains","summary":" Our main focus is on the generalization bound, which serves as an upper limit\nfor the generalization error. Our analysis delves into regression and\nclassification tasks separately to ensure a thorough examination. We assume the\ntarget function is real-valued and Lipschitz continuous for regression tasks.\nWe use the 2-norm and a root-mean-square-error (RMSE) variant to measure the\ndisparities between predictions and actual values. In the case of\nclassification tasks, we treat the target function as a one-hot classifier,\nrepresenting a piece-wise constant function, and employ 0/1 loss for error\nmeasurement. Our analysis underscores the differing sample complexity required\nto achieve a concentration inequality of generalization bounds, highlighting\nthe variation in learning efficiency for regression and classification tasks.\nFurthermore, we demonstrate that the generalization bounds for regression and\nclassification functions are inversely proportional to a polynomial of the\nnumber of parameters in a network, with the degree depending on the hypothesis\nclass and the network architecture. These findings emphasize the advantages of\nover-parameterized networks and elucidate the conditions for benign overfitting\nin such systems.\n","authors":["Wen-Liang Hwang"],"pdf_url":"https://arxiv.org/pdf/2407.19715v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.06886v6","updated":"2024-07-29T05:26:44Z","published":"2024-07-09T14:14:47Z","title":"Aligning Cyber Space with Physical World: A Comprehensive Survey on\n Embodied AI","summary":" Embodied Artificial Intelligence (Embodied AI) is crucial for achieving\nArtificial General Intelligence (AGI) and serves as a foundation for various\napplications that bridge cyberspace and the physical world. Recently, the\nemergence of Multi-modal Large Models (MLMs) and World Models (WMs) have\nattracted significant attention due to their remarkable perception,\ninteraction, and reasoning capabilities, making them a promising architecture\nfor the brain of embodied agents. However, there is no comprehensive survey for\nEmbodied AI in the era of MLMs. In this survey, we give a comprehensive\nexploration of the latest advancements in Embodied AI. Our analysis firstly\nnavigates through the forefront of representative works of embodied robots and\nsimulators, to fully understand the research focuses and their limitations.\nThen, we analyze four main research targets: 1) embodied perception, 2)\nembodied interaction, 3) embodied agent, and 4) sim-to-real adaptation,\ncovering the state-of-the-art methods, essential paradigms, and comprehensive\ndatasets. Additionally, we explore the complexities of MLMs in virtual and real\nembodied agents, highlighting their significance in facilitating interactions\nin dynamic digital and physical environments. Finally, we summarize the\nchallenges and limitations of embodied AI and discuss their potential future\ndirections. We hope this survey will serve as a foundational reference for the\nresearch community and inspire continued innovation. The associated project can\nbe found at https://github.com/HCPLab-SYSU/Embodied_AI_Paper_List.\n","authors":["Yang Liu","Weixing Chen","Yongjie Bai","Guanbin Li","Wen Gao","Liang Lin"],"pdf_url":"https://arxiv.org/pdf/2407.06886v6.pdf","comment":"The first comprehensive review of Embodied AI in the era of MLMs, 36\n pages. We also provide the paper list for Embodied AI:\n https://github.com/HCPLab-SYSU/Embodied_AI_Paper_List"},{"id":"http://arxiv.org/abs/2407.19707v1","updated":"2024-07-29T05:05:13Z","published":"2024-07-29T05:05:13Z","title":"Neural networks for bifurcation and linear stability analysis of steady\n states in partial differential equations","summary":" This research introduces an extended application of neural networks for\nsolving nonlinear partial differential equations (PDEs). A neural network,\ncombined with a pseudo-arclength continuation, is proposed to construct\nbifurcation diagrams from parameterized nonlinear PDEs. Additionally, a neural\nnetwork approach is also presented for solving eigenvalue problems to analyze\nsolution linear stability, focusing on identifying the largest eigenvalue. The\neffectiveness of the proposed neural network is examined through experiments on\nthe Bratu equation and the Burgers equation. Results from a finite difference\nmethod are also presented as comparison. Varying numbers of grid points are\nemployed in each case to assess the behavior and accuracy of both the neural\nnetwork and the finite difference method. The experimental results demonstrate\nthat the proposed neural network produces better solutions, generates more\naccurate bifurcation diagrams, has reasonable computational times, and proves\neffective for linear stability analysis.\n","authors":["Muhammad Luthfi Shahab","Hadi Susanto"],"pdf_url":"https://arxiv.org/pdf/2407.19707v1.pdf","comment":"Accepted for publication in Applied Mathematics and Computation"},{"id":"http://arxiv.org/abs/2306.01812v2","updated":"2024-07-29T04:53:18Z","published":"2023-06-02T07:10:45Z","title":"SAPI: Surroundings-Aware Vehicle Trajectory Prediction at Intersections","summary":" In this work we propose a deep learning model, i.e., SAPI, to predict vehicle\ntrajectories at intersections. SAPI uses an abstract way to represent and\nencode surrounding environment by utilizing information from real-time map,\nright-of-way, and surrounding traffic. The proposed model consists of two\nconvolutional network (CNN) and recurrent neural network (RNN)-based encoders\nand one decoder. A refiner is proposed to conduct a look-back operation inside\nthe model, in order to make full use of raw history trajectory information. We\nevaluate SAPI on a proprietary dataset collected in real-world intersections\nthrough autonomous vehicles. It is demonstrated that SAPI shows promising\nperformance when predicting vehicle trajectories at intersection, and\noutperforms benchmark methods. The average displacement error(ADE) and final\ndisplacement error(FDE) for 6-second prediction are 1.84m and 4.32m\nrespectively. We also show that the proposed model can accurately predict\nvehicle trajectories in different scenarios.\n","authors":["Ethan Zhang","Hao Xiao","Yiqian Gan","Lei Wang"],"pdf_url":"https://arxiv.org/pdf/2306.01812v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10404v8","updated":"2024-07-29T04:47:04Z","published":"2023-10-16T13:49:46Z","title":"LLM4SGG: Large Language Models for Weakly Supervised Scene Graph\n Generation","summary":" Weakly-Supervised Scene Graph Generation (WSSGG) research has recently\nemerged as an alternative to the fully-supervised approach that heavily relies\non costly annotations. In this regard, studies on WSSGG have utilized image\ncaptions to obtain unlocalized triplets while primarily focusing on grounding\nthe unlocalized triplets over image regions. However, they have overlooked the\ntwo issues involved in the triplet formation process from the captions: 1)\nSemantic over-simplification issue arises when extracting triplets from\ncaptions, where fine-grained predicates in captions are undesirably converted\ninto coarse-grained predicates, resulting in a long-tailed predicate\ndistribution, and 2) Low-density scene graph issue arises when aligning the\ntriplets in the caption with entity/predicate classes of interest, where many\ntriplets are discarded and not used in training, leading to insufficient\nsupervision. To tackle the two issues, we propose a new approach, i.e., Large\nLanguage Model for weakly-supervised SGG (LLM4SGG), where we mitigate the two\nissues by leveraging the LLM's in-depth understanding of language and reasoning\nability during the extraction of triplets from captions and alignment of\nentity/predicate classes with target data. To further engage the LLM in these\nprocesses, we adopt the idea of Chain-of-Thought and the in-context few-shot\nlearning strategy. To validate the effectiveness of LLM4SGG, we conduct\nextensive experiments on Visual Genome and GQA datasets, showing significant\nimprovements in both Recall@K and mean Recall@K compared to the\nstate-of-the-art WSSGG methods. A further appeal is that LLM4SGG is\ndata-efficient, enabling effective model training with a small amount of\ntraining images.\n","authors":["Kibum Kim","Kanghoon Yoon","Jaehyeong Jeon","Yeonjun In","Jinyoung Moon","Donghyun Kim","Chanyoung Park"],"pdf_url":"https://arxiv.org/pdf/2310.10404v8.pdf","comment":"8 pages; CVPR 2024"},{"id":"http://arxiv.org/abs/2407.19697v1","updated":"2024-07-29T04:42:18Z","published":"2024-07-29T04:42:18Z","title":"Multiscale Representation Enhanced Temporal Flow Fusion Model for\n Long-Term Workload Forecasting","summary":" Accurate workload forecasting is critical for efficient resource management\nin cloud computing systems, enabling effective scheduling and autoscaling.\nDespite recent advances with transformer-based forecasting models, challenges\nremain due to the non-stationary, nonlinear characteristics of workload time\nseries and the long-term dependencies. In particular, inconsistent performance\nbetween long-term history and near-term forecasts hinders long-range\npredictions. This paper proposes a novel framework leveraging self-supervised\nmultiscale representation learning to capture both long-term and near-term\nworkload patterns. The long-term history is encoded through multiscale\nrepresentations while the near-term observations are modeled via temporal flow\nfusion. These representations of different scales are fused using an attention\nmechanism and characterized with normalizing flows to handle\nnon-Gaussian/non-linear distributions of time series. Extensive experiments on\n9 benchmarks demonstrate superiority over existing methods.\n","authors":["Shiyu Wang","Zhixuan Chu","Yinbo Sun","Yu Liu","Yuliang Guo","Yang Chen","Huiyang Jian","Lintao Ma","Xingyu Lu","Jun Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.19697v1.pdf","comment":"Proceedings of the 33rd ACM International Conference on Information\n and Knowledge Management (CIKM '24), October 21--25, 2024, Boise, ID, USA"},{"id":"http://arxiv.org/abs/2407.19688v1","updated":"2024-07-29T04:16:45Z","published":"2024-07-29T04:16:45Z","title":"Causal Interventional Prediction System for Robust and Explainable\n Effect Forecasting","summary":" Although the widespread use of AI systems in today's world is growing, many\ncurrent AI systems are found vulnerable due to hidden bias and missing\ninformation, especially in the most commonly used forecasting system. In this\nwork, we explore the robustness and explainability of AI-based forecasting\nsystems. We provide an in-depth analysis of the underlying causality involved\nin the effect prediction task and further establish a causal graph based on\ntreatment, adjustment variable, confounder, and outcome. Correspondingly, we\ndesign a causal interventional prediction system (CIPS) based on a variational\nautoencoder and fully conditional specification of multiple imputations.\nExtensive results demonstrate the superiority of our system over\nstate-of-the-art methods and show remarkable versatility and extensibility in\npractice.\n","authors":["Zhixuan Chu","Hui Ding","Guang Zeng","Shiyu Wang","Yiming Li"],"pdf_url":"https://arxiv.org/pdf/2407.19688v1.pdf","comment":"Proceedings of the 33rd ACM International Conference on Information\n and Knowledge Management (CIKM '24), October 21--25, 2024, Boise, ID, USA"},{"id":"http://arxiv.org/abs/2307.05908v2","updated":"2024-07-29T04:03:22Z","published":"2023-07-12T04:28:41Z","title":"Predictive Pipelined Decoding: A Compute-Latency Trade-off for Exact LLM\n Decoding","summary":" This paper presents \"Predictive Pipelined Decoding (PPD),\" an approach that\nspeeds up greedy decoding in Large Language Models (LLMs) while maintaining the\nexact same output as the original decoding. Unlike conventional strategies, PPD\nemploys additional compute resources to parallelize the initiation of\nsubsequent token decoding during the current token decoding. This method\nreduces decoding latency and reshapes the understanding of trade-offs in LLM\ndecoding strategies. We have developed a theoretical framework that allows us\nto analyze the trade-off between computation and latency. Using this framework,\nwe can analytically estimate the potential reduction in latency associated with\nour proposed method, achieved through the assessment of the match rate,\nrepresented as p_correct. The results demonstrate that the use of extra\ncomputational resources has the potential to accelerate LLM decoding.\nAdditionally, we implement PPD and conduct preliminary experiments to\nempirically validate its efficacy, addressing potential practical overheads not\ncovered by theoretical analysis.\n","authors":["Seongjun Yang","Gibbeum Lee","Jaewoong Cho","Dimitris Papailiopoulos","Kangwook Lee"],"pdf_url":"https://arxiv.org/pdf/2307.05908v2.pdf","comment":"ES-FoMo Workshop at ICML 2023 / Published in TMLR"},{"id":"http://arxiv.org/abs/2407.19683v1","updated":"2024-07-29T03:55:52Z","published":"2024-07-29T03:55:52Z","title":"Revisiting the robustness of post-hoc interpretability methods","summary":" Post-hoc interpretability methods play a critical role in explainable\nartificial intelligence (XAI), as they pinpoint portions of data that a trained\ndeep learning model deemed important to make a decision. However, different\npost-hoc interpretability methods often provide different results, casting\ndoubts on their accuracy. For this reason, several evaluation strategies have\nbeen proposed to understand the accuracy of post-hoc interpretability. Many of\nthese evaluation strategies provide a coarse-grained assessment -- i.e., they\nevaluate how the performance of the model degrades on average by corrupting\ndifferent data points across multiple samples. While these strategies are\neffective in selecting the post-hoc interpretability method that is most\nreliable on average, they fail to provide a sample-level, also referred to as\nfine-grained, assessment. In other words, they do not measure the robustness of\npost-hoc interpretability methods. We propose an approach and two new metrics\nto provide a fine-grained assessment of post-hoc interpretability methods. We\nshow that the robustness is generally linked to its coarse-grained performance.\n","authors":["Jiawen Wei","Hugues Turbé","Gianmarco Mengaldo"],"pdf_url":"https://arxiv.org/pdf/2407.19683v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18373v2","updated":"2024-07-29T03:46:15Z","published":"2024-07-25T20:14:58Z","title":"Physics Informed Kolmogorov-Arnold Neural Networks for Dynamical\n Analysis via Efficent-KAN and WAV-KAN","summary":" Physics-informed neural networks have proven to be a powerful tool for\nsolving differential equations, leveraging the principles of physics to inform\nthe learning process. However, traditional deep neural networks often face\nchallenges in achieving high accuracy without incurring significant\ncomputational costs. In this work, we implement the Physics-Informed\nKolmogorov-Arnold Neural Networks (PIKAN) through efficient-KAN and WAV-KAN,\nwhich utilize the Kolmogorov-Arnold representation theorem. PIKAN demonstrates\nsuperior performance compared to conventional deep neural networks, achieving\nthe same level of accuracy with fewer layers and reduced computational\noverhead. We explore both B-spline and wavelet-based implementations of PIKAN\nand benchmark their performance across various ordinary and partial\ndifferential equations using unsupervised (data-free) and supervised\n(data-driven) techniques. For certain differential equations, the data-free\napproach suffices to find accurate solutions, while in more complex scenarios,\nthe data-driven method enhances the PIKAN's ability to converge to the correct\nsolution. We validate our results against numerical solutions and achieve $99\n\\%$ accuracy in most scenarios.\n","authors":["Subhajit Patra","Sonali Panda","Bikram Keshari Parida","Mahima Arya","Kurt Jacobs","Denys I. Bondar","Abhijit Sen"],"pdf_url":"https://arxiv.org/pdf/2407.18373v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11527v4","updated":"2024-07-29T03:41:34Z","published":"2023-05-19T08:51:11Z","title":"InstructIE: A Bilingual Instruction-based Information Extraction Dataset","summary":" Large language models can perform well on general natural language tasks, but\ntheir effectiveness is still suboptimal for information extraction (IE). Recent\nworks indicate that the main reason lies in the lack of extensive data on IE\ninstructions. Note that the existing datasets on IE instructions not only have\nlimited coverage but also involve high construction costs. To address this\nissue, we introduce InstructIE, a bilingual instruction-based IE dataset, which\ncovers 12 diverse domains. We propose KG2Instruction, a framework specifically\nfor the automatic generation of such datasets. Additionally, we manually\nannotate the test set. Experimental results demonstrate that large language\nmodels trained with InstructIE can not only obtain better IE capabilities but\nalso enhance zero-shot performance compared with baselines.\n","authors":["Honghao Gui","Shuofei Qiao","Jintian Zhang","Hongbin Ye","Mengshu Sun","Lei Liang","Jeff Z. Pan","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.11527v4.pdf","comment":"ISWC 2024; project homepage:\n https://www.zjukg.org/project/InstructIE/ dataset:\n https://huggingface.co/datasets/zjunlp/InstructIE"},{"id":"http://arxiv.org/abs/2406.08749v2","updated":"2024-07-29T03:06:40Z","published":"2024-06-13T02:17:19Z","title":"Mathematical models for off-ball scoring prediction in basketball","summary":" In professional basketball, the accurate prediction of scoring opportunities\nbased on strategic decision-making is crucial for spatial and player\nevaluations. However, traditional models often face challenges in accounting\nfor the complexities of off-ball movements, which are essential for\ncomprehensive performance evaluations. In this study, we propose two\nmathematical models to predict off-ball scoring opportunities in basketball,\nconsidering pass-to-score and dribble-to-score sequences: the Ball Movement for\nOff-ball Scoring (BMOS) and the Ball Intercept and Movement for Off-ball\nScoring (BIMOS) models. The BMOS model adapts principles from the Off-Ball\nScoring Opportunities (OBSO) model, originally designed for soccer, to\nbasketball, whereas the BIMOS model also incorporates the likelihood of\ninterception during ball movements. We evaluated these models using player\ntracking data from 630 NBA games in the 2015-2016 regular season, demonstrating\nthat the BIMOS model outperforms the BMOS model in terms of team scoring\nprediction accuracy, while also highlighting its potential for further\ndevelopment. Overall, the BIMOS model provides valuable insights for tactical\nanalysis and player evaluation in basketball.\n","authors":["Rikako Kono","Keisuke Fujii"],"pdf_url":"https://arxiv.org/pdf/2406.08749v2.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2407.19664v1","updated":"2024-07-29T02:54:52Z","published":"2024-07-29T02:54:52Z","title":"Adaptive Soft Error Protection for Deep Learning","summary":" The rising incidence of soft errors in hardware systems represents a\nconsiderable risk to the reliability of deep learning systems and can\nprecipitate severe malfunctions. Although essential, soft error mitigation can\nimpose substantial costs on deep learning systems that are inherently demanding\nin terms of computation and memory. Previous research has primarily explored\nvariations in vulnerability among different components of computing engines or\nneural networks, aiming for selective protection to minimize protection\noverhead. Our approach diverges from these studies by recognizing that the\nsusceptibility of deep learning tasks to soft errors is heavily\ninput-dependent. Notably, some inputs are simpler for deep learning models and\ninherently exhibit greater tolerance to soft errors. Conversely, more complex\ninputs are prone to soft error impact. Based on these insights, we introduce an\nadaptive soft error protection strategy that tailors protection to the\ncomputational demands of individual inputs. To implement this strategy, we\ndevelop a metric for assessing the complexity of inputs and deploy a\nlightweight machine learning algorithm to gauge input difficulty. Subsequently,\nwe employ robust protection for challenging inputs and minimal protection for\nsimpler ones. Our experimental evaluation across diverse datasets and deep\nlearning tasks reveals that our adaptive strategy reduces the soft error\nprotection overhead by an average of 46.9%, without compromising system\nreliability.\n","authors":["Xinghua Xue","Cheng Liu"],"pdf_url":"https://arxiv.org/pdf/2407.19664v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19663v1","updated":"2024-07-29T02:53:39Z","published":"2024-07-29T02:53:39Z","title":"Short-Term Forecasting of Photovoltaic Power Generation Based on Entropy\n during the Foggy Winter","summary":" Solar energy is one of the most promising renewable energy resources.\nForecasting photovoltaic power generation is an important way to increase\nphotovoltaic penetration. However, the task of photovoltaic forecasting is\ncomplicated due to its property of uncertainty, especially in specific regions\nduring the foggy winter. This paper proposes a novel model to accomplish the\nproblem. A developed entropy is created to qualify the uncertainty during the\nfoggy winter. The clustering method and modified retention network are applied\nto reduce complexity and forecast, respectively. We adopt an optimization to\noptimize the hyperparameters. Results are validated from the multivariate\nforecasting model using the dataset from a photovoltaic power station in\nJiangsu Province, China. Experiments show that the proposed model improves the\nforecasting accuracy compared to various models during the foggy winter.\n","authors":["Xuan Yang","Yunxuan Dong","Thomas Wu"],"pdf_url":"https://arxiv.org/pdf/2407.19663v1.pdf","comment":"The manuscript was submitted to Applied Energy on June 3, 2024"},{"id":"http://arxiv.org/abs/2407.19660v1","updated":"2024-07-29T02:49:55Z","published":"2024-07-29T02:49:55Z","title":"Towards a Knowledge guided Multimodal Foundation Model for\n Spatio-Temporal Remote Sensing Applications","summary":" In recent years, there is increased interest in foundation models for\ngeoscience due to vast amount of earth observing satellite imagery. Existing\nremote sensing foundation models make use of the various sources of spectral\nimagery to create large models pretrained on masked reconstruction task. The\nembeddings from these foundation models are then used for various downstream\nremote sensing applications. In this paper we propose a foundational modeling\nframework for remote sensing geoscience applications, that goes beyond these\ntraditional single modality masked autoencoder family of foundation models.\nThis framework leverages the knowledge guided principles that the spectral\nimagery captures the impact of the physical drivers on the environmental\nsystem, and that the relationship between them is governed by the\ncharacteristics of the system. Specifically, our method, called MultiModal\nVariable Step Forecasting (MM-VSF), uses mutlimodal data (spectral imagery and\nweather) as its input and a variable step forecasting task as its pretraining\nobjective. In our evaluation we show forecasting of satellite imagery using\nweather can be used as an effective pretraining task for foundation models. We\nfurther show the effectiveness of the embeddings from MM-VSF on the downstream\ntask of pixel wise crop mapping, when compared with a model trained in the\ntraditional setting of single modality input and masked reconstruction based\npretraining.\n","authors":["Praveen Ravirathinam","Ankush Khandelwal","Rahul Ghosh","Vipin Kumar"],"pdf_url":"https://arxiv.org/pdf/2407.19660v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2306.06048v3","updated":"2024-07-29T02:36:45Z","published":"2023-06-09T17:16:50Z","title":"How Does Fine-Tuning Impact Out-of-Distribution Detection for\n Vision-Language Models?","summary":" Recent large vision-language models such as CLIP have shown remarkable\nout-of-distribution (OOD) detection and generalization performance. However,\ntheir zero-shot in-distribution (ID) accuracy is often limited for downstream\ndatasets. Recent CLIP-based fine-tuning methods such as prompt learning have\ndemonstrated significant improvements in ID classification and OOD\ngeneralization where OOD labels are available. Nonetheless, it remains unclear\nwhether the model is reliable to semantic shifts without OOD labels. In this\npaper, we aim to bridge the gap and present a comprehensive study to understand\nhow fine-tuning impact OOD detection for few-shot downstream tasks. By framing\nOOD detection as multi-modal concept matching, we establish a connection\nbetween fine-tuning methods and various OOD scores. Our results suggest that a\nproper choice of OOD scores is essential for CLIP-based fine-tuning. In\nparticular, the maximum concept matching (MCM) score provides a promising\nsolution consistently. We also show that prompt learning demonstrates the\nstate-of-the-art OOD detection performance over the zero-shot counterpart.\n","authors":["Yifei Ming","Yixuan Li"],"pdf_url":"https://arxiv.org/pdf/2306.06048v3.pdf","comment":"Accepted to IJCV 2023"},{"id":"http://arxiv.org/abs/2402.14236v2","updated":"2024-07-29T02:34:20Z","published":"2024-02-22T02:36:14Z","title":"Automated Design and Optimization of Distributed Filtering Circuits via\n Reinforcement Learning","summary":" Designing distributed filter circuits (DFCs) is complex and time-consuming,\ninvolving setting and optimizing multiple hyperparameters. Traditional\noptimization methods, such as using the commercial finite element solver HFSS\n(High-Frequency Structure Simulator) to enumerate all parameter combinations\nwith fixed steps and then simulate each combination, are not only\ntime-consuming and labor-intensive but also rely heavily on the expertise and\nexperience of electronics engineers, making it difficult to adapt to rapidly\nchanging design requirements. Additionally, these commercial tools struggle\nwith precise adjustments when parameters are sensitive to numerical changes,\nresulting in limited optimization effectiveness. This study proposes a novel\nend-to-end automated method for DFC design. The proposed method harnesses\nreinforcement learning (RL) algorithms, eliminating the dependence on the\ndesign experience of engineers. Thus, it significantly reduces the subjectivity\nand constraints associated with circuit design. The experimental findings\ndemonstrate clear improvements in design efficiency and quality when comparing\nthe proposed method with traditional engineer-driven methods. Furthermore, the\nproposed method achieves superior performance when designing complex or rapidly\nevolving DFCs, highlighting the substantial potential of RL in circuit design\nautomation. In particular, compared to the existing DFC automation design\nmethod CircuitGNN, our method achieves an average performance improvement of\n8.72%. Additionally, the execution efficiency of our method is 2000 times\nhigher than CircuitGNN on the CPU and 241 times higher on the GPU.\n","authors":["Peng Gao","Tao Yu","Fei Wang","Ru-Yue Yuan"],"pdf_url":"https://arxiv.org/pdf/2402.14236v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19651v1","updated":"2024-07-29T02:32:44Z","published":"2024-07-29T02:32:44Z","title":"ComNeck: Bridging Compressed Image Latents and Multimodal LLMs via\n Universal Transform-Neck","summary":" This paper presents the first-ever study of adapting compressed image latents\nto suit the needs of downstream vision tasks that adopt Multimodal Large\nLanguage Models (MLLMs). MLLMs have extended the success of large language\nmodels to modalities (e.g. images) beyond text, but their billion scale hinders\ndeployment on resource-constrained end devices. While cloud-hosted MLLMs could\nbe available, transmitting raw, uncompressed images captured by end devices to\nthe cloud requires an efficient image compression system. To address this, we\nfocus on emerging neural image compression and propose a novel framework with a\nlightweight transform-neck and a surrogate loss to adapt compressed image\nlatents for MLLM-based vision tasks. The proposed framework is generic and\napplicable to multiple application scenarios, where the neural image codec can\nbe (1) pre-trained for human perception without updating, (2) fully updated for\njoint human and machine perception, or (3) fully updated for only machine\nperception. The transform-neck trained with the surrogate loss is universal,\nfor it can serve various downstream vision tasks enabled by a variety of MLLMs\nthat share the same visual encoder. Our framework has the striking feature of\nexcluding the downstream MLLMs from training the transform-neck, and\npotentially the neural image codec as well. This stands out from most existing\ncoding for machine approaches that involve downstream networks in training and\nthus could be impractical when the networks are MLLMs. Extensive experiments on\ndifferent neural image codecs and various MLLM-based vision tasks show that our\nmethod achieves great rate-accuracy performance with much less complexity,\ndemonstrating its effectiveness.\n","authors":["Chia-Hao Kao","Cheng Chien","Yu-Jen Tseng","Yi-Hsin Chen","Alessandro Gnutti","Shao-Yuan Lo","Wen-Hsiao Peng","Riccardo Leonardi"],"pdf_url":"https://arxiv.org/pdf/2407.19651v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18416v2","updated":"2024-07-29T02:30:35Z","published":"2024-07-25T22:24:45Z","title":"PersonaGym: Evaluating Persona Agents and LLMs","summary":" Persona agents, which are LLM agents that act according to an assigned\npersona, have demonstrated impressive contextual response capabilities across\nvarious applications. These persona agents offer significant enhancements\nacross diverse sectors, such as education, healthcare, and entertainment, where\nmodel developers can align agent responses to different user requirements\nthereby broadening the scope of agent applications. However, evaluating persona\nagent performance is incredibly challenging due to the complexity of assessing\npersona adherence in free-form interactions across various environments that\nare relevant to each persona agent. We introduce PersonaGym, the first dynamic\nevaluation framework for assessing persona agents, and PersonaScore, the first\nautomated human-aligned metric grounded in decision theory for comprehensive\nlarge-scale evaluation of persona agents. Our evaluation of 6 open and\nclosed-source LLMs, using a benchmark encompassing 200 personas and 10,000\nquestions, reveals significant opportunities for advancement in persona agent\ncapabilities across state-of-the-art models. For example, Claude 3.5 Sonnet\nonly has a 2.97% relative improvement in PersonaScore than GPT 3.5 despite\nbeing a much more advanced model. Importantly, we find that increased model\nsize and complexity do not necessarily imply enhanced persona agent\ncapabilities thereby highlighting the pressing need for algorithmic and\narchitectural invention towards faithful and performant persona agents.\n","authors":["Vinay Samuel","Henry Peng Zou","Yue Zhou","Shreyas Chaudhari","Ashwin Kalyan","Tanmay Rajpurohit","Ameet Deshpande","Karthik Narasimhan","Vishvak Murahari"],"pdf_url":"https://arxiv.org/pdf/2407.18416v2.pdf","comment":"21 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.19644v1","updated":"2024-07-29T01:59:06Z","published":"2024-07-29T01:59:06Z","title":"Realizing Unaligned Block-wise Pruning for DNN Acceleration on Mobile\n Devices","summary":" With the recent proliferation of on-device AI, there is an increasing need to\nrun computationally intensive DNNs directly on mobile devices. However, the\nlimited computing and memory resources of these devices necessitate effective\npruning techniques. Block-wise pruning is promising due to its low accuracy\ndrop tradeoff for speedup gains, but it requires block positions to be aligned\nwith block size, hindering optimal position selection to minimize model\naccuracy drop. Unaligned block pruning (UBP) addresses this by allowing blocks\nto be selected at arbitrary positions, yet its practical use is limited by a\ntime-consuming optimal block selection algorithm and lack of efficient\ninference kernels. In this paper, we propose a pseudo-optimal yet fast block\nselection algorithm called Block Expansion and Division (BED), which can be\nintegrated into an iterative model training process. Additionally, we introduce\nan efficient inference kernel implementation for mobile devices, enabling a\nUBP-based model to achieve similar latency to a DNN model compressed by aligned\nblock pruning. We demonstrate the superiority of our techniques on a real\nmobile phone with MobileNet and ResNet models.\n","authors":["Hayun Lee","Dongkun Shin"],"pdf_url":"https://arxiv.org/pdf/2407.19644v1.pdf","comment":"11 pages, 8 figures"},{"id":"http://arxiv.org/abs/2407.18910v2","updated":"2024-07-29T01:26:51Z","published":"2024-07-26T17:59:32Z","title":"Do We Really Need Graph Convolution During Training? Light Post-Training\n Graph-ODE for Efficient Recommendation","summary":" The efficiency and scalability of graph convolution networks (GCNs) in\ntraining recommender systems (RecSys) have been persistent concerns, hindering\ntheir deployment in real-world applications. This paper presents a critical\nexamination of the necessity of graph convolutions during the training phase\nand introduces an innovative alternative: the Light Post-Training Graph\nOrdinary-Differential-Equation (LightGODE). Our investigation reveals that the\nbenefits of GCNs are more pronounced during testing rather than training.\nMotivated by this, LightGODE utilizes a novel post-training graph convolution\nmethod that bypasses the computation-intensive message passing of GCNs and\nemploys a non-parametric continuous graph ordinary-differential-equation (ODE)\nto dynamically model node representations. This approach drastically reduces\ntraining time while achieving fine-grained post-training graph convolution to\navoid the distortion of the original training embedding space, termed the\nembedding discrepancy issue. We validate our model across several real-world\ndatasets of different scales, demonstrating that LightGODE not only outperforms\nGCN-based models in terms of efficiency and effectiveness but also\nsignificantly mitigates the embedding discrepancy commonly associated with\ndeeper graph convolution layers. Our LightGODE challenges the prevailing\nparadigms in RecSys training and suggests re-evaluating the role of graph\nconvolutions, potentially guiding future developments of efficient large-scale\ngraph-based RecSys.\n","authors":["Weizhi Zhang","Liangwei Yang","Zihe Song","Henry Peng Zou","Ke Xu","Liancheng Fang","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2407.18910v2.pdf","comment":"Accepted to CIKM 2024"},{"id":"http://arxiv.org/abs/2407.19631v1","updated":"2024-07-29T01:22:04Z","published":"2024-07-29T01:22:04Z","title":"\"A Good Bot Always Knows Its Limitations\": Assessing Autonomous System\n Decision-making Competencies through Factorized Machine Self-confidence","summary":" How can intelligent machines assess their competencies in completing tasks?\nThis question has come into focus for autonomous systems that algorithmically\nreason and make decisions under uncertainty. It is argued here that machine\nself-confidence -- a form of meta-reasoning based on self-assessments of an\nagent's knowledge about the state of the world and itself, as well as its\nability to reason about and execute tasks -- leads to many eminently computable\nand useful competency indicators for such agents. This paper presents a\nculmination of work on this concept in the form of a computational framework\ncalled Factorized Machine Self-confidence (FaMSeC), which provides an\nengineering-focused holistic description of factors driving an algorithmic\ndecision-making process, including outcome assessment, solver quality, model\nquality, alignment quality, and past experience. In FaMSeC, self-confidence\nindicators are derived from hierarchical `problem-solving statistics' embedded\nwithin broad classes of probabilistic decision-making algorithms such as Markov\ndecision processes. The problem-solving statistics are obtained by evaluating\nand grading probabilistic exceedance margins with respect to given competency\nstandards, which are specified for each decision-making competency factor by\nthe informee (e.g. a non-expert user or an expert system designer). This\napproach allows `algorithmic goodness of fit' evaluations to be easily\nincorporated into the design of many kinds of autonomous agents via\nhuman-interpretable competency self-assessment reports. Detailed descriptions\nand running application examples for a Markov decision process agent show how\ntwo FaMSeC factors (outcome assessment and solver quality) can be practically\ncomputed and reported for a range of possible tasking contexts through novel\nuse of meta-utility functions, behavior simulations, and surrogate prediction\nmodels.\n","authors":["Brett Israelsen","Nisar R. Ahmed","Matthew Aitken","Eric W. Frew","Dale A. Lawrence","Brian M. Argrow"],"pdf_url":"https://arxiv.org/pdf/2407.19631v1.pdf","comment":"59 pages, 22 figures, draft to be submitted for journal review"},{"id":"http://arxiv.org/abs/2407.19618v1","updated":"2024-07-29T00:41:11Z","published":"2024-07-29T00:41:11Z","title":"Experimenting on Markov Decision Processes with Local Treatments","summary":" As service systems grow increasingly complex and dynamic, many interventions\nbecome localized, available and taking effect only in specific states. This\npaper investigates experiments with local treatments on a widely-used class of\ndynamic models, Markov Decision Processes (MDPs). Particularly, we focus on\nutilizing the local structure to improve the inference efficiency of the\naverage treatment effect. We begin by demonstrating the efficiency of classical\ninference methods, including model-based estimation and temporal difference\nlearning under a fixed policy, as well as classical A/B testing with general\ntreatments. We then introduce a variance reduction technique that exploits the\nlocal treatment structure by sharing information for states unaffected by the\ntreatment policy. Our new estimator effectively overcomes the variance lower\nbound for general treatments while matching the more stringent lower bound\nincorporating the local treatment structure. Furthermore, our estimator can\noptimally achieve a linear reduction with the number of test arms for a major\npart of the variance. Finally, we explore scenarios with perfect knowledge of\nthe control arm and design estimators that further improve inference\nefficiency.\n","authors":["Shuze Chen","David Simchi-Levi","Chonghuan Wang"],"pdf_url":"https://arxiv.org/pdf/2407.19618v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19617v1","updated":"2024-07-29T00:39:51Z","published":"2024-07-29T00:39:51Z","title":"AgEval: A Benchmark for Zero-Shot and Few-Shot Plant Stress Phenotyping\n with Multimodal LLMs","summary":" Plant stress phenotyping traditionally relies on expert assessments and\nspecialized models, limiting scalability in agriculture. Recent advances in\nmultimodal large language models (LLMs) offer potential solutions to this\nchallenge. We present AgEval, a benchmark comprising 12 diverse plant stress\nphenotyping tasks, to evaluate these models' capabilities. Our study assesses\nzero-shot and few-shot in-context learning performance of state-of-the-art\nmodels, including Claude, GPT, Gemini, and LLaVA. Results show significant\nperformance improvements with few-shot learning, with F1 scores increasing from\n46.24% to 73.37% in 8-shot identification for the best-performing model.\nFew-shot examples from other classes in the dataset have negligible or negative\nimpacts, although having the exact category example helps to increase\nperformance by 15.38%. We also quantify the consistency of model performance\nacross different classes within each task, finding that the coefficient of\nvariance (CV) ranges from 26.02% to 58.03% across models, implying that subject\nmatter expertise is needed - of 'difficult' classes - to achieve reliability in\nperformance. AgEval establishes baseline metrics for multimodal LLMs in\nagricultural applications, offering insights into their promise for enhancing\nplant stress phenotyping at scale. Benchmark and code can be accessed at:\nhttps://anonymous.4open.science/r/AgEval/\n","authors":["Muhammad Arbab Arshad","Talukder Zaki Jubery","Tirtho Roy","Rim Nassiri","Asheesh K. Singh","Arti Singh","Chinmay Hegde","Baskar Ganapathysubramanian","Aditya Balu","Adarsh Krishnamurthy","Soumik Sarkar"],"pdf_url":"https://arxiv.org/pdf/2407.19617v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19616v1","updated":"2024-07-29T00:18:17Z","published":"2024-07-29T00:18:17Z","title":"TopicTag: Automatic Annotation of NMF Topic Models Using Chain of\n Thought and Prompt Tuning with LLMs","summary":" Topic modeling is a technique for organizing and extracting themes from large\ncollections of unstructured text. Non-negative matrix factorization (NMF) is a\ncommon unsupervised approach that decomposes a term frequency-inverse document\nfrequency (TF-IDF) matrix to uncover latent topics and segment the dataset\naccordingly. While useful for highlighting patterns and clustering documents,\nNMF does not provide explicit topic labels, necessitating subject matter\nexperts (SMEs) to assign labels manually. We present a methodology for\nautomating topic labeling in documents clustered via NMF with automatic model\ndetermination (NMFk). By leveraging the output of NMFk and employing prompt\nengineering, we utilize large language models (LLMs) to generate accurate topic\nlabels. Our case study on over 34,000 scientific abstracts on Knowledge Graphs\ndemonstrates the effectiveness of our method in enhancing knowledge management\nand document organization.\n","authors":["Selma Wanna","Ryan Barron","Nick Solovyev","Maksim E. Eren","Manish Bhattarai","Kim Rasmussen","Boian S. Alexandrov"],"pdf_url":"https://arxiv.org/pdf/2407.19616v1.pdf","comment":"Accepted to ACM Symposium on Document Engineering 2024 (DocEng 24),\n 2024"},{"id":"http://arxiv.org/abs/2407.20466v1","updated":"2024-07-29T23:48:07Z","published":"2024-07-29T23:48:07Z","title":"A Method for Fast Autonomy Transfer in Reinforcement Learning","summary":" This paper introduces a novel reinforcement learning (RL) strategy designed\nto facilitate rapid autonomy transfer by utilizing pre-trained critic value\nfunctions from multiple environments. Unlike traditional methods that require\nextensive retraining or fine-tuning, our approach integrates existing\nknowledge, enabling an RL agent to adapt swiftly to new settings without\nrequiring extensive computational resources. Our contributions include\ndevelopment of the Multi-Critic Actor-Critic (MCAC) algorithm, establishing its\nconvergence, and empirical evidence demonstrating its efficacy. Our\nexperimental results show that MCAC significantly outperforms the baseline\nactor-critic algorithm, achieving up to 22.76x faster autonomy transfer and\nhigher reward accumulation. This advancement underscores the potential of\nleveraging accumulated knowledge for efficient adaptation in RL applications.\n","authors":["Dinuka Sahabandu","Bhaskar Ramasubramanian","Michail Alexiou","J. Sukarno Mertoguno","Linda Bushnell","Radha Poovendran"],"pdf_url":"https://arxiv.org/pdf/2407.20466v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20462v1","updated":"2024-07-29T23:41:26Z","published":"2024-07-29T23:41:26Z","title":"Graphite: A Graph-based Extreme Multi-Label Short Text Classifier for\n Keyphrase Recommendation","summary":" Keyphrase Recommendation has been a pivotal problem in advertising and\ne-commerce where advertisers/sellers are recommended keyphrases (search\nqueries) to bid on to increase their sales. It is a challenging task due to the\nplethora of items shown on online platforms and various possible queries that\nusers search while showing varying interest in the displayed items. Moreover,\nquery/keyphrase recommendations need to be made in real-time and in a\nresource-constrained environment. This problem can be framed as an Extreme\nMulti-label (XML) Short text classification by tagging the input text with\nkeywords as labels. Traditional neural network models are either infeasible or\nhave slower inference latency due to large label spaces. We present Graphite, a\ngraph-based classifier model that provides real-time keyphrase recommendations\nthat are on par with standard text classification models. Furthermore, it\ndoesn't utilize GPU resources, which can be limited in production environments.\nDue to its lightweight nature and smaller footprint, it can train on very large\ndatasets, where state-of-the-art XML models fail due to extreme resource\nrequirements. Graphite is deterministic, transparent, and intrinsically more\ninterpretable than neural network-based models. We present a comprehensive\nanalysis of our model's performance across forty categories spanning eBay's\nEnglish-speaking sites.\n","authors":["Ashirbad Mishra","Soumik Dey","Jinyu Zhao","Marshall Wu","Binbin Li","Kamesh Madduri"],"pdf_url":"https://arxiv.org/pdf/2407.20462v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20454v1","updated":"2024-07-29T23:18:55Z","published":"2024-07-29T23:18:55Z","title":"CoMMIT: Coordinated Instruction Tuning for Multimodal Large Language\n Models","summary":" Instruction tuning in multimodal large language models (MLLMs) aims to\nsmoothly integrate a backbone LLM with a pre-trained feature encoder for\ndownstream tasks. The major challenge is how to efficiently find the synergy\nthrough cooperative learning where LLMs adapt their reasoning abilities in\ndownstream tasks while feature encoders adjust their encoding to provide more\nrelevant modal information. In this paper, we analyze the MLLM instruction\ntuning from both theoretical and empirical perspectives, where we find\nunbalanced learning between the two components, i.e., the feature encoder and\nthe LLM, can cause diminishing learning gradients that slow the model\nconvergence and often lead to sub-optimal results due to insufficient learning.\nInspired by our findings, we propose a measurement to quantitatively evaluate\nthe learning balance, based on which we further design a dynamic learning\nscheduler that better coordinates the learning. In addition, we introduce an\nauxiliary loss regularization method to promote updating of the generation\ndistribution of MLLMs considering the learning state of each model component,\nwhich potentially prevents each component from gradient diminishing and enables\na more accurate estimation of the learning balance coefficient. We conduct\nexperiments with multiple LLM backbones and feature encoders, where our\ntechniques are model-agnostic and can be generically integrated with various\nMLLM backbones. Experiment results on multiple downstream tasks and modalities\nin vision and audio, demonstrate the proposed method's better efficiency and\neffectiveness in MLLM instruction tuning.\n","authors":["Junda Wu","Xintong Li","Tong Yu","Yu Wang","Xiang Chen","Jiuxiang Gu","Lina Yao","Jingbo Shang","Julian McAuley"],"pdf_url":"https://arxiv.org/pdf/2407.20454v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/1907.11975v2","updated":"2024-07-29T23:06:12Z","published":"2019-07-27T20:42:01Z","title":"Blocking Bandits","summary":" We consider a novel stochastic multi-armed bandit setting, where playing an\narm makes it unavailable for a fixed number of time slots thereafter. This\nmodels situations where reusing an arm too often is undesirable (e.g. making\nthe same product recommendation repeatedly) or infeasible (e.g. compute job\nscheduling on machines). We show that with prior knowledge of the rewards and\ndelays of all the arms, the problem of optimizing cumulative reward does not\nadmit any pseudo-polynomial time algorithm (in the number of arms) unless\nrandomized exponential time hypothesis is false, by mapping to the PINWHEEL\nscheduling problem. Subsequently, we show that a simple greedy algorithm that\nplays the available arm with the highest reward is asymptotically $(1-1/e)$\noptimal. When the rewards are unknown, we design a UCB based algorithm which is\nshown to have $c \\log T + o(\\log T)$ cumulative regret against the greedy\nalgorithm, leveraging the free exploration of arms due to the unavailability.\nFinally, when all the delays are equal the problem reduces to Combinatorial\nSemi-bandits providing us with a lower bound of $c' \\log T+ \\omega(\\log T)$.\n","authors":["Soumya Basu","Rajat Sen","Sujay Sanghavi","Sanjay Shakkottai"],"pdf_url":"https://arxiv.org/pdf/1907.11975v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20445v1","updated":"2024-07-29T22:53:32Z","published":"2024-07-29T22:53:32Z","title":"Futga: Towards Fine-grained Music Understanding through\n Temporally-enhanced Generative Augmentation","summary":" Existing music captioning methods are limited to generating concise global\ndescriptions of short music clips, which fail to capture fine-grained musical\ncharacteristics and time-aware musical changes. To address these limitations,\nwe propose FUTGA, a model equipped with fined-grained music understanding\ncapabilities through learning from generative augmentation with temporal\ncompositions. We leverage existing music caption datasets and large language\nmodels (LLMs) to synthesize fine-grained music captions with structural\ndescriptions and time boundaries for full-length songs. Augmented by the\nproposed synthetic dataset, FUTGA is enabled to identify the music's temporal\nchanges at key transition points and their musical functions, as well as\ngenerate detailed descriptions for each music segment. We further introduce a\nfull-length music caption dataset generated by FUTGA, as the augmentation of\nthe MusicCaps and the Song Describer datasets. We evaluate the automatically\ngenerated captions on several downstream tasks, including music generation and\nretrieval. The experiments demonstrate the quality of the generated captions\nand the better performance in various downstream tasks achieved by the proposed\nmusic captioning approach. Our code and datasets can be found in\n\\href{https://huggingface.co/JoshuaW1997/FUTGA}{\\textcolor{blue}{https://huggingface.co/JoshuaW1997/FUTGA}}.\n","authors":["Junda Wu","Zachary Novack","Amit Namburi","Jiaheng Dai","Hao-Wen Dong","Zhouhang Xie","Carol Chen","Julian McAuley"],"pdf_url":"https://arxiv.org/pdf/2407.20445v1.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2407.20444v1","updated":"2024-07-29T22:49:59Z","published":"2024-07-29T22:49:59Z","title":"Importance Corrected Neural JKO Sampling","summary":" In order to sample from an unnormalized probability density function, we\npropose to combine continuous normalizing flows (CNFs) with\nrejection-resampling steps based on importance weights. We relate the iterative\ntraining of CNFs with regularized velocity fields to a JKO scheme and prove\nconvergence of the involved velocity fields to the velocity field of the\nWasserstein gradient flow (WGF). The alternation of local flow steps and\nnon-local rejection-resampling steps allows to overcome local minima or slow\nconvergence of the WGF for multimodal distributions. Since the proposal of the\nrejection step is generated by the model itself, they do not suffer from common\ndrawbacks of classical rejection schemes. The arising model can be trained\niteratively, reduces the reverse Kulback-Leibler (KL) loss function in each\nstep, allows to generate iid samples and moreover allows for evaluations of the\ngenerated underlying density. Numerical examples show that our method yields\naccurate results on various test distributions including high-dimensional\nmultimodal targets and outperforms the state of the art in almost all cases\nsignificantly.\n","authors":["Johannes Hertrich","Robert Gruhlke"],"pdf_url":"https://arxiv.org/pdf/2407.20444v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2111.00002v3","updated":"2024-07-29T22:31:49Z","published":"2021-10-28T04:13:53Z","title":"Fair Incentives for Repeated Engagement","summary":" We study a decision-maker's problem of finding optimal monetary incentive\nschemes for retention when faced with agents whose participation decisions\n(stochastically) depend on the incentive they receive. Our focus is on policies\nconstrained to fulfill two fairness properties that preclude outcomes wherein\ndifferent groups of agents experience different treatment on average. We\nformulate the problem as a high-dimensional stochastic optimization problem,\nand study it through the use of a closely related deterministic variant. We\nshow that the optimal static solution to this deterministic variant is\nasymptotically optimal for the dynamic problem under fairness constraints.\nThough solving for the optimal static solution gives rise to a non-convex\noptimization problem, we uncover a structural property that allows us to design\na tractable, fast-converging heuristic policy. Traditional schemes for\nretention ignore fairness constraints; indeed, the goal in these is to use\ndifferentiation to incentivize repeated engagement with the system. Our work\n(i) shows that even in the absence of explicit discrimination, dynamic policies\nmay unintentionally discriminate between agents of different types by varying\nthe type composition of the system, and (ii) presents an asymptotically optimal\npolicy to avoid such discriminatory outcomes.\n","authors":["Daniel Freund","Chamsi Hssaine"],"pdf_url":"https://arxiv.org/pdf/2111.00002v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18290v3","updated":"2024-07-29T22:26:36Z","published":"2023-05-29T17:57:46Z","title":"Direct Preference Optimization: Your Language Model is Secretly a Reward\n Model","summary":" While large-scale unsupervised language models (LMs) learn broad world\nknowledge and some reasoning skills, achieving precise control of their\nbehavior is difficult due to the completely unsupervised nature of their\ntraining. Existing methods for gaining such steerability collect human labels\nof the relative quality of model generations and fine-tune the unsupervised LM\nto align with these preferences, often with reinforcement learning from human\nfeedback (RLHF). However, RLHF is a complex and often unstable procedure, first\nfitting a reward model that reflects the human preferences, and then\nfine-tuning the large unsupervised LM using reinforcement learning to maximize\nthis estimated reward without drifting too far from the original model. In this\npaper we introduce a new parameterization of the reward model in RLHF that\nenables extraction of the corresponding optimal policy in closed form, allowing\nus to solve the standard RLHF problem with only a simple classification loss.\nThe resulting algorithm, which we call Direct Preference Optimization (DPO), is\nstable, performant, and computationally lightweight, eliminating the need for\nsampling from the LM during fine-tuning or performing significant\nhyperparameter tuning. Our experiments show that DPO can fine-tune LMs to align\nwith human preferences as well as or better than existing methods. Notably,\nfine-tuning with DPO exceeds PPO-based RLHF in ability to control sentiment of\ngenerations, and matches or improves response quality in summarization and\nsingle-turn dialogue while being substantially simpler to implement and train.\n","authors":["Rafael Rafailov","Archit Sharma","Eric Mitchell","Stefano Ermon","Christopher D. Manning","Chelsea Finn"],"pdf_url":"https://arxiv.org/pdf/2305.18290v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.06322v2","updated":"2024-07-29T22:17:31Z","published":"2024-07-08T18:38:52Z","title":"MagMax: Leveraging Model Merging for Seamless Continual Learning","summary":" This paper introduces a continual learning approach named MagMax, which\nutilizes model merging to enable large pre-trained models to continuously learn\nfrom new data without forgetting previously acquired knowledge. Distinct from\ntraditional continual learning methods that aim to reduce forgetting during\ntask training, MagMax combines sequential fine-tuning with a maximum magnitude\nweight selection for effective knowledge integration across tasks. Our initial\ncontribution is an extensive examination of model merging techniques, revealing\nthat simple approaches like weight averaging and random weight selection\nsurprisingly hold up well in various continual learning contexts. More\nimportantly, we present MagMax, a novel model-merging strategy that enables\ncontinual learning of large pre-trained models for successive tasks. Our\nthorough evaluation demonstrates the superiority of MagMax in various\nscenarios, including class- and domain-incremental learning settings. The code\nis available at this URL: https://github.com/danielm1405/magmax.\n","authors":["Daniel Marczak","Bartłomiej Twardowski","Tomasz Trzciński","Sebastian Cygert"],"pdf_url":"https://arxiv.org/pdf/2407.06322v2.pdf","comment":"Accepted for ECCV2024"},{"id":"http://arxiv.org/abs/2407.20432v1","updated":"2024-07-29T21:54:57Z","published":"2024-07-29T21:54:57Z","title":"Neural Surrogate HMC: Accelerated Hamiltonian Monte Carlo with a Neural\n Network Surrogate Likelihood","summary":" Bayesian Inference with Markov Chain Monte Carlo requires efficient\ncomputation of the likelihood function. In some scientific applications, the\nlikelihood must be computed by numerically solving a partial differential\nequation, which can be prohibitively expensive. We demonstrate that some such\nproblems can be made tractable by amortizing the computation with a surrogate\nlikelihood function implemented by a neural network. We show that this has two\nadditional benefits: reducing noise in the likelihood evaluations and providing\nfast gradient calculations. In experiments, the approach is applied to a model\nof heliospheric transport of galactic cosmic rays, where it enables efficient\nsampling from the posterior of latent parameters in the Parker equation.\n","authors":["Linnea M Wolniewicz","Peter Sadowski","Claudio Corti"],"pdf_url":"https://arxiv.org/pdf/2407.20432v1.pdf","comment":"5 pages, 3 figures, accepted at SPAICE Conference 2024"},{"id":"http://arxiv.org/abs/2404.05062v2","updated":"2024-07-29T21:39:17Z","published":"2024-04-07T20:16:37Z","title":"New methods to compute the generalized chi-square distribution","summary":" We present several new mathematical methods (ray-trace, inverse Fourier\ntransform and ellipse) and open-source software to compute the cdf, pdf and\ninverse cdf of the generalized chi-square distribution. Some methods are geared\nfor speed, while others are designed to be accurate far into the tails, using\nwhich we can also measure large values of the discriminability index d' between\nmultinormals. We characterize the performance and limitations of these and\nprevious methods, and recommend the best methods to use for each part of each\ntype of distribution. We also demonstrate the speed and accuracy of our new\nmethods against previous methods across a wide sample of distributions.\n","authors":["Abhranil Das"],"pdf_url":"https://arxiv.org/pdf/2404.05062v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16731v3","updated":"2024-07-29T21:32:01Z","published":"2023-12-27T22:05:42Z","title":"Infinite dSprites for Disentangled Continual Learning: Separating Memory\n Edits from Generalization","summary":" The ability of machine learning systems to learn continually is hindered by\ncatastrophic forgetting, the tendency of neural networks to overwrite\npreviously acquired knowledge when learning a new task. Existing methods\nmitigate this problem through regularization, parameter isolation, or\nrehearsal, but they are typically evaluated on benchmarks comprising only a\nhandful of tasks. In contrast, humans are able to learn over long time horizons\nin dynamic, open-world environments, effortlessly memorizing unfamiliar objects\nand reliably recognizing them under various transformations. To make progress\ntowards closing this gap, we introduce Infinite dSprites, a parsimonious tool\nfor creating continual classification and disentanglement benchmarks of\narbitrary length and with full control over generative factors. We show that\nover a sufficiently long time horizon, the performance of all major types of\ncontinual learning methods deteriorates on this simple benchmark. This result\nhighlights an important and previously overlooked aspect of continual learning:\ngiven a finite modelling capacity and an arbitrarily long learning horizon,\nefficient learning requires memorizing class-specific information and\naccumulating knowledge about general mechanisms. In a simple setting with\ndirect supervision on the generative factors, we show how learning\nclass-agnostic transformations offers a way to circumvent catastrophic\nforgetting and improve classification accuracy over time. Our approach sets the\nstage for continual learning over hundreds of tasks with explicit control over\nmemorization and forgetting, emphasizing open-set classification and one-shot\ngeneralization.\n","authors":["Sebastian Dziadzio","Çağatay Yıldız","Gido M. van de Ven","Tomasz Trzciński","Tinne Tuytelaars","Matthias Bethge"],"pdf_url":"https://arxiv.org/pdf/2312.16731v3.pdf","comment":"10 pages, 10 figures"},{"id":"http://arxiv.org/abs/2407.20421v1","updated":"2024-07-29T21:22:53Z","published":"2024-07-29T21:22:53Z","title":"Event-based Optical Flow on Neuromorphic Processor: ANN vs. SNN\n Comparison based on Activation Sparsification","summary":" Spiking neural networks (SNNs) for event-based optical flow are claimed to be\ncomputationally more efficient than their artificial neural networks (ANNs)\ncounterparts, but a fair comparison is missing in the literature. In this work,\nwe propose an event-based optical flow solution based on activation\nsparsification and a neuromorphic processor, SENECA. SENECA has an event-driven\nprocessing mechanism that can exploit the sparsity in ANN activations and SNN\nspikes to accelerate the inference of both types of neural networks. The ANN\nand the SNN for comparison have similar low activation/spike density (~5%)\nthanks to our novel sparsification-aware training. In the hardware-in-loop\nexperiments designed to deduce the average time and energy consumption, the SNN\nconsumes 44.9ms and 927.0 microjoules, which are 62.5% and 75.2% of the ANN's\nconsumption, respectively. We find that SNN's higher efficiency attributes to\nits lower pixel-wise spike density (43.5% vs. 66.5%) that requires fewer memory\naccess operations for neuron states.\n","authors":["Yingfu Xu","Guangzhi Tang","Amirreza Yousefzadeh","Guido de Croon","Manolis Sifalakis"],"pdf_url":"https://arxiv.org/pdf/2407.20421v1.pdf","comment":"18 pages, 12 figures, 4 tables"},{"id":"http://arxiv.org/abs/2312.06106v3","updated":"2024-07-29T21:09:11Z","published":"2023-12-11T04:24:11Z","title":"AUGCAL: Improving Sim2Real Adaptation by Uncertainty Calibration on\n Augmented Synthetic Images","summary":" Synthetic data (SIM) drawn from simulators have emerged as a popular\nalternative for training models where acquiring annotated real-world images is\ndifficult. However, transferring models trained on synthetic images to\nreal-world applications can be challenging due to appearance disparities. A\ncommonly employed solution to counter this SIM2REAL gap is unsupervised domain\nadaptation, where models are trained using labeled SIM data and unlabeled REAL\ndata. Mispredictions made by such SIM2REAL adapted models are often associated\nwith miscalibration - stemming from overconfident predictions on real data. In\nthis paper, we introduce AUGCAL, a simple training-time patch for unsupervised\nadaptation that improves SIM2REAL adapted models by - (1) reducing overall\nmiscalibration, (2) reducing overconfidence in incorrect predictions and (3)\nimproving confidence score reliability by better guiding misclassification\ndetection - all while retaining or improving SIM2REAL performance. Given a base\nSIM2REAL adaptation algorithm, at training time, AUGCAL involves replacing\nvanilla SIM images with strongly augmented views (AUG intervention) and\nadditionally optimizing for a training time calibration loss on augmented SIM\npredictions (CAL intervention). We motivate AUGCAL using a brief analytical\njustification of how to reduce miscalibration on unlabeled REAL data. Through\nour experiments, we empirically show the efficacy of AUGCAL across multiple\nadaptation methods, backbones, tasks and shifts.\n","authors":["Prithvijit Chattopadhyay","Bharat Goyal","Boglarka Ecsedi","Viraj Prabhu","Judy Hoffman"],"pdf_url":"https://arxiv.org/pdf/2312.06106v3.pdf","comment":"Published at ICLR 2024"},{"id":"http://arxiv.org/abs/2309.06979v3","updated":"2024-07-29T20:51:25Z","published":"2023-09-13T14:15:03Z","title":"Auto-Regressive Next-Token Predictors are Universal Learners","summary":" Large language models display remarkable capabilities in logical and\nmathematical reasoning, allowing them to solve complex tasks. Interestingly,\nthese abilities emerge in networks trained on the simple task of next-token\nprediction. In this work, we present a theoretical framework for studying\nauto-regressive next-token predictors. We demonstrate that even simple models\nsuch as linear next-token predictors, trained on Chain-of-Thought (CoT) data,\ncan approximate any function efficiently computed by a Turing machine. We\nintroduce a new complexity measure -- length complexity -- which measures the\nnumber of intermediate tokens in a CoT sequence required to approximate some\ntarget function, and analyze the interplay between length complexity and other\nnotions of complexity. Finally, we show experimentally that simple next-token\npredictors, such as linear networks and shallow Multi-Layer Perceptrons (MLPs),\ndisplay non-trivial performance on text generation and arithmetic tasks. Our\nresults demonstrate that the power of today's LLMs can be attributed, to a\ngreat extent, to the auto-regressive next-token training scheme, and not\nnecessarily to a particular choice of architecture.\n","authors":["Eran Malach"],"pdf_url":"https://arxiv.org/pdf/2309.06979v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04533v2","updated":"2024-07-29T20:40:09Z","published":"2023-12-07T18:51:19Z","title":"Dream2Real: Zero-Shot 3D Object Rearrangement with Vision-Language\n Models","summary":" We introduce Dream2Real, a robotics framework which integrates\nvision-language models (VLMs) trained on 2D data into a 3D object rearrangement\npipeline. This is achieved by the robot autonomously constructing a 3D\nrepresentation of the scene, where objects can be rearranged virtually and an\nimage of the resulting arrangement rendered. These renders are evaluated by a\nVLM, so that the arrangement which best satisfies the user instruction is\nselected and recreated in the real world with pick-and-place. This enables\nlanguage-conditioned rearrangement to be performed zero-shot, without needing\nto collect a training dataset of example arrangements. Results on a series of\nreal-world tasks show that this framework is robust to distractors,\ncontrollable by language, capable of understanding complex multi-object\nrelations, and readily applicable to both tabletop and 6-DoF rearrangement\ntasks.\n","authors":["Ivan Kapelyukh","Yifei Ren","Ignacio Alzugaray","Edward Johns"],"pdf_url":"https://arxiv.org/pdf/2312.04533v2.pdf","comment":"ICRA 2024. Project webpage with robot videos:\n https://www.robot-learning.uk/dream2real"},{"id":"http://arxiv.org/abs/2312.04474v4","updated":"2024-07-29T20:21:37Z","published":"2023-12-07T17:51:43Z","title":"Chain of Code: Reasoning with a Language Model-Augmented Code Emulator","summary":" Code provides a general syntactic structure to build complex programs and\nperform precise computations when paired with a code interpreter - we\nhypothesize that language models (LMs) can leverage code-writing to improve\nChain of Thought reasoning not only for logic and arithmetic tasks, but also\nfor semantic ones (and in particular, those that are a mix of both). For\nexample, consider prompting an LM to write code that counts the number of times\nit detects sarcasm in an essay: the LM may struggle to write an implementation\nfor \"detect_sarcasm(string)\" that can be executed by the interpreter (handling\nthe edge cases would be insurmountable). However, LMs may still produce a valid\nsolution if they not only write code, but also selectively \"emulate\" the\ninterpreter by generating the expected output of \"detect_sarcasm(string)\". In\nthis work, we propose Chain of Code (CoC), a simple yet surprisingly effective\nextension that improves LM code-driven reasoning. The key idea is to encourage\nLMs to format semantic sub-tasks in a program as flexible pseudocode that the\ninterpreter can explicitly catch undefined behaviors and hand off to simulate\nwith an LM (as an \"LMulator\"). Experiments demonstrate that Chain of Code\noutperforms Chain of Thought and other baselines across a variety of\nbenchmarks; on BIG-Bench Hard, Chain of Code achieves 84%, a gain of 12% over\nChain of Thought. In a nutshell, CoC broadens the scope of reasoning questions\nthat LMs can answer by \"thinking in code\".\n","authors":["Chengshu Li","Jacky Liang","Andy Zeng","Xinyun Chen","Karol Hausman","Dorsa Sadigh","Sergey Levine","Li Fei-Fei","Fei Xia","Brian Ichter"],"pdf_url":"https://arxiv.org/pdf/2312.04474v4.pdf","comment":"ICML 2024 Oral; Project webpage: https://chain-of-code.github.io"},{"id":"http://arxiv.org/abs/2407.20395v1","updated":"2024-07-29T19:42:22Z","published":"2024-07-29T19:42:22Z","title":"Dense Self-Supervised Learning for Medical Image Segmentation","summary":" Deep learning has revolutionized medical image segmentation, but it relies\nheavily on high-quality annotations. The time, cost and expertise required to\nlabel images at the pixel-level for each new task has slowed down widespread\nadoption of the paradigm. We propose Pix2Rep, a self-supervised learning (SSL)\napproach for few-shot segmentation, that reduces the manual annotation burden\nby learning powerful pixel-level representations directly from unlabeled\nimages. Pix2Rep is a novel pixel-level loss and pre-training paradigm for\ncontrastive SSL on whole images. It is applied to generic encoder-decoder deep\nlearning backbones (e.g., U-Net). Whereas most SSL methods enforce invariance\nof the learned image-level representations under intensity and spatial image\naugmentations, Pix2Rep enforces equivariance of the pixel-level\nrepresentations. We demonstrate the framework on a task of cardiac MRI\nsegmentation. Results show improved performance compared to existing semi- and\nself-supervised approaches; and a 5-fold reduction in the annotation burden for\nequivalent performance versus a fully supervised U-Net baseline. This includes\na 30% (resp. 31%) DICE improvement for one-shot segmentation under\nlinear-probing (resp. fine-tuning). Finally, we also integrate the novel\nPix2Rep concept with the Barlow Twins non-contrastive SSL, which leads to even\nbetter segmentation performance.\n","authors":["Maxime Seince","Loic Le Folgoc","Luiz Augusto Facury de Souza","Elsa Angelini"],"pdf_url":"https://arxiv.org/pdf/2407.20395v1.pdf","comment":"Accepted at MIDL 2024"},{"id":"http://arxiv.org/abs/2407.20387v1","updated":"2024-07-29T19:26:24Z","published":"2024-07-29T19:26:24Z","title":"Two-Phase Segmentation Approach for Accurate Left Ventricle Segmentation\n in Cardiac MRI using Machine Learning","summary":" Accurate segmentation of the Left Ventricle (LV) holds substantial importance\ndue to its implications in disease detection, regional analysis, and the\ndevelopment of complex models for cardiac surgical planning. CMR is a golden\nstandard for diagnosis of serveral cardiac diseases. LV in CMR comprises of\nthree distinct sections: Basal, Mid-Ventricle, and Apical. This research\nfocuses on the precise segmentation of the LV from Cardiac MRI (CMR) scans,\njoining with the capabilities of Machine Learning (ML). The central challenge\nin this research revolves around the absence of a set of parameters applicable\nto all three types of LV slices. Parameters optimized for basal slices often\nfall short when applied to mid-ventricular and apical slices, and vice versa.\nTo handle this issue, a new method is proposed to enhance LV segmentation. The\nproposed method involves using distinct sets of parameters for each type of\nslice, resulting in a two-phase segmentation approach. The initial phase\ncategorizes images into three groups based on the type of LV slice, while the\nsecond phase aims to segment CMR images using parameters derived from the\npreceding phase. A publicly available dataset (Automated Cardiac Diagnosis\nChallenge (ACDC)) is used. 10-Fold Cross Validation is used and it achieved a\nmean score of 0.9228. Comprehensive testing indicates that the best parameter\nset for a particular type of slice does not perform adequately for the other\nslice types. All results show that the proposed approach fills a critical void\nin parameter standardization through a two-phase segmentation model for the LV,\naiming to not only improve the accuracy of cardiac image analysis but also\ncontribute advancements to the field of LV segmentation.\n","authors":["Maria Tamoor","Abbas Raza Ali","Philemon Philip","Ruqqayia Adil","Rabia Shahid","Asma Naseer"],"pdf_url":"https://arxiv.org/pdf/2407.20387v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.20055v2","updated":"2024-07-29T19:14:39Z","published":"2024-06-28T17:07:11Z","title":"SpotlessSplats: Ignoring Distractors in 3D Gaussian Splatting","summary":" 3D Gaussian Splatting (3DGS) is a promising technique for 3D reconstruction,\noffering efficient training and rendering speeds, making it suitable for\nreal-time applications.However, current methods require highly controlled\nenvironments (no moving people or wind-blown elements, and consistent lighting)\nto meet the inter-view consistency assumption of 3DGS. This makes\nreconstruction of real-world captures problematic. We present SpotLessSplats,\nan approach that leverages pre-trained and general-purpose features coupled\nwith robust optimization to effectively ignore transient distractors. Our\nmethod achieves state-of-the-art reconstruction quality both visually and\nquantitatively, on casual captures. Additional results available at:\nhttps://spotlesssplats.github.io\n","authors":["Sara Sabour","Lily Goli","George Kopanas","Mark Matthews","Dmitry Lagun","Leonidas Guibas","Alec Jacobson","David J. Fleet","Andrea Tagliasacchi"],"pdf_url":"https://arxiv.org/pdf/2406.20055v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.19871v2","updated":"2024-07-29T18:48:45Z","published":"2023-05-31T14:08:48Z","title":"There is more to graphs than meets the eye: Learning universal features\n with self-supervision","summary":" We study the problem of learning features through self-supervision that are\ngeneralisable to multiple graphs. State-of-the-art graph self-supervision\nrestricts training to only one graph, resulting in graph-specific models that\nare incompatible with different but related graphs. We hypothesize that\ntraining with more than one graph that belong to the same family can improve\nthe quality of the learnt representations. However, learning universal features\nfrom disparate node/edge features in different graphs is non-trivial. To\naddress this challenge, we first homogenise the disparate features with\ngraph-specific encoders that transform the features into a common space. A\nuniversal representation learning module then learns generalisable features on\nthis common space. We show that compared to traditional self-supervision with\none graph, our approach results in (1) better performance on downstream node\nclassification, (2) learning features that can be re-used for unseen graphs of\nthe same family, (3) more efficient training and (4) compact yet generalisable\nmodels. We also show ability of the proposed framework to deliver these\nbenefits for relatively larger graphs. In this paper, we present a principled\nway to design foundation graph models that learn from more than one graph in an\nend-to-end manner, while bridging the gap between self-supervised and\nsupervised performance.\n","authors":["Laya Das","Sai Munikoti","Nrushad Joshi","Mahantesh Halappanavar"],"pdf_url":"https://arxiv.org/pdf/2305.19871v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2302.11939,\n arXiv:2301.13287, arXiv:2305.12686, arXiv:2305.02299"},{"id":"http://arxiv.org/abs/2401.14283v2","updated":"2024-07-29T18:46:50Z","published":"2024-01-25T16:15:27Z","title":"Information Leakage Detection through Approximate Bayes-optimal\n Prediction","summary":" In today's data-driven world, the proliferation of publicly available\ninformation raises security concerns due to the information leakage (IL)\nproblem. IL involves unintentionally exposing sensitive information to\nunauthorized parties via observable system information. Conventional\nstatistical approaches rely on estimating mutual information (MI) between\nobservable and secret information for detecting ILs, face challenges of the\ncurse of dimensionality, convergence, computational complexity, and MI\nmisestimation. Though effective, emerging supervised machine learning based\napproaches to detect ILs are limited to binary system sensitive information and\nlack a comprehensive framework. To address these limitations, we establish a\ntheoretical framework using statistical learning theory and information theory\nto quantify and detect IL accurately. Using automated machine learning, we\ndemonstrate that MI can be accurately estimated by approximating the typically\nunknown Bayes predictor's log-loss and accuracy. Based on this, we show how MI\ncan effectively be estimated to detect ILs. Our method performs superior to\nstate-of-the-art baselines in an empirical study considering synthetic and\nreal-world OpenSSL TLS server datasets.\n","authors":["Pritha Gupta","Marcel Wever","Eyke Hüllermeier"],"pdf_url":"https://arxiv.org/pdf/2401.14283v2.pdf","comment":"Under submission in Information Sciences"},{"id":"http://arxiv.org/abs/2407.20371v1","updated":"2024-07-29T18:42:39Z","published":"2024-07-29T18:42:39Z","title":"Gender, Race, and Intersectional Bias in Resume Screening via Language\n Model Retrieval","summary":" Artificial intelligence (AI) hiring tools have revolutionized resume\nscreening, and large language models (LLMs) have the potential to do the same.\nHowever, given the biases which are embedded within LLMs, it is unclear whether\nthey can be used in this scenario without disadvantaging groups based on their\nprotected attributes. In this work, we investigate the possibilities of using\nLLMs in a resume screening setting via a document retrieval framework that\nsimulates job candidate selection. Using that framework, we then perform a\nresume audit study to determine whether a selection of Massive Text Embedding\n(MTE) models are biased in resume screening scenarios. We simulate this for\nnine occupations, using a collection of over 500 publicly available resumes and\n500 job descriptions. We find that the MTEs are biased, significantly favoring\nWhite-associated names in 85.1\\% of cases and female-associated names in only\n11.1\\% of cases, with a minority of cases showing no statistically significant\ndifferences. Further analyses show that Black males are disadvantaged in up to\n100\\% of cases, replicating real-world patterns of bias in employment settings,\nand validate three hypotheses of intersectionality. We also find an impact of\ndocument length as well as the corpus frequency of names in the selection of\nresumes. These findings have implications for widely used AI tools that are\nautomating employment, fairness, and tech policy.\n","authors":["Kyra Wilson","Aylin Caliskan"],"pdf_url":"https://arxiv.org/pdf/2407.20371v1.pdf","comment":"To be published in Proceedings of the 2024 AAAI/ACM Conference on AI,\n Ethics, and Society; code available at\n https://github.com/kyrawilson/Resume-Screening-Bias"},{"id":"http://arxiv.org/abs/2404.13646v2","updated":"2024-07-29T18:38:43Z","published":"2024-04-21T12:41:30Z","title":"Physics-informed Discretization-independent Deep Compositional Operator\n Network","summary":" Solving parametric Partial Differential Equations (PDEs) for a broad range of\nparameters is a critical challenge in scientific computing. To this end, neural\noperators, which \\textcolor{black}{predicts the PDE solution with variable PDE\nparameter inputs}, have been successfully used. However, the training of neural\noperators typically demands large training datasets, the acquisition of which\ncan be prohibitively expensive. To address this challenge, physics-informed\ntraining can offer a cost-effective strategy. However, current physics-informed\nneural operators face limitations, either in handling irregular domain shapes\nor in in generalizing to various discrete representations of PDE parameters. In\nthis research, we introduce a novel physics-informed model architecture which\ncan generalize to various discrete representations of PDE parameters and\nirregular domain shapes. Particularly, inspired by deep operator neural\nnetworks, our model involves a discretization-independent learning of parameter\nembedding repeatedly, and this parameter embedding is integrated with the\nresponse embeddings through multiple compositional layers, for more\nexpressivity. Numerical results demonstrate the accuracy and efficiency of the\nproposed method.\n","authors":["Weiheng Zhong","Hadi Meidani"],"pdf_url":"https://arxiv.org/pdf/2404.13646v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.05480v3","updated":"2024-07-29T18:34:37Z","published":"2024-05-09T00:37:56Z","title":"FloorSet -- a VLSI Floorplanning Dataset with Design Constraints of\n Real-World SoCs","summary":" Floorplanning for systems-on-a-chip (SoCs) and its sub-systems is a crucial\nand non-trivial step of the physical design flow. It represents a difficult\ncombinatorial optimization problem. A typical large scale SoC with 120\npartitions generates a search-space of nearly 10E250. As novel machine learning\n(ML) approaches emerge to tackle such problems, there is a growing need for a\nmodern benchmark that comprises a large training dataset and performance\nmetrics that better reflect real-world constraints and objectives compared to\nexisting benchmarks. To address this need, we present FloorSet -- two\ncomprehensive datasets of synthetic fixed-outline floorplan layouts that\nreflect the distribution of real SoCs. Each dataset has 1M training samples and\n100 test samples where each sample is a synthetic floor-plan. FloorSet-Prime\ncomprises fully-abutted rectilinear partitions and near-optimal wire-length. A\nsimplified dataset that reflects early design phases, FloorSet-Lite comprises\nrectangular partitions, with under 5 percent white-space and near-optimal\nwire-length. Both datasets define hard constraints seen in modern design flows\nsuch as shape constraints, edge-affinity, grouping constraints, and\npre-placement constraints. FloorSet is intended to spur fundamental research on\nlarge-scale constrained optimization problems. Crucially, FloorSet alleviates\nthe core issue of reproducibility in modern ML driven solutions to such\nproblems. FloorSet is available as an open-source repository for the research\ncommunity.\n","authors":["Uday Mallappa","Hesham Mostafa","Mikhail Galkin","Mariano Phielipp","Somdeb Majumdar"],"pdf_url":"https://arxiv.org/pdf/2405.05480v3.pdf","comment":"10 pages, 11 figures"},{"id":"http://arxiv.org/abs/2407.20367v1","updated":"2024-07-29T18:31:42Z","published":"2024-07-29T18:31:42Z","title":"Mixed Newton Method for Optimization in Complex Spaces","summary":" In this paper, we modify and apply the recently introduced Mixed Newton\nMethod, which is originally designed for minimizing real-valued functions of\ncomplex variables, to the minimization of real-valued functions of real\nvariables by extending the functions to complex space. We show that arbitrary\nregularizations preserve the favorable local convergence properties of the\nmethod, and construct a special type of regularization used to prevent\nconvergence to complex minima. We compare several variants of the method\napplied to training neural networks with real and complex parameters.\n","authors":["Nikita Yudin","Roland Hildebrand","Sergey Bakhurin","Alexander Degtyarev","Anna Lisachenko","Ilya Kuruzov","Andrei Semenov","Mohammad Alkousa"],"pdf_url":"https://arxiv.org/pdf/2407.20367v1.pdf","comment":"16 pages, 7 figures, 6 tables"},{"id":"http://arxiv.org/abs/2407.13922v2","updated":"2024-07-29T18:29:50Z","published":"2024-07-18T22:22:49Z","title":"Synthetic Counterfactual Faces","summary":" Computer vision systems have been deployed in various applications involving\nbiometrics like human faces. These systems can identify social media users,\nsearch for missing persons, and verify identity of individuals. While computer\nvision models are often evaluated for accuracy on available benchmarks, more\nannotated data is necessary to learn about their robustness and fairness\nagainst semantic distributional shifts in input data, especially in face data.\nAmong annotated data, counterfactual examples grant strong explainability\ncharacteristics. Because collecting natural face data is prohibitively\nexpensive, we put forth a generative AI-based framework to construct targeted,\ncounterfactual, high-quality synthetic face data. Our synthetic data pipeline\nhas many use cases, including face recognition systems sensitivity evaluations\nand image understanding system probes. The pipeline is validated with multiple\nuser studies. We showcase the efficacy of our face generation pipeline on a\nleading commercial vision model. We identify facial attributes that cause\nvision systems to fail.\n","authors":["Guruprasad V Ramesh","Harrison Rosenberg","Ashish Hooda","Shimaa Ahmed Kassem Fawaz"],"pdf_url":"https://arxiv.org/pdf/2407.13922v2.pdf","comment":"Paper under review. Full text and results will be updated after\n acceptance"},{"id":"http://arxiv.org/abs/2406.00045v2","updated":"2024-07-29T18:19:35Z","published":"2024-05-28T05:10:40Z","title":"Personalized Steering of Large Language Models: Versatile Steering\n Vectors Through Bi-directional Preference Optimization","summary":" Researchers have been studying approaches to steer the behavior of Large\nLanguage Models (LLMs) and build personalized LLMs tailored for various\napplications. While fine-tuning seems to be a direct solution, it requires\nsubstantial computational resources and may significantly affect the utility of\nthe original LLM. Recent endeavors have introduced more lightweight strategies,\nfocusing on extracting \"steering vectors\" to guide the model's output toward\ndesired behaviors by adjusting activations within specific layers of the LLM's\ntransformer architecture. However, such steering vectors are directly extracted\nfrom the activations of human preference data and thus often lead to suboptimal\nresults and occasional failures, especially in alignment-related scenarios.\nThis work proposes an innovative approach that could produce more effective\nsteering vectors through bi-directional preference optimization. Our method is\ndesigned to allow steering vectors to directly influence the generation\nprobability of contrastive human preference data pairs, thereby offering a more\nprecise representation of the target behavior. By carefully adjusting the\ndirection and magnitude of the steering vector, we enabled personalized control\nover the desired behavior across a spectrum of intensities. Extensive\nexperimentation across various open-ended generation tasks, particularly\nfocusing on steering AI personas, has validated the efficacy of our approach.\nMoreover, we comprehensively investigate critical alignment-concerning\nscenarios, such as managing truthfulness, mitigating hallucination, and\naddressing jailbreaking attacks. Remarkably, our method can still demonstrate\noutstanding steering effectiveness across these scenarios. Furthermore, we\nshowcase the transferability of our steering vectors across different\nmodels/LoRAs and highlight the synergistic benefits of applying multiple\nvectors simultaneously.\n","authors":["Yuanpu Cao","Tianrong Zhang","Bochuan Cao","Ziyi Yin","Lu Lin","Fenglong Ma","Jinghui Chen"],"pdf_url":"https://arxiv.org/pdf/2406.00045v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00418v2","updated":"2024-07-29T18:08:39Z","published":"2024-06-01T12:31:15Z","title":"GATE: How to Keep Out Intrusive Neighbors","summary":" Graph Attention Networks (GATs) are designed to provide flexible neighborhood\naggregation that assigns weights to neighbors according to their importance. In\npractice, however, GATs are often unable to switch off task-irrelevant\nneighborhood aggregation, as we show experimentally and analytically. To\naddress this challenge, we propose GATE, a GAT extension that holds three major\nadvantages: i) It alleviates over-smoothing by addressing its root cause of\nunnecessary neighborhood aggregation. ii) Similarly to perceptrons, it benefits\nfrom higher depth as it can still utilize additional layers for (non-)linear\nfeature transformations in case of (nearly) switched-off neighborhood\naggregation. iii) By down-weighting connections to unrelated neighbors, it\noften outperforms GATs on real-world heterophilic datasets. To further validate\nour claims, we construct a synthetic test bed to analyze a model's ability to\nutilize the appropriate amount of neighborhood aggregation, which could be of\nindependent interest.\n","authors":["Nimrah Mustafa","Rebekka Burkholz"],"pdf_url":"https://arxiv.org/pdf/2406.00418v2.pdf","comment":"26 pages. Published at the International Conference on Machine\n Learning (ICML), 2024"},{"id":"http://arxiv.org/abs/2308.02594v3","updated":"2024-07-29T18:07:54Z","published":"2023-08-03T21:08:51Z","title":"SMARLA: A Safety Monitoring Approach for Deep Reinforcement Learning\n Agents","summary":" Deep reinforcement learning algorithms (DRL) are increasingly being used in\nsafety-critical systems. Ensuring the safety of DRL agents is a critical\nconcern in such contexts. However, relying solely on testing is not sufficient\nto ensure safety as it does not offer guarantees. Building safety monitors is\none solution to alleviate this challenge. This paper proposes SMARLA, a machine\nlearning-based safety monitoring approach designed for DRL agents. For\npractical reasons, SMARLA is agnostic to the type of DRL agent's inputs.\nFurther, it is designed to be black-box (as it does not require access to the\ninternals or training data of the agent) by leveraging state abstraction to\nfacilitate the learning of safety violation prediction models from the agent's\nstates using a reduced state space. We quantitatively and qualitatively\nvalidated SMARLA on three well-known RL case studies. Empirical results reveal\nthat SMARLA achieves accurate violation prediction with a low false positive\nrate and can predict safety violations at an early stage, approximately halfway\nthrough the execution of the agent, before violations occur.\n","authors":["Amirhossein Zolfagharian","Manel Abdellatif","Lionel C. Briand","Ramesh S"],"pdf_url":"https://arxiv.org/pdf/2308.02594v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01361v2","updated":"2024-07-29T18:07:08Z","published":"2024-02-02T12:29:18Z","title":"To the Max: Reinventing Reward in Reinforcement Learning","summary":" In reinforcement learning (RL), different reward functions can define the\nsame optimal policy but result in drastically different learning performance.\nFor some, the agent gets stuck with a suboptimal behavior, and for others, it\nsolves the task efficiently. Choosing a good reward function is hence an\nextremely important yet challenging problem. In this paper, we explore an\nalternative approach for using rewards for learning. We introduce\n\\textit{max-reward RL}, where an agent optimizes the maximum rather than the\ncumulative reward. Unlike earlier works, our approach works for deterministic\nand stochastic environments and can be easily combined with state-of-the-art RL\nalgorithms. In the experiments, we study the performance of max-reward RL\nalgorithms in two goal-reaching environments from Gymnasium-Robotics and\ndemonstrate its benefits over standard RL. The code is available at\nhttps://github.com/veviurko/To-the-Max.\n","authors":["Grigorii Veviurko","Wendelin Böhmer","Mathijs de Weerdt"],"pdf_url":"https://arxiv.org/pdf/2402.01361v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20352v1","updated":"2024-07-29T18:06:29Z","published":"2024-07-29T18:06:29Z","title":"Designing Time-Series Models With Hypernetworks & Adversarial Portfolios","summary":" This article describes the methods that achieved 4th and 6th place in the\nforecasting and investment challenges, respectively, of the M6 competition,\nultimately securing the 1st place in the overall duathlon ranking. In the\nforecasting challenge, we tested a novel meta-learning model that utilizes\nhypernetworks to design a parametric model tailored to a specific family of\nforecasting tasks. This approach allowed us to leverage similarities observed\nacross individual forecasting tasks while also acknowledging potential\nheterogeneity in their data generating processes. The model's training can be\ndirectly performed with backpropagation, eliminating the need for reliance on\nhigher-order derivatives and is equivalent to a simultaneous search over the\nspace of parametric functions and their optimal parameter values. The proposed\nmodel's capabilities extend beyond M6, demonstrating superiority over\nstate-of-the-art meta-learning methods in the sinusoidal regression task and\noutperforming conventional parametric models on time-series from the M4\ncompetition. In the investment challenge, we adjusted portfolio weights to\ninduce greater or smaller correlation between our submission and that of other\nparticipants, depending on the current ranking, aiming to maximize the\nprobability of achieving a good rank.\n","authors":["Filip Staněk"],"pdf_url":"https://arxiv.org/pdf/2407.20352v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2407.20124v1","updated":"2024-07-29T15:54:43Z","published":"2024-07-29T15:54:43Z","title":"AxiomVision: Accuracy-Guaranteed Adaptive Visual Model Selection for\n Perspective-Aware Video Analytics","summary":" The rapid evolution of multimedia and computer vision technologies requires\nadaptive visual model deployment strategies to effectively handle diverse tasks\nand varying environments. This work introduces AxiomVision, a novel framework\nthat can guarantee accuracy by leveraging edge computing to dynamically select\nthe most efficient visual models for video analytics under diverse scenarios.\nUtilizing a tiered edge-cloud architecture, AxiomVision enables the deployment\nof a broad spectrum of visual models, from lightweight to complex DNNs, that\ncan be tailored to specific scenarios while considering camera source impacts.\nIn addition, AxiomVision provides three core innovations: (1) a dynamic visual\nmodel selection mechanism utilizing continual online learning, (2) an efficient\nonline method that efficiently takes into account the influence of the camera's\nperspective, and (3) a topology-driven grouping approach that accelerates the\nmodel selection process. With rigorous theoretical guarantees, these\nadvancements provide a scalable and effective solution for visual tasks\ninherent to multimedia systems, such as object detection, classification, and\ncounting. Empirically, AxiomVision achieves a 25.7\\% improvement in accuracy.\n","authors":["Xiangxiang Dai","Zeyu Zhang","Peng Yang","Yuedong Xu","Xutong Liu","John C. S. Lui"],"pdf_url":"https://arxiv.org/pdf/2407.20124v1.pdf","comment":"Accepted by ACM MM 2024"},{"id":"http://arxiv.org/abs/2303.03857v3","updated":"2024-07-29T15:29:23Z","published":"2023-03-07T12:49:45Z","title":"Leveraging Pre-trained AudioLDM for Sound Generation: A Benchmark Study","summary":" Deep neural networks have recently achieved breakthroughs in sound\ngeneration. Despite the outstanding sample quality, current sound generation\nmodels face issues on small-scale datasets (e.g., overfitting), significantly\nlimiting performance. In this paper, we make the first attempt to investigate\nthe benefits of pre-training on sound generation with AudioLDM, the\ncutting-edge model for audio generation, as the backbone. Our study\ndemonstrates the advantages of the pre-trained AudioLDM, especially in\ndata-scarcity scenarios. In addition, the baselines and evaluation protocol for\nsound generation systems are not consistent enough to compare different studies\ndirectly. Aiming to facilitate further study on sound generation tasks, we\nbenchmark the sound generation task on various frequently-used datasets. We\nhope our results on transfer learning and benchmarks can provide references for\nfurther research on conditional sound generation.\n","authors":["Yi Yuan","Haohe Liu","Jinhua Liang","Xubo Liu","Mark D. Plumbley","Wenwu Wang"],"pdf_url":"https://arxiv.org/pdf/2303.03857v3.pdf","comment":"Updated for EUSIPCO 2023 proceedings version"},{"id":"http://arxiv.org/abs/2407.19988v1","updated":"2024-07-29T13:20:22Z","published":"2024-07-29T13:20:22Z","title":"HeadsetOff: Enabling Photorealistic Video Conferencing on Economical VR\n Headsets","summary":" Virtual Reality (VR) headsets have become increasingly popular for remote\ncollaboration, but video conferencing poses challenges when the user's face is\ncovered by the headset. Existing solutions have limitations in terms of\naccessibility. In this paper, we propose HeadsetOff, a novel system that\nachieves photorealistic video conferencing on economical VR headsets by\nleveraging voice-driven face reconstruction. HeadsetOff consists of three main\ncomponents: a multimodal attention-based predictor, a generator, and an\nadaptive controller. The predictor effectively predicts user future behavior\nbased on different modalities. The generator employs voice input, head motion,\nand eye blink to animate the human face. The adaptive controller dynamically\nselects the appropriate generator model based on the trade-off between video\nquality and delay, aiming to maximize Quality of Experience while minimizing\nlatency. Experimental results demonstrate the effectiveness of HeadsetOff in\nachieving high-quality, low-latency video conferencing on economical VR\nheadsets.\n","authors":["Yili Jin","Xize Duan","Fangxin Wang","Xue Liu"],"pdf_url":"https://arxiv.org/pdf/2407.19988v1.pdf","comment":"Accepted by ACM Multimedia 2024"},{"id":"http://arxiv.org/abs/2407.19976v1","updated":"2024-07-29T13:09:26Z","published":"2024-07-29T13:09:26Z","title":"MambaGesture: Enhancing Co-Speech Gesture Generation with Mamba and\n Disentangled Multi-Modality Fusion","summary":" Co-speech gesture generation is crucial for producing synchronized and\nrealistic human gestures that accompany speech, enhancing the animation of\nlifelike avatars in virtual environments. While diffusion models have shown\nimpressive capabilities, current approaches often overlook a wide range of\nmodalities and their interactions, resulting in less dynamic and contextually\nvaried gestures. To address these challenges, we present MambaGesture, a novel\nframework integrating a Mamba-based attention block, MambaAttn, with a\nmulti-modality feature fusion module, SEAD. The MambaAttn block combines the\nsequential data processing strengths of the Mamba model with the contextual\nrichness of attention mechanisms, enhancing the temporal coherence of generated\ngestures. SEAD adeptly fuses audio, text, style, and emotion modalities,\nemploying disentanglement to deepen the fusion process and yield gestures with\ngreater realism and diversity. Our approach, rigorously evaluated on the\nmulti-modal BEAT dataset, demonstrates significant improvements in Fr\\'echet\nGesture Distance (FGD), diversity scores, and beat alignment, achieving\nstate-of-the-art performance in co-speech gesture generation.\n","authors":["Chencan Fu","Yabiao Wang","Jiangning Zhang","Zhengkai Jiang","Xiaofeng Mao","Jiafu Wu","Weijian Cao","Chengjie Wang","Yanhao Ge","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2407.19976v1.pdf","comment":"Accepted to ACM MM 2024"},{"id":"http://arxiv.org/abs/2407.19651v1","updated":"2024-07-29T02:32:44Z","published":"2024-07-29T02:32:44Z","title":"ComNeck: Bridging Compressed Image Latents and Multimodal LLMs via\n Universal Transform-Neck","summary":" This paper presents the first-ever study of adapting compressed image latents\nto suit the needs of downstream vision tasks that adopt Multimodal Large\nLanguage Models (MLLMs). MLLMs have extended the success of large language\nmodels to modalities (e.g. images) beyond text, but their billion scale hinders\ndeployment on resource-constrained end devices. While cloud-hosted MLLMs could\nbe available, transmitting raw, uncompressed images captured by end devices to\nthe cloud requires an efficient image compression system. To address this, we\nfocus on emerging neural image compression and propose a novel framework with a\nlightweight transform-neck and a surrogate loss to adapt compressed image\nlatents for MLLM-based vision tasks. The proposed framework is generic and\napplicable to multiple application scenarios, where the neural image codec can\nbe (1) pre-trained for human perception without updating, (2) fully updated for\njoint human and machine perception, or (3) fully updated for only machine\nperception. The transform-neck trained with the surrogate loss is universal,\nfor it can serve various downstream vision tasks enabled by a variety of MLLMs\nthat share the same visual encoder. Our framework has the striking feature of\nexcluding the downstream MLLMs from training the transform-neck, and\npotentially the neural image codec as well. This stands out from most existing\ncoding for machine approaches that involve downstream networks in training and\nthus could be impractical when the networks are MLLMs. Extensive experiments on\ndifferent neural image codecs and various MLLM-based vision tasks show that our\nmethod achieves great rate-accuracy performance with much less complexity,\ndemonstrating its effectiveness.\n","authors":["Chia-Hao Kao","Cheng Chien","Yu-Jen Tseng","Yi-Hsin Chen","Alessandro Gnutti","Shao-Yuan Lo","Wen-Hsiao Peng","Riccardo Leonardi"],"pdf_url":"https://arxiv.org/pdf/2407.19651v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19625v1","updated":"2024-07-29T01:06:45Z","published":"2024-07-29T01:06:45Z","title":"LoginMEA: Local-to-Global Interaction Network for Multi-modal Entity\n Alignment","summary":" Multi-modal entity alignment (MMEA) aims to identify equivalent entities\nbetween two multi-modal knowledge graphs (MMKGs), whose entities can be\nassociated with relational triples and related images. Most previous studies\ntreat the graph structure as a special modality, and fuse different modality\ninformation with separate uni-modal encoders, neglecting valuable relational\nassociations in modalities. Other studies refine each uni-modal information\nwith graph structures, but may introduce unnecessary relations in specific\nmodalities. To this end, we propose a novel local-to-global interaction network\nfor MMEA, termed as LoginMEA. Particularly, we first fuse local multi-modal\ninteractions to generate holistic entity semantics and then refine them with\nglobal relational interactions of entity neighbors. In this design, the\nuni-modal information is fused adaptively, and can be refined with relations\naccordingly. To enrich local interactions of multi-modal entity information, we\ndevice modality weights and low-rank interactive fusion, allowing diverse\nimpacts and element-level interactions among modalities. To capture global\ninteractions of graph structures, we adopt relation reflection graph attention\nnetworks, which fully capture relational associations between entities.\nExtensive experiments demonstrate superior results of our method over 5\ncross-KG or bilingual benchmark datasets, indicating the effectiveness of\ncapturing local and global interactions.\n","authors":["Taoyu Su","Xinghua Zhang","Jiawei Sheng","Zhenyu Zhang","Tingwen Liu"],"pdf_url":"https://arxiv.org/pdf/2407.19625v1.pdf","comment":"Accepted by ECAI 2024"},{"id":"http://arxiv.org/abs/2407.20341v1","updated":"2024-07-29T18:00:17Z","published":"2024-07-29T18:00:17Z","title":"BRIDGE: Bridging Gaps in Image Captioning Evaluation with Stronger\n Visual Cues","summary":" Effectively aligning with human judgment when evaluating machine-generated\nimage captions represents a complex yet intriguing challenge. Existing\nevaluation metrics like CIDEr or CLIP-Score fall short in this regard as they\ndo not take into account the corresponding image or lack the capability of\nencoding fine-grained details and penalizing hallucinations. To overcome these\nissues, in this paper, we propose BRIDGE, a new learnable and reference-free\nimage captioning metric that employs a novel module to map visual features into\ndense vectors and integrates them into multi-modal pseudo-captions which are\nbuilt during the evaluation process. This approach results in a multimodal\nmetric that properly incorporates information from the input image without\nrelying on reference captions, bridging the gap between human judgment and\nmachine-generated image captions. Experiments spanning several datasets\ndemonstrate that our proposal achieves state-of-the-art results compared to\nexisting reference-free evaluation scores. Our source code and trained models\nare publicly available at: https://github.com/aimagelab/bridge-score.\n","authors":["Sara Sarto","Marcella Cornia","Lorenzo Baraldi","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2407.20341v1.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2407.20337v1","updated":"2024-07-29T18:00:10Z","published":"2024-07-29T18:00:10Z","title":"Contrasting Deepfakes Diffusion via Contrastive Learning and\n Global-Local Similarities","summary":" Discerning between authentic content and that generated by advanced AI\nmethods has become increasingly challenging. While previous research primarily\naddresses the detection of fake faces, the identification of generated natural\nimages has only recently surfaced. This prompted the recent exploration of\nsolutions that employ foundation vision-and-language models, like CLIP.\nHowever, the CLIP embedding space is optimized for global image-to-text\nalignment and is not inherently designed for deepfake detection, neglecting the\npotential benefits of tailored training and local image features. In this\nstudy, we propose CoDE (Contrastive Deepfake Embeddings), a novel embedding\nspace specifically designed for deepfake detection. CoDE is trained via\ncontrastive learning by additionally enforcing global-local similarities. To\nsustain the training of our model, we generate a comprehensive dataset that\nfocuses on images generated by diffusion models and encompasses a collection of\n9.2 million images produced by using four different generators. Experimental\nresults demonstrate that CoDE achieves state-of-the-art accuracy on the newly\ncollected dataset, while also showing excellent generalization capabilities to\nunseen image generators. Our source code, trained models, and collected dataset\nare publicly available at: https://github.com/aimagelab/CoDE.\n","authors":["Lorenzo Baraldi","Federico Cocchi","Marcella Cornia","Lorenzo Baraldi","Alessandro Nicolosi","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2407.20337v1.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2407.19704v1","updated":"2024-07-29T04:56:56Z","published":"2024-07-29T04:56:56Z","title":"UNQA: Unified No-Reference Quality Assessment for Audio, Image, Video,\n and Audio-Visual Content","summary":" As multimedia data flourishes on the Internet, quality assessment (QA) of\nmultimedia data becomes paramount for digital media applications. Since\nmultimedia data includes multiple modalities including audio, image, video, and\naudio-visual (A/V) content, researchers have developed a range of QA methods to\nevaluate the quality of different modality data. While they exclusively focus\non addressing the single modality QA issues, a unified QA model that can handle\ndiverse media across multiple modalities is still missing, whereas the latter\ncan better resemble human perception behaviour and also have a wider range of\napplications. In this paper, we propose the Unified No-reference Quality\nAssessment model (UNQA) for audio, image, video, and A/V content, which tries\nto train a single QA model across different media modalities. To tackle the\nissue of inconsistent quality scales among different QA databases, we develop a\nmulti-modality strategy to jointly train UNQA on multiple QA databases. Based\non the input modality, UNQA selectively extracts the spatial features, motion\nfeatures, and audio features, and calculates a final quality score via the four\ncorresponding modality regression modules. Compared with existing QA methods,\nUNQA has two advantages: 1) the multi-modality training strategy makes the QA\nmodel learn more general and robust quality-aware feature representation as\nevidenced by the superior performance of UNQA compared to state-of-the-art QA\nmethods. 2) UNQA reduces the number of models required to assess multimedia\ndata across different modalities. and is friendly to deploy to practical\napplications.\n","authors":["Yuqin Cao","Xiongkuo Min","Yixuan Gao","Wei Sun","Weisi Lin","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2407.19704v1.pdf","comment":null}]},"2024-07-28T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2311.10813v4","updated":"2024-07-28T23:37:51Z","published":"2023-11-17T18:59:56Z","title":"A Language Agent for Autonomous Driving","summary":" Human-level driving is an ultimate goal of autonomous driving. Conventional\napproaches formulate autonomous driving as a perception-prediction-planning\nframework, yet their systems do not capitalize on the inherent reasoning\nability and experiential knowledge of humans. In this paper, we propose a\nfundamental paradigm shift from current pipelines, exploiting Large Language\nModels (LLMs) as a cognitive agent to integrate human-like intelligence into\nautonomous driving systems. Our approach, termed Agent-Driver, transforms the\ntraditional autonomous driving pipeline by introducing a versatile tool library\naccessible via function calls, a cognitive memory of common sense and\nexperiential knowledge for decision-making, and a reasoning engine capable of\nchain-of-thought reasoning, task planning, motion planning, and\nself-reflection. Powered by LLMs, our Agent-Driver is endowed with intuitive\ncommon sense and robust reasoning capabilities, thus enabling a more nuanced,\nhuman-like approach to autonomous driving. We evaluate our approach on the\nlarge-scale nuScenes benchmark, and extensive experiments substantiate that our\nAgent-Driver significantly outperforms the state-of-the-art driving methods by\na large margin. Our approach also demonstrates superior interpretability and\nfew-shot learning ability to these methods.\n","authors":["Jiageng Mao","Junjie Ye","Yuxi Qian","Marco Pavone","Yue Wang"],"pdf_url":"https://arxiv.org/pdf/2311.10813v4.pdf","comment":"COLM 2024. Project Page: https://usc-gvl.github.io/Agent-Driver/"},{"id":"http://arxiv.org/abs/2407.19600v1","updated":"2024-07-28T22:12:36Z","published":"2024-07-28T22:12:36Z","title":"You shall know a piece by the company it keeps. Chess plays as a data\n for word2vec models","summary":" In this paper, I apply linguistic methods of analysis to non-linguistic data,\nchess plays, metaphorically equating one with the other and seeking analogies.\nChess game notations are also a kind of text, and one can consider the records\nof moves or positions of pieces as words and statements in a certain language.\nIn this article I show how word embeddings (word2vec) can work on chess game\ntexts instead of natural language texts. I don't see how this representation of\nchess data can be used productively. It's unlikely that these vector models\nwill help engines or people choose the best move. But in a purely academic\nsense, it's clear that such methods of information representation capture\nsomething important about the very nature of the game, which doesn't\nnecessarily lead to a win.\n","authors":["Boris Orekhov"],"pdf_url":"https://arxiv.org/pdf/2407.19600v1.pdf","comment":"14 pages, 7 figures"},{"id":"http://arxiv.org/abs/2407.19594v1","updated":"2024-07-28T21:58:28Z","published":"2024-07-28T21:58:28Z","title":"Meta-Rewarding Language Models: Self-Improving Alignment with\n LLM-as-a-Meta-Judge","summary":" Large Language Models (LLMs) are rapidly surpassing human knowledge in many\ndomains. While improving these models traditionally relies on costly human\ndata, recent self-rewarding mechanisms (Yuan et al., 2024) have shown that LLMs\ncan improve by judging their own responses instead of relying on human\nlabelers. However, existing methods have primarily focused on improving model\nresponses rather than judgment capabilities, resulting in rapid saturation\nduring iterative training. To address this issue, we introduce a novel\nMeta-Rewarding step to the self-improvement process, where the model judges its\nown judgements and uses that feedback to refine its judgment skills.\nSurprisingly, this unsupervised approach improves the model's ability to judge\n{\\em and} follow instructions, as demonstrated by a win rate improvement of\nLlama-3-8B-Instruct from 22.9% to 39.4% on AlpacaEval 2, and 20.6% to 29.1% on\nArena-Hard. These results strongly suggest the potential for self-improving\nmodels without human supervision.\n","authors":["Tianhao Wu","Weizhe Yuan","Olga Golovneva","Jing Xu","Yuandong Tian","Jiantao Jiao","Jason Weston","Sainbayar Sukhbaatar"],"pdf_url":"https://arxiv.org/pdf/2407.19594v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19584v1","updated":"2024-07-28T20:50:53Z","published":"2024-07-28T20:50:53Z","title":"SaulLM-54B & SaulLM-141B: Scaling Up Domain Adaptation for the Legal\n Domain","summary":" In this paper, we introduce SaulLM-54B and SaulLM-141B, two large language\nmodels (LLMs) tailored for the legal sector. These models, which feature\narchitectures of 54 billion and 141 billion parameters, respectively, are based\non the Mixtral architecture. The development of SaulLM-54B and SaulLM-141B is\nguided by large-scale domain adaptation, divided into three strategies: (1) the\nexploitation of continued pretraining involving a base corpus that includes\nover 540 billion of legal tokens, (2) the implementation of a specialized legal\ninstruction-following protocol, and (3) the alignment of model outputs with\nhuman preferences in legal interpretations. The integration of synthetically\ngenerated data in the second and third steps enhances the models' capabilities\nin interpreting and processing legal texts, effectively reaching\nstate-of-the-art performance and outperforming previous open-source models on\nLegalBench-Instruct. This work explores the trade-offs involved in\ndomain-specific adaptation at this scale, offering insights that may inform\nfuture studies on domain adaptation using strong decoder models. Building upon\nSaulLM-7B, this study refines the approach to produce an LLM better equipped\nfor legal tasks. We are releasing base, instruct, and aligned versions on top\nof SaulLM-54B and SaulLM-141B under the MIT License to facilitate reuse and\ncollaborative research.\n","authors":["Pierre Colombo","Telmo Pires","Malik Boudiaf","Rui Melo","Dominic Culver","Sofia Morgado","Etienne Malaboeuf","Gabriel Hautreux","Johanne Charpentier","Michael Desa"],"pdf_url":"https://arxiv.org/pdf/2407.19584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19580v1","updated":"2024-07-28T20:39:16Z","published":"2024-07-28T20:39:16Z","title":"Memory-efficient Training of LLMs with Larger Mini-batches","summary":" Training with larger mini-batches improves the performance and convergence\nrate of training machine learning models. However, training with large\nmini-batches becomes prohibitive for Large Language Models (LLMs) with billions\nof parameters, due to the large GPU memory requirement. To address this\nproblem, we propose finding small mini-batches that simulate the dynamics of\ntraining with larger mini-batches. Specifically, we formulate selecting smaller\nmini-batches of examples that closely capture gradients of large mini-batches\nas a submodular maximization problem. Nevertheless, the very large\ndimensionality of the gradients makes the problem very challenging to solve. To\naddress this, we leverage ideas from zeroth-order optimization and neural\nnetwork pruning to find lower-dimensional gradient estimates that allow finding\nhigh-quality subsets effectively with a limited amount of memory. We prove the\nsuperior convergence rate of training on the small mini-batches found by our\nmethod and empirically show its effectiveness. Our method can effectively\nreduce the memory requirement by 2x and speed up training by 1.3x, as we\nconfirm for fine-tuning Phi-2 on MathInstruct. Our method can be easily stacked\nwith LoRA and other memory-efficient methods to further reduce the memory\nrequirements of training LLMs.\n","authors":["Dang Nguyen","Wenhan Yang","Rathul Anand","Yu Yang","Baharan Mirzasoleiman"],"pdf_url":"https://arxiv.org/pdf/2407.19580v1.pdf","comment":"15 pages, 2 figures, 4 tables"},{"id":"http://arxiv.org/abs/2407.07895v2","updated":"2024-07-28T19:58:08Z","published":"2024-07-10T17:59:43Z","title":"LLaVA-NeXT-Interleave: Tackling Multi-image, Video, and 3D in Large\n Multimodal Models","summary":" Visual instruction tuning has made considerable strides in enhancing the\ncapabilities of Large Multimodal Models (LMMs). However, existing open LMMs\nlargely focus on single-image tasks, their applications to multi-image\nscenarios remains less explored. Additionally, prior LMM research separately\ntackles different scenarios, leaving it impossible to generalize cross\nscenarios with new emerging capabilities. To this end, we introduce\nLLaVA-NeXT-Interleave, which simultaneously tackles Multi-image, Multi-frame\n(video), Multi-view (3D), and Multi-patch (single-image) scenarios in LMMs. To\nenable these capabilities, we regard the interleaved data format as a general\ntemplate and compile the M4-Instruct dataset with 1,177.6k samples, spanning 4\nprimary domains with 14 tasks and 41 datasets. We also curate the\nLLaVA-Interleave Bench to comprehensively evaluate the multi-image performance\nof LMMs. Through extensive experiments, LLaVA-NeXT-Interleave achieves leading\nresults in multi-image, video, and 3D benchmarks, while maintaining the\nperformance of single-image tasks. Besides, our model also exhibits several\nemerging capabilities, e.g., transferring tasks across different settings and\nmodalities. Code is available at https://github.com/LLaVA-VL/LLaVA-NeXT\n","authors":["Feng Li","Renrui Zhang","Hao Zhang","Yuanhan Zhang","Bo Li","Wei Li","Zejun Ma","Chunyuan Li"],"pdf_url":"https://arxiv.org/pdf/2407.07895v2.pdf","comment":"Project Page:\n https://llava-vl.github.io/blog/2024-06-16-llava-next-interleave/"},{"id":"http://arxiv.org/abs/2407.19568v1","updated":"2024-07-28T19:27:06Z","published":"2024-07-28T19:27:06Z","title":"Are LLMs Good Annotators for Discourse-level Event Relation Extraction?","summary":" Large Language Models (LLMs) have demonstrated proficiency in a wide array of\nnatural language processing tasks. However, its effectiveness over\ndiscourse-level event relation extraction (ERE) tasks remains unexplored. In\nthis paper, we assess the effectiveness of LLMs in addressing discourse-level\nERE tasks characterized by lengthy documents and intricate relations\nencompassing coreference, temporal, causal, and subevent types. Evaluation is\nconducted using an commercial model, GPT-3.5, and an open-source model,\nLLaMA-2. Our study reveals a notable underperformance of LLMs compared to the\nbaseline established through supervised learning. Although Supervised\nFine-Tuning (SFT) can improve LLMs performance, it does not scale well compared\nto the smaller supervised baseline model. Our quantitative and qualitative\nanalysis shows that LLMs have several weaknesses when applied for extracting\nevent relations, including a tendency to fabricate event mentions, and failures\nto capture transitivity rules among relations, detect long distance relations,\nor comprehend contexts with dense event mentions.\n","authors":["Kangda Wei","Aayush Gautam","Ruihong Huang"],"pdf_url":"https://arxiv.org/pdf/2407.19568v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19528v1","updated":"2024-07-28T16:34:53Z","published":"2024-07-28T16:34:53Z","title":"Motamot: A Dataset for Revealing the Supremacy of Large Language Models\n over Transformer Models in Bengali Political Sentiment Analysis","summary":" Sentiment analysis is the process of identifying and categorizing people's\nemotions or opinions regarding various topics. Analyzing political sentiment is\ncritical for understanding the complexities of public opinion processes,\nespecially during election seasons. It gives significant information on voter\npreferences, attitudes, and current trends. In this study, we investigate\npolitical sentiment analysis during Bangladeshi elections, specifically\nexamining how effectively Pre-trained Language Models (PLMs) and Large Language\nModels (LLMs) capture complex sentiment characteristics. Our study centers on\nthe creation of the \"Motamot\" dataset, comprising 7,058 instances annotated\nwith positive and negative sentiments, sourced from diverse online newspaper\nportals, forming a comprehensive resource for political sentiment analysis. We\nmeticulously evaluate the performance of various PLMs including BanglaBERT,\nBangla BERT Base, XLM-RoBERTa, mBERT, and sahajBERT, alongside LLMs such as\nGemini 1.5 Pro and GPT 3.5 Turbo. Moreover, we explore zero-shot and few-shot\nlearning strategies to enhance our understanding of political sentiment\nanalysis methodologies. Our findings underscore BanglaBERT's commendable\naccuracy of 88.10% among PLMs. However, the exploration into LLMs reveals even\nmore promising results. Through the adept application of Few-Shot learning\ntechniques, Gemini 1.5 Pro achieves an impressive accuracy of 96.33%,\nsurpassing the remarkable performance of GPT 3.5 Turbo, which stands at 94%.\nThis underscores Gemini 1.5 Pro's status as the superior performer in this\ncomparison.\n","authors":["Fatema Tuj Johora Faria","Mukaffi Bin Moin","Rabeya Islam Mumu","Md Mahabubul Alam Abir","Abrar Nawar Alfy","Mohammad Shafiul Alam"],"pdf_url":"https://arxiv.org/pdf/2407.19528v1.pdf","comment":"Accepted for publication in \"The IEEE Region 10 Symposium (TENSYMP\n 2024)\""},{"id":"http://arxiv.org/abs/2407.19527v1","updated":"2024-07-28T16:34:25Z","published":"2024-07-28T16:34:25Z","title":"Open Sentence Embeddings for Portuguese with the Serafim PT* encoders\n family","summary":" Sentence encoder encode the semantics of their input, enabling key downstream\napplications such as classification, clustering, or retrieval. In this paper,\nwe present Serafim PT*, a family of open-source sentence encoders for\nPortuguese with various sizes, suited to different hardware/compute budgets.\nEach model exhibits state-of-the-art performance and is made openly available\nunder a permissive license, allowing its use for both commercial and research\npurposes. Besides the sentence encoders, this paper contributes a systematic\nstudy and lessons learned concerning the selection criteria of learning\nobjectives and parameters that support top-performing encoders.\n","authors":["Luís Gomes","António Branco","João Silva","João Rodrigues","Rodrigo Santos"],"pdf_url":"https://arxiv.org/pdf/2407.19527v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19526v1","updated":"2024-07-28T16:31:09Z","published":"2024-07-28T16:31:09Z","title":"Impact of Decoding Methods on Human Alignment of Conversational LLMs","summary":" To be included into chatbot systems, Large language models (LLMs) must be\naligned with human conversational conventions. However, being trained mainly on\nweb-scraped data gives existing LLMs a voice closer to informational text than\nactual human speech. In this paper, we examine the effect of decoding methods\non the alignment between LLM-generated and human conversations, including Beam\nSearch, Top K Sampling, and Nucleus Sampling. We present new measures of\nalignment in substance, style, and psychometric orientation, and experiment\nwith two conversation datasets. Our results provide subtle insights: better\nalignment is attributed to fewer beams in Beam Search and lower values of P in\nNucleus Sampling. We also find that task-oriented and open-ended datasets\nperform differently in terms of alignment, indicating the significance of\ntaking into account the context of the interaction.\n","authors":["Shaz Furniturewala","Kokil Jaidka","Yashvardhan Sharma"],"pdf_url":"https://arxiv.org/pdf/2407.19526v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16089v2","updated":"2024-07-28T16:08:29Z","published":"2024-05-25T06:41:23Z","title":"Towards Completeness-Oriented Tool Retrieval for Large Language Models","summary":" Recently, integrating external tools with Large Language Models (LLMs) has\ngained significant attention as an effective strategy to mitigate the\nlimitations inherent in their pre-training data. However, real-world systems\noften incorporate a wide array of tools, making it impractical to input all\ntools into LLMs due to length limitations and latency constraints. Therefore,\nto fully exploit the potential of tool-augmented LLMs, it is crucial to develop\nan effective tool retrieval system. Existing tool retrieval methods primarily\nfocus on semantic matching between user queries and tool descriptions,\nfrequently leading to the retrieval of redundant, similar tools. Consequently,\nthese methods fail to provide a complete set of diverse tools necessary for\naddressing the multifaceted problems encountered by LLMs. In this paper, we\npropose a novel modelagnostic COllaborative Learning-based Tool Retrieval\napproach, COLT, which captures not only the semantic similarities between user\nqueries and tool descriptions but also takes into account the collaborative\ninformation of tools. Specifically, we first fine-tune the PLM-based retrieval\nmodels to capture the semantic relationships between queries and tools in the\nsemantic learning stage. Subsequently, we construct three bipartite graphs\namong queries, scenes, and tools and introduce a dual-view graph collaborative\nlearning framework to capture the intricate collaborative relationships among\ntools during the collaborative learning stage. Extensive experiments on both\nthe open benchmark and the newly introduced ToolLens dataset show that COLT\nachieves superior performance. Notably, the performance of BERT-mini (11M) with\nour proposed model framework outperforms BERT-large (340M), which has 30 times\nmore parameters. Furthermore, we will release ToolLens publicly to facilitate\nfuture research on tool retrieval.\n","authors":["Changle Qu","Sunhao Dai","Xiaochi Wei","Hengyi Cai","Shuaiqiang Wang","Dawei Yin","Jun Xu","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2405.16089v2.pdf","comment":"Accepted by CIKM 2024; GitHub: https://github.com/quchangle1/COLT"},{"id":"http://arxiv.org/abs/2407.18003v2","updated":"2024-07-28T14:42:12Z","published":"2024-07-25T12:56:22Z","title":"Keep the Cost Down: A Review on Methods to Optimize LLM' s KV-Cache\n Consumption","summary":" Large Language Models (LLMs), epitomized by ChatGPT' s release in late 2022,\nhave revolutionized various industries with their advanced language\ncomprehension. However, their efficiency is challenged by the Transformer\narchitecture' s struggle with handling long texts. KV-Cache has emerged as a\npivotal solution to this issue, converting the time complexity of token\ngeneration from quadratic to linear, albeit with increased GPU memory overhead\nproportional to conversation length. With the development of the LLM community\nand academia, various KV-Cache compression methods have been proposed. In this\nreview, we dissect the various properties of KV-Cache and elaborate on various\nmethods currently used to optimize the KV-Cache space usage of LLMs. These\nmethods span the pre-training phase, deployment phase, and inference phase, and\nwe summarize the commonalities and differences among these methods.\nAdditionally, we list some metrics for evaluating the long-text capabilities of\nlarge language models, from both efficiency and capability perspectives. Our\nreview thus sheds light on the evolving landscape of LLM optimization, offering\ninsights into future advancements in this dynamic field.\n","authors":["Luohe Shi","Hongyi Zhang","Yao Yao","Zuchao Li","Hai Zhao"],"pdf_url":"https://arxiv.org/pdf/2407.18003v2.pdf","comment":"to be published in CoLM 2024"},{"id":"http://arxiv.org/abs/2404.01582v2","updated":"2024-07-28T13:12:03Z","published":"2024-04-01T12:20:34Z","title":"BERT-Enhanced Retrieval Tool for Homework Plagiarism Detection System","summary":" Text plagiarism detection task is a common natural language processing task\nthat aims to detect whether a given text contains plagiarism or copying from\nother texts. In existing research, detection of high level plagiarism is still\na challenge due to the lack of high quality datasets. In this paper, we propose\na plagiarized text data generation method based on GPT-3.5, which produces\n32,927 pairs of text plagiarism detection datasets covering a wide range of\nplagiarism methods, bridging the gap in this part of research. Meanwhile, we\npropose a plagiarism identification method based on Faiss with BERT with high\nefficiency and high accuracy. Our experiments show that the performance of this\nmodel outperforms other models in several metrics, including 98.86\\%, 98.90%,\n98.86%, and 0.9888 for Accuracy, Precision, Recall, and F1 Score, respectively.\nAt the end, we also provide a user-friendly demo platform that allows users to\nupload a text library and intuitively participate in the plagiarism analysis.\n","authors":["Jiarong Xian","Jibao Yuan","Peiwei Zheng","Dexian Chen","Nie yuntao"],"pdf_url":"https://arxiv.org/pdf/2404.01582v2.pdf","comment":"arXiv admin note: text overlap with arXiv:1604.06573 by other authors"},{"id":"http://arxiv.org/abs/2407.19474v1","updated":"2024-07-28T11:56:03Z","published":"2024-07-28T11:56:03Z","title":"Visual Riddles: a Commonsense and World Knowledge Challenge for Large\n Vision and Language Models","summary":" Imagine observing someone scratching their arm; to understand why, additional\ncontext would be necessary. However, spotting a mosquito nearby would\nimmediately offer a likely explanation for the person's discomfort, thereby\nalleviating the need for further information. This example illustrates how\nsubtle visual cues can challenge our cognitive skills and demonstrates the\ncomplexity of interpreting visual scenarios. To study these skills, we present\nVisual Riddles, a benchmark aimed to test vision and language models on visual\nriddles requiring commonsense and world knowledge. The benchmark comprises 400\nvisual riddles, each featuring a unique image created by a variety of\ntext-to-image models, question, ground-truth answer, textual hint, and\nattribution. Human evaluation reveals that existing models lag significantly\nbehind human performance, which is at 82\\% accuracy, with Gemini-Pro-1.5\nleading with 40\\% accuracy. Our benchmark comes with automatic evaluation tasks\nto make assessment scalable. These findings underscore the potential of Visual\nRiddles as a valuable resource for enhancing vision and language models'\ncapabilities in interpreting complex visual scenarios.\n","authors":["Nitzan Bitton-Guetta","Aviv Slobodkin","Aviya Maimon","Eliya Habba","Royi Rassin","Yonatan Bitton","Idan Szpektor","Amir Globerson","Yuval Elovici"],"pdf_url":"https://arxiv.org/pdf/2407.19474v1.pdf","comment":"https://visual-riddles.github.io/"},{"id":"http://arxiv.org/abs/2310.00322v5","updated":"2024-07-28T09:39:01Z","published":"2023-09-30T09:35:50Z","title":"Evolving Diverse Red-team Language Models in Multi-round Multi-agent\n Games","summary":" The primary challenge in deploying Large Language Model (LLM) is ensuring its\nharmlessness. Red team can identify vulnerabilities by attacking LLM to attain\nsafety. However, current efforts heavily rely on single-round prompt designs\nand unilateral red team optimizations against fixed blue teams. These static\napproaches lead to significant reductions in generation diversity, known as the\nmode collapse, which makes it difficult to discover the potential risks in the\nincreasingly complex human-LLM interactions. Here we introduce dynamic Red Team\nGame (RTG) to comprehensively analyze the multi-round offensive and defensive\ninteractions between red team and blue team. Furthermore, we develop a Gamified\nRed Team Solver (GRTS) with diversity measures to mitigate mode collapse and\ntheoretically guarantee the convergence of approximate Nash equilibrium which\nresults in better strategies for both teams. Empirical results demonstrate that\nGRTS explore diverse and implicit attacks to adaptively exploit various LLMs,\nsurpassing the constraints of specific modes. Insightfully, the geometrical\nstructure we unveil of the red team task aligns with the spinning top\nhypothesis, confirming the necessity of constructing a diverse LLM population\nas a promising proxy for heterogeneous human expert red-teamers. This paves the\nway for scalable toxicity detection and safe alignment for LLMs.\n","authors":["Chengdong Ma","Ziran Yang","Hai Ci","Jun Gao","Minquan Gao","Xuehai Pan","Yaodong Yang"],"pdf_url":"https://arxiv.org/pdf/2310.00322v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19435v1","updated":"2024-07-28T09:25:59Z","published":"2024-07-28T09:25:59Z","title":"ASI-Seg: Audio-Driven Surgical Instrument Segmentation with Surgeon\n Intention Understanding","summary":" Surgical instrument segmentation is crucial in surgical scene understanding,\nthereby facilitating surgical safety. Existing algorithms directly detected all\ninstruments of pre-defined categories in the input image, lacking the\ncapability to segment specific instruments according to the surgeon's\nintention. During different stages of surgery, surgeons exhibit varying\npreferences and focus toward different surgical instruments. Therefore, an\ninstrument segmentation algorithm that adheres to the surgeon's intention can\nminimize distractions from irrelevant instruments and assist surgeons to a\ngreat extent. The recent Segment Anything Model (SAM) reveals the capability to\nsegment objects following prompts, but the manual annotations for prompts are\nimpractical during the surgery. To address these limitations in operating\nrooms, we propose an audio-driven surgical instrument segmentation framework,\nnamed ASI-Seg, to accurately segment the required surgical instruments by\nparsing the audio commands of surgeons. Specifically, we propose an\nintention-oriented multimodal fusion to interpret the segmentation intention\nfrom audio commands and retrieve relevant instrument details to facilitate\nsegmentation. Moreover, to guide our ASI-Seg segment of the required surgical\ninstruments, we devise a contrastive learning prompt encoder to effectively\ndistinguish the required instruments from the irrelevant ones. Therefore, our\nASI-Seg promotes the workflow in the operating rooms, thereby providing\ntargeted support and reducing the cognitive load on surgeons. Extensive\nexperiments are performed to validate the ASI-Seg framework, which reveals\nremarkable advantages over classical state-of-the-art and medical SAMs in both\nsemantic segmentation and intention-oriented segmentation. The source code is\navailable at https://github.com/Zonmgin-Zhang/ASI-Seg.\n","authors":["Zhen Chen","Zongming Zhang","Wenwu Guo","Xingjian Luo","Long Bai","Jinlin Wu","Hongliang Ren","Hongbin Liu"],"pdf_url":"https://arxiv.org/pdf/2407.19435v1.pdf","comment":"This work is accepted by IROS 2024 (Oral)"},{"id":"http://arxiv.org/abs/2407.00569v2","updated":"2024-07-28T08:08:47Z","published":"2024-06-30T03:04:11Z","title":"Investigating and Mitigating the Multimodal Hallucination Snowballing in\n Large Vision-Language Models","summary":" Though advanced in understanding visual information with human languages,\nLarge Vision-Language Models (LVLMs) still suffer from multimodal\nhallucinations. A natural concern is that during multimodal interaction, the\ngenerated hallucinations could influence the LVLMs' subsequent generation.\nThus, we raise a question: When presented with a query relevant to the\npreviously generated hallucination, will LVLMs be misled and respond\nincorrectly, even though the ground visual information exists? To answer this,\nwe propose a framework called MMHalSnowball to evaluate LVLMs' behaviors when\nencountering generated hallucinations, where LVLMs are required to answer\nspecific visual questions within a curated hallucinatory conversation.\nCrucially, our experiment shows that the performance of open-source LVLMs drops\nby at least $31\\%$, indicating that LVLMs are prone to accept the generated\nhallucinations and make false claims that they would not have supported without\ndistractions. We term this phenomenon Multimodal Hallucination Snowballing. To\nmitigate this, we further propose a training-free method called Residual Visual\nDecoding, where we revise the output distribution of LVLMs with the one derived\nfrom the residual visual input, providing models with direct access to the\nvisual information. Experiments show that our method can mitigate more than\n$24\\%$ of the snowballed multimodal hallucination while maintaining\ncapabilities.\n","authors":["Weihong Zhong","Xiaocheng Feng","Liang Zhao","Qiming Li","Lei Huang","Yuxuan Gu","Weitao Ma","Yuan Xu","Bing Qin"],"pdf_url":"https://arxiv.org/pdf/2407.00569v2.pdf","comment":"Accepted to ACL 2024 Main Conference. 21 pages, 20 figures"},{"id":"http://arxiv.org/abs/2407.14076v2","updated":"2024-07-28T07:46:26Z","published":"2024-07-19T07:12:43Z","title":"Domain-Specific Pretraining of Language Models: A Comparative Study in\n the Medical Field","summary":" There are many cases where LLMs are used for specific tasks in a single\ndomain. These usually require less general, but more domain-specific knowledge.\nHighly capable, general-purpose state-of-the-art language models like GPT-4 or\nClaude-3-opus can often be used for such tasks, but they are very large and\ncannot be run locally, even if they were not proprietary. This can be a problem\nwhen working with sensitive data. This paper focuses on domain-specific and\nmixed-domain pretraining as potentially more efficient methods than general\npretraining for specialized language models. We will take a look at work\nrelated to domain-specific pretraining, specifically in the medical area, and\ncompare benchmark results of specialized language models to general-purpose\nlanguage models.\n","authors":["Tobias Kerner"],"pdf_url":"https://arxiv.org/pdf/2407.14076v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.04308v3","updated":"2024-07-28T07:17:18Z","published":"2023-06-07T10:14:17Z","title":"Personality testing of Large Language Models: Limited temporal\n stability, but highlighted prosociality","summary":" As Large Language Models (LLMs) continue to gain popularity due to their\nhuman-like traits and the intimacy they offer to users, their societal impact\ninevitably expands. This leads to the rising necessity for comprehensive\nstudies to fully understand LLMs and reveal their potential opportunities,\ndrawbacks, and overall societal impact. With that in mind, this research\nconducted an extensive investigation into seven LLM's, aiming to assess the\ntemporal stability and inter-rater agreement on their responses on personality\ninstruments in two time points. In addition, LLMs personality profile was\nanalyzed and compared to human normative data. The findings revealed varying\nlevels of inter-rater agreement in the LLMs responses over a short time, with\nsome LLMs showing higher agreement (e.g., LIama3 and GPT-4o) compared to others\n(e.g., GPT-4 and Gemini). Furthermore, agreement depended on used instruments\nas well as on domain or trait. This implies the variable robustness in LLMs'\nability to reliably simulate stable personality characteristics. In the case of\nscales which showed at least fair agreement, LLMs displayed mostly a socially\ndesirable profile in both agentic and communal domains, as well as a prosocial\npersonality profile reflected in higher agreeableness and conscientiousness and\nlower Machiavellianism. Exhibiting temporal stability and coherent responses on\npersonality traits is crucial for AI systems due to their societal impact and\nAI safety concerns.\n","authors":["Bojana Bodroza","Bojana M. Dinic","Ljubisa Bojic"],"pdf_url":"https://arxiv.org/pdf/2306.04308v3.pdf","comment":"21 pages, 1 table"},{"id":"http://arxiv.org/abs/2407.19409v1","updated":"2024-07-28T06:10:47Z","published":"2024-07-28T06:10:47Z","title":"LLAVADI: What Matters For Multimodal Large Language Models Distillation","summary":" The recent surge in Multimodal Large Language Models (MLLMs) has showcased\ntheir remarkable potential for achieving generalized intelligence by\nintegrating visual understanding into Large Language Models.Nevertheless, the\nsheer model size of MLLMs leads to substantial memory and computational demands\nthat hinder their widespread deployment. In this work, we do not propose a new\nefficient model structure or train small-scale MLLMs from scratch. Instead, we\nfocus on what matters for training small-scale MLLMs through knowledge\ndistillation, which is the first step from the multimodal distillation\nperspective. Our extensive studies involve training strategies, model choices,\nand distillation algorithms in the knowledge distillation process. These\nresults show that joint alignment for both tokens and logit alignment plays\ncritical roles in teacher-student frameworks. In addition, we draw a series of\nintriguing observations from this study. By evaluating different benchmarks and\nproper strategy, even a 2.7B small-scale model can perform on par with larger\nmodels with 7B or 13B parameters. Our code and models will be publicly\navailable for further research.\n","authors":["Shilin Xu","Xiangtai Li","Haobo Yuan","Lu Qi","Yunhai Tong","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2407.19409v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19400v1","updated":"2024-07-28T05:06:58Z","published":"2024-07-28T05:06:58Z","title":"Word Segmentation for Asian Languages: Chinese, Korean, and Japanese","summary":" We provide a detailed overview of various approaches to word segmentation of\nAsian Languages, specifically Chinese, Korean, and Japanese languages. For each\nlanguage, approaches to deal with word segmentation differs. We also include\nour analysis about certain advantages and disadvantages to each method. In\naddition, there is room for future work in this field.\n","authors":["Matthew Rho","Yexin Tian","Qin Chen"],"pdf_url":"https://arxiv.org/pdf/2407.19400v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11581v2","updated":"2024-07-28T04:33:57Z","published":"2024-06-17T14:20:53Z","title":"Style Transfer with Multi-iteration Preference Optimization","summary":" Numerous recent techniques for text style transfer characterize their\napproaches as variants of reinforcement learning and preference optimization.\nIn this work, we consider the relationship between these approaches and a class\nof optimization approaches developed primarily for (non-neural) statistical\nmachine translation, formerly known as `tuning'. Inspired by these techniques\nfrom the past, we improve upon established preference optimization approaches,\nincorporating multiple iterations of exploration and optimization, and choosing\ncontrastive examples by following a `hope' vs `fear' sampling strategy.\nCognizant of the difference between machine translation and style transfer,\nhowever, we further tailor our framework with a new pseudo-parallel generation\nmethod and a dynamic weighted reward aggregation method to tackle the lack of\nparallel data and the need for a multi-objective reward. We evaluate our model\non two commonly used text style transfer datasets. Through automatic and human\nevaluation results we show the effectiveness and the superiority of our model\ncompared to state-of-the-art baselines.\n","authors":["Shuai Liu","Jonathan May"],"pdf_url":"https://arxiv.org/pdf/2406.11581v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08043v2","updated":"2024-07-28T04:29:43Z","published":"2024-03-12T19:34:54Z","title":"Authorship Style Transfer with Policy Optimization","summary":" Authorship style transfer aims to rewrite a given text into a specified\ntarget while preserving the original meaning in the source. Existing approaches\nrely on the availability of a large number of target style exemplars for model\ntraining. However, these overlook cases where a limited number of target style\nexamples are available. The development of parameter-efficient transfer\nlearning techniques and policy optimization (PO) approaches suggest lightweight\nPO is a feasible approach to low-resource style transfer. In this work, we\npropose a simple two-stage tune-and-optimize technique for low-resource textual\nstyle transfer. We apply our technique to authorship transfer as well as a\nlarger-data native language style task and in both cases find it outperforms\nstate-of-the-art baseline models.\n","authors":["Shuai Liu","Shantanu Agarwal","Jonathan May"],"pdf_url":"https://arxiv.org/pdf/2403.08043v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.09816v2","updated":"2024-07-28T04:00:52Z","published":"2024-07-13T09:22:33Z","title":"MaskMoE: Boosting Token-Level Learning via Routing Mask in\n Mixture-of-Experts","summary":" Scaling the size of a model enhances its capabilities but significantly\nincreases computation complexity. Mixture-of-Experts models (MoE) address the\nissue by allowing model size to scale up without substantially increasing\ntraining or inference costs. Despite their promising results, MoE models\nencounter several challenges. Primarily, for dynamic routing methods, the\ndispersion of training tokens across multiple experts can lead to underfitting,\nparticularly for infrequent tokens. Additionally, while fixed routing methods\ncan mitigate that issue, they compromise on the diversity of representations.\nIn this paper, we propose \\textbf{MaskMoE}, a method designed to enhance\ntoken-level learning by employing a routing \\textbf{mask}ing technique within\nthe \\textbf{M}ixture-\\textbf{o}f-\\textbf{E}xperts model. MaskMoE is capable of\nmaintaining representation diversity while achieving more comprehensive\ntraining. Experimental results demonstrate that our method outperforms previous\ndominant Mixture-of-Experts models in terms of both perplexity (PPL) and\ndownstream task performance.\n","authors":["Zhenpeng Su","Zijia Lin","Xue Bai","Xing Wu","Yizhe Xiong","Haoran Lian","Guangyuan Ma","Hui Chen","Guiguang Ding","Wei Zhou","Songlin Hu"],"pdf_url":"https://arxiv.org/pdf/2407.09816v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2403.16303v4","updated":"2024-07-28T03:24:37Z","published":"2024-03-24T21:29:39Z","title":"Large Language Models in Biomedical and Health Informatics: A Review\n with Bibliometric Analysis","summary":" Large Language Models (LLMs) have rapidly become important tools in\nBiomedical and Health Informatics (BHI), enabling new ways to analyze data,\ntreat patients, and conduct research. This study aims to provide a\ncomprehensive overview of LLM applications in BHI, highlighting their\ntransformative potential and addressing the associated ethical and practical\nchallenges. We reviewed 1,698 research articles from January 2022 to December\n2023, categorizing them by research themes and diagnostic categories.\nAdditionally, we conducted network analysis to map scholarly collaborations and\nresearch dynamics. Our findings reveal a substantial increase in the potential\napplications of LLMs to a variety of BHI tasks, including clinical decision\nsupport, patient interaction, and medical document analysis. Notably, LLMs are\nexpected to be instrumental in enhancing the accuracy of diagnostic tools and\npatient care protocols. The network analysis highlights dense and dynamically\nevolving collaborations across institutions, underscoring the interdisciplinary\nnature of LLM research in BHI. A significant trend was the application of LLMs\nin managing specific disease categories such as mental health and neurological\ndisorders, demonstrating their potential to influence personalized medicine and\npublic health strategies. LLMs hold promising potential to further transform\nbiomedical research and healthcare delivery. While promising, the ethical\nimplications and challenges of model validation call for rigorous scrutiny to\noptimize their benefits in clinical settings. This survey serves as a resource\nfor stakeholders in healthcare, including researchers, clinicians, and\npolicymakers, to understand the current state and future potential of LLMs in\nBHI.\n","authors":["Huizi Yu","Lizhou Fan","Lingyao Li","Jiayan Zhou","Zihui Ma","Lu Xian","Wenyue Hua","Sijia He","Mingyu Jin","Yongfeng Zhang","Ashvin Gandhi","Xin Ma"],"pdf_url":"https://arxiv.org/pdf/2403.16303v4.pdf","comment":"62 pages, 9 figures, 5 tables"},{"id":"http://arxiv.org/abs/2404.00236v2","updated":"2024-07-28T01:02:21Z","published":"2024-03-30T03:56:53Z","title":"Enhancing Content-based Recommendation via Large Language Model","summary":" In real-world applications, users express different behaviors when they\ninteract with different items, including implicit click/like interactions, and\nexplicit comments/reviews interactions. Nevertheless, almost all recommender\nworks are focused on how to describe user preferences by the implicit\nclick/like interactions, to find the synergy of people. For the content-based\nexplicit comments/reviews interactions, some works attempt to utilize them to\nmine the semantic knowledge to enhance recommender models. However, they still\nneglect the following two points: (1) The content semantic is a universal world\nknowledge; how do we extract the multi-aspect semantic information to empower\ndifferent domains? (2) The user/item ID feature is a fundamental element for\nrecommender models; how do we align the ID and content semantic feature space?\nIn this paper, we propose a `plugin' semantic knowledge transferring method\n\\textbf{LoID}, which includes two major components: (1) LoRA-based large\nlanguage model pretraining to extract multi-aspect semantic information; (2)\nID-based contrastive objective to align their feature spaces. We conduct\nextensive experiments with SOTA baselines on real-world datasets, the detailed\nresults demonstrating significant improvements of our method LoID.\n","authors":["Wentao Xu","Qianqian Xie","Shuo Yang","Jiangxia Cao","Shuchao Pang"],"pdf_url":"https://arxiv.org/pdf/2404.00236v2.pdf","comment":"Accepted at CIKM 2024"},{"id":"http://arxiv.org/abs/2404.00026v4","updated":"2024-07-28T00:29:22Z","published":"2024-03-20T21:02:16Z","title":"Ink and Individuality: Crafting a Personalised Narrative in the Age of\n LLMs","summary":" Individuality and personalization comprise the distinctive characteristics\nthat make each writer unique and influence their words in order to effectively\nengage readers while conveying authenticity. However, our growing reliance on\nLLM-based writing assistants risks compromising our creativity and\nindividuality over time. We often overlook the negative impacts of this trend\non our creativity and uniqueness, despite the possible consequences. This study\ninvestigates these concerns by performing a brief survey to explore different\nperspectives and concepts, as well as trying to understand people's viewpoints,\nin conjunction with past studies in the area. Addressing these issues is\nessential for improving human-computer interaction systems and enhancing\nwriting assistants for personalization and individuality.\n","authors":["Azmine Toushik Wasi","Raima Islam","Mst Rafia Islam"],"pdf_url":"https://arxiv.org/pdf/2404.00026v4.pdf","comment":"8 Pages, 4 Figures. Accepted in The Third Workshop on Intelligent and\n Interactive Writing Assistants at CHI 2024"},{"id":"http://arxiv.org/abs/2404.00027v4","updated":"2024-07-28T00:26:14Z","published":"2024-03-20T21:06:42Z","title":"LLMs as Writing Assistants: Exploring Perspectives on Sense of Ownership\n and Reasoning","summary":" Sense of ownership in writing confines our investment of thoughts, time, and\ncontribution, leading to attachment to the output. However, using writing\nassistants introduces a mental dilemma, as some content isn't directly our\ncreation. For instance, we tend to credit Large Language Models (LLMs) more in\ncreative tasks, even though all tasks are equal for them. Additionally, while\nwe may not claim complete ownership of LLM-generated content, we freely claim\nauthorship. We conduct a short survey to examine these issues and understand\nunderlying cognitive processes in order to gain a better knowledge of\nhuman-computer interaction in writing and improve writing aid systems.\n","authors":["Azmine Toushik Wasi","Mst Rafia Islam","Raima Islam"],"pdf_url":"https://arxiv.org/pdf/2404.00027v4.pdf","comment":"8 Pages, 3 Figures. Accepted in The Third Workshop on Intelligent and\n Interactive Writing Assistants at CHI 2024"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2311.10813v4","updated":"2024-07-28T23:37:51Z","published":"2023-11-17T18:59:56Z","title":"A Language Agent for Autonomous Driving","summary":" Human-level driving is an ultimate goal of autonomous driving. Conventional\napproaches formulate autonomous driving as a perception-prediction-planning\nframework, yet their systems do not capitalize on the inherent reasoning\nability and experiential knowledge of humans. In this paper, we propose a\nfundamental paradigm shift from current pipelines, exploiting Large Language\nModels (LLMs) as a cognitive agent to integrate human-like intelligence into\nautonomous driving systems. Our approach, termed Agent-Driver, transforms the\ntraditional autonomous driving pipeline by introducing a versatile tool library\naccessible via function calls, a cognitive memory of common sense and\nexperiential knowledge for decision-making, and a reasoning engine capable of\nchain-of-thought reasoning, task planning, motion planning, and\nself-reflection. Powered by LLMs, our Agent-Driver is endowed with intuitive\ncommon sense and robust reasoning capabilities, thus enabling a more nuanced,\nhuman-like approach to autonomous driving. We evaluate our approach on the\nlarge-scale nuScenes benchmark, and extensive experiments substantiate that our\nAgent-Driver significantly outperforms the state-of-the-art driving methods by\na large margin. Our approach also demonstrates superior interpretability and\nfew-shot learning ability to these methods.\n","authors":["Jiageng Mao","Junjie Ye","Yuxi Qian","Marco Pavone","Yue Wang"],"pdf_url":"https://arxiv.org/pdf/2311.10813v4.pdf","comment":"COLM 2024. Project Page: https://usc-gvl.github.io/Agent-Driver/"},{"id":"http://arxiv.org/abs/2407.19605v1","updated":"2024-07-28T22:35:08Z","published":"2024-07-28T22:35:08Z","title":"Look Hear: Gaze Prediction for Speech-directed Human Attention","summary":" For computer systems to effectively interact with humans using spoken\nlanguage, they need to understand how the words being generated affect the\nusers' moment-by-moment attention. Our study focuses on the incremental\nprediction of attention as a person is seeing an image and hearing a referring\nexpression defining the object in the scene that should be fixated by gaze. To\npredict the gaze scanpaths in this incremental object referral task, we\ndeveloped the Attention in Referral Transformer model or ART, which predicts\nthe human fixations spurred by each word in a referring expression. ART uses a\nmultimodal transformer encoder to jointly learn gaze behavior and its\nunderlying grounding tasks, and an autoregressive transformer decoder to\npredict, for each word, a variable number of fixations based on fixation\nhistory. To train ART, we created RefCOCO-Gaze, a large-scale dataset of 19,738\nhuman gaze scanpaths, corresponding to 2,094 unique image-expression pairs,\nfrom 220 participants performing our referral task. In our quantitative and\nqualitative analyses, ART not only outperforms existing methods in scanpath\nprediction, but also appears to capture several human attention patterns, such\nas waiting, scanning, and verification.\n","authors":["Sounak Mondal","Seoyoung Ahn","Zhibo Yang","Niranjan Balasubramanian","Dimitris Samaras","Gregory Zelinsky","Minh Hoai"],"pdf_url":"https://arxiv.org/pdf/2407.19605v1.pdf","comment":"Accepted for ECCV 2024"},{"id":"http://arxiv.org/abs/2403.12977v2","updated":"2024-07-28T21:59:57Z","published":"2024-02-10T01:16:21Z","title":"SportsNGEN: Sustained Generation of Realistic Multi-player Sports\n Gameplay","summary":" We present a transformer decoder based sports simulation engine, SportsNGEN,\ntrained on sports player and ball tracking sequences, that is capable of\ngenerating sustained gameplay and accurately mimicking the decision making of\nreal players. By training on a large database of professional tennis tracking\ndata, we demonstrate that simulations produced by SportsNGEN can be used to\npredict the outcomes of rallies, determine the best shot choices at any point,\nand evaluate counterfactual or what if scenarios to inform coaching decisions\nand elevate broadcast coverage. By combining the generated simulations with a\nshot classifier and logic to start and end rallies, the system is capable of\nsimulating an entire tennis match. We evaluate SportsNGEN by comparing\nstatistics of the simulations with those of real matches between the same\nplayers. We show that the model output sampling parameters are crucial to\nsimulation realism and that SportsNGEN is probabilistically well-calibrated to\nreal data. In addition, a generic version of SportsNGEN can be customized to a\nspecific player by fine-tuning on the subset of match data that includes that\nplayer. Finally, we show qualitative results indicating the same approach works\nfor football.\n","authors":["Lachlan Thorpe","Lewis Bawden","Karanjot Vendal","John Bronskill","Richard E. Turner"],"pdf_url":"https://arxiv.org/pdf/2403.12977v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19593v1","updated":"2024-07-28T21:26:33Z","published":"2024-07-28T21:26:33Z","title":"Bridging the Gap: Studio-like Avatar Creation from a Monocular Phone\n Capture","summary":" Creating photorealistic avatars for individuals traditionally involves\nextensive capture sessions with complex and expensive studio devices like the\nLightStage system. While recent strides in neural representations have enabled\nthe generation of photorealistic and animatable 3D avatars from quick phone\nscans, they have the capture-time lighting baked-in, lack facial details and\nhave missing regions in areas such as the back of the ears. Thus, they lag in\nquality compared to studio-captured avatars. In this paper, we propose a method\nthat bridges this gap by generating studio-like illuminated texture maps from\nshort, monocular phone captures. We do this by parameterizing the phone texture\nmaps using the $W^+$ space of a StyleGAN2, enabling near-perfect\nreconstruction. Then, we finetune a StyleGAN2 by sampling in the $W^+$\nparameterized space using a very small set of studio-captured textures as an\nadversarial training signal. To further enhance the realism and accuracy of\nfacial details, we super-resolve the output of the StyleGAN2 using carefully\ndesigned diffusion model that is guided by image gradients of the\nphone-captured texture map. Once trained, our method excels at producing\nstudio-like facial texture maps from casual monocular smartphone videos.\nDemonstrating its capabilities, we showcase the generation of photorealistic,\nuniformly lit, complete avatars from monocular phone captures.\n\\href{http://shahrukhathar.github.io/2024/07/22/Bridging.html}{The project page\ncan be found here.}\n","authors":["ShahRukh Athar","Shunsuke Saito","Zhengyu Yang","Stanislav Pidhorsky","Chen Cao"],"pdf_url":"https://arxiv.org/pdf/2407.19593v1.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2403.14027v3","updated":"2024-07-28T20:55:51Z","published":"2024-03-20T22:52:34Z","title":"EcoSense: Energy-Efficient Intelligent Sensing for In-Shore Ship\n Detection through Edge-Cloud Collaboration","summary":" Detecting marine objects inshore presents challenges owing to algorithmic\nintricacies and complexities in system deployment. We propose a\ndifficulty-aware edge-cloud collaborative sensing system that splits the task\ninto object localization and fine-grained classification. Objects are\nclassified either at the edge or within the cloud, based on their estimated\ndifficulty. The framework comprises a low-power device-tailored front-end model\nfor object localization, classification, and difficulty estimation, along with\na transformer-graph convolutional network-based back-end model for fine-grained\nclassification. Our system demonstrates superior performance (mAP@0.5 +4.3%})\non widely used marine object detection datasets, significantly reducing both\ndata transmission volume (by 95.43%) and energy consumption (by 72.7%}) at the\nsystem level. We validate the proposed system across various embedded system\nplatforms and in real-world scenarios involving drone deployment.\n","authors":["Wenjun Huang","Hanning Chen","Yang Ni","Arghavan Rezvani","Sanggeon Yun","Sungheon Jeon","Eric Pedley","Mohsen Imani"],"pdf_url":"https://arxiv.org/pdf/2403.14027v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.07895v2","updated":"2024-07-28T19:58:08Z","published":"2024-07-10T17:59:43Z","title":"LLaVA-NeXT-Interleave: Tackling Multi-image, Video, and 3D in Large\n Multimodal Models","summary":" Visual instruction tuning has made considerable strides in enhancing the\ncapabilities of Large Multimodal Models (LMMs). However, existing open LMMs\nlargely focus on single-image tasks, their applications to multi-image\nscenarios remains less explored. Additionally, prior LMM research separately\ntackles different scenarios, leaving it impossible to generalize cross\nscenarios with new emerging capabilities. To this end, we introduce\nLLaVA-NeXT-Interleave, which simultaneously tackles Multi-image, Multi-frame\n(video), Multi-view (3D), and Multi-patch (single-image) scenarios in LMMs. To\nenable these capabilities, we regard the interleaved data format as a general\ntemplate and compile the M4-Instruct dataset with 1,177.6k samples, spanning 4\nprimary domains with 14 tasks and 41 datasets. We also curate the\nLLaVA-Interleave Bench to comprehensively evaluate the multi-image performance\nof LMMs. Through extensive experiments, LLaVA-NeXT-Interleave achieves leading\nresults in multi-image, video, and 3D benchmarks, while maintaining the\nperformance of single-image tasks. Besides, our model also exhibits several\nemerging capabilities, e.g., transferring tasks across different settings and\nmodalities. Code is available at https://github.com/LLaVA-VL/LLaVA-NeXT\n","authors":["Feng Li","Renrui Zhang","Hao Zhang","Yuanhan Zhang","Bo Li","Wei Li","Zejun Ma","Chunyuan Li"],"pdf_url":"https://arxiv.org/pdf/2407.07895v2.pdf","comment":"Project Page:\n https://llava-vl.github.io/blog/2024-06-16-llava-next-interleave/"},{"id":"http://arxiv.org/abs/2403.15382v2","updated":"2024-07-28T19:45:06Z","published":"2024-03-22T17:58:59Z","title":"DragAPart: Learning a Part-Level Motion Prior for Articulated Objects","summary":" We introduce DragAPart, a method that, given an image and a set of drags as\ninput, generates a new image of the same object that responds to the action of\nthe drags. Differently from prior works that focused on repositioning objects,\nDragAPart predicts part-level interactions, such as opening and closing a\ndrawer. We study this problem as a proxy for learning a generalist motion\nmodel, not restricted to a specific kinematic structure or object category. We\nstart from a pre-trained image generator and fine-tune it on a new synthetic\ndataset, Drag-a-Move, which we introduce. Combined with a new encoding for the\ndrags and dataset randomization, the model generalizes well to real images and\ndifferent categories. Compared to prior motion-controlled generators, we\ndemonstrate much better part-level motion understanding.\n","authors":["Ruining Li","Chuanxia Zheng","Christian Rupprecht","Andrea Vedaldi"],"pdf_url":"https://arxiv.org/pdf/2403.15382v2.pdf","comment":"Project page: https://dragapart.github.io/"},{"id":"http://arxiv.org/abs/2407.19564v1","updated":"2024-07-28T19:18:59Z","published":"2024-07-28T19:18:59Z","title":"Forecast-PEFT: Parameter-Efficient Fine-Tuning for Pre-trained Motion\n Forecasting Models","summary":" Recent progress in motion forecasting has been substantially driven by\nself-supervised pre-training. However, adapting pre-trained models for specific\ndownstream tasks, especially motion prediction, through extensive fine-tuning\nis often inefficient. This inefficiency arises because motion prediction\nclosely aligns with the masked pre-training tasks, and traditional full\nfine-tuning methods fail to fully leverage this alignment. To address this, we\nintroduce Forecast-PEFT, a fine-tuning strategy that freezes the majority of\nthe model's parameters, focusing adjustments on newly introduced prompts and\nadapters. This approach not only preserves the pre-learned representations but\nalso significantly reduces the number of parameters that need retraining,\nthereby enhancing efficiency. This tailored strategy, supplemented by our\nmethod's capability to efficiently adapt to different datasets, enhances model\nefficiency and ensures robust performance across datasets without the need for\nextensive retraining. Our experiments show that Forecast-PEFT outperforms\ntraditional full fine-tuning methods in motion prediction tasks, achieving\nhigher accuracy with only 17% of the trainable parameters typically required.\nMoreover, our comprehensive adaptation, Forecast-FT, further improves\nprediction performance, evidencing up to a 9.6% enhancement over conventional\nbaseline methods. Code will be available at\nhttps://github.com/csjfwang/Forecast-PEFT.\n","authors":["Jifeng Wang","Kaouther Messaoud","Yuejiang Liu","Juergen Gall","Alexandre Alahi"],"pdf_url":"https://arxiv.org/pdf/2407.19564v1.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2404.11525v3","updated":"2024-07-28T18:53:18Z","published":"2024-04-17T16:16:12Z","title":"JointViT: Modeling Oxygen Saturation Levels with Joint Supervision on\n Long-Tailed OCTA","summary":" The oxygen saturation level in the blood (SaO2) is crucial for health,\nparticularly in relation to sleep-related breathing disorders. However,\ncontinuous monitoring of SaO2 is time-consuming and highly variable depending\non patients' conditions. Recently, optical coherence tomography angiography\n(OCTA) has shown promising development in rapidly and effectively screening\neye-related lesions, offering the potential for diagnosing sleep-related\ndisorders. To bridge this gap, our paper presents three key contributions.\nFirstly, we propose JointViT, a novel model based on the Vision Transformer\narchitecture, incorporating a joint loss function for supervision. Secondly, we\nintroduce a balancing augmentation technique during data preprocessing to\nimprove the model's performance, particularly on the long-tail distribution\nwithin the OCTA dataset. Lastly, through comprehensive experiments on the OCTA\ndataset, our proposed method significantly outperforms other state-of-the-art\nmethods, achieving improvements of up to 12.28% in overall accuracy. This\nadvancement lays the groundwork for the future utilization of OCTA in\ndiagnosing sleep-related disorders. See project website\nhttps://steve-zeyu-zhang.github.io/JointViT\n","authors":["Zeyu Zhang","Xuyin Qi","Mingxi Chen","Guangxi Li","Ryan Pham","Ayub Qassim","Ella Berry","Zhibin Liao","Owen Siggs","Robert Mclaughlin","Jamie Craig","Minh-Son To"],"pdf_url":"https://arxiv.org/pdf/2404.11525v3.pdf","comment":"Accepted to MIUA 2024 Oral"},{"id":"http://arxiv.org/abs/2407.19553v1","updated":"2024-07-28T18:20:08Z","published":"2024-07-28T18:20:08Z","title":"Exploring the Adversarial Robustness of CLIP for AI-generated Image\n Detection","summary":" In recent years, many forensic detectors have been proposed to detect\nAI-generated images and prevent their use for malicious purposes. Convolutional\nneural networks (CNNs) have long been the dominant architecture in this field\nand have been the subject of intense study. However, recently proposed\nTransformer-based detectors have been shown to match or even outperform\nCNN-based detectors, especially in terms of generalization. In this paper, we\nstudy the adversarial robustness of AI-generated image detectors, focusing on\nContrastive Language-Image Pretraining (CLIP)-based methods that rely on Visual\nTransformer backbones and comparing their performance with CNN-based methods.\nWe study the robustness to different adversarial attacks under a variety of\nconditions and analyze both numerical results and frequency-domain patterns.\nCLIP-based detectors are found to be vulnerable to white-box attacks just like\nCNN-based detectors. However, attacks do not easily transfer between CNN-based\nand CLIP-based methods. This is also confirmed by the different distribution of\nthe adversarial noise patterns in the frequency domain. Overall, this analysis\nprovides new insights into the properties of forensic detectors that can help\nto develop more effective strategies.\n","authors":["Vincenzo De Rosa","Fabrizio Guillaro","Giovanni Poggi","Davide Cozzolino","Luisa Verdoliva"],"pdf_url":"https://arxiv.org/pdf/2407.19553v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19551v1","updated":"2024-07-28T18:16:41Z","published":"2024-07-28T18:16:41Z","title":"Improving Domain Adaptation Through Class Aware Frequency Transformation","summary":" In this work, we explore the usage of the Frequency Transformation for\nreducing the domain shift between the source and target domain (e.g., synthetic\nimage and real image respectively) towards solving the Domain Adaptation task.\nMost of the Unsupervised Domain Adaptation (UDA) algorithms focus on reducing\nthe global domain shift between labelled source and unlabelled target domains\nby matching the marginal distributions under a small domain gap assumption. UDA\nperformance degrades for the cases where the domain gap between source and\ntarget distribution is large. In order to bring the source and the target\ndomains closer, we propose a novel approach based on traditional image\nprocessing technique Class Aware Frequency Transformation (CAFT) that utilizes\npseudo label based class consistent low-frequency swapping for improving the\noverall performance of the existing UDA algorithms. The proposed approach, when\ncompared with the state-of-the-art deep learning based methods, is\ncomputationally more efficient and can easily be plugged into any existing UDA\nalgorithm to improve its performance. Additionally, we introduce a novel\napproach based on absolute difference of top-2 class prediction probabilities\n(ADT2P) for filtering target pseudo labels into clean and noisy sets. Samples\nwith clean pseudo labels can be used to improve the performance of unsupervised\nlearning algorithms. We name the overall framework as CAFT++. We evaluate the\nsame on the top of different UDA algorithms across many public domain\nadaptation datasets. Our extensive experiments indicate that CAFT++ is able to\nachieve significant performance gains across all the popular benchmarks.\n","authors":["Vikash Kumar","Himanshu Patil","Rohit Lal","Anirban Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2407.19551v1.pdf","comment":"Accepted at the International Journal of Computer Vision"},{"id":"http://arxiv.org/abs/2407.19548v1","updated":"2024-07-28T17:58:35Z","published":"2024-07-28T17:58:35Z","title":"Cycle3D: High-quality and Consistent Image-to-3D Generation via\n Generation-Reconstruction Cycle","summary":" Recent 3D large reconstruction models typically employ a two-stage process,\nincluding first generate multi-view images by a multi-view diffusion model, and\nthen utilize a feed-forward model to reconstruct images to 3D content.However,\nmulti-view diffusion models often produce low-quality and inconsistent images,\nadversely affecting the quality of the final 3D reconstruction. To address this\nissue, we propose a unified 3D generation framework called Cycle3D, which\ncyclically utilizes a 2D diffusion-based generation module and a feed-forward\n3D reconstruction module during the multi-step diffusion process. Concretely,\n2D diffusion model is applied for generating high-quality texture, and the\nreconstruction model guarantees multi-view consistency.Moreover, 2D diffusion\nmodel can further control the generated content and inject reference-view\ninformation for unseen views, thereby enhancing the diversity and texture\nconsistency of 3D generation during the denoising process. Extensive\nexperiments demonstrate the superior ability of our method to create 3D content\nwith high-quality and consistency compared with state-of-the-art baselines.\n","authors":["Zhenyu Tang","Junwu Zhang","Xinhua Cheng","Wangbo Yu","Chaoran Feng","Yatian Pang","Bin Lin","Li Yuan"],"pdf_url":"https://arxiv.org/pdf/2407.19548v1.pdf","comment":"Project page: https://pku-yuangroup.github.io/Cycle3D/"},{"id":"http://arxiv.org/abs/2407.19547v1","updated":"2024-07-28T17:46:15Z","published":"2024-07-28T17:46:15Z","title":"Temporal Feature Matters: A Framework for Diffusion Model Quantization","summary":" The Diffusion models, widely used for image generation, face significant\nchallenges related to their broad applicability due to prolonged inference\ntimes and high memory demands. Efficient Post-Training Quantization (PTQ) is\ncrucial to address these issues in traditional models. Unlike those models,\ndiffusion models critically rely on the time-step $t$ for effective multi-round\ndenoising. Typically, $t$ from the finite set $\\{1, \\ldots, T\\}$ is encoded\ninto a hypersensitive temporal feature by several modules, entirely independent\nof the sampling data. However, existing PTQ methods do not optimize these\nmodules individually. Instead, they employ unsuitable reconstruction objectives\nand complex calibration methods, leading to significant disturbances in the\ntemporal feature and denoising trajectory. To address these challenges, we\nintroduce a novel quantization framework: 1)~TIB-based Maintenance: Based on\nour innovative Temporal Information Block~(TIB) definition, Temporal\nInformation-aware Reconstruction~(TIAR) and Finite Set Calibration~(FSC) are\ndeveloped to efficiently align full precision temporal features. 2)~Cache-based\nMaintenance: Instead of indirect and complex optimization for the related\nmodules, pre-computing and caching quantized counterparts of temporal features\nare developed to minimize errors. 3)~Disturbance-aware Selection: Employ\ntemporal feature errors to guide a fine-grained selection for superior\nmaintenance. This framework preserves most of the temporal information and\nensures high-quality end-to-end generation. Extensive testing on various\ndatasets and diffusion models confirms our superior results. Notably, our\napproach closely matches the performance of the full-precision model under\n4-bit quantization. Furthermore, the quantized SD-XL model achieves hardware\nacceleration of 2.20$\\times$ on CPU and 5.76$\\times$ on GPU demonstrating its\nefficiency.\n","authors":["Yushi Huang","Ruihao Gong","Xianglong Liu","Jing Liu","Yuhang Li","Jiwen Lu","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2407.19547v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2311.16503"},{"id":"http://arxiv.org/abs/2407.19546v1","updated":"2024-07-28T17:38:21Z","published":"2024-07-28T17:38:21Z","title":"XLIP: Cross-modal Attention Masked Modelling for Medical Language-Image\n Pre-Training","summary":" Vision-and-language pretraining (VLP) in the medical field utilizes\ncontrastive learning on image-text pairs to achieve effective transfer across\ntasks. Yet, current VLP approaches with the masked modelling strategy face two\nchallenges when applied to the medical domain. First, current models struggle\nto accurately reconstruct key pathological features due to the scarcity of\nmedical data. Second, most methods only adopt either paired image-text or\nimage-only data, failing to exploit the combination of both paired and unpaired\ndata. To this end, this paper proposes a XLIP (Masked modelling for medical\nLanguage-Image Pre-training) framework to enhance pathological learning and\nfeature learning via unpaired data. First, we introduce the attention-masked\nimage modelling (AttMIM) and entity-driven masked language modelling module\n(EntMLM), which learns to reconstruct pathological visual and textual tokens\nvia multi-modal feature interaction, thus improving medical-enhanced features.\nThe AttMIM module masks a portion of the image features that are highly\nresponsive to textual features. This allows XLIP to improve the reconstruction\nof highly similar image data in medicine efficiency. Second, our XLIP\ncapitalizes unpaired data to enhance multimodal learning by introducing\ndisease-kind prompts. The experimental results show that XLIP achieves SOTA for\nzero-shot and fine-tuning classification performance on five datasets. Our code\nwill be available at https://github.com/White65534/XLIP\n","authors":["Biao Wu","Yutong Xie","Zeyu Zhang","Minh Hieu Phan","Qi Chen","Ling Chen","Qi Wu"],"pdf_url":"https://arxiv.org/pdf/2407.19546v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19542v1","updated":"2024-07-28T17:24:14Z","published":"2024-07-28T17:24:14Z","title":"UniVoxel: Fast Inverse Rendering by Unified Voxelization of Scene\n Representation","summary":" Typical inverse rendering methods focus on learning implicit neural scene\nrepresentations by modeling the geometry, materials and illumination\nseparately, which entails significant computations for optimization. In this\nwork we design a Unified Voxelization framework for explicit learning of scene\nrepresentations, dubbed UniVoxel, which allows for efficient modeling of the\ngeometry, materials and illumination jointly, thereby accelerating the inverse\nrendering significantly. To be specific, we propose to encode a scene into a\nlatent volumetric representation, based on which the geometry, materials and\nillumination can be readily learned via lightweight neural networks in a\nunified manner. Particularly, an essential design of UniVoxel is that we\nleverage local Spherical Gaussians to represent the incident light radiance,\nwhich enables the seamless integration of modeling illumination into the\nunified voxelization framework. Such novel design enables our UniVoxel to model\nthe joint effects of direct lighting, indirect lighting and light visibility\nefficiently without expensive multi-bounce ray tracing. Extensive experiments\non multiple benchmarks covering diverse scenes demonstrate that UniVoxel boosts\nthe optimization efficiency significantly compared to other methods, reducing\nthe per-scene training time from hours to 18 minutes, while achieving favorable\nreconstruction quality. Code is available at\nhttps://github.com/freemantom/UniVoxel.\n","authors":["Shuang Wu","Songlin Tang","Guangming Lu","Jianzhuang Liu","Wenjie Pei"],"pdf_url":"https://arxiv.org/pdf/2407.19542v1.pdf","comment":"ECCV2024"},{"id":"http://arxiv.org/abs/2402.07245v3","updated":"2024-07-28T17:11:28Z","published":"2024-02-11T17:09:21Z","title":"Semi-Mamba-UNet: Pixel-Level Contrastive and Pixel-Level\n Cross-Supervised Visual Mamba-based UNet for Semi-Supervised Medical Image\n Segmentation","summary":" Medical image segmentation is essential in diagnostics, treatment planning,\nand healthcare, with deep learning offering promising advancements. Notably,\nthe convolutional neural network (CNN) excels in capturing local image\nfeatures, whereas the Vision Transformer (ViT) adeptly models long-range\ndependencies through multi-head self-attention mechanisms. Despite their\nstrengths, both the CNN and ViT face challenges in efficiently processing\nlong-range dependencies in medical images, often requiring substantial\ncomputational resources. This issue, combined with the high cost and limited\navailability of expert annotations, poses significant obstacles to achieving\nprecise segmentation. To address these challenges, this study introduces\nSemi-Mamba-UNet, which integrates a purely visual Mamba-based U-shaped\nencoder-decoder architecture with a conventional CNN-based UNet into a\nsemi-supervised learning (SSL) framework. This innovative SSL approach\nleverages both networks to generate pseudo-labels and cross-supervise one\nanother at the pixel level simultaneously, drawing inspiration from consistency\nregularisation techniques. Furthermore, we introduce a self-supervised\npixel-level contrastive learning strategy that employs a pair of projectors to\nenhance the feature learning capabilities further, especially on unlabelled\ndata. Semi-Mamba-UNet was comprehensively evaluated on two publicly available\nsegmentation dataset and compared with seven other SSL frameworks with both\nCNN- or ViT-based UNet as the backbone network, highlighting the superior\nperformance of the proposed method. The source code of Semi-Mamba-Unet, all\nbaseline SSL frameworks, the CNN- and ViT-based networks, and the two\ncorresponding datasets are made publicly accessible.\n","authors":["Chao Ma","Ziyang Wang"],"pdf_url":"https://arxiv.org/pdf/2402.07245v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.11316v2","updated":"2024-07-28T16:56:41Z","published":"2024-07-16T02:02:51Z","title":"BUSClean: Open-source software for breast ultrasound image\n pre-processing and knowledge extraction for medical AI","summary":" Development of artificial intelligence (AI) for medical imaging demands\ncuration and cleaning of large-scale clinical datasets comprising hundreds of\nthousands of images. Some modalities, such as mammography, contain highly\nstandardized imaging. In contrast, breast ultrasound imaging (BUS) can contain\nmany irregularities not indicated by scan metadata, such as enhanced scan\nmodes, sonographer annotations, or additional views. We present an open-source\nsoftware solution for automatically processing clinical BUS datasets. The\nalgorithm performs BUS scan filtering, cleaning, and knowledge extraction from\nsonographer annotations. Its modular design enables users to adapt it to new\nsettings. Experiments on an internal testing dataset of 430 clinical BUS images\nachieve >95% sensitivity and >98% specificity in detecting every type of text\nannotation, >98% sensitivity and specificity in detecting scans with blood flow\nhighlighting, alternative scan modes, or invalid scans. A case study on a\ncompletely external, public dataset of BUS scans found that BUSClean identified\ntext annotations and scans with blood flow highlighting with 88.6% and 90.9%\nsensitivity and 98.3% and 99.9% specificity, respectively. Adaptation of the\nlesion caliper detection method to account for a type of caliper specific to\nthe case study demonstrates intended use of BUSClean in new data distributions\nand improved performance in lesion caliper detection from 43.3% and 93.3%\nout-of-the-box to 92.1% and 92.3% sensitivity and specificity, respectively.\nSource code, example notebooks, and sample data are available at\nhttps://github.com/hawaii-ai/bus-cleaning.\n","authors":["Arianna Bunnell","Kailee Hung","John A. Shepherd","Peter Sadowski"],"pdf_url":"https://arxiv.org/pdf/2407.11316v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19524v1","updated":"2024-07-28T16:24:07Z","published":"2024-07-28T16:24:07Z","title":"VersusDebias: Universal Zero-Shot Debiasing for Text-to-Image Models via\n SLM-Based Prompt Engineering and Generative Adversary","summary":" With the rapid development of Text-to-Image models, biases in human image\ngeneration against demographic groups social attract more and more concerns.\nExisting methods are designed based on certain models with fixed prompts,\nunable to accommodate the trend of high-speed updating of Text-to-Image (T2I)\nmodels and variable prompts in practical scenes. Additionally, they fail to\nconsider the possibility of hallucinations, leading to deviations between\nexpected and actual results. To address this issue, we introduce VersusDebias,\na novel and universal debiasing framework for biases in T2I models, consisting\nof one generative adversarial mechanism (GAM) and one debiasing generation\nmechanism using a small language model (SLM). The self-adaptive GAM generates\nspecialized attribute arrays for each prompts for diminishing the influence of\nhallucinations from T2I models. The SLM uses prompt engineering to generate\ndebiased prompts for the T2I model, providing zero-shot debiasing ability and\ncustom optimization for different models. Extensive experiments demonstrate\nVersusDebias's capability to rectify biases on arbitrary models across multiple\nprotected attributes simultaneously, including gender, race, and age.\nFurthermore, VersusDebias outperforms existing methods in both zero-shot and\nfew-shot situations, illustrating its extraordinary utility. Our work is openly\naccessible to the research community to ensure the reproducibility.\n","authors":["Hanjun Luo","Ziye Deng","Haoyu Huang","Xuecheng Liu","Ruizhe Chen","Zuozhu Liu"],"pdf_url":"https://arxiv.org/pdf/2407.19524v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19520v1","updated":"2024-07-28T16:01:32Z","published":"2024-07-28T16:01:32Z","title":"Ego-VPA: Egocentric Video Understanding with Parameter-efficient\n Adaptation","summary":" Video understanding typically requires fine-tuning the large backbone when\nadapting to new domains. In this paper, we leverage the egocentric video\nfoundation models (Ego-VFMs) based on video-language pre-training and propose a\nparameter-efficient adaptation for egocentric video tasks, namely Ego-VPA. It\nemploys a local sparse approximation for each video frame/text feature using\nthe basis prompts, and the selected basis prompts are used to synthesize\nvideo/text prompts. Since the basis prompts are shared across frames and\nmodalities, it models context fusion and cross-modal transfer in an efficient\nfashion. Experiments show that Ego-VPA excels in lightweight adaptation (with\nonly 0.84% learnable parameters), largely improving over baselines and reaching\nthe performance of full fine-tuning.\n","authors":["Tz-Ying Wu","Kyle Min","Subarna Tripathi","Nuno Vasconcelos"],"pdf_url":"https://arxiv.org/pdf/2407.19520v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19518v1","updated":"2024-07-28T15:54:37Z","published":"2024-07-28T15:54:37Z","title":"Solving Short-Term Relocalization Problems In Monocular Keyframe Visual\n SLAM Using Spatial And Semantic Data","summary":" In Monocular Keyframe Visual Simultaneous Localization and Mapping (MKVSLAM)\nframeworks, when incremental position tracking fails, global pose has to be\nrecovered in a short-time window, also known as short-term relocalization. This\ncapability is crucial for mobile robots to have reliable navigation, build\naccurate maps, and have precise behaviors around human collaborators. This\npaper focuses on the development of robust short-term relocalization\ncapabilities for mobile robots using a monocular camera system. A novel\nmultimodal keyframe descriptor is introduced, that contains semantic\ninformation of objects detected in the environment and the spatial information\nof the camera. Using this descriptor, a new Keyframe-based Place Recognition\n(KPR) method is proposed that is formulated as a multi-stage keyframe filtering\nalgorithm, leading to a new relocalization pipeline for MKVSLAM systems. The\nproposed approach is evaluated over several indoor GPS denied datasets and\ndemonstrates accurate pose recovery, in comparison to a bag-of-words approach.\n","authors":["Azmyin Md. Kamal","Nenyi K. N. Dadson","Donovan Gegg","Corina Barbalata"],"pdf_url":"https://arxiv.org/pdf/2407.19518v1.pdf","comment":"8 pages, Keywords: VSLAM, Localization, Semantics. Presented in 2024\n IEEE/ASME International Conference on Advanced Intelligent Mechatronics (AIM)"},{"id":"http://arxiv.org/abs/2403.10997v2","updated":"2024-07-28T15:53:39Z","published":"2024-03-16T18:50:44Z","title":"N2F2: Hierarchical Scene Understanding with Nested Neural Feature Fields","summary":" Understanding complex scenes at multiple levels of abstraction remains a\nformidable challenge in computer vision. To address this, we introduce Nested\nNeural Feature Fields (N2F2), a novel approach that employs hierarchical\nsupervision to learn a single feature field, wherein different dimensions\nwithin the same high-dimensional feature encode scene properties at varying\ngranularities. Our method allows for a flexible definition of hierarchies,\ntailored to either the physical dimensions or semantics or both, thereby\nenabling a comprehensive and nuanced understanding of scenes. We leverage a 2D\nclass-agnostic segmentation model to provide semantically meaningful pixel\ngroupings at arbitrary scales in the image space, and query the CLIP\nvision-encoder to obtain language-aligned embeddings for each of these\nsegments. Our proposed hierarchical supervision method then assigns different\nnested dimensions of the feature field to distill the CLIP embeddings using\ndeferred volumetric rendering at varying physical scales, creating a\ncoarse-to-fine representation. Extensive experiments show that our approach\noutperforms the state-of-the-art feature field distillation methods on tasks\nsuch as open-vocabulary 3D segmentation and localization, demonstrating the\neffectiveness of the learned nested feature field.\n","authors":["Yash Bhalgat","Iro Laina","João F. Henriques","Andrew Zisserman","Andrea Vedaldi"],"pdf_url":"https://arxiv.org/pdf/2403.10997v2.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2407.19514v1","updated":"2024-07-28T15:38:58Z","published":"2024-07-28T15:38:58Z","title":"Detached and Interactive Multimodal Learning","summary":" Recently, Multimodal Learning (MML) has gained significant interest as it\ncompensates for single-modality limitations through comprehensive complementary\ninformation within multimodal data. However, traditional MML methods generally\nuse the joint learning framework with a uniform learning objective that can\nlead to the modality competition issue, where feedback predominantly comes from\ncertain modalities, limiting the full potential of others. In response to this\nchallenge, this paper introduces DI-MML, a novel detached MML framework\ndesigned to learn complementary information across modalities under the premise\nof avoiding modality competition. Specifically, DI-MML addresses competition by\nseparately training each modality encoder with isolated learning objectives. It\nfurther encourages cross-modal interaction via a shared classifier that defines\na common feature space and employing a dimension-decoupled unidirectional\ncontrastive (DUC) loss to facilitate modality-level knowledge transfer.\nAdditionally, to account for varying reliability in sample pairs, we devise a\ncertainty-aware logit weighting strategy to effectively leverage complementary\ninformation at the instance level during inference. Extensive experiments\nconducted on audio-visual, flow-image, and front-rear view datasets show the\nsuperior performance of our proposed method. The code is released at\nhttps://github.com/fanyunfeng-bit/DI-MML.\n","authors":["Yunfeng Fan","Wenchao Xu","Haozhao Wang","Junhong Liu","Song Guo"],"pdf_url":"https://arxiv.org/pdf/2407.19514v1.pdf","comment":"Accepted by ACM MM 24"},{"id":"http://arxiv.org/abs/2407.19512v1","updated":"2024-07-28T15:29:07Z","published":"2024-07-28T15:29:07Z","title":"Large-scale cervical precancerous screening via AI-assisted cytology\n whole slide image analysis","summary":" Cervical Cancer continues to be the leading gynecological malignancy, posing\na persistent threat to women's health on a global scale. Early screening via\ncytology Whole Slide Image (WSI) diagnosis is critical to prevent this Cancer\nprogression and improve survival rate, but pathologist's single test suffers\ninevitable false negative due to the immense number of cells that need to be\nreviewed within a WSI. Though computer-aided automated diagnostic models can\nserve as strong complement for pathologists, their effectiveness is hampered by\nthe paucity of extensive and detailed annotations, coupled with the limited\ninterpretability and robustness. These factors significantly hinder their\npractical applicability and reliability in clinical settings. To tackle these\nchallenges, we develop an AI approach, which is a Scalable Technology for\nRobust and Interpretable Diagnosis built on Extensive data (STRIDE) of cervical\ncytology. STRIDE addresses the bottleneck of limited annotations by integrating\npatient-level labels with a small portion of cell-level labels through an\nend-to-end training strategy, facilitating scalable learning across extensive\ndatasets. To further improve the robustness to real-world domain shifts of\ncytology slide-making and imaging, STRIDE employs color adversarial samples\ntraining that mimic staining and imaging variations. Lastly, to achieve\npathologist-level interpretability for the trustworthiness in clinical\nsettings, STRIDE can generate explanatory textual descriptions that simulates\npathologists' diagnostic processes by cell image feature and textual\ndescription alignment. Conducting extensive experiments and evaluations in 183\nmedical centers with a dataset of 341,889 WSIs and 0.1 billion cells from\ncervical cytology patients, STRIDE has demonstrated a remarkable superiority\nover previous state-of-the-art techniques.\n","authors":["Honglin Li","Yusuan Sun","Chenglu Zhu","Yunlong Zhang","Shichuan Zhang","Zhongyi Shui","Pingyi Chen","Jingxiong Li","Sunyi Zheng","Can Cui","Lin Yang"],"pdf_url":"https://arxiv.org/pdf/2407.19512v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19510v1","updated":"2024-07-28T15:14:07Z","published":"2024-07-28T15:14:07Z","title":"EPD: Long-term Memory Extraction, Context-awared Planning and\n Multi-iteration Decision @ EgoPlan Challenge ICML 2024","summary":" In this technical report, we present our solution for the EgoPlan Challenge\nin ICML 2024. To address the real-world egocentric task planning problem, we\nintroduce a novel planning framework which comprises three stages: long-term\nmemory Extraction, context-awared Planning, and multi-iteration Decision, named\nEPD. Given the task goal, task progress, and current observation, the\nextraction model first extracts task-relevant memory information from the\nprogress video, transforming the complex long video into summarized memory\ninformation. The planning model then combines the context of the memory\ninformation with fine-grained visual information from the current observation\nto predict the next action. Finally, through multi-iteration decision-making,\nthe decision model comprehensively understands the task situation and current\nstate to make the most realistic planning decision. On the EgoPlan-Test set,\nEPD achieves a planning accuracy of 53.85% over 1,584 egocentric task planning\nquestions. We have made all codes available at https://github.com/Kkskkkskr/EPD .\n","authors":["Letian Shi","Qi Lv","Xiang Deng","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2407.19510v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19507v1","updated":"2024-07-28T14:58:07Z","published":"2024-07-28T14:58:07Z","title":"WeCromCL: Weakly Supervised Cross-Modality Contrastive Learning for\n Transcription-only Supervised Text Spotting","summary":" Transcription-only Supervised Text Spotting aims to learn text spotters\nrelying only on transcriptions but no text boundaries for supervision, thus\neliminating expensive boundary annotation. The crux of this task lies in\nlocating each transcription in scene text images without location annotations.\nIn this work, we formulate this challenging problem as a Weakly Supervised\nCross-modality Contrastive Learning problem, and design a simple yet effective\nmodel dubbed WeCromCL that is able to detect each transcription in a scene\nimage in a weakly supervised manner. Unlike typical methods for cross-modality\ncontrastive learning that focus on modeling the holistic semantic correlation\nbetween an entire image and a text description, our WeCromCL conducts atomistic\ncontrastive learning to model the character-wise appearance consistency between\na text transcription and its correlated region in a scene image to detect an\nanchor point for the transcription in a weakly supervised manner. The detected\nanchor points by WeCromCL are further used as pseudo location labels to guide\nthe learning of text spotting. Extensive experiments on four challenging\nbenchmarks demonstrate the superior performance of our model over other\nmethods. Code will be released.\n","authors":["Jingjing Wu","Zhengyao Fang","Pengyuan Lyu","Chengquan Zhang","Fanglin Chen","Guangming Lu","Wenjie Pei"],"pdf_url":"https://arxiv.org/pdf/2407.19507v1.pdf","comment":"Accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2401.00403v2","updated":"2024-07-28T14:33:47Z","published":"2023-12-31T05:37:27Z","title":"Overcome Modal Bias in Multi-modal Federated Learning via Balanced\n Modality Selection","summary":" Selecting proper clients to participate in each federated learning (FL) round\nis critical to effectively harness a broad range of distributed data. Existing\nclient selection methods simply consider the mining of distributed uni-modal\ndata, yet, their effectiveness may diminish in multi-modal FL (MFL) as the\nmodality imbalance problem not only impedes the collaborative local training\nbut also leads to a severe global modality-level bias. We empirically reveal\nthat local training with a certain single modality may contribute more to the\nglobal model than training with all local modalities. To effectively exploit\nthe distributed multiple modalities, we propose a novel Balanced Modality\nSelection framework for MFL (BMSFed) to overcome the modal bias. On the one\nhand, we introduce a modal enhancement loss during local training to alleviate\nlocal imbalance based on the aggregated global prototypes. On the other hand,\nwe propose the modality selection aiming to select subsets of local modalities\nwith great diversity and achieving global modal balance simultaneously. Our\nextensive experiments on audio-visual, colored-gray, and front-back datasets\nshowcase the superiority of BMSFed over baselines and its effectiveness in\nmulti-modal data exploitation.\n","authors":["Yunfeng Fan","Wenchao Xu","Haozhao Wang","Fushuo Huo","Jinyu Chen","Song Guo"],"pdf_url":"https://arxiv.org/pdf/2401.00403v2.pdf","comment":"Accepted by ECCV24, 23 pages"},{"id":"http://arxiv.org/abs/2312.08977v3","updated":"2024-07-28T14:09:32Z","published":"2023-12-14T14:26:57Z","title":"Weighted Ensemble Models Are Strong Continual Learners","summary":" In this work, we study the problem of continual learning (CL) where the goal\nis to learn a model on a sequence of tasks, such that the data from the\nprevious tasks becomes unavailable while learning on the current task data. CL\nis essentially a balancing act between being able to learn on the new task\n(i.e., plasticity) and maintaining the performance on the previously learned\nconcepts (i.e., stability). Intending to address the stability-plasticity\ntrade-off, we propose to perform weight-ensembling of the model parameters of\nthe previous and current tasks. This weighted-ensembled model, which we call\nContinual Model Averaging (or CoMA), attains high accuracy on the current task\nby leveraging plasticity, while not deviating too far from the previous weight\nconfiguration, ensuring stability. We also propose an improved variant of CoMA,\nnamed Continual Fisher-weighted Model Averaging (or CoFiMA), that selectively\nweighs each parameter in the weights ensemble by leveraging the Fisher\ninformation of the weights of the model. Both variants are conceptually simple,\neasy to implement, and effective in attaining state-of-the-art performance on\nseveral standard CL benchmarks. Code is available at:\nhttps://github.com/IemProg/CoFiMA.\n","authors":["Imad Eddine Marouf","Subhankar Roy","Enzo Tartaglione","Stéphane Lathuilière"],"pdf_url":"https://arxiv.org/pdf/2312.08977v3.pdf","comment":"Accepted for ECCV2024, Code: https://github.com/IemProg/CoFiMA"},{"id":"http://arxiv.org/abs/2407.19497v1","updated":"2024-07-28T13:57:03Z","published":"2024-07-28T13:57:03Z","title":"Skeleton-based Group Activity Recognition via Spatial-Temporal Panoramic\n Graph","summary":" Group Activity Recognition aims to understand collective activities from\nvideos. Existing solutions primarily rely on the RGB modality, which encounters\nchallenges such as background variations, occlusions, motion blurs, and\nsignificant computational overhead. Meanwhile, current keypoint-based methods\noffer a lightweight and informative representation of human motions but\nnecessitate accurate individual annotations and specialized interaction\nreasoning modules. To address these limitations, we design a panoramic graph\nthat incorporates multi-person skeletons and objects to encapsulate group\nactivity, offering an effective alternative to RGB video. This panoramic graph\nenables Graph Convolutional Network (GCN) to unify intra-person, inter-person,\nand person-object interactive modeling through spatial-temporal graph\nconvolutions. In practice, we develop a novel pipeline that extracts skeleton\ncoordinates using pose estimation and tracking algorithms and employ\nMulti-person Panoramic GCN (MP-GCN) to predict group activities. Extensive\nexperiments on Volleyball and NBA datasets demonstrate that the MP-GCN achieves\nstate-of-the-art performance in both accuracy and efficiency. Notably, our\nmethod outperforms RGB-based approaches by using only estimated 2D keypoints as\ninput. Code is available at https://github.com/mgiant/MP-GCN\n","authors":["Zhengcen Li","Xinle Chang","Yueran Li","Jingyong Su"],"pdf_url":"https://arxiv.org/pdf/2407.19497v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2405.16089v2","updated":"2024-07-28T16:08:29Z","published":"2024-05-25T06:41:23Z","title":"Towards Completeness-Oriented Tool Retrieval for Large Language Models","summary":" Recently, integrating external tools with Large Language Models (LLMs) has\ngained significant attention as an effective strategy to mitigate the\nlimitations inherent in their pre-training data. However, real-world systems\noften incorporate a wide array of tools, making it impractical to input all\ntools into LLMs due to length limitations and latency constraints. Therefore,\nto fully exploit the potential of tool-augmented LLMs, it is crucial to develop\nan effective tool retrieval system. Existing tool retrieval methods primarily\nfocus on semantic matching between user queries and tool descriptions,\nfrequently leading to the retrieval of redundant, similar tools. Consequently,\nthese methods fail to provide a complete set of diverse tools necessary for\naddressing the multifaceted problems encountered by LLMs. In this paper, we\npropose a novel modelagnostic COllaborative Learning-based Tool Retrieval\napproach, COLT, which captures not only the semantic similarities between user\nqueries and tool descriptions but also takes into account the collaborative\ninformation of tools. Specifically, we first fine-tune the PLM-based retrieval\nmodels to capture the semantic relationships between queries and tools in the\nsemantic learning stage. Subsequently, we construct three bipartite graphs\namong queries, scenes, and tools and introduce a dual-view graph collaborative\nlearning framework to capture the intricate collaborative relationships among\ntools during the collaborative learning stage. Extensive experiments on both\nthe open benchmark and the newly introduced ToolLens dataset show that COLT\nachieves superior performance. Notably, the performance of BERT-mini (11M) with\nour proposed model framework outperforms BERT-large (340M), which has 30 times\nmore parameters. Furthermore, we will release ToolLens publicly to facilitate\nfuture research on tool retrieval.\n","authors":["Changle Qu","Sunhao Dai","Xiaochi Wei","Hengyi Cai","Shuaiqiang Wang","Dawei Yin","Jun Xu","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2405.16089v2.pdf","comment":"Accepted by CIKM 2024; GitHub: https://github.com/quchangle1/COLT"},{"id":"http://arxiv.org/abs/2407.17234v2","updated":"2024-07-28T15:00:40Z","published":"2024-07-24T12:42:41Z","title":"Intent-guided Heterogeneous Graph Contrastive Learning for\n Recommendation","summary":" Contrastive Learning (CL)-based recommender systems have gained prominence in\nthe context of Heterogeneous Graph (HG) due to their capacity to enhance the\nconsistency of representations across different views. However, existing\nframeworks often neglect the fact that user-item interactions within HG are\ngoverned by diverse latent intents (e.g., brand preferences or demographic\ncharacteristics of item audiences), which are pivotal in capturing fine-grained\nrelations. The exploration of these underlying intents, particularly through\nthe lens of meta-paths in HGs, presents us with two principal challenges: i)\nHow to integrate CL with intents; ii) How to mitigate noise from\nmeta-path-driven intents.\n To address these challenges, we propose an innovative framework termed\nIntent-guided Heterogeneous Graph Contrastive Learning (IHGCL), which designed\nto enhance CL-based recommendation by capturing the intents contained within\nmeta-paths. Specifically, the IHGCL framework includes: i) a meta-path-based\nDual Contrastive Learning (DCL) approach to effectively integrate intents into\nthe recommendation, constructing intent-intent contrast and intent-interaction\ncontrast; ii) a Bottlenecked AutoEncoder (BAE) that combines mask propagation\nwith the information bottleneck principle to significantly reduce noise\nperturbations introduced by meta-paths. Empirical evaluations conducted across\nsix distinct datasets demonstrate the superior performance of our IHGCL\nframework relative to conventional baseline methods. Our model implementation\nis available at https://github.com/wangyu0627/IHGCL.\n","authors":["Lei Sang","Yu Wang","Yi Zhang","Yiwen Zhang","Xindong Wu"],"pdf_url":"https://arxiv.org/pdf/2407.17234v2.pdf","comment":"14pages, 11figures"},{"id":"http://arxiv.org/abs/2404.01582v2","updated":"2024-07-28T13:12:03Z","published":"2024-04-01T12:20:34Z","title":"BERT-Enhanced Retrieval Tool for Homework Plagiarism Detection System","summary":" Text plagiarism detection task is a common natural language processing task\nthat aims to detect whether a given text contains plagiarism or copying from\nother texts. In existing research, detection of high level plagiarism is still\na challenge due to the lack of high quality datasets. In this paper, we propose\na plagiarized text data generation method based on GPT-3.5, which produces\n32,927 pairs of text plagiarism detection datasets covering a wide range of\nplagiarism methods, bridging the gap in this part of research. Meanwhile, we\npropose a plagiarism identification method based on Faiss with BERT with high\nefficiency and high accuracy. Our experiments show that the performance of this\nmodel outperforms other models in several metrics, including 98.86\\%, 98.90%,\n98.86%, and 0.9888 for Accuracy, Precision, Recall, and F1 Score, respectively.\nAt the end, we also provide a user-friendly demo platform that allows users to\nupload a text library and intuitively participate in the plagiarism analysis.\n","authors":["Jiarong Xian","Jibao Yuan","Peiwei Zheng","Dexian Chen","Nie yuntao"],"pdf_url":"https://arxiv.org/pdf/2404.01582v2.pdf","comment":"arXiv admin note: text overlap with arXiv:1604.06573 by other authors"},{"id":"http://arxiv.org/abs/2407.19469v1","updated":"2024-07-28T11:46:55Z","published":"2024-07-28T11:46:55Z","title":"Interpretable Triplet Importance for Personalized Ranking","summary":" Personalized item ranking has been a crucial component contributing to the\nperformance of recommender systems. As a representative approach, pairwise\nranking directly optimizes the ranking with user implicit feedback by\nconstructing (\\textit{user}, \\textit{positive item}, \\textit{negative item})\ntriplets. Several recent works have noticed that treating all triplets equally\nmay hardly achieve the best effects. They assign different importance scores to\nnegative items, user-item pairs, or triplets, respectively. However, almost all\nthe generated importance scores are groundless and hard to interpret, thus far\nfrom trustworthy and transparent. To tackle these, we propose the\n\\textit{Triplet Shapley} -- a Shapely value-based method to measure the triplet\nimportance in an interpretable manner. Due to the huge number of triplets, we\ntransform the original Shapley value calculation to the Monte Carlo (MC)\napproximation, where the guarantee for the approximation unbiasedness is also\nprovided. To stabilize the MC approximation, we adopt a control\ncovariates-based method. Finally, we utilize the triplet Shapley value to guide\nthe resampling of important triplets for benefiting the model learning.\nExtensive experiments are conducted on six public datasets involving classical\nmatrix factorization- and graph neural network-based recommendation models.\nEmpirical results and subsequent analysis show that our model consistently\noutperforms the state-of-the-art methods.\n","authors":["Bowei He","Chen Ma"],"pdf_url":"https://arxiv.org/pdf/2407.19469v1.pdf","comment":"Accepted by CIKM 2024"},{"id":"http://arxiv.org/abs/2407.19467v1","updated":"2024-07-28T11:36:47Z","published":"2024-07-28T11:36:47Z","title":"Enhancing Taobao Display Advertising with Multimodal Representations:\n Challenges, Approaches and Insights","summary":" Despite the recognized potential of multimodal data to improve model\naccuracy, many large-scale industrial recommendation systems, including Taobao\ndisplay advertising system, predominantly depend on sparse ID features in their\nmodels. In this work, we explore approaches to leverage multimodal data to\nenhance the recommendation accuracy. We start from identifying the key\nchallenges in adopting multimodal data in a manner that is both effective and\ncost-efficient for industrial systems. To address these challenges, we\nintroduce a two-phase framework, including: 1) the pre-training of multimodal\nrepresentations to capture semantic similarity, and 2) the integration of these\nrepresentations with existing ID-based models. Furthermore, we detail the\narchitecture of our production system, which is designed to facilitate the\ndeployment of multimodal representations. Since the integration of multimodal\nrepresentations in mid-2023, we have observed significant performance\nimprovements in Taobao display advertising system. We believe that the insights\nwe have gathered will serve as a valuable resource for practitioners seeking to\nleverage multimodal data in their systems.\n","authors":["Xiang-Rong Sheng","Feifan Yang","Litong Gong","Biao Wang","Zhangming Chan","Yujing Zhang","Yueyao Cheng","Yong-Nan Zhu","Tiezheng Ge","Han Zhu","Yuning Jiang","Jian Xu","Bo Zheng"],"pdf_url":"https://arxiv.org/pdf/2407.19467v1.pdf","comment":"Accepted at CIKM 2024"},{"id":"http://arxiv.org/abs/2404.00236v2","updated":"2024-07-28T01:02:21Z","published":"2024-03-30T03:56:53Z","title":"Enhancing Content-based Recommendation via Large Language Model","summary":" In real-world applications, users express different behaviors when they\ninteract with different items, including implicit click/like interactions, and\nexplicit comments/reviews interactions. Nevertheless, almost all recommender\nworks are focused on how to describe user preferences by the implicit\nclick/like interactions, to find the synergy of people. For the content-based\nexplicit comments/reviews interactions, some works attempt to utilize them to\nmine the semantic knowledge to enhance recommender models. However, they still\nneglect the following two points: (1) The content semantic is a universal world\nknowledge; how do we extract the multi-aspect semantic information to empower\ndifferent domains? (2) The user/item ID feature is a fundamental element for\nrecommender models; how do we align the ID and content semantic feature space?\nIn this paper, we propose a `plugin' semantic knowledge transferring method\n\\textbf{LoID}, which includes two major components: (1) LoRA-based large\nlanguage model pretraining to extract multi-aspect semantic information; (2)\nID-based contrastive objective to align their feature spaces. We conduct\nextensive experiments with SOTA baselines on real-world datasets, the detailed\nresults demonstrating significant improvements of our method LoID.\n","authors":["Wentao Xu","Qianqian Xie","Shuo Yang","Jiangxia Cao","Shuchao Pang"],"pdf_url":"https://arxiv.org/pdf/2404.00236v2.pdf","comment":"Accepted at CIKM 2024"},{"id":"http://arxiv.org/abs/2404.00026v4","updated":"2024-07-28T00:29:22Z","published":"2024-03-20T21:02:16Z","title":"Ink and Individuality: Crafting a Personalised Narrative in the Age of\n LLMs","summary":" Individuality and personalization comprise the distinctive characteristics\nthat make each writer unique and influence their words in order to effectively\nengage readers while conveying authenticity. However, our growing reliance on\nLLM-based writing assistants risks compromising our creativity and\nindividuality over time. We often overlook the negative impacts of this trend\non our creativity and uniqueness, despite the possible consequences. This study\ninvestigates these concerns by performing a brief survey to explore different\nperspectives and concepts, as well as trying to understand people's viewpoints,\nin conjunction with past studies in the area. Addressing these issues is\nessential for improving human-computer interaction systems and enhancing\nwriting assistants for personalization and individuality.\n","authors":["Azmine Toushik Wasi","Raima Islam","Mst Rafia Islam"],"pdf_url":"https://arxiv.org/pdf/2404.00026v4.pdf","comment":"8 Pages, 4 Figures. Accepted in The Third Workshop on Intelligent and\n Interactive Writing Assistants at CHI 2024"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2401.14521v4","updated":"2024-07-28T22:59:57Z","published":"2024-01-25T21:26:49Z","title":"Towards Interpretable Physical-Conceptual Catchment-Scale Hydrological\n Modeling using the Mass-Conserving-Perceptron","summary":" We investigate the applicability of machine learning technologies to the\ndevelopment of parsimonious, interpretable, catchment-scale hydrologic models\nusing directed-graph architectures based on the mass-conserving perceptron\n(MCP) as the fundamental computational unit. Here, we focus on architectural\ncomplexity (depth) at a single location, rather than universal applicability\n(breadth) across large samples of catchments. The goal is to discover a minimal\nrepresentation (numbers of cell-states and flow paths) that represents the\ndominant processes that can explain the input-state-output behaviors of a given\ncatchment, with particular emphasis given to simulating the full range (high,\nmedium, and low) of flow dynamics. We find that a HyMod Like architecture with\nthree cell-states and two major flow pathways achieves such a representation at\nour study location, but that the additional incorporation of an input-bypass\nmechanism significantly improves the timing and shape of the hydrograph, while\nthe inclusion of bi-directional groundwater mass exchanges significantly\nenhances the simulation of baseflow. Overall, our results demonstrate the\nimportance of using multiple diagnostic metrics for model evaluation, while\nhighlighting the need for properly selecting and designing the training metrics\nbased on information-theoretic foundations that are better suited to extracting\ninformation across the full range of flow dynamics. This study sets the stage\nfor interpretable regional-scale MCP-based hydrological modeling (using large\nsample data) by using neural architecture search to determine appropriate\nminimal representations for catchments in different hydroclimatic regimes.\n","authors":["Yuan-Heng Wang","Hoshin V. Gupta"],"pdf_url":"https://arxiv.org/pdf/2401.14521v4.pdf","comment":"65 pages, 8 Figures, 4 Tables, 1 Supplementary Material"},{"id":"http://arxiv.org/abs/2309.15169v2","updated":"2024-07-28T22:41:20Z","published":"2023-09-26T18:05:19Z","title":"Revealing the Power of Masked Autoencoders in Traffic Forecasting","summary":" Traffic forecasting, crucial for urban planning, requires accurate\npredictions of spatial-temporal traffic patterns across urban areas. Existing\nresearch mainly focuses on designing complex models that capture\nspatial-temporal dependencies among variables explicitly. However, this field\nfaces challenges related to data scarcity and model stability, which results in\nlimited performance improvement. To address these issues, we propose\nSpatial-Temporal Masked AutoEncoders (STMAE), a plug-and-play framework\ndesigned to enhance existing spatial-temporal models on traffic prediction.\nSTMAE consists of two learning stages. In the pretraining stage, an encoder\nprocesses partially visible traffic data produced by a dual-masking strategy,\nincluding biased random walk-based spatial masking and patch-based temporal\nmasking. Subsequently, two decoders aim to reconstruct the masked counterparts\nfrom both spatial and temporal perspectives. The fine-tuning stage retains the\npretrained encoder and integrates it with decoders from existing backbones to\nimprove forecasting accuracy. Our results on traffic benchmarks show that STMAE\ncan largely enhance the forecasting capabilities of various spatial-temporal\nmodels.\n","authors":["Jiarui Sun","Yujie Fan","Chin-Chia Michael Yeh","Wei Zhang","Girish Chowdhary"],"pdf_url":"https://arxiv.org/pdf/2309.15169v2.pdf","comment":"Accepted by CIKM 2024"},{"id":"http://arxiv.org/abs/2403.12977v2","updated":"2024-07-28T21:59:57Z","published":"2024-02-10T01:16:21Z","title":"SportsNGEN: Sustained Generation of Realistic Multi-player Sports\n Gameplay","summary":" We present a transformer decoder based sports simulation engine, SportsNGEN,\ntrained on sports player and ball tracking sequences, that is capable of\ngenerating sustained gameplay and accurately mimicking the decision making of\nreal players. By training on a large database of professional tennis tracking\ndata, we demonstrate that simulations produced by SportsNGEN can be used to\npredict the outcomes of rallies, determine the best shot choices at any point,\nand evaluate counterfactual or what if scenarios to inform coaching decisions\nand elevate broadcast coverage. By combining the generated simulations with a\nshot classifier and logic to start and end rallies, the system is capable of\nsimulating an entire tennis match. We evaluate SportsNGEN by comparing\nstatistics of the simulations with those of real matches between the same\nplayers. We show that the model output sampling parameters are crucial to\nsimulation realism and that SportsNGEN is probabilistically well-calibrated to\nreal data. In addition, a generic version of SportsNGEN can be customized to a\nspecific player by fine-tuning on the subset of match data that includes that\nplayer. Finally, we show qualitative results indicating the same approach works\nfor football.\n","authors":["Lachlan Thorpe","Lewis Bawden","Karanjot Vendal","John Bronskill","Richard E. Turner"],"pdf_url":"https://arxiv.org/pdf/2403.12977v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.10988v2","updated":"2024-07-28T21:52:11Z","published":"2024-05-16T06:05:16Z","title":"Flow Score Distillation for Diverse Text-to-3D Generation","summary":" Recent advancements in Text-to-3D generation have yielded remarkable\nprogress, particularly through methods that rely on Score Distillation Sampling\n(SDS). While SDS exhibits the capability to create impressive 3D assets, it is\nhindered by its inherent maximum-likelihood-seeking essence, resulting in\nlimited diversity in generation outcomes. In this paper, we discover that the\nDenoise Diffusion Implicit Models (DDIM) generation process (\\ie PF-ODE) can be\nsuccinctly expressed using an analogue of SDS loss. One step further, one can\nsee SDS as a generalized DDIM generation process. Following this insight, we\nshow that the noise sampling strategy in the noise addition stage significantly\nrestricts the diversity of generation results. To address this limitation, we\npresent an innovative noise sampling approach and introduce a novel text-to-3D\nmethod called Flow Score Distillation (FSD). Our validation experiments across\nvarious text-to-image Diffusion Models demonstrate that FSD substantially\nenhances generation diversity without compromising quality.\n","authors":["Runjie Yan","Kailu Wu","Kaisheng Ma"],"pdf_url":"https://arxiv.org/pdf/2405.10988v2.pdf","comment":"Consistent Flow Distillation is an improved version of this paper"},{"id":"http://arxiv.org/abs/2404.14701v2","updated":"2024-07-28T21:22:53Z","published":"2024-04-23T03:01:09Z","title":"Deep neural networks for choice analysis: Enhancing behavioral\n regularity with gradient regularization","summary":" Deep neural networks (DNNs) frequently present behaviorally irregular\npatterns, significantly limiting their practical potentials and theoretical\nvalidity in travel behavior modeling. This study proposes strong and weak\nbehavioral regularities as novel metrics to evaluate the monotonicity of\nindividual demand functions (known as the \"law of demand\"), and further designs\na constrained optimization framework with six gradient regularizers to enhance\nDNNs' behavioral regularity. The proposed framework is applied to travel survey\ndata from Chicago and London to examine the trade-off between predictive power\nand behavioral regularity for large vs. small sample scenarios and in-domain\nvs. out-of-domain generalizations. The results demonstrate that, unlike models\nwith strong behavioral foundations such as the multinomial logit, the benchmark\nDNNs cannot guarantee behavioral regularity. However, gradient regularization\n(GR) increases DNNs' behavioral regularity by around 6 percentage points (pp)\nwhile retaining their relatively high predictive power. In the small sample\nscenario, GR is more effective than in the large sample scenario,\nsimultaneously improving behavioral regularity by about 20 pp and\nlog-likelihood by around 1.7%. Comparing with the in-domain generalization of\nDNNs, GR works more effectively in out-of-domain generalization: it drastically\nimproves the behavioral regularity of poorly performing benchmark DNNs by\naround 65 pp, indicating the criticality of behavioral regularization for\nenhancing model transferability and application in forecasting. Moreover, the\nproposed framework is applicable to other NN-based choice models such as\nTasteNets. Future studies could use behavioral regularity as a metric along\nwith log-likelihood in evaluating travel demand models, and investigate other\nmethods to further enhance behavioral regularity when adopting complex machine\nlearning models.\n","authors":["Siqi Feng","Rui Yao","Stephane Hess","Ricardo A. Daziano","Timothy Brathwaite","Joan Walker","Shenhao Wang"],"pdf_url":"https://arxiv.org/pdf/2404.14701v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19580v1","updated":"2024-07-28T20:39:16Z","published":"2024-07-28T20:39:16Z","title":"Memory-efficient Training of LLMs with Larger Mini-batches","summary":" Training with larger mini-batches improves the performance and convergence\nrate of training machine learning models. However, training with large\nmini-batches becomes prohibitive for Large Language Models (LLMs) with billions\nof parameters, due to the large GPU memory requirement. To address this\nproblem, we propose finding small mini-batches that simulate the dynamics of\ntraining with larger mini-batches. Specifically, we formulate selecting smaller\nmini-batches of examples that closely capture gradients of large mini-batches\nas a submodular maximization problem. Nevertheless, the very large\ndimensionality of the gradients makes the problem very challenging to solve. To\naddress this, we leverage ideas from zeroth-order optimization and neural\nnetwork pruning to find lower-dimensional gradient estimates that allow finding\nhigh-quality subsets effectively with a limited amount of memory. We prove the\nsuperior convergence rate of training on the small mini-batches found by our\nmethod and empirically show its effectiveness. Our method can effectively\nreduce the memory requirement by 2x and speed up training by 1.3x, as we\nconfirm for fine-tuning Phi-2 on MathInstruct. Our method can be easily stacked\nwith LoRA and other memory-efficient methods to further reduce the memory\nrequirements of training LLMs.\n","authors":["Dang Nguyen","Wenhan Yang","Rathul Anand","Yu Yang","Baharan Mirzasoleiman"],"pdf_url":"https://arxiv.org/pdf/2407.19580v1.pdf","comment":"15 pages, 2 figures, 4 tables"},{"id":"http://arxiv.org/abs/2405.00082v2","updated":"2024-07-28T20:15:10Z","published":"2024-04-30T18:00:00Z","title":"Structure learning of Hamiltonians from real-time evolution","summary":" We study the problem of Hamiltonian structure learning from real-time\nevolution: given the ability to apply $e^{-\\mathrm{i} Ht}$ for an unknown local\nHamiltonian $H = \\sum_{a = 1}^m \\lambda_a E_a$ on $n$ qubits, the goal is to\nrecover $H$. This problem is already well-understood under the assumption that\nthe interaction terms, $E_a$, are given, and only the interaction strengths,\n$\\lambda_a$, are unknown. But how efficiently can we learn a local Hamiltonian\nwithout prior knowledge of its interaction structure?\n We present a new, general approach to Hamiltonian learning that not only\nsolves the challenging structure learning variant, but also resolves other open\nquestions in the area, all while achieving the gold standard of\nHeisenberg-limited scaling. In particular, our algorithm recovers the\nHamiltonian to $\\varepsilon$ error with total evolution time $O(\\log\n(n)/\\varepsilon)$, and has the following appealing properties: (1) it does not\nneed to know the Hamiltonian terms; (2) it works beyond the short-range\nsetting, extending to any Hamiltonian $H$ where the sum of terms interacting\nwith a qubit has bounded norm; (3) it evolves according to $H$ in constant time\n$t$ increments, thus achieving constant time resolution. As an application, we\ncan also learn Hamiltonians exhibiting power-law decay up to accuracy\n$\\varepsilon$ with total evolution time beating the standard limit of\n$1/\\varepsilon^2$.\n","authors":["Ainesh Bakshi","Allen Liu","Ankur Moitra","Ewin Tang"],"pdf_url":"https://arxiv.org/pdf/2405.00082v2.pdf","comment":"52 pages; v2 discussed more literature, qualified some claims"},{"id":"http://arxiv.org/abs/2407.07895v2","updated":"2024-07-28T19:58:08Z","published":"2024-07-10T17:59:43Z","title":"LLaVA-NeXT-Interleave: Tackling Multi-image, Video, and 3D in Large\n Multimodal Models","summary":" Visual instruction tuning has made considerable strides in enhancing the\ncapabilities of Large Multimodal Models (LMMs). However, existing open LMMs\nlargely focus on single-image tasks, their applications to multi-image\nscenarios remains less explored. Additionally, prior LMM research separately\ntackles different scenarios, leaving it impossible to generalize cross\nscenarios with new emerging capabilities. To this end, we introduce\nLLaVA-NeXT-Interleave, which simultaneously tackles Multi-image, Multi-frame\n(video), Multi-view (3D), and Multi-patch (single-image) scenarios in LMMs. To\nenable these capabilities, we regard the interleaved data format as a general\ntemplate and compile the M4-Instruct dataset with 1,177.6k samples, spanning 4\nprimary domains with 14 tasks and 41 datasets. We also curate the\nLLaVA-Interleave Bench to comprehensively evaluate the multi-image performance\nof LMMs. Through extensive experiments, LLaVA-NeXT-Interleave achieves leading\nresults in multi-image, video, and 3D benchmarks, while maintaining the\nperformance of single-image tasks. Besides, our model also exhibits several\nemerging capabilities, e.g., transferring tasks across different settings and\nmodalities. Code is available at https://github.com/LLaVA-VL/LLaVA-NeXT\n","authors":["Feng Li","Renrui Zhang","Hao Zhang","Yuanhan Zhang","Bo Li","Wei Li","Zejun Ma","Chunyuan Li"],"pdf_url":"https://arxiv.org/pdf/2407.07895v2.pdf","comment":"Project Page:\n https://llava-vl.github.io/blog/2024-06-16-llava-next-interleave/"},{"id":"http://arxiv.org/abs/2407.19567v1","updated":"2024-07-28T19:23:56Z","published":"2024-07-28T19:23:56Z","title":"Sharp Bounds for Poly-GNNs and the Effect of Graph Noise","summary":" We investigate the classification performance of graph neural networks with\ngraph-polynomial features, poly-GNNs, on the problem of semi-supervised node\nclassification. We analyze poly-GNNs under a general contextual stochastic\nblock model (CSBM) by providing a sharp characterization of the rate of\nseparation between classes in their output node representations. A question of\ninterest is whether this rate depends on the depth of the network $k$, i.e.,\nwhether deeper networks can achieve a faster separation? We provide a negative\nanswer to this question: for a sufficiently large graph, a depth $k > 1$\npoly-GNN exhibits the same rate of separation as a depth $k=1$ counterpart. Our\nanalysis highlights and quantifies the impact of ``graph noise'' in deep GNNs\nand shows how noise in the graph structure can dominate other sources of signal\nin the graph, negating any benefit further aggregation provides. Our analysis\nalso reveals subtle differences between even and odd-layered GNNs in how the\nfeature noise propagates.\n","authors":["Luciano Vinas","Arash A. Amini"],"pdf_url":"https://arxiv.org/pdf/2407.19567v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19557v1","updated":"2024-07-28T18:44:49Z","published":"2024-07-28T18:44:49Z","title":"Neural stochastic Volterra equations: learning path-dependent dynamics","summary":" Stochastic Volterra equations (SVEs) serve as mathematical models for the\ntime evolutions of random systems with memory effects and irregular behaviour.\nWe introduce neural stochastic Volterra equations as a physics-inspired\narchitecture, generalizing the class of neural stochastic differential\nequations, and provide some theoretical foundation. Numerical experiments on\nvarious SVEs, like the disturbed pendulum equation, the generalized\nOrnstein--Uhlenbeck process and the rough Heston model are presented, comparing\nthe performance of neural SVEs, neural SDEs and Deep Operator Networks\n(DeepONets).\n","authors":["David J. Prömel","David Scheffels"],"pdf_url":"https://arxiv.org/pdf/2407.19557v1.pdf","comment":"15 pages, 8 figures"},{"id":"http://arxiv.org/abs/2311.14120v4","updated":"2024-07-28T18:28:55Z","published":"2023-11-23T17:30:31Z","title":"Weight fluctuations in (deep) linear neural networks and a derivation of\n the inverse-variance flatness relation","summary":" We investigate the stationary (late-time) training regime of single- and\ntwo-layer underparameterized linear neural networks within the continuum limit\nof stochastic gradient descent (SGD) for synthetic Gaussian data. In the case\nof a single-layer network in the weakly underparameterized regime, the spectrum\nof the noise covariance matrix deviates notably from the Hessian, which can be\nattributed to the broken detailed balance of SGD dynamics. The weight\nfluctuations are in this case generally anisotropic, but effectively experience\nan isotropic loss. For an underparameterized two-layer network, we describe the\nstochastic dynamics of the weights in each layer and analyze the associated\nstationary covariances. We identify the inter-layer coupling as a distinct\nsource of anisotropy for the weight fluctuations. In contrast to the\nsingle-layer case, the weight fluctuations are effectively subject to an\nanisotropic loss, the flatness of which is inversely related to the fluctuation\nvariance. We thereby provide an analytical derivation of the recently observed\ninverse variance-flatness relation in a model of a deep linear neural network.\n","authors":["Markus Gross","Arne P. Raulf","Christoph Räth"],"pdf_url":"https://arxiv.org/pdf/2311.14120v4.pdf","comment":"27 pages, 9 figures"},{"id":"http://arxiv.org/abs/2405.06884v2","updated":"2024-07-28T18:22:17Z","published":"2024-05-11T02:35:08Z","title":"Efficient PAC Learnability of Dynamical Systems Over Multilayer Networks","summary":" Networked dynamical systems are widely used as formal models of real-world\ncascading phenomena, such as the spread of diseases and information. Prior\nresearch has addressed the problem of learning the behavior of an unknown\ndynamical system when the underlying network has a single layer. In this work,\nwe study the learnability of dynamical systems over multilayer networks, which\nare more realistic and challenging. First, we present an efficient PAC learning\nalgorithm with provable guarantees to show that the learner only requires a\nsmall number of training examples to infer an unknown system. We further\nprovide a tight analysis of the Natarajan dimension which measures the model\ncomplexity. Asymptotically, our bound on the Nararajan dimension is tight for\nalmost all multilayer graphs. The techniques and insights from our work provide\nthe theoretical foundations for future investigations of learning problems for\nmultilayer dynamical systems.\n","authors":["Zirou Qiu","Abhijin Adiga","Madhav V. Marathe","S. S. Ravi","Daniel J. Rosenkrantz","Richard E. Stearns","Anil Vullikanti"],"pdf_url":"https://arxiv.org/pdf/2405.06884v2.pdf","comment":"Accepted at ICML 2024"},{"id":"http://arxiv.org/abs/2405.03724v2","updated":"2024-07-28T17:34:22Z","published":"2024-05-06T04:00:00Z","title":"GraphSL: An Open-Source Library for Graph Source Localization Approaches\n and Benchmark Datasets","summary":" We introduce GraphSL, a new library for studying the graph source\nlocalization problem. graph diffusion and graph source localization are inverse\nproblems in nature: graph diffusion predicts information diffusions from\ninformation sources, while graph source localization predicts information\nsources from information diffusions. GraphSL facilitates the exploration of\nvarious graph diffusion models for simulating information diffusions and\nenables the evaluation of cutting-edge source localization approaches on\nestablished benchmark datasets. The source code of GraphSL is made available at\nGithub Repository (https://github.com/xianggebenben/GraphSL). Bug reports and\nfeedback can be directed to the Github issues page\n(https://github.com/xianggebenben/GraphSL/issues).\n","authors":["Junxiang Wang","Liang Zhao"],"pdf_url":"https://arxiv.org/pdf/2405.03724v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19540v1","updated":"2024-07-28T17:14:27Z","published":"2024-07-28T17:14:27Z","title":"Overcoming Uncertain Incompleteness for Robust Multimodal Sequential\n Diagnosis Prediction via Knowledge Distillation and Random Data Erasing","summary":" In this paper, we present NECHO v2, a novel framework designed to enhance the\npredictive accuracy of multimodal sequential patient diagnoses under uncertain\nmissing visit sequences, a common challenge in clinical settings. Firstly, we\nmodify NECHO to handle uncertain modality representation dominance under the\nimperfect data. Next, we develop a systematic knowledge distillation by\nemploying the modified NECHO as both teacher and student. It encompasses a\nmodality-wise contrastive and hierarchical distillation, transformer\nrepresentation random distillation, along with other distillations to align\nrepresentations tightly and effectively. We also utilise random erasing on\nindividual data points within sequences during both training and distillation\nof teacher to lightly simulate scenario with missing visit information to\nfoster effective knowledge transfer. As a result, NECHO v2 verifies itself by\nshowing superiority in multimodal sequential diagnosis prediction on both\nbalanced and imbalanced incomplete settings on multimodal healthcare data.\n","authors":["Heejoon Koo"],"pdf_url":"https://arxiv.org/pdf/2407.19540v1.pdf","comment":"5 pages, 1 figure, and 4 tables"},{"id":"http://arxiv.org/abs/2310.03447v3","updated":"2024-07-28T16:44:40Z","published":"2023-10-05T10:34:47Z","title":"FLAIM: AIM-based Synthetic Data Generation in the Federated Setting","summary":" Preserving individual privacy while enabling collaborative data sharing is\ncrucial for organizations. Synthetic data generation is one solution, producing\nartificial data that mirrors the statistical properties of private data. While\nnumerous techniques have been devised under differential privacy, they\npredominantly assume data is centralized. However, data is often distributed\nacross multiple clients in a federated manner. In this work, we initiate the\nstudy of federated synthetic tabular data generation. Building upon a SOTA\ncentral method known as AIM, we present DistAIM and FLAIM. We first show that\nit is straightforward to distribute AIM, extending a recent approach based on\nsecure multi-party computation which necessitates additional overhead, making\nit less suited to federated scenarios. We then demonstrate that naively\nfederating AIM can lead to substantial degradation in utility under the\npresence of heterogeneity. To mitigate both issues, we propose an augmented\nFLAIM approach that maintains a private proxy of heterogeneity. We simulate our\nmethods across a range of benchmark datasets under different degrees of\nheterogeneity and show we can improve utility while reducing overhead.\n","authors":["Samuel Maddock","Graham Cormode","Carsten Maple"],"pdf_url":"https://arxiv.org/pdf/2310.03447v3.pdf","comment":"Accepted to KDD 2024"},{"id":"http://arxiv.org/abs/2407.12068v2","updated":"2024-07-28T16:44:21Z","published":"2024-07-16T09:05:31Z","title":"Learning on Graphs with Large Language Models(LLMs): A Deep Dive into\n Model Robustness","summary":" Large Language Models (LLMs) have demonstrated remarkable performance across\nvarious natural language processing tasks. Recently, several LLMs-based\npipelines have been developed to enhance learning on graphs with text\nattributes, showcasing promising performance. However, graphs are well-known to\nbe susceptible to adversarial attacks and it remains unclear whether LLMs\nexhibit robustness in learning on graphs. To address this gap, our work aims to\nexplore the potential of LLMs in the context of adversarial attacks on graphs.\nSpecifically, we investigate the robustness against graph structural and\ntextual perturbations in terms of two dimensions: LLMs-as-Enhancers and\nLLMs-as-Predictors. Through extensive experiments, we find that, compared to\nshallow models, both LLMs-as-Enhancers and LLMs-as-Predictors offer superior\nrobustness against structural and textual attacks.Based on these findings, we\ncarried out additional analyses to investigate the underlying causes.\nFurthermore, we have made our benchmark library openly available to facilitate\nquick and fair evaluations, and to encourage ongoing innovative research in\nthis field.\n","authors":["Kai Guo","Zewen Liu","Zhikai Chen","Hongzhi Wen","Wei Jin","Jiliang Tang","Yi Chang"],"pdf_url":"https://arxiv.org/pdf/2407.12068v2.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2407.19590v1","updated":"2024-07-28T21:18:30Z","published":"2024-07-28T21:18:30Z","title":"The Future is Meta: Metadata, Formats and Perspectives towards\n Interactive and Personalized AV Content","summary":" The production of media content has undergone tremendous changes in recent\nyears. Multiple daily content updates are just as common for some platforms as\nis processing the provided content specifically for their target audiences.\nSuch features are made possible through metadata, which make information\naccessible by categorizing it. In conjunction with AI-supported tools, metadata\nare shaping the future of audio-visual content production, distribution and\nconsumption. It allows editors to effectively search through archives like in\nthe Tailored Media Project, broadcasters to provide content that is adapted to\nusers' surroundings like in the ARD Audiothek unterwegs project, or give users\nthe ability to experience audio-visual content from different perspectives like\nin the ORPHEUS project. Although these projects provide comprehensive insight\ninto the potential of metadata, their integration in existing infrastructures\nmeets several limitations. For example, content-related metadata may initially\nbe generated at some point during the production process but will then be lost\nat later stages due to current standards and incomplete software\nimplementations. In our contribution, we will discuss requirements and\npotential approaches and give an outlook on possible fields of application and\nuse-cases.\n","authors":["Alexander Weller","Werner Bleisteiner","Christian Hufnagel","Michael Iber"],"pdf_url":"https://arxiv.org/pdf/2407.19590v1.pdf","comment":"12 pages, 4 figures, submitted to the Tonmeistertagung 32"},{"id":"http://arxiv.org/abs/2407.19514v1","updated":"2024-07-28T15:38:58Z","published":"2024-07-28T15:38:58Z","title":"Detached and Interactive Multimodal Learning","summary":" Recently, Multimodal Learning (MML) has gained significant interest as it\ncompensates for single-modality limitations through comprehensive complementary\ninformation within multimodal data. However, traditional MML methods generally\nuse the joint learning framework with a uniform learning objective that can\nlead to the modality competition issue, where feedback predominantly comes from\ncertain modalities, limiting the full potential of others. In response to this\nchallenge, this paper introduces DI-MML, a novel detached MML framework\ndesigned to learn complementary information across modalities under the premise\nof avoiding modality competition. Specifically, DI-MML addresses competition by\nseparately training each modality encoder with isolated learning objectives. It\nfurther encourages cross-modal interaction via a shared classifier that defines\na common feature space and employing a dimension-decoupled unidirectional\ncontrastive (DUC) loss to facilitate modality-level knowledge transfer.\nAdditionally, to account for varying reliability in sample pairs, we devise a\ncertainty-aware logit weighting strategy to effectively leverage complementary\ninformation at the instance level during inference. Extensive experiments\nconducted on audio-visual, flow-image, and front-rear view datasets show the\nsuperior performance of our proposed method. The code is released at\nhttps://github.com/fanyunfeng-bit/DI-MML.\n","authors":["Yunfeng Fan","Wenchao Xu","Haozhao Wang","Junhong Liu","Song Guo"],"pdf_url":"https://arxiv.org/pdf/2407.19514v1.pdf","comment":"Accepted by ACM MM 24"},{"id":"http://arxiv.org/abs/2401.00403v2","updated":"2024-07-28T14:33:47Z","published":"2023-12-31T05:37:27Z","title":"Overcome Modal Bias in Multi-modal Federated Learning via Balanced\n Modality Selection","summary":" Selecting proper clients to participate in each federated learning (FL) round\nis critical to effectively harness a broad range of distributed data. Existing\nclient selection methods simply consider the mining of distributed uni-modal\ndata, yet, their effectiveness may diminish in multi-modal FL (MFL) as the\nmodality imbalance problem not only impedes the collaborative local training\nbut also leads to a severe global modality-level bias. We empirically reveal\nthat local training with a certain single modality may contribute more to the\nglobal model than training with all local modalities. To effectively exploit\nthe distributed multiple modalities, we propose a novel Balanced Modality\nSelection framework for MFL (BMSFed) to overcome the modal bias. On the one\nhand, we introduce a modal enhancement loss during local training to alleviate\nlocal imbalance based on the aggregated global prototypes. On the other hand,\nwe propose the modality selection aiming to select subsets of local modalities\nwith great diversity and achieving global modal balance simultaneously. Our\nextensive experiments on audio-visual, colored-gray, and front-back datasets\nshowcase the superiority of BMSFed over baselines and its effectiveness in\nmulti-modal data exploitation.\n","authors":["Yunfeng Fan","Wenchao Xu","Haozhao Wang","Fushuo Huo","Jinyu Chen","Song Guo"],"pdf_url":"https://arxiv.org/pdf/2401.00403v2.pdf","comment":"Accepted by ECCV24, 23 pages"},{"id":"http://arxiv.org/abs/2407.19493v1","updated":"2024-07-28T13:23:43Z","published":"2024-07-28T13:23:43Z","title":"Official-NV: A News Video Dataset for Multimodal Fake News Detection","summary":" News media, especially video news media, have penetrated into every aspect of\ndaily life, which also brings the risk of fake news. Therefore, multimodal fake\nnews detection has recently received more attention. However, the number of\nfake news detection data sets for video modal is small, and these data sets are\ncomposed of unofficial videos uploaded by users, so there is too much useless\ndata. To solve this problem, we present in this paper a dataset named\nOfficial-NV, which consists of officially published news videos on Xinhua. We\ncrawled videos on Xinhua, and then extended the data set using LLM generation\nand manual modification. In addition, we benchmarked the data set presented in\nthis paper using a baseline model to demonstrate the advantage of Official-NV\nin multimodal fake news detection.\n","authors":["Yihao Wang","Lizhi Chen","Zhong Qian","Peifeng Li"],"pdf_url":"https://arxiv.org/pdf/2407.19493v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19468v1","updated":"2024-07-28T11:39:40Z","published":"2024-07-28T11:39:40Z","title":"MVPbev: Multi-view Perspective Image Generation from BEV with Test-time\n Controllability and Generalizability","summary":" This work aims to address the multi-view perspective RGB generation from text\nprompts given Bird-Eye-View(BEV) semantics. Unlike prior methods that neglect\nlayout consistency, lack the ability to handle detailed text prompts, or are\nincapable of generalizing to unseen view points, MVPbev simultaneously\ngenerates cross-view consistent images of different perspective views with a\ntwo-stage design, allowing object-level control and novel view generation at\ntest-time. Specifically, MVPbev firstly projects given BEV semantics to\nperspective view with camera parameters, empowering the model to generalize to\nunseen view points. Then we introduce a multi-view attention module where\nspecial initialization and de-noising processes are introduced to explicitly\nenforce local consistency among overlapping views w.r.t. cross-view homography.\nLast but not least, MVPbev further allows test-time instance-level\ncontrollability by refining a pre-trained text-to-image diffusion model. Our\nextensive experiments on NuScenes demonstrate that our method is capable of\ngenerating high-resolution photorealistic images from text descriptions with\nthousands of training samples, surpassing the state-of-the-art methods under\nvarious evaluation metrics. We further demonstrate the advances of our method\nin terms of generalizability and controllability with the help of novel\nevaluation metrics and comprehensive human analysis. Our code, data, and model\ncan be found in \\url{https://github.com/kkaiwwana/MVPbev}.\n","authors":["Buyu Liu","Kai Wang","Yansong Liu","Jun Bao","Tingting Han","Jun Yu"],"pdf_url":"https://arxiv.org/pdf/2407.19468v1.pdf","comment":"Accepted by ACM MM24"},{"id":"http://arxiv.org/abs/2407.19456v1","updated":"2024-07-28T10:15:17Z","published":"2024-07-28T10:15:17Z","title":"An Inverse Partial Optimal Transport Framework for Music-guided Movie\n Trailer Generation","summary":" Trailer generation is a challenging video clipping task that aims to select\nhighlighting shots from long videos like movies and re-organize them in an\nattractive way. In this study, we propose an inverse partial optimal transport\n(IPOT) framework to achieve music-guided movie trailer generation. In\nparticular, we formulate the trailer generation task as selecting and sorting\nkey movie shots based on audio shots, which involves matching the latent\nrepresentations across visual and acoustic modalities. We learn a multi-modal\nlatent representation model in the proposed IPOT framework to achieve this aim.\nIn this framework, a two-tower encoder derives the latent representations of\nmovie and music shots, respectively, and an attention-assisted Sinkhorn\nmatching network parameterizes the grounding distance between the shots' latent\nrepresentations and the distribution of the movie shots. Taking the\ncorrespondence between the movie shots and its trailer music shots as the\nobserved optimal transport plan defined on the grounding distances, we learn\nthe model by solving an inverse partial optimal transport problem, leading to a\nbi-level optimization strategy. We collect real-world movies and their trailers\nto construct a dataset with abundant label information called CMTD and,\naccordingly, train and evaluate various automatic trailer generators. Compared\nwith state-of-the-art methods, our IPOT method consistently shows superiority\nin subjective visual effects and objective quantitative measurements.\n","authors":["Yutong Wang","Sidan Zhu","Hongteng Xu","Dixin Luo"],"pdf_url":"https://arxiv.org/pdf/2407.19456v1.pdf","comment":"acmmm2024"},{"id":"http://arxiv.org/abs/2407.19415v1","updated":"2024-07-28T07:06:28Z","published":"2024-07-28T07:06:28Z","title":"Start from Video-Music Retrieval: An Inter-Intra Modal Loss for Cross\n Modal Retrieval","summary":" The burgeoning short video industry has accelerated the advancement of\nvideo-music retrieval technology, assisting content creators in selecting\nappropriate music for their videos. In self-supervised training for\nvideo-to-music retrieval, the video and music samples in the dataset are\nseparated from the same video work, so they are all one-to-one matches. This\ndoes not match the real situation. In reality, a video can use different music\nas background music, and a music can be used as background music for different\nvideos. Many videos and music that are not in a pair may be compatible, leading\nto false negative noise in the dataset. A novel inter-intra modal (II) loss is\nproposed as a solution. By reducing the variation of feature distribution\nwithin the two modalities before and after the encoder, II loss can reduce the\nmodel's overfitting to such noise without removing it in a costly and laborious\nway. The video-music retrieval framework, II-CLVM (Contrastive Learning for\nVideo-Music Retrieval), incorporating the II Loss, achieves state-of-the-art\nperformance on the YouTube8M dataset. The framework II-CLVTM shows better\nperformance when retrieving music using multi-modal video information (such as\ntext in videos). Experiments are designed to show that II loss can effectively\nalleviate the problem of false negative noise in retrieval tasks. Experiments\nalso show that II loss improves various self-supervised and supervised\nuni-modal and cross-modal retrieval tasks, and can obtain good retrieval models\nwith a small amount of training samples.\n","authors":["Zeyu Chen","Pengfei Zhang","Kai Ye","Wei Dong","Xin Feng","Yana Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.19415v1.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2407.19410v1","updated":"2024-07-28T06:23:06Z","published":"2024-07-28T06:23:06Z","title":"AdaCoder: Adaptive Prompt Compression for Programmatic Visual Question\n Answering","summary":" Visual question answering aims to provide responses to natural language\nquestions given visual input. Recently, visual programmatic models (VPMs),\nwhich generate executable programs to answer questions through large language\nmodels (LLMs), have attracted research interest. However, they often require\nlong input prompts to provide the LLM with sufficient API usage details to\ngenerate relevant code. To address this limitation, we propose AdaCoder, an\nadaptive prompt compression framework for VPMs. AdaCoder operates in two\nphases: a compression phase and an inference phase. In the compression phase,\ngiven a preprompt that describes all API definitions in the Python language\nwith example snippets of code, a set of compressed preprompts is generated,\neach depending on a specific question type. In the inference phase, given an\ninput question, AdaCoder predicts the question type and chooses the appropriate\ncorresponding compressed preprompt to generate code to answer the question.\nNotably, AdaCoder employs a single frozen LLM and pre-defined prompts, negating\nthe necessity of additional training and maintaining adaptability across\ndifferent powerful black-box LLMs such as GPT and Claude. In experiments, we\napply AdaCoder to ViperGPT and demonstrate that it reduces token length by\n71.1%, while maintaining or even improving the performance of visual question\nanswering.\n","authors":["Mahiro Ukai","Shuhei Kurita","Atsushi Hashimoto","Yoshitaka Ushiku","Nakamasa Inoue"],"pdf_url":"https://arxiv.org/pdf/2407.19410v1.pdf","comment":null}]},"2024-07-27T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2407.02885v2","updated":"2024-07-27T23:52:43Z","published":"2024-07-03T07:59:52Z","title":"CogErgLLM: Exploring Large Language Model Systems Design Perspective\n Using Cognitive Ergonomics","summary":" Integrating cognitive ergonomics with LLMs is essential for enhancing safety,\nreliability, and user satisfaction in human-AI interactions. Current LLM design\noften lacks this integration, leading to systems that may not fully align with\nhuman cognitive capabilities and limitations. Insufficient focus on\nincorporating cognitive science methods exacerbates biases in LLM outputs,\nwhile inconsistent application of user-centered design principles results in\nsub-optimal user experiences. To address these challenges, our position paper\nexplores the critical integration of cognitive ergonomics principles into LLM\ndesign, aiming to provide a comprehensive framework and practical guidelines\nfor ethical LLM development. Through our contributions, we seek to advance\nunderstanding and practice in integrating cognitive ergonomics into LLM\nsystems, fostering safer, more reliable, and ethically sound human-AI\ninteractions.\n","authors":["Azmine Toushik Wasi"],"pdf_url":"https://arxiv.org/pdf/2407.02885v2.pdf","comment":"8 Page, 3 Figures. Accepted to Large Language Models and Cognition @\n ICML 2024 (https://llm-cognition.github.io/#:~:text=CogErgLLM)"},{"id":"http://arxiv.org/abs/2404.03602v2","updated":"2024-07-27T22:01:50Z","published":"2024-04-04T17:19:47Z","title":"Evaluating LLMs at Detecting Errors in LLM Responses","summary":" With Large Language Models (LLMs) being widely used across various tasks,\ndetecting errors in their responses is increasingly crucial. However, little\nresearch has been conducted on error detection of LLM responses. Collecting\nerror annotations on LLM responses is challenging due to the subjective nature\nof many NLP tasks, and thus previous research focuses on tasks of little\npractical value (e.g., word sorting) or limited error types (e.g., faithfulness\nin summarization). This work introduces ReaLMistake, the first error detection\nbenchmark consisting of objective, realistic, and diverse errors made by LLMs.\nReaLMistake contains three challenging and meaningful tasks that introduce\nobjectively assessable errors in four categories (reasoning correctness,\ninstruction-following, context-faithfulness, and parameterized knowledge),\neliciting naturally observed and diverse errors in responses of GPT-4 and Llama\n2 70B annotated by experts. We use ReaLMistake to evaluate error detectors\nbased on 12 LLMs. Our findings show: 1) Top LLMs like GPT-4 and Claude 3 detect\nerrors made by LLMs at very low recall, and all LLM-based error detectors\nperform much worse than humans. 2) Explanations by LLM-based error detectors\nlack reliability. 3) LLMs-based error detection is sensitive to small changes\nin prompts but remains challenging to improve. 4) Popular approaches to\nimproving LLMs, including self-consistency and majority vote, do not improve\nthe error detection performance. Our benchmark and code are provided at\nhttps://github.com/psunlpgroup/ReaLMistake.\n","authors":["Ryo Kamoi","Sarkar Snigdha Sarathi Das","Renze Lou","Jihyun Janice Ahn","Yilun Zhao","Xiaoxin Lu","Nan Zhang","Yusen Zhang","Ranran Haoran Zhang","Sujeeth Reddy Vummanthala","Salika Dave","Shaobo Qin","Arman Cohan","Wenpeng Yin","Rui Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.03602v2.pdf","comment":"COLM 2024, 46 pages, Benchmark and code:\n https://github.com/psunlpgroup/ReaLMistake"},{"id":"http://arxiv.org/abs/2407.19346v1","updated":"2024-07-27T22:00:52Z","published":"2024-07-27T22:00:52Z","title":"Polynomial Regression as a Task for Understanding In-context Learning\n Through Finetuning and Alignment","summary":" Simple function classes have emerged as toy problems to better understand\nin-context-learning in transformer-based architectures used for large language\nmodels. But previously proposed simple function classes like linear regression\nor multi-layer-perceptrons lack the structure required to explore things like\nprompting and alignment within models capable of in-context-learning. We\npropose univariate polynomial regression as a function class that is just rich\nenough to study prompting and alignment, while allowing us to visualize and\nunderstand what is going on clearly.\n","authors":["Max Wilcoxson","Morten Svendgård","Ria Doshi","Dylan Davis","Reya Vir","Anant Sahai"],"pdf_url":"https://arxiv.org/pdf/2407.19346v1.pdf","comment":"ICML Workshop on In-Context Learning"},{"id":"http://arxiv.org/abs/2407.19345v1","updated":"2024-07-27T21:56:23Z","published":"2024-07-27T21:56:23Z","title":"Inference-Time Selective Debiasing","summary":" We propose selective debiasing -- an inference-time safety mechanism that\naims to increase the overall quality of models in terms of prediction\nperformance and fairness in the situation when re-training a model is\nprohibitive. The method is inspired by selective prediction, where some\npredictions that are considered low quality are discarded at inference time. In\nour approach, we identify the potentially biased model predictions and, instead\nof discarding them, we debias them using LEACE -- a post-processing debiasing\nmethod. To select problematic predictions, we propose a bias quantification\napproach based on KL divergence, which achieves better results than standard UQ\nmethods. Experiments with text classification datasets demonstrate that\nselective debiasing helps to close the performance gap between post-processing\nmethods and at-training and pre-processing debiasing techniques.\n","authors":["Gleb Kuzmin","Nemeesh Yadav","Ivan Smirnov","Timothy Baldwin","Artem Shelmanov"],"pdf_url":"https://arxiv.org/pdf/2407.19345v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19342v1","updated":"2024-07-27T21:12:46Z","published":"2024-07-27T21:12:46Z","title":"Parameter-Efficient Fine-Tuning via Circular Convolution","summary":" Low-Rank Adaptation (LoRA) has gained popularity for fine-tuning large\nfoundation models, leveraging low-rank matrices $\\mathbf{A}$ and $\\mathbf{B}$\nto represent weight changes (\\textit{i.e.,} $\\Delta \\mathbf{W} = \\mathbf{B}\n\\mathbf{A}$). This method reduces trainable parameters and mitigates heavy\nmemory consumption associated with full delta matrices by sequentially\nmultiplying $\\mathbf{A}$ and $\\mathbf{B}$ with the activation. Despite its\nsuccess, the intrinsic low-rank characteristic may limit its performance.\nAlthough several variants have been proposed to address this issue, they often\noverlook the crucial computational and memory efficiency brought by LoRA. In\nthis paper, we propose \\underline{C}ir\\underline{c}ular \\underline{C}onvolution\n\\underline{A}daptation (C$^3$A), which not only achieves high-rank adaptation\nwith enhanced performance but also excels in both computational power and\nmemory utilization. Extensive experiments demonstrate that C$^3$A consistently\noutperforms LoRA and its variants across various fine-tuning tasks.\n","authors":["Aochuan Chen","Ziqi Gao","Zijing Liu","Yu Li","Jia Li"],"pdf_url":"https://arxiv.org/pdf/2407.19342v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2407.19325v1","updated":"2024-07-27T19:17:10Z","published":"2024-07-27T19:17:10Z","title":"Do Language Models Have a Critical Period for Language Acquisition?","summary":" Humans appear to have a critical period (CP) for language acquisition: Second\nlanguage (L2) acquisition becomes harder after early childhood, and ceasing\nexposure to a first language (L1) after this period (but not before) typically\ndoes not lead to substantial loss of L1 proficiency. It is unknown whether\nthese CP effects result from innately determined brain maturation or as a\nstabilization of neural connections naturally induced by experience. In this\nstudy, we use language models (LMs) to test the extent to which these phenomena\nare peculiar to humans, or shared by a broader class of language learners. We\nvary the age of exposure by training LMs on language pairs in various\nexperimental conditions, and find that LMs, which lack any direct analog to\ninnate maturational stages, do not show CP effects when trained sequentially on\nL1 and L2. Our results contradict the claim that CP effects are an inevitable\nresult of learning in statistical learners, and they are consistent with an\ninnate mechanism for CP effects. We show that we can reverse-engineer the CP by\nintroducing a regularizer partway through training to simulate a maturational\ndecrease in plasticity. All in all, our results suggest that L1 learning on its\nown may not be enough to induce a CP, and additional engineering is necessary\nto make language models more cognitively plausible.\n","authors":["Ionut Constantinescu","Tiago Pimentel","Ryan Cotterell","Alex Warstadt"],"pdf_url":"https://arxiv.org/pdf/2407.19325v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06214v2","updated":"2024-07-27T18:50:28Z","published":"2024-04-09T11:04:50Z","title":"[Call for Papers] The 2nd BabyLM Challenge: Sample-efficient pretraining\n on a developmentally plausible corpus","summary":" After last year's successful BabyLM Challenge, the competition will be hosted\nagain in 2024/2025. The overarching goals of the challenge remain the same;\nhowever, some of the competition rules will be different. The big changes for\nthis year's competition are as follows: First, we replace the loose track with\na paper track, which allows (for example) non-model-based submissions, novel\ncognitively-inspired benchmarks, or analysis techniques. Second, we are\nrelaxing the rules around pretraining data, and will now allow participants to\nconstruct their own datasets provided they stay within the 100M-word or\n10M-word budget. Third, we introduce a multimodal vision-and-language track,\nand will release a corpus of 50% text-only and 50% image-text multimodal data\nas a starting point for LM model training. The purpose of this CfP is to\nprovide rules for this year's challenge, explain these rule changes and their\nrationale in greater detail, give a timeline of this year's competition, and\nprovide answers to frequently asked questions from last year's challenge.\n","authors":["Leshem Choshen","Ryan Cotterell","Michael Y. Hu","Tal Linzen","Aaron Mueller","Candace Ross","Alex Warstadt","Ethan Wilcox","Adina Williams","Chengxu Zhuang"],"pdf_url":"https://arxiv.org/pdf/2404.06214v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15454v4","updated":"2024-07-27T17:41:20Z","published":"2024-03-18T23:22:50Z","title":"Emotion Detection with Transformers: A Comparative Study","summary":" In this study, we explore the application of transformer-based models for\nemotion classification on text data. We train and evaluate several pre-trained\ntransformer models, on the Emotion dataset using different variants of\ntransformers. The paper also analyzes some factors that in-fluence the\nperformance of the model, such as the fine-tuning of the transformer layer, the\ntrainability of the layer, and the preprocessing of the text data. Our analysis\nreveals that commonly applied techniques like removing punctuation and stop\nwords can hinder model performance. This might be because transformers strength\nlies in understanding contextual relationships within text. Elements like\npunctuation and stop words can still convey sentiment or emphasis and removing\nthem might disrupt this context.\n","authors":["Mahdi Rezapour"],"pdf_url":"https://arxiv.org/pdf/2403.15454v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14050v3","updated":"2024-07-27T17:37:28Z","published":"2024-03-21T00:20:16Z","title":"Extracting Emotion Phrases from Tweets using BART","summary":" Sentiment analysis is a natural language processing task that aims to\nidentify and extract the emotional aspects of a text. However, many existing\nsentiment analysis methods primarily classify the overall polarity of a text,\noverlooking the specific phrases that convey sentiment. In this paper, we\napplied an approach to sentiment analysis based on a question-answering\nframework. Our approach leverages the power of Bidirectional Autoregressive\nTransformer (BART), a pre-trained sequence-to-sequence model, to extract a\nphrase from a given text that amplifies a given sentiment polarity. We create a\nnatural language question that identifies the specific emotion to extract and\nthen guide BART to pay attention to the relevant emotional cues in the text. We\nuse a classifier within BART to predict the start and end positions of the\nanswer span within the text, which helps to identify the precise boundaries of\nthe extracted emotion phrase. Our approach offers several advantages over most\nsentiment analysis studies, including capturing the complete context and\nmeaning of the text and extracting precise token spans that highlight the\nintended sentiment. We achieved an end loss of 87% and Jaccard score of 0.61.\n","authors":["Mahdi Rezapour"],"pdf_url":"https://arxiv.org/pdf/2403.14050v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08699v3","updated":"2024-07-27T17:22:19Z","published":"2024-04-10T16:30:09Z","title":"PoliTune: Analyzing the Impact of Data Selection and Fine-Tuning on\n Economic and Political Biases in Large Language Models","summary":" In an era where language models are increasingly integrated into\ndecision-making and communication, understanding the biases within Large\nLanguage Models (LLMs) becomes imperative, especially when these models are\napplied in the economic and political domains. This work investigates the\nimpact of fine-tuning and data selection on economic and political biases in\nLLMs. In this context, we introduce PoliTune, a fine-tuning methodology to\nexplore the systematic aspects of aligning LLMs with specific ideologies,\nmindful of the biases that arise from their extensive training on diverse\ndatasets. Distinct from earlier efforts that either focus on smaller models or\nentail resource-intensive pre-training, PoliTune employs Parameter-Efficient\nFine-Tuning (PEFT) techniques, which allow for the alignment of LLMs with\ntargeted ideologies by modifying a small subset of parameters. We introduce a\nsystematic method for using the open-source LLM Llama3-70B for dataset\nselection, annotation, and synthesizing a preferences dataset for Direct\nPreference Optimization (DPO) to align the model with a given political\nideology. We assess the effectiveness of PoliTune through both quantitative and\nqualitative evaluations of aligning open-source LLMs (Llama3-8B and Mistral-7B)\nto different ideologies. Our work analyzes the potential of embedding specific\nbiases into LLMs and contributes to the dialogue on the ethical application of\nAI, highlighting the importance of deploying AI in a manner that aligns with\nsocietal values.\n","authors":["Ahmed Agiza","Mohamed Mostagir","Sherief Reda"],"pdf_url":"https://arxiv.org/pdf/2404.08699v3.pdf","comment":"AIES '24: Proceedings of the 2024 AAAI/ACM Conference on AI, Ethics,\n and Society"},{"id":"http://arxiv.org/abs/2407.19302v1","updated":"2024-07-27T17:12:37Z","published":"2024-07-27T17:12:37Z","title":"IBMEA: Exploring Variational Information Bottleneck for Multi-modal\n Entity Alignment","summary":" Multi-modal entity alignment (MMEA) aims to identify equivalent entities\nbetween multi-modal knowledge graphs (MMKGs), where the entities can be\nassociated with related images. Most existing studies integrate multi-modal\ninformation heavily relying on the automatically-learned fusion module, rarely\nsuppressing the redundant information for MMEA explicitly. To this end, we\nexplore variational information bottleneck for multi-modal entity alignment\n(IBMEA), which emphasizes the alignment-relevant information and suppresses the\nalignment-irrelevant information in generating entity representations.\nSpecifically, we devise multi-modal variational encoders to generate\nmodal-specific entity representations as probability distributions. Then, we\npropose four modal-specific information bottleneck regularizers, limiting the\nmisleading clues in refining modal-specific entity representations. Finally, we\npropose a modal-hybrid information contrastive regularizer to integrate all the\nrefined modal-specific representations, enhancing the entity similarity between\nMMKGs to achieve MMEA. We conduct extensive experiments on two cross-KG and\nthree bilingual MMEA datasets. Experimental results demonstrate that our model\nconsistently outperforms previous state-of-the-art methods, and also shows\npromising and robust performance in low-resource and high-noise data scenarios.\n","authors":["Taoyu Su","Jiawei Sheng","Shicheng Wang","Xinghua Zhang","Hongbo Xu","Tingwen Liu"],"pdf_url":"https://arxiv.org/pdf/2407.19302v1.pdf","comment":"Accepted by ACM MM 2024"},{"id":"http://arxiv.org/abs/2407.19299v1","updated":"2024-07-27T16:48:03Z","published":"2024-07-27T16:48:03Z","title":"The Impact of LoRA Adapters for LLMs on Clinical NLP Classification\n Under Data Limitations","summary":" Fine-tuning Large Language Models (LLMs) for clinical Natural Language\nProcessing (NLP) poses significant challenges due to the domain gap and limited\ndata availability. This study investigates the effectiveness of various adapter\ntechniques, equivalent to Low-Rank Adaptation (LoRA), for fine-tuning LLMs in a\nresource-constrained hospital environment. We experimented with four\nstructures-Adapter, Lightweight, TinyAttention, and Gated Residual Network\n(GRN)-as final layers for clinical notes classification. We fine-tuned\nbiomedical pre-trained models, including CamemBERT-bio, AliBERT, and DrBERT,\nalongside two Transformer-based models. Our extensive experimental results\nindicate that i) employing adapter structures does not yield significant\nimprovements in fine-tuning biomedical pre-trained LLMs, and ii) simpler\nTransformer-based models, trained from scratch, perform better under resource\nconstraints. Among the adapter structures, GRN demonstrated superior\nperformance with accuracy, precision, recall, and an F1 score of 0.88.\nMoreover, the total training time for LLMs exceeded 1000 hours, compared to\nunder 6 hours for simpler transformer-based models, highlighting that LLMs are\nmore suitable for environments with extensive computational resources and\nlarger datasets. Consequently, this study demonstrates that simpler\nTransformer-based models can be effectively trained from scratch, providing a\nviable solution for clinical NLP tasks in low-resource environments with\nlimited data availability. By identifying the GRN as the most effective adapter\nstructure, we offer a practical approach to enhance clinical note\nclassification without requiring extensive computational resources.\n","authors":["Thanh-Dung Le","Ti Ti Nguyen","Vu Nguyen Ha"],"pdf_url":"https://arxiv.org/pdf/2407.19299v1.pdf","comment":"Under revisions"},{"id":"http://arxiv.org/abs/2308.02270v2","updated":"2024-07-27T16:32:40Z","published":"2023-08-04T11:47:19Z","title":"Redundancy Aware Multi-Reference Based Gainwise Evaluation of Extractive\n Summarization","summary":" The ROUGE metric is commonly used to evaluate extractive summarization task,\nbut it has been criticized for its lack of semantic awareness and its ignorance\nabout the ranking quality of the extractive summarizer. Previous research has\nintroduced a gain-based automated metric called Sem-nCG that addresses these\nissues, as it is both rank and semantic aware. However, it does not consider\nthe amount of redundancy present in a model summary and currently does not\nsupport evaluation with multiple reference summaries. It is essential to have a\nmodel summary that balances importance and diversity, but finding a metric that\ncaptures both of these aspects is challenging. In this paper, we propose a\nredundancy-aware Sem-nCG metric and demonstrate how the revised Sem-nCG metric\ncan be used to evaluate model summaries against multiple references as well\nwhich was missing in previous research. Experimental results demonstrate that\nthe revised Sem-nCG metric has a stronger correlation with human judgments\ncompared to the previous Sem-nCG metric and traditional ROUGE and BERTScore\nmetric for both single and multiple reference scenarios.\n","authors":["Mousumi Akter","Santu Karmaker"],"pdf_url":"https://arxiv.org/pdf/2308.02270v2.pdf","comment":"Accepted to KNOVENS 2024"},{"id":"http://arxiv.org/abs/2407.19262v1","updated":"2024-07-27T14:00:21Z","published":"2024-07-27T14:00:21Z","title":"Understanding Memorisation in LLMs: Dynamics, Influencing Factors, and\n Implications","summary":" Understanding whether and to what extent large language models (LLMs) have\nmemorised training data has important implications for the reliability of their\noutput and the privacy of their training data. In order to cleanly measure and\ndisentangle memorisation from other phenomena (e.g. in-context learning), we\ncreate an experimental framework that is based on repeatedly exposing LLMs to\nrandom strings. Our framework allows us to better understand the dynamics,\ni.e., the behaviour of the model, when repeatedly exposing it to random\nstrings. Using our framework, we make several striking observations: (a) we\nfind consistent phases of the dynamics across families of models (Pythia, Phi\nand Llama2), (b) we identify factors that make some strings easier to memorise\nthan others, and (c) we identify the role of local prefixes and global context\nin memorisation. We also show that sequential exposition to different random\nstrings has a significant effect on memorisation. Our results, often\nsurprising, have significant downstream implications in the study and usage of\nLLMs.\n","authors":["Till Speicher","Mohammad Aflah Khan","Qinyuan Wu","Vedant Nanda","Soumi Das","Bishwamittra Ghosh","Krishna P. Gummadi","Evimaria Terzi"],"pdf_url":"https://arxiv.org/pdf/2407.19262v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19256v1","updated":"2024-07-27T13:41:43Z","published":"2024-07-27T13:41:43Z","title":"Stochastic Parrots or ICU Experts? Large Language Models in Critical\n Care Medicine: A Scoping Review","summary":" With the rapid development of artificial intelligence (AI), large language\nmodels (LLMs) have shown strong capabilities in natural language understanding,\nreasoning, and generation, attracting amounts of research interest in applying\nLLMs to health and medicine. Critical care medicine (CCM) provides diagnosis\nand treatment for critically ill patients who often require intensive\nmonitoring and interventions in intensive care units (ICUs). Can LLMs be\napplied to CCM? Are LLMs just like stochastic parrots or ICU experts in\nassisting clinical decision-making? This scoping review aims to provide a\npanoramic portrait of the application of LLMs in CCM. Literature in seven\ndatabases, including PubMed, Embase, Scopus, Web of Science, CINAHL, IEEE\nXplore, and ACM Digital Library, were searched from January 1, 2019, to June\n10, 2024. Peer-reviewed journal and conference articles that discussed the\napplication of LLMs in critical care settings were included. From an initial\n619 articles, 24 were selected for final review. This review grouped\napplications of LLMs in CCM into three categories: clinical decision support,\nmedical documentation and reporting, and medical education and doctor-patient\ncommunication. LLMs have advantages in handling unstructured data and do not\nrequire manual feature engineering. Meanwhile, applying LLMs to CCM faces\nchallenges, including hallucinations, poor interpretability, bias and alignment\nchallenges, and privacy and ethics issues. Future research should enhance model\nreliability and interpretability, integrate up-to-date medical knowledge, and\nstrengthen privacy and ethical guidelines. As LLMs evolve, they could become\nkey tools in CCM to help improve patient outcomes and optimize healthcare\ndelivery. This study is the first review of LLMs in CCM, aiding researchers,\nclinicians, and policymakers to understand the current status and future\npotentials of LLMs in CCM.\n","authors":["Tongyue Shi","Jun Ma","Zihan Yu","Haowei Xu","Minqi Xiong","Meirong Xiao","Yilin Li","Huiying Zhao","Guilan Kong"],"pdf_url":"https://arxiv.org/pdf/2407.19256v1.pdf","comment":"28 pages, 5 figures"},{"id":"http://arxiv.org/abs/2406.13408v2","updated":"2024-07-27T12:13:39Z","published":"2024-06-19T09:57:19Z","title":"SQLFixAgent: Towards Semantic-Accurate Text-to-SQL Parsing via\n Consistency-Enhanced Multi-Agent Collaboration","summary":" While fine-tuned large language models (LLMs) excel in generating\ngrammatically valid SQL in Text-to-SQL parsing, they often struggle to ensure\nsemantic accuracy in queries, leading to user confusion and diminished system\nusability. To tackle this challenge, we introduce SQLFixAgent, an innovative\nmulti-agent collaborative framework designed for detecting and repairing\nerroneous SQL. Our framework comprises a core agent, SQLRefiner, alongside two\nauxiliary agents: SQLReviewer and QueryCrafter. The SQLReviewer agent employs\nthe rubber duck debugging method to identify potential semantic mismatches\nbetween SQL statement and user query. If the error is detected, the\nQueryCrafter agent generates multiple SQL statements as candidate repairs using\na fine-tuned SQLTool. Subsequently, leveraging similar repair retrieval and\nfailure memory reflexion, the SQLRefiner agent selects the most fitting SQL\nstatement from the candidates as the final repair. We evaluated our proposed\nframework on five Text-to-SQL benchmarks. The experimental results show that\nour method consistently enhances the performance of the baseline model,\nspecifically achieving an execution accuracy improvement of over 3\\% on the\nBird benchmark. Our framework also has a higher token efficiency compared to\nother advanced methods, making it more competitive.\n","authors":["Jipeng Cen","Jiaxin Liu","Zhixu Li","Jingjing Wang"],"pdf_url":"https://arxiv.org/pdf/2406.13408v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.19240v2","updated":"2024-07-27T09:53:36Z","published":"2023-10-30T03:11:30Z","title":"M4LE: A Multi-Ability Multi-Range Multi-Task Multi-Domain Long-Context\n Evaluation Benchmark for Large Language Models","summary":" Managing long sequences has become an important and necessary feature for\nlarge language models (LLMs). However, it is still an open question of how to\ncomprehensively and systematically evaluate the long-sequence capability of\nLLMs. One of the reasons is that conventional and widely-used benchmarks mainly\nconsist of short sequences. In this paper, we propose M4LE, a Multi-ability,\nMulti-range, Multi-task, Multi-domain benchmark for Long-context Evaluation.\nM4LE is based on a diverse NLP task pool comprising 36 NLP datasets, 11 task\ntypes and 12 domains. To alleviate the scarcity of tasks with naturally long\nsequences and incorporate multiple-ability assessment, we propose an automatic\napproach (but with negligible human annotations) to convert short-sequence\ntasks into a unified long-sequence scenario where LLMs have to identify single\nor multiple relevant spans in long contexts based on explicit or semantic\nhints. Specifically, the scenario includes five different types of abilities:\n(1) explicit single-span; (2) semantic single-span; (3) explicit multiple-span;\n(4) semantic multiple-span; and (5) global context understanding. The resulting\nsamples in M4LE are evenly distributed from 1k to 8k input length. We conducted\na systematic evaluation on 11 well-established LLMs, especially those optimized\nfor long-sequence inputs. Our results reveal that: 1) Current LLMs struggle to\nunderstand long context, particularly when tasks require multiple-span\nattention. 2) Semantic retrieval task is more difficult for competent LLMs. 3)\nModels fine-tuned on longer text with position interpolation have comparable\nperformance to those using Neural Tangent Kernel (NTK) aware scaling methods\nwithout fine-tuning. We make our benchmark publicly available to encourage\nfuture research in this challenging area.\n","authors":["Wai-Chung Kwan","Xingshan Zeng","Yufei Wang","Yusen Sun","Liangyou Li","Lifeng Shang","Qun Liu","Kam-Fai Wong"],"pdf_url":"https://arxiv.org/pdf/2310.19240v2.pdf","comment":"Code and data are available at https://github.com/KwanWaiChung/M4LE"},{"id":"http://arxiv.org/abs/2407.19200v1","updated":"2024-07-27T08:00:27Z","published":"2024-07-27T08:00:27Z","title":"On Behalf of the Stakeholders: Trends in NLP Model Interpretability in\n the Era of LLMs","summary":" Recent advancements in NLP systems, particularly with the introduction of\nLLMs, have led to widespread adoption of these systems by a broad spectrum of\nusers across various domains, impacting decision-making, the job market,\nsociety, and scientific research. This surge in usage has led to an explosion\nin NLP model interpretability and analysis research, accompanied by numerous\ntechnical surveys. Yet, these surveys often overlook the needs and perspectives\nof explanation stakeholders. In this paper, we address three fundamental\nquestions: Why do we need interpretability, what are we interpreting, and how?\nBy exploring these questions, we examine existing interpretability paradigms,\ntheir properties, and their relevance to different stakeholders. We further\nexplore the practical implications of these paradigms by analyzing trends from\nthe past decade across multiple research fields. To this end, we retrieved\nthousands of papers and employed an LLM to characterize them. Our analysis\nreveals significant disparities between NLP developers and non-developer users,\nas well as between research fields, underscoring the diverse needs of\nstakeholders. For example, explanations of internal model components are rarely\nused outside the NLP field. We hope this paper informs the future design,\ndevelopment, and application of methods that align with the objectives and\nrequirements of various stakeholders.\n","authors":["Nitay Calderon","Roi Reichart"],"pdf_url":"https://arxiv.org/pdf/2407.19200v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19198v1","updated":"2024-07-27T07:34:49Z","published":"2024-07-27T07:34:49Z","title":"Towards the Dynamics of a DNN Learning Symbolic Interactions","summary":" This study proves the two-phase dynamics of a deep neural network (DNN)\nlearning interactions. Despite the long disappointing view of the faithfulness\nof post-hoc explanation of a DNN, in recent years, a series of theorems have\nbeen proven to show that given an input sample, a small number of interactions\nbetween input variables can be considered as primitive inference patterns,\nwhich can faithfully represent every detailed inference logic of the DNN on\nthis sample. Particularly, it has been observed that various DNNs all learn\ninteractions of different complexities with two-phase dynamics, and this well\nexplains how a DNN's generalization power changes from under-fitting to\nover-fitting. Therefore, in this study, we prove the dynamics of a DNN\ngradually encoding interactions of different complexities, which provides a\ntheoretically grounded mechanism for the over-fitting of a DNN. Experiments\nshow that our theory well predicts the real learning dynamics of various DNNs\non different tasks.\n","authors":["Qihan Ren","Yang Xu","Junpeng Zhang","Yue Xin","Dongrui Liu","Quanshi Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.19198v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19196v1","updated":"2024-07-27T07:30:47Z","published":"2024-07-27T07:30:47Z","title":"Why Misinformation is Created? Detecting them by Integrating Intent\n Features","summary":" Various social media platforms, e.g., Twitter and Reddit, allow people to\ndisseminate a plethora of information more efficiently and conveniently.\nHowever, they are inevitably full of misinformation, causing damage to diverse\naspects of our daily lives. To reduce the negative impact, timely\nidentification of misinformation, namely Misinformation Detection (MD), has\nbecome an active research topic receiving widespread attention. As a complex\nphenomenon, the veracity of an article is influenced by various aspects. In\nthis paper, we are inspired by the opposition of intents between misinformation\nand real information. Accordingly, we propose to reason the intent of articles\nand form the corresponding intent features to promote the veracity\ndiscrimination of article features. To achieve this, we build a hierarchy of a\nset of intents for both misinformation and real information by referring to the\nexisting psychological theories, and we apply it to reason the intent of\narticles by progressively generating binary answers with an encoder-decoder\nstructure. We form the corresponding intent features and integrate it with the\ntoken features to achieve more discriminative article features for MD. Upon\nthese ideas, we suggest a novel MD method, namely Detecting Misinformation by\nIntegrating Intent featuRes (DM-INTER). To evaluate the performance of\nDM-INTER, we conduct extensive experiments on benchmark MD datasets. The\nexperimental results validate that DM-INTER can outperform the existing\nbaseline MD methods.\n","authors":["Bing Wang","Ximing Li","Changchun Li","Bo Fu","Songwen Pei","Shengsheng Wang"],"pdf_url":"https://arxiv.org/pdf/2407.19196v1.pdf","comment":"11 pages, 3 figures. Accepted by CIKM 2024"},{"id":"http://arxiv.org/abs/2305.06683v2","updated":"2024-07-27T07:22:15Z","published":"2023-05-11T09:40:24Z","title":"Cost-efficient Crowdsourcing for Span-based Sequence Labeling: Worker\n Selection and Data Augmentation","summary":" This paper introduces a novel crowdsourcing worker selection algorithm,\nenhancing annotation quality and reducing costs. Unlike previous studies\ntargeting simpler tasks, this study contends with the complexities of label\ninterdependencies in sequence labeling. The proposed algorithm utilizes a\nCombinatorial Multi-Armed Bandit (CMAB) approach for worker selection, and a\ncost-effective human feedback mechanism. The challenge of dealing with\nimbalanced and small-scale datasets, which hinders offline simulation of worker\nselection, is tackled using an innovative data augmentation method termed\nshifting, expanding, and shrinking (SES). Rigorous testing on CoNLL 2003 NER\nand Chinese OEI datasets showcased the algorithm's efficiency, with an increase\nin F1 score up to 100.04% of the expert-only baseline, alongside cost savings\nup to 65.97%. The paper also encompasses a dataset-independent test emulating\nannotation evaluation through a Bernoulli distribution, which still led to an\nimpressive 97.56% F1 score of the expert baseline and 59.88% cost savings.\nFurthermore, our approach can be seamlessly integrated into Reinforcement\nLearning from Human Feedback (RLHF) systems, offering a cost-effective solution\nfor obtaining human feedback.\n","authors":["Yujie Wang","Chao Huang","Liner Yang","Zhixuan Fang","Yaping Huang","Yang Liu","Jingsi Yu","Erhong Yang"],"pdf_url":"https://arxiv.org/pdf/2305.06683v2.pdf","comment":"Camera-ready version for CCL 2024"},{"id":"http://arxiv.org/abs/2407.19192v1","updated":"2024-07-27T07:16:07Z","published":"2024-07-27T07:16:07Z","title":"Harmfully Manipulated Images Matter in Multimodal Misinformation\n Detection","summary":" Nowadays, misinformation is widely spreading over various social media\nplatforms and causes extremely negative impacts on society. To combat this\nissue, automatically identifying misinformation, especially those containing\nmultimodal content, has attracted growing attention from the academic and\nindustrial communities, and induced an active research topic named Multimodal\nMisinformation Detection (MMD). Typically, existing MMD methods capture the\nsemantic correlation and inconsistency between multiple modalities, but neglect\nsome potential clues in multimodal content. Recent studies suggest that\nmanipulated traces of the images in articles are non-trivial clues for\ndetecting misinformation. Meanwhile, we find that the underlying intentions\nbehind the manipulation, e.g., harmful and harmless, also matter in MMD.\nAccordingly, in this work, we propose to detect misinformation by learning\nmanipulation features that indicate whether the image has been manipulated, as\nwell as intention features regarding the harmful and harmless intentions of the\nmanipulation. Unfortunately, the manipulation and intention labels that make\nthese features discriminative are unknown. To overcome the problem, we propose\ntwo weakly supervised signals as alternatives by introducing additional\ndatasets on image manipulation detection and formulating two classification\ntasks as positive and unlabeled learning problems. Based on these ideas, we\npropose a novel MMD method, namely Harmfully Manipulated Images Matter in MMD\n(HAMI-M3D). Extensive experiments across three benchmark datasets can\ndemonstrate that HAMI-M3D can consistently improve the performance of any MMD\nbaselines.\n","authors":["Bing Wang","Shengsheng Wang","Changchun Li","Renchu Guan","Ximing Li"],"pdf_url":"https://arxiv.org/pdf/2407.19192v1.pdf","comment":"Accepted by ACM MM 2024. Code:\n https://github.com/wangbing1416/HAMI-M3D"},{"id":"http://arxiv.org/abs/2407.17940v2","updated":"2024-07-27T06:56:06Z","published":"2024-07-25T10:58:42Z","title":"Positive Text Reframing under Multi-strategy Optimization","summary":" Differing from sentiment transfer, positive reframing seeks to substitute\nnegative perspectives with positive expressions while preserving the original\nmeaning. With the emergence of pre-trained language models (PLMs), it is\npossible to achieve acceptable results by fine-tuning PLMs. Nevertheless,\ngenerating fluent, diverse and task-constrained reframing text remains a\nsignificant challenge. To tackle this issue, a \\textbf{m}ulti-\\textbf{s}trategy\n\\textbf{o}ptimization \\textbf{f}ramework (MSOF) is proposed in this paper.\nStarting from the objective of positive reframing, we first design positive\nsentiment reward and content preservation reward to encourage the model to\ntransform the negative expressions of the original text while ensuring the\nintegrity and consistency of the semantics. Then, different decoding\noptimization approaches are introduced to improve the quality of text\ngeneration. Finally, based on the modeling formula of positive reframing, we\npropose a multi-dimensional re-ranking method that further selects candidate\nsentences from three dimensions: strategy consistency, text similarity and\nfluency. Extensive experiments on two Seq2Seq PLMs, BART and T5, demonstrate\nour framework achieves significant improvements on unconstrained and controlled\npositive reframing tasks.\n","authors":["Shutong Jia","Biwei Cao","Qingqing Gao","Jiuxin Cao","Bo Liu"],"pdf_url":"https://arxiv.org/pdf/2407.17940v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19173v1","updated":"2024-07-27T05:04:49Z","published":"2024-07-27T05:04:49Z","title":"FarSSiBERT: A Novel Transformer-based Model for Semantic Similarity\n Measurement of Persian Social Networks Informal Texts","summary":" One fundamental task for NLP is to determine the similarity between two texts\nand evaluate the extent of their likeness. The previous methods for the Persian\nlanguage have low accuracy and are unable to comprehend the structure and\nmeaning of texts effectively. Additionally, these methods primarily focus on\nformal texts, but in real-world applications of text processing, there is a\nneed for robust methods that can handle colloquial texts. This requires\nalgorithms that consider the structure and significance of words based on\ncontext, rather than just the frequency of words. The lack of a proper dataset\nfor this task in the Persian language makes it important to develop such\nalgorithms and construct a dataset for Persian text. This paper introduces a\nnew transformer-based model to measure semantic similarity between Persian\ninformal short texts from social networks. In addition, a Persian dataset named\nFarSSiM has been constructed for this purpose, using real data from social\nnetworks and manually annotated and verified by a linguistic expert team. The\nproposed model involves training a large language model using the BERT\narchitecture from scratch. This model, called FarSSiBERT, is pre-trained on\napproximately 104 million Persian informal short texts from social networks,\nmaking it one of a kind in the Persian language. Moreover, a novel specialized\ninformal language tokenizer is provided that not only performs tokenization on\nformal texts well but also accurately identifies tokens that other Persian\ntokenizers are unable to recognize. It has been demonstrated that our proposed\nmodel outperforms ParsBERT, laBSE, and multilingual BERT in the Pearson and\nSpearman's coefficient criteria. Additionally, the pre-trained large language\nmodel has great potential for use in other NLP tasks on colloquial text and as\na tokenizer for less-known informal words.\n","authors":["Seyed Mojtaba Sadjadi","Zeinab Rajabi","Leila Rabiei","Mohammad-Shahram Moin"],"pdf_url":"https://arxiv.org/pdf/2407.19173v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19164v1","updated":"2024-07-27T04:16:11Z","published":"2024-07-27T04:16:11Z","title":"Addressing Topic Leakage in Cross-Topic Evaluation for Authorship\n Verification","summary":" Authorship verification (AV) aims to identify whether a pair of texts has the\nsame author. We address the challenge of evaluating AV models' robustness\nagainst topic shifts. The conventional evaluation assumes minimal topic overlap\nbetween training and test data. However, we argue that there can still be topic\nleakage in test data, causing misleading model performance and unstable\nrankings. To address this, we propose an evaluation method called\nHeterogeneity-Informed Topic Sampling (HITS), which creates a smaller dataset\nwith a heterogeneously distributed topic set. Our experimental results\ndemonstrate that HITS-sampled datasets yield a more stable ranking of models\nacross random seeds and evaluation splits. Our contributions include: 1. An\nanalysis of causes and effects of topic leakage. 2. A demonstration of the HITS\nin reducing the effects of topic leakage, and 3. The Robust Authorship\nVerification bENchmark (RAVEN) that allows topic shortcut test to uncover AV\nmodels' reliance on topic-specific features.\n","authors":["Jitkapat Sawatphol","Can Udomcharoenchaikit","Sarana Nutanong"],"pdf_url":"https://arxiv.org/pdf/2407.19164v1.pdf","comment":"Accepted to publish at Transactions of the Association for\n Computational Linguistics"},{"id":"http://arxiv.org/abs/2404.14631v2","updated":"2024-07-27T03:13:19Z","published":"2024-04-23T00:05:48Z","title":"Learning Word Embedding with Better Distance Weighting and Window Size\n Scheduling","summary":" Distributed word representation (a.k.a. word embedding) is a key focus in\nnatural language processing (NLP). As a highly successful word embedding model,\nWord2Vec offers an efficient method for learning distributed word\nrepresentations on large datasets. However, Word2Vec lacks consideration for\ndistances between center and context words. We propose two novel methods,\nLearnable Formulated Weights (LFW) and Epoch-based Dynamic Window Size (EDWS),\nto incorporate distance information into two variants of Word2Vec, the\nContinuous Bag-of-Words (CBOW) model and the Continuous Skip-gram (Skip-gram)\nmodel. For CBOW, LFW uses a formula with learnable parameters that best\nreflects the relationship of influence and distance between words to calculate\ndistance-related weights for average pooling, providing insights for future NLP\ntext modeling research. For Skip-gram, we improve its dynamic window size\nstrategy to introduce distance information in a more balanced way. Experiments\nprove the effectiveness of LFW and EDWS in enhancing Word2Vec's performance,\nsurpassing previous state-of-the-art methods.\n","authors":["Chaohao Yang","Chris Ding"],"pdf_url":"https://arxiv.org/pdf/2404.14631v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.12500v2","updated":"2024-07-27T00:52:44Z","published":"2024-07-17T11:30:04Z","title":"Automate or Assist? The Role of Computational Models in Identifying\n Gendered Discourse in US Capital Trial Transcripts","summary":" The language used by US courtroom actors in criminal trials has long been\nstudied for biases. However, systematic studies for bias in high-stakes court\ntrials have been difficult, due to the nuanced nature of bias and the legal\nexpertise required. Large language models offer the possibility to automate\nannotation. But validating the computational approach requires both an\nunderstanding of how automated methods fit in existing annotation workflows and\nwhat they really offer. We present a case study of adding a computational model\nto a complex and high-stakes problem: identifying gender-biased language in US\ncapital trials for women defendants. Our team of experienced death-penalty\nlawyers and NLP technologists pursue a three-phase study: first annotating\nmanually, then training and evaluating computational models, and finally\ncomparing expert annotations to model predictions. Unlike many typical NLP\ntasks, annotating for gender bias in months-long capital trials is complicated,\nwith many individual judgment calls. Contrary to standard arguments for\nautomation that are based on efficiency and scalability, legal experts find the\ncomputational models most useful in providing opportunities to reflect on their\nown bias in annotation and to build consensus on annotation rules. This\nexperience suggests that seeking to replace experts with computational models\nfor complex annotation is both unrealistic and undesirable. Rather,\ncomputational models offer valuable opportunities to assist the legal experts\nin annotation-based studies.\n","authors":["Andrea W Wen-Yi","Kathryn Adamson","Nathalie Greenfield","Rachel Goldberg","Sandra Babcock","David Mimno","Allison Koenecke"],"pdf_url":"https://arxiv.org/pdf/2407.12500v2.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2308.03366v2","updated":"2024-07-27T21:25:00Z","published":"2023-08-07T07:41:01Z","title":"POSIT: Promotion of Semantic Item Tail via Adversarial Learning","summary":" In many recommendations, a handful of popular items (e.g., movies /\ntelevision shows, news, etc.) can be dominant in recommendations for many\nusers. However, we know that in a large catalog of items, users are likely\ninterested in more than what is popular. The dominance of popular items may\nmean that users will not see items that they would probably enjoy. In this\npaper, we propose a technique to overcome this problem using adversarial\nmachine learning. We define a metric to translate the user-level utility metric\nin terms of an advantage/disadvantage over items. We subsequently used that\nmetric in an adversarial learning framework to systematically promote\ndisadvantaged items. Distinctly, our method integrates a small-capacity model\nto produce semantically meaningful weights, leading to an algorithm that\nidentifies and promotes a semantically similar item within the learning\nprocess. In the empirical study, we evaluated the proposed technique on three\npublicly available datasets and seven competitive baselines. The result shows\nthat our proposed method not only improves the coverage, but also,\nsurprisingly, improves the overall performance.\n","authors":["Qiuling Xu","Pannaga Shivaswamy","Xiangyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.03366v2.pdf","comment":"EAI-KDD'2024. Code at https://github.com/qiulingxu/POSIT"},{"id":"http://arxiv.org/abs/2302.03561v3","updated":"2024-07-27T17:15:33Z","published":"2023-02-07T16:17:25Z","title":"Optimizing Audio Recommendations for the Long-Term: A Reinforcement\n Learning Perspective","summary":" We present a novel podcast recommender system deployed at industrial scale.\nThis system successfully optimizes personal listening journeys that unfold over\nmonths for hundreds of millions of listeners. In deviating from the pervasive\nindustry practice of optimizing machine learning algorithms for short-term\nproxy metrics, the system substantially improves long-term performance in A/B\ntests. The paper offers insights into how our methods cope with attribution,\ncoordination, and measurement challenges that usually hinder such long-term\noptimization. To contextualize these practical insights within a broader\nacademic framework, we turn to reinforcement learning (RL). Using the language\nof RL, we formulate a comprehensive model of users' recurring relationships\nwith a recommender system. Then, within this model, we identify our approach as\na policy improvement update to a component of the existing recommender system,\nenhanced by tailored modeling of value functions and user-state\nrepresentations. Illustrative offline experiments suggest this specialized\nmodeling reduces data requirements by as much as a factor of 120,000 compared\nto black-box approaches.\n","authors":["Lucas Maystre","Daniel Russo","Yu Zhao"],"pdf_url":"https://arxiv.org/pdf/2302.03561v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19239v1","updated":"2024-07-27T12:07:46Z","published":"2024-07-27T12:07:46Z","title":"MaTrRec: Uniting Mamba and Transformer for Sequential Recommendation","summary":" Sequential recommendation systems aim to provide personalized recommendations\nby analyzing dynamic preferences and dependencies within user behavior\nsequences. Recently, Transformer models can effectively capture user\npreferences. However, their quadratic computational complexity limits\nrecommendation performance on long interaction sequence data. Inspired by the\nState Space Model (SSM)representative model, Mamba, which efficiently captures\nuser preferences in long interaction sequences with linear complexity, we find\nthat Mamba's recommendation effectiveness is limited in short interaction\nsequences, with failing to recall items of actual interest to users and\nexacerbating the data sparsity cold start problem. To address this issue, we\ninnovatively propose a new model, MaTrRec, which combines the strengths of\nMamba and Transformer. This model fully leverages Mamba's advantages in\nhandling long-term dependencies and Transformer's global attention advantages\nin short-term dependencies, thereby enhances predictive capabilities on both\nlong and short interaction sequence datasets while balancing model efficiency.\nNotably, our model significantly improves the data sparsity cold start problem,\nwith an improvement of up to 33% on the highly sparse Amazon Musical\nInstruments dataset. We conducted extensive experimental evaluations on five\nwidely used public datasets. The experimental results show that our model\noutperforms the current state-of-the-art sequential recommendation models on\nall five datasets. The code is available at\nhttps://github.com/Unintelligentmumu/MaTrRec.\n","authors":["Shun Zhang","Runsen Zhang","Zhirong Yang"],"pdf_url":"https://arxiv.org/pdf/2407.19239v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21065v1","updated":"2024-07-27T21:51:30Z","published":"2024-07-27T21:51:30Z","title":"LawLLM: Law Large Language Model for the US Legal System","summary":" In the rapidly evolving field of legal analytics, finding relevant cases and\naccurately predicting judicial outcomes are challenging because of the\ncomplexity of legal language, which often includes specialized terminology,\ncomplex syntax, and historical context. Moreover, the subtle distinctions\nbetween similar and precedent cases require a deep understanding of legal\nknowledge. Researchers often conflate these concepts, making it difficult to\ndevelop specialized techniques to effectively address these nuanced tasks. In\nthis paper, we introduce the Law Large Language Model (LawLLM), a multi-task\nmodel specifically designed for the US legal domain to address these\nchallenges. LawLLM excels at Similar Case Retrieval (SCR), Precedent Case\nRecommendation (PCR), and Legal Judgment Prediction (LJP). By clearly\ndistinguishing between precedent and similar cases, we provide essential\nclarity, guiding future research in developing specialized strategies for these\ntasks. We propose customized data preprocessing techniques for each task that\ntransform raw legal data into a trainable format. Furthermore, we also use\ntechniques such as in-context learning (ICL) and advanced information retrieval\nmethods in LawLLM. The evaluation results demonstrate that LawLLM consistently\noutperforms existing baselines in both zero-shot and few-shot scenarios,\noffering unparalleled multi-task capabilities and filling critical gaps in the\nlegal domain.\n","authors":["Dong Shu","Haoran Zhao","Xukun Liu","David Demeter","Mengnan Du","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.21065v1.pdf","comment":"21 pages, 2 figures, accepted at the 33rd ACM International\n Conference on Information and Knowledge Management (CIKM 2024) for the\n Applied Research Paper track"}],"Multimedia":[{"id":"http://arxiv.org/abs/2407.19340v1","updated":"2024-07-27T21:00:36Z","published":"2024-07-27T21:00:36Z","title":"Integrating Large Language Models into a Tri-Modal Architecture for\n Automated Depression Classification","summary":" Major Depressive Disorder (MDD) is a pervasive mental health condition that\naffects 300 million people worldwide. This work presents a novel, BiLSTM-based\ntri-modal model-level fusion architecture for the binary classification of\ndepression from clinical interview recordings. The proposed architecture\nincorporates Mel Frequency Cepstral Coefficients, Facial Action Units, and uses\na two-shot learning based GPT-4 model to process text data. This is the first\nwork to incorporate large language models into a multi-modal architecture for\nthis task. It achieves impressive results on the DAIC-WOZ AVEC 2016 Challenge\ncross-validation split and Leave-One-Subject-Out cross-validation split,\nsurpassing all baseline models and multiple state-of-the-art models. In\nLeave-One-Subject-Out testing, it achieves an accuracy of 91.01%, an F1-Score\nof 85.95%, a precision of 80%, and a recall of 92.86%.\n","authors":["Santosh V. Patapati"],"pdf_url":"https://arxiv.org/pdf/2407.19340v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19302v1","updated":"2024-07-27T17:12:37Z","published":"2024-07-27T17:12:37Z","title":"IBMEA: Exploring Variational Information Bottleneck for Multi-modal\n Entity Alignment","summary":" Multi-modal entity alignment (MMEA) aims to identify equivalent entities\nbetween multi-modal knowledge graphs (MMKGs), where the entities can be\nassociated with related images. Most existing studies integrate multi-modal\ninformation heavily relying on the automatically-learned fusion module, rarely\nsuppressing the redundant information for MMEA explicitly. To this end, we\nexplore variational information bottleneck for multi-modal entity alignment\n(IBMEA), which emphasizes the alignment-relevant information and suppresses the\nalignment-irrelevant information in generating entity representations.\nSpecifically, we devise multi-modal variational encoders to generate\nmodal-specific entity representations as probability distributions. Then, we\npropose four modal-specific information bottleneck regularizers, limiting the\nmisleading clues in refining modal-specific entity representations. Finally, we\npropose a modal-hybrid information contrastive regularizer to integrate all the\nrefined modal-specific representations, enhancing the entity similarity between\nMMKGs to achieve MMEA. We conduct extensive experiments on two cross-KG and\nthree bilingual MMEA datasets. Experimental results demonstrate that our model\nconsistently outperforms previous state-of-the-art methods, and also shows\npromising and robust performance in low-resource and high-noise data scenarios.\n","authors":["Taoyu Su","Jiawei Sheng","Shicheng Wang","Xinghua Zhang","Hongbo Xu","Tingwen Liu"],"pdf_url":"https://arxiv.org/pdf/2407.19302v1.pdf","comment":"Accepted by ACM MM 2024"},{"id":"http://arxiv.org/abs/2407.19244v1","updated":"2024-07-27T12:44:21Z","published":"2024-07-27T12:44:21Z","title":"Radio Frequency Signal based Human Silhouette Segmentation: A Sequential\n Diffusion Approach","summary":" Radio frequency (RF) signals have been proved to be flexible for human\nsilhouette segmentation (HSS) under complex environments. Existing studies are\nmainly based on a one-shot approach, which lacks a coherent projection ability\nfrom the RF domain. Additionally, the spatio-temporal patterns have not been\nfully explored for human motion dynamics in HSS. Therefore, we propose a\ntwo-stage Sequential Diffusion Model (SDM) to progressively synthesize\nhigh-quality segmentation jointly with the considerations on motion dynamics.\nCross-view transformation blocks are devised to guide the diffusion model in a\nmulti-scale manner for comprehensively characterizing human related patterns in\nan individual frame such as directional projection from signal planes.\nMoreover, spatio-temporal blocks are devised to fine-tune the frame-level model\nto incorporate spatio-temporal contexts and motion dynamics, enhancing the\nconsistency of the segmentation maps. Comprehensive experiments on a public\nbenchmark -- HIBER demonstrate the state-of-the-art performance of our method\nwith an IoU 0.732. Our code is available at https://github.com/ph-w2000/SDM.\n","authors":["Penghui Wen","Kun Hu","Dong Yuan","Zhiyuan Ning","Changyang Li","Zhiyong Wang"],"pdf_url":"https://arxiv.org/pdf/2407.19244v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19225v1","updated":"2024-07-27T09:59:13Z","published":"2024-07-27T09:59:13Z","title":"Magic3DSketch: Create Colorful 3D Models From Sketch-Based 3D Modeling\n Guided by Text and Language-Image Pre-Training","summary":" The requirement for 3D content is growing as AR/VR application emerges. At\nthe same time, 3D modelling is only available for skillful experts, because\ntraditional methods like Computer-Aided Design (CAD) are often too\nlabor-intensive and skill-demanding, making it challenging for novice users.\nOur proposed method, Magic3DSketch, employs a novel technique that encodes\nsketches to predict a 3D mesh, guided by text descriptions and leveraging\nexternal prior knowledge obtained through text and language-image pre-training.\nThe integration of language-image pre-trained neural networks complements the\nsparse and ambiguous nature of single-view sketch inputs. Our method is also\nmore useful and offers higher degree of controllability compared to existing\ntext-to-3D approaches, according to our user study. Moreover, Magic3DSketch\nachieves state-of-the-art performance in both synthetic and real dataset with\nthe capability of producing more detailed structures and realistic shapes with\nthe help of text input. Users are also more satisfied with models obtained by\nMagic3DSketch according to our user study. Additionally, we are also the first,\nto our knowledge, add color based on text description to the sketch-derived\nshapes. By combining sketches and text guidance with the help of language-image\npretrained models, our Magic3DSketch can allow novice users to create custom 3D\nmodels with minimal effort and maximum creative freedom, with the potential to\nrevolutionize future 3D modeling pipelines.\n","authors":["Ying Zang","Yidong Han","Chaotao Ding","Jianqi Zhang","Tianrun Chen"],"pdf_url":"https://arxiv.org/pdf/2407.19225v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19192v1","updated":"2024-07-27T07:16:07Z","published":"2024-07-27T07:16:07Z","title":"Harmfully Manipulated Images Matter in Multimodal Misinformation\n Detection","summary":" Nowadays, misinformation is widely spreading over various social media\nplatforms and causes extremely negative impacts on society. To combat this\nissue, automatically identifying misinformation, especially those containing\nmultimodal content, has attracted growing attention from the academic and\nindustrial communities, and induced an active research topic named Multimodal\nMisinformation Detection (MMD). Typically, existing MMD methods capture the\nsemantic correlation and inconsistency between multiple modalities, but neglect\nsome potential clues in multimodal content. Recent studies suggest that\nmanipulated traces of the images in articles are non-trivial clues for\ndetecting misinformation. Meanwhile, we find that the underlying intentions\nbehind the manipulation, e.g., harmful and harmless, also matter in MMD.\nAccordingly, in this work, we propose to detect misinformation by learning\nmanipulation features that indicate whether the image has been manipulated, as\nwell as intention features regarding the harmful and harmless intentions of the\nmanipulation. Unfortunately, the manipulation and intention labels that make\nthese features discriminative are unknown. To overcome the problem, we propose\ntwo weakly supervised signals as alternatives by introducing additional\ndatasets on image manipulation detection and formulating two classification\ntasks as positive and unlabeled learning problems. Based on these ideas, we\npropose a novel MMD method, namely Harmfully Manipulated Images Matter in MMD\n(HAMI-M3D). Extensive experiments across three benchmark datasets can\ndemonstrate that HAMI-M3D can consistently improve the performance of any MMD\nbaselines.\n","authors":["Bing Wang","Shengsheng Wang","Changchun Li","Renchu Guan","Ximing Li"],"pdf_url":"https://arxiv.org/pdf/2407.19192v1.pdf","comment":"Accepted by ACM MM 2024. Code:\n https://github.com/wangbing1416/HAMI-M3D"}]},"2024-07-30T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2407.21018v1","updated":"2024-07-30T17:59:08Z","published":"2024-07-30T17:59:08Z","title":"ThinK: Thinner Key Cache by Query-Driven Pruning","summary":" Large Language Models (LLMs) have revolutionized the field of natural\nlanguage processing, achieving unprecedented performance across a variety of\napplications by leveraging increased model sizes and sequence lengths. However,\nthe associated rise in computational and memory costs poses significant\nchallenges, particularly in managing long sequences due to the quadratic\ncomplexity of the transformer attention mechanism. This paper focuses on the\nlong-context scenario, addressing the inefficiencies in KV cache memory\nconsumption during inference. Unlike existing approaches that optimize the\nmemory based on the sequence lengths, we uncover that the channel dimension of\nthe KV cache exhibits significant redundancy, characterized by unbalanced\nmagnitude distribution and low-rank structure in attention weights. Based on\nthese observations, we propose ThinK, a novel query-dependent KV cache pruning\nmethod designed to minimize attention weight loss while selectively pruning the\nleast significant channels. Our approach not only maintains or enhances model\naccuracy but also achieves a reduction in memory costs by over 20% compared\nwith vanilla KV cache eviction methods. Extensive evaluations on the LLaMA3 and\nMistral models across various long-sequence datasets confirm the efficacy of\nThinK, setting a new precedent for efficient LLM deployment without\ncompromising performance. We also outline the potential of extending our method\nto value cache pruning, demonstrating ThinK's versatility and broad\napplicability in reducing both memory and computational overheads.\n","authors":["Yuhui Xu","Zhanming Jie","Hanze Dong","Lei Wang","Xudong Lu","Aojun Zhou","Amrita Saha","Caiming Xiong","Doyen Sahoo"],"pdf_url":"https://arxiv.org/pdf/2407.21018v1.pdf","comment":"20 pages, 6 figures"},{"id":"http://arxiv.org/abs/2401.05254v3","updated":"2024-07-30T17:58:45Z","published":"2024-01-10T16:32:25Z","title":"Language-based Valence and Arousal Expressions between the United States\n and China: a Cross-Cultural Examination","summary":" Although affective expressions of individuals have been extensively studied\nusing social media, research has primarily focused on the Western context.\nThere are substantial differences among cultures that contribute to their\naffective expressions. This paper examines the differences between Twitter (X)\nin the United States and Sina Weibo posts in China on two primary dimensions of\naffect - valence and arousal. We study the difference in the functional\nrelationship between arousal and valence (so-called V-shaped) among individuals\nin the US and China and explore the associated content differences.\nFurthermore, we correlate word usage and topics in both platforms to interpret\ntheir differences. We observe that for Twitter users, the variation in\nemotional intensity is less distinct between negative and positive emotions\ncompared to Weibo users, and there is a sharper escalation in arousal\ncorresponding with heightened emotions. From language features, we discover\nthat affective expressions are associated with personal life and feelings on\nTwitter, while on Weibo such discussions are about socio-political topics in\nthe society. These results suggest a West-East difference in the V-shaped\nrelationship between valence and arousal of affective expressions on social\nmedia influenced by content differences. Our findings have implications for\napplications and theories related to cultural differences in affective\nexpressions.\n","authors":["Young-Min Cho","Dandan Pang","Stuti Thapa","Garrick Sherman","Lyle Ungar","Louis Tay","Sharath Chandra Guntuku"],"pdf_url":"https://arxiv.org/pdf/2401.05254v3.pdf","comment":"preview"},{"id":"http://arxiv.org/abs/2407.21004v1","updated":"2024-07-30T17:51:44Z","published":"2024-07-30T17:51:44Z","title":"Evolver: Chain-of-Evolution Prompting to Boost Large Multimodal Models\n for Hateful Meme Detection","summary":" Recent advances show that two-stream approaches have achieved outstanding\nperformance in hateful meme detection. However, hateful memes constantly evolve\nas new memes emerge by fusing progressive cultural ideas, making existing\nmethods obsolete or ineffective. In this work, we explore the potential of\nLarge Multimodal Models (LMMs) for hateful meme detection. To this end, we\npropose Evolver, which incorporates LMMs via Chain-of-Evolution (CoE)\nPrompting, by integrating the evolution attribute and in-context information of\nmemes. Specifically, Evolver simulates the evolving and expressing process of\nmemes and reasons through LMMs in a step-by-step manner. First, an evolutionary\npair mining module retrieves the top-k most similar memes in the external\ncurated meme set with the input meme. Second, an evolutionary information\nextractor is designed to summarize the semantic regularities between the paired\nmemes for prompting. Finally, a contextual relevance amplifier enhances the\nin-context hatefulness information to boost the search for evolutionary\nprocesses. Extensive experiments on public FHM, MAMI, and HarM datasets show\nthat CoE prompting can be incorporated into existing LMMs to improve their\nperformance. More encouragingly, it can serve as an interpretive tool to\npromote the understanding of the evolution of social memes.\n","authors":["Jinfa Huang","Jinsheng Pan","Zhongwei Wan","Hanjia Lyu","Jiebo Luo"],"pdf_url":"https://arxiv.org/pdf/2407.21004v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20990v1","updated":"2024-07-30T17:27:20Z","published":"2024-07-30T17:27:20Z","title":"From Feature Importance to Natural Language Explanations Using LLMs with\n RAG","summary":" As machine learning becomes increasingly integral to autonomous\ndecision-making processes involving human interaction, the necessity of\ncomprehending the model's outputs through conversational means increases. Most\nrecently, foundation models are being explored for their potential as post hoc\nexplainers, providing a pathway to elucidate the decision-making mechanisms of\npredictive models. In this work, we introduce traceable question-answering,\nleveraging an external knowledge repository to inform the responses of Large\nLanguage Models (LLMs) to user queries within a scene understanding task. This\nknowledge repository comprises contextual details regarding the model's output,\ncontaining high-level features, feature importance, and alternative\nprobabilities. We employ subtractive counterfactual reasoning to compute\nfeature importance, a method that entails analysing output variations resulting\nfrom decomposing semantic features. Furthermore, to maintain a seamless\nconversational flow, we integrate four key characteristics - social, causal,\nselective, and contrastive - drawn from social science research on human\nexplanations into a single-shot prompt, guiding the response generation\nprocess. Our evaluation demonstrates that explanations generated by the LLMs\nencompassed these elements, indicating its potential to bridge the gap between\ncomplex model outputs and natural language expressions.\n","authors":["Sule Tekkesinoglu","Lars Kunze"],"pdf_url":"https://arxiv.org/pdf/2407.20990v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13307v3","updated":"2024-07-30T16:11:48Z","published":"2023-11-22T10:55:36Z","title":"Rethinking Radiology Report Generation via Causal Inspired\n Counterfactual Augmentation","summary":" Radiology Report Generation (RRG) draws attention as a vision-and-language\ninteraction of biomedical fields. Previous works inherited the ideology of\ntraditional language generation tasks, aiming to generate paragraphs with high\nreadability as reports. Despite significant progress, the independence between\ndiseases-a specific property of RRG-was neglected, yielding the models being\nconfused by the co-occurrence of diseases brought on by the biased data\ndistribution, thus generating inaccurate reports. In this paper, to rethink\nthis issue, we first model the causal effects between the variables from a\ncausal perspective, through which we prove that the co-occurrence relationships\nbetween diseases on the biased distribution function as confounders, confusing\nthe accuracy through two backdoor paths, i.e. the Joint Vision Coupling and the\nConditional Sequential Coupling. Then, we proposed a novel model-agnostic\ncounterfactual augmentation method that contains two strategies, i.e. the\nPrototype-based Counterfactual Sample Synthesis (P-CSS) and the Magic-Cube-like\nCounterfactual Report Reconstruction (Cube), to intervene the backdoor paths,\nthus enhancing the accuracy and generalization of RRG models. Experimental\nresults on the widely used MIMIC-CXR dataset demonstrate the effectiveness of\nour proposed method. Additionally, a generalization performance is evaluated on\nIU X-Ray dataset, which verifies our work can effectively reduce the impact of\nco-occurrences caused by different distributions on the results.\n","authors":["Xiao Song","Jiafan Liu","Yun Li","Yan Liu","Wenbin Lei","Ruxin Wang"],"pdf_url":"https://arxiv.org/pdf/2311.13307v3.pdf","comment":"10 pages,5 figures"},{"id":"http://arxiv.org/abs/2407.20910v1","updated":"2024-07-30T15:37:05Z","published":"2024-07-30T15:37:05Z","title":"Enabling Contextual Soft Moderation on Social Media through Contrastive\n Textual Deviation","summary":" Automated soft moderation systems are unable to ascertain if a post supports\nor refutes a false claim, resulting in a large number of contextual false\npositives. This limits their effectiveness, for example undermining trust in\nhealth experts by adding warnings to their posts or resorting to vague warnings\ninstead of granular fact-checks, which result in desensitizing users. In this\npaper, we propose to incorporate stance detection into existing automated\nsoft-moderation pipelines, with the goal of ruling out contextual false\npositives and providing more precise recommendations for social media content\nthat should receive warnings. We develop a textual deviation task called\nContrastive Textual Deviation (CTD) and show that it outperforms existing\nstance detection approaches when applied to soft moderation.We then integrate\nCTD into the stateof-the-art system for automated soft moderation Lambretta,\nshowing that our approach can reduce contextual false positives from 20% to\n2.1%, providing another important building block towards deploying reliable\nautomated soft moderation tools on social media.\n","authors":["Pujan Paudel","Mohammad Hammas Saeed","Rebecca Auger","Chris Wells","Gianluca Stringhini"],"pdf_url":"https://arxiv.org/pdf/2407.20910v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20906v1","updated":"2024-07-30T15:26:36Z","published":"2024-07-30T15:26:36Z","title":"Automated Review Generation Method Based on Large Language Models","summary":" Literature research, vital for scientific advancement, is overwhelmed by the\nvast ocean of available information. Addressing this, we propose an automated\nreview generation method based on Large Language Models (LLMs) to streamline\nliterature processing and reduce cognitive load. In case study on propane\ndehydrogenation (PDH) catalysts, our method swiftly generated comprehensive\nreviews from 343 articles, averaging seconds per article per LLM account.\nExtended analysis of 1041 articles provided deep insights into catalysts'\ncomposition, structure, and performance. Recognizing LLMs' hallucinations, we\nemployed a multi-layered quality control strategy, ensuring our method's\nreliability and effective hallucination mitigation. Expert verification\nconfirms the accuracy and citation integrity of generated reviews,\ndemonstrating LLM hallucination risks reduced to below 0.5% with over 95%\nconfidence. Released Windows application enables one-click review generation,\naiding researchers in tracking advancements and recommending literature. This\napproach showcases LLMs' role in enhancing scientific research productivity and\nsets the stage for further exploration.\n","authors":["Shican Wu","Xiao Ma","Dehui Luo","Lulu Li","Xiangcheng Shi","Xin Chang","Xiaoyun Lin","Ran Luo","Chunlei Pei","Zhi-Jian Zhao","Jinlong Gong"],"pdf_url":"https://arxiv.org/pdf/2407.20906v1.pdf","comment":"16 pages, 3 figures, 3 tables"},{"id":"http://arxiv.org/abs/2407.20899v1","updated":"2024-07-30T15:17:15Z","published":"2024-07-30T15:17:15Z","title":"Faithful and Plausible Natural Language Explanations for Image\n Classification: A Pipeline Approach","summary":" Existing explanation methods for image classification struggle to provide\nfaithful and plausible explanations. This paper addresses this issue by\nproposing a post-hoc natural language explanation method that can be applied to\nany CNN-based classifier without altering its training process or affecting\npredictive performance. By analysing influential neurons and the corresponding\nactivation maps, the method generates a faithful description of the\nclassifier's decision process in the form of a structured meaning\nrepresentation, which is then converted into text by a language model. Through\nthis pipeline approach, the generated explanations are grounded in the neural\nnetwork architecture, providing accurate insight into the classification\nprocess while remaining accessible to non-experts. Experimental results show\nthat the NLEs constructed by our method are significantly more plausible and\nfaithful. In particular, user interventions in the neural network structure\n(masking of neurons) are three times more effective than the baselines.\n","authors":["Adam Wojciechowski","Mateusz Lango","Ondrej Dusek"],"pdf_url":"https://arxiv.org/pdf/2407.20899v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11960v3","updated":"2024-07-30T15:02:51Z","published":"2023-10-18T13:40:41Z","title":"Fast Multipole Attention: A Divide-and-Conquer Attention Mechanism for\n Long Sequences","summary":" Transformer-based models have achieved state-of-the-art performance in many\nareas. However, the quadratic complexity of self-attention with respect to the\ninput length hinders the applicability of Transformer-based models to long\nsequences. To address this, we present Fast Multipole Attention, a new\nattention mechanism that uses a divide-and-conquer strategy to reduce the time\nand memory complexity of attention for sequences of length $n$ from\n$\\mathcal{O}(n^2)$ to $\\mathcal{O}(n \\log n)$ or $O(n)$, while retaining a\nglobal receptive field. The hierarchical approach groups queries, keys, and\nvalues into $\\mathcal{O}( \\log n)$ levels of resolution, where groups at\ngreater distances are increasingly larger in size and the weights to compute\ngroup quantities are learned. As such, the interaction between tokens far from\neach other is considered in lower resolution in an efficient hierarchical\nmanner. The overall complexity of Fast Multipole Attention is $\\mathcal{O}(n)$\nor $\\mathcal{O}(n \\log n)$, depending on whether the queries are down-sampled\nor not. This multi-level divide-and-conquer strategy is inspired by fast\nsummation methods from $n$-body physics and the Fast Multipole Method. We\nperform evaluation on autoregressive and bidirectional language modeling tasks\nand compare our Fast Multipole Attention model with other efficient attention\nvariants on medium-size datasets. We find empirically that the Fast Multipole\nTransformer performs much better than other efficient transformers in terms of\nmemory size and accuracy. The Fast Multipole Attention mechanism has the\npotential to empower large language models with much greater sequence lengths,\ntaking the full context into account in an efficient, naturally hierarchical\nmanner during training and when generating long sequences.\n","authors":["Yanming Kang","Giang Tran","Hans De Sterck"],"pdf_url":"https://arxiv.org/pdf/2310.11960v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20884v1","updated":"2024-07-30T14:58:11Z","published":"2024-07-30T14:58:11Z","title":"Effective Black Box Testing of Sentiment Analysis Classification\n Networks","summary":" Transformer-based neural networks have demonstrated remarkable performance in\nnatural language processing tasks such as sentiment analysis. Nevertheless, the\nissue of ensuring the dependability of these complicated architectures through\ncomprehensive testing is still open. This paper presents a collection of\ncoverage criteria specifically designed to assess test suites created for\ntransformer-based sentiment analysis networks. Our approach utilizes input\nspace partitioning, a black-box method, by considering emotionally relevant\nlinguistic features such as verbs, adjectives, adverbs, and nouns. In order to\neffectively produce test cases that encompass a wide range of emotional\nelements, we utilize the k-projection coverage metric. This metric minimizes\nthe complexity of the problem by examining subsets of k features at the same\ntime, hence reducing dimensionality. Large language models are employed to\ngenerate sentences that display specific combinations of emotional features.\nThe findings from experiments obtained from a sentiment analysis dataset\nillustrate that our criteria and generated tests have led to an average\nincrease of 16\\% in test coverage. In addition, there is a corresponding\naverage decrease of 6.5\\% in model accuracy, showing the ability to identify\nvulnerabilities. Our work provides a foundation for improving the dependability\nof transformer-based sentiment analysis systems through comprehensive test\nevaluation.\n","authors":["Parsa Karbasizadeh","Fathiyeh Faghih","Pouria Golshanrad"],"pdf_url":"https://arxiv.org/pdf/2407.20884v1.pdf","comment":"This paper uses LaTeX with the IEEEtran.cls document class"},{"id":"http://arxiv.org/abs/2406.17055v3","updated":"2024-07-30T14:22:26Z","published":"2024-06-24T18:15:27Z","title":"Large Language Models Assume People are More Rational than We Really are","summary":" In order for AI systems to communicate effectively with people, they must\nunderstand how we make decisions. However, people's decisions are not always\nrational, so the implicit internal models of human decision-making in Large\nLanguage Models (LLMs) must account for this. Previous empirical evidence seems\nto suggest that these implicit models are accurate -- LLMs offer believable\nproxies of human behavior, acting how we expect humans would in everyday\ninteractions. However, by comparing LLM behavior and predictions to a large\ndataset of human decisions, we find that this is actually not the case: when\nboth simulating and predicting people's choices, a suite of cutting-edge LLMs\n(GPT-4o & 4-Turbo, Llama-3-8B & 70B, Claude 3 Opus) assume that people are more\nrational than we really are. Specifically, these models deviate from human\nbehavior and align more closely with a classic model of rational choice --\nexpected value theory. Interestingly, people also tend to assume that other\npeople are rational when interpreting their behavior. As a consequence, when we\ncompare the inferences that LLMs and people draw from the decisions of others\nusing another psychological dataset, we find that these inferences are highly\ncorrelated. Thus, the implicit decision-making models of LLMs appear to be\naligned with the human expectation that other people will act rationally,\nrather than with how people actually act.\n","authors":["Ryan Liu","Jiayi Geng","Joshua C. Peterson","Ilia Sucholutsky","Thomas L. Griffiths"],"pdf_url":"https://arxiv.org/pdf/2406.17055v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07633v4","updated":"2024-07-30T13:14:55Z","published":"2023-08-15T08:31:05Z","title":"A Survey on Model Compression for Large Language Models","summary":" Large Language Models (LLMs) have transformed natural language processing\ntasks successfully. Yet, their large size and high computational needs pose\nchallenges for practical use, especially in resource-limited settings. Model\ncompression has emerged as a key research area to address these challenges.\nThis paper presents a survey of model compression techniques for LLMs. We cover\nmethods like quantization, pruning, and knowledge distillation, highlighting\nrecent advancements. We also discuss benchmarking strategies and evaluation\nmetrics crucial for assessing compressed LLMs. This survey offers valuable\ninsights for researchers and practitioners, aiming to enhance efficiency and\nreal-world applicability of LLMs while laying a foundation for future\nadvancements.\n","authors":["Xunyu Zhu","Jian Li","Yong Liu","Can Ma","Weiping Wang"],"pdf_url":"https://arxiv.org/pdf/2308.07633v4.pdf","comment":"Accepted for publication in TACL; a pre-MIT Press publication version"},{"id":"http://arxiv.org/abs/2407.18483v3","updated":"2024-07-30T12:15:34Z","published":"2024-07-26T03:23:31Z","title":"A Role-specific Guided Large Language Model for Ophthalmic Consultation\n Based on Stylistic Differentiation","summary":" Ophthalmology consultations are crucial for diagnosing, treating, and\npreventing eye diseases. However, the growing demand for consultations exceeds\nthe availability of ophthalmologists. By leveraging large pre-trained language\nmodels, we can design effective dialogues for specific scenarios, aiding in\nconsultations. Traditional fine-tuning strategies for question-answering tasks\nare impractical due to increasing model size and often ignoring patient-doctor\nrole function during consultations. In this paper, we propose EyeDoctor, an\nophthalmic medical questioning large language model that enhances accuracy\nthrough doctor-patient role perception guided and an augmented knowledge base\nwith external disease information. Experimental results show EyeDoctor achieves\nhigher question-answering precision in ophthalmology consultations. Notably,\nEyeDoctor demonstrated a 7.25% improvement in Rouge-1 scores and a 10.16%\nimprovement in F1 scores on multi-round datasets compared to second best model\nChatGPT, highlighting the importance of doctor-patient role differentiation and\ndynamic knowledge base expansion for intelligent medical consultations. EyeDoc\nalso serves as a free available web based service and souce code is available\nat https://github.com/sperfu/EyeDoc.\n","authors":["Laiyi Fu","Binbin Fan","Hongkai Du","Yanxiang Feng","Chunhua Li","Huping Song"],"pdf_url":"https://arxiv.org/pdf/2407.18483v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18752v3","updated":"2024-07-30T12:05:11Z","published":"2024-07-26T14:07:00Z","title":"Knowledge Graph Structure as Prompt: Improving Small Language Models\n Capabilities for Knowledge-based Causal Discovery","summary":" Causal discovery aims to estimate causal structures among variables based on\nobservational data. Large Language Models (LLMs) offer a fresh perspective to\ntackle the causal discovery problem by reasoning on the metadata associated\nwith variables rather than their actual data values, an approach referred to as\nknowledge-based causal discovery. In this paper, we investigate the\ncapabilities of Small Language Models (SLMs, defined as LLMs with fewer than 1\nbillion parameters) with prompt-based learning for knowledge-based causal\ndiscovery. Specifically, we present KG Structure as Prompt, a novel approach\nfor integrating structural information from a knowledge graph, such as common\nneighbor nodes and metapaths, into prompt-based learning to enhance the\ncapabilities of SLMs. Experimental results on three types of biomedical and\nopen-domain datasets under few-shot settings demonstrate the effectiveness of\nour approach, surpassing most baselines and even conventional fine-tuning\napproaches trained on full datasets. Our findings further highlight the strong\ncapabilities of SLMs: in combination with knowledge graphs and prompt-based\nlearning, SLMs demonstrate the potential to surpass LLMs with larger number of\nparameters. Our code and datasets are available on GitHub.\n","authors":["Yuni Susanti","Michael Färber"],"pdf_url":"https://arxiv.org/pdf/2407.18752v3.pdf","comment":"accepted at ISWC'24"},{"id":"http://arxiv.org/abs/2406.06620v2","updated":"2024-07-30T12:03:09Z","published":"2024-06-07T14:34:28Z","title":"DualTime: A Dual-Adapter Multimodal Language Model for Time Series\n Representation","summary":" The recent rapid development of language models (LMs) has attracted attention\nin the field of time series, including multimodal time series modeling.\nHowever, we note that current time series multimodal methods are biased, often\nassigning a primary role to one modality while the other assumes a secondary\nrole. They overlook the mutual benefits and complementary of different\nmodalities. For example, in seizure diagnosis, relying solely on textual\nclinical reports makes it difficult to pinpoint the area and type of the\ndisease, while electroencephalograms (EEGs) alone cannot provide an accurate\ndiagnosis without considering the symptoms. In this study, based on the\ncomplementary information mining of time series multimodal data, we propose\nDualTime, a Dual-adapter multimodal language model for Time series\nrepresentation implementing temporal-primary and textual-primary modeling\nsimultaneously. By injecting lightweight adaption tokens, the LM pipeline\nshared by dual adapters encourages embedding alignment and achieves efficient\nfine-tuning. Empirically, our method outperforms state-of-the-art models in\nboth supervised and unsupervised settings, highlighting the complementary\nbenefits of different modalities. In addition, we conduct few-shot label\ntransfer experiments, which further verifies the transferability and\nexpressiveness of our proposed DualTime.\n","authors":["Weiqi Zhang","Jiexia Ye","Ziyue Li","Jia Li","Fugee Tsung"],"pdf_url":"https://arxiv.org/pdf/2406.06620v2.pdf","comment":"15 pages, 12 figure, 5 tables"},{"id":"http://arxiv.org/abs/2407.20756v1","updated":"2024-07-30T11:57:40Z","published":"2024-07-30T11:57:40Z","title":"SynthVLM: High-Efficiency and High-Quality Synthetic Data for Vision\n Language Models","summary":" Recently, with the rise of web images, managing and understanding large-scale\nimage datasets has become increasingly important. Vision Large Language Models\n(VLLMs) have recently emerged due to their robust vision-understanding\ncapabilities. However, training these models requires vast amounts of data,\nposing challenges to efficiency, effectiveness, data quality, and privacy. In\nthis paper, we introduce SynthVLM, a novel data synthesis pipeline for VLLMs.\nUnlike existing methods that generate captions from images, SynthVLM employs\nadvanced diffusion models and high-quality captions to automatically generate\nand select high-resolution images from captions, creating precisely aligned\nimage-text pairs. Leveraging these pairs, we achieve state-of-the-art (SoTA)\nperformance on various vision question answering tasks, maintaining high\nalignment quality and preserving advanced language abilities. Moreover,\nSynthVLM surpasses traditional GPT-4 Vision-based caption generation methods in\nperformance while significantly reducing computational overhead. Crucially, our\nmethod's reliance on purely generated data ensures the preservation of privacy,\nachieving SoTA performance with just 100k data points (only 18% of the official\ndataset size).\n","authors":["Zheng Liu","Hao Liang","Wentao Xiong","Qinhan Yu","Conghui He","Bin Cui","Wentao Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.20756v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20750v1","updated":"2024-07-30T11:42:19Z","published":"2024-07-30T11:42:19Z","title":"JaColBERTv2.5: Optimising Multi-Vector Retrievers to Create\n State-of-the-Art Japanese Retrievers with Constrained Resources","summary":" Neural Information Retrieval has advanced rapidly in high-resource languages,\nbut progress in lower-resource ones such as Japanese has been hindered by data\nscarcity, among other challenges. Consequently, multilingual models have\ndominated Japanese retrieval, despite their computational inefficiencies and\ninability to capture linguistic nuances. While recent multi-vector monolingual\nmodels like JaColBERT have narrowed this gap, they still lag behind\nmultilingual methods in large-scale evaluations. This work addresses the\nsuboptimal training methods of multi-vector retrievers in lower-resource\nsettings, focusing on Japanese. We systematically evaluate and improve key\naspects of the inference and training settings of JaColBERT, and more broadly,\nmulti-vector models. We further enhance performance through a novel checkpoint\nmerging step, showcasing it to be an effective way of combining the benefits of\nfine-tuning with the generalization capabilities of the original checkpoint.\nBuilding on our analysis, we introduce a novel training recipe, resulting in\nthe JaColBERTv2.5 model. JaColBERTv2.5, with only 110 million parameters and\ntrained in under 15 hours on 4 A100 GPUs, significantly outperforms all\nexisting methods across all common benchmarks, reaching an average score of\n0.754, significantly above the previous best of 0.720. To support future\nresearch, we make our final models, intermediate checkpoints and all data used\npublicly available.\n","authors":["Benjamin Clavié"],"pdf_url":"https://arxiv.org/pdf/2407.20750v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20743v1","updated":"2024-07-30T11:22:52Z","published":"2024-07-30T11:22:52Z","title":"Meltemi: The first open Large Language Model for Greek","summary":" We describe the development and capabilities of Meltemi 7B, the first open\nLarge Language Model for the Greek language. Meltemi 7B has 7 billion\nparameters and is trained on a 40 billion token Greek corpus. For the\ndevelopment of Meltemi 7B, we adapt Mistral, by continuous pretraining on the\nGreek Corpus. Meltemi 7B contains up-to-date information up to September 2023.\nFurthermore, we have translated and curated a Greek instruction corpus, which\nhas been used for the instruction-tuning of a chat model, named Meltemi 7B\nInstruct. Special care has been given to the alignment and the removal of toxic\ncontent for the Meltemi 7B Instruct. The developed models are evaluated on a\nbroad set of collected evaluation corpora, and examples of prompts and\nresponses are presented. Both Meltemi 7B and Meltemi 7B Instruct are available\nat https://huggingface.co/ilsp under the Apache 2.0 license.\n","authors":["Leon Voukoutis","Dimitris Roussis","Georgios Paraskevopoulos","Sokratis Sofianopoulos","Prokopis Prokopidis","Vassilis Papavasileiou","Athanasios Katsamanis","Stelios Piperidis","Vassilis Katsouros"],"pdf_url":"https://arxiv.org/pdf/2407.20743v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20729v1","updated":"2024-07-30T10:51:51Z","published":"2024-07-30T10:51:51Z","title":"Adapting Safe-for-Work Classifier for Malaysian Language Text: Enhancing\n Alignment in LLM-Ops Framework","summary":" As large language models (LLMs) become increasingly integrated into\noperational workflows (LLM-Ops), there is a pressing need for effective\nguardrails to ensure safe and aligned interactions, including the ability to\ndetect potentially unsafe or inappropriate content across languages. However,\nexisting safe-for-work classifiers are primarily focused on English text. To\naddress this gap for the Malaysian language, we present a novel safe-for-work\ntext classifier tailored specifically for Malaysian language content. By\ncurating and annotating a first-of-its-kind dataset of Malaysian text spanning\nmultiple content categories, we trained a classification model capable of\nidentifying potentially unsafe material using state-of-the-art natural language\nprocessing techniques. This work represents an important step in enabling safer\ninteractions and content filtering to mitigate potential risks and ensure\nresponsible deployment of LLMs. To maximize accessibility and promote further\nresearch towards enhancing alignment in LLM-Ops for the Malaysian context, the\nmodel is publicly released at\nhttps://huggingface.co/malaysia-ai/malaysian-sfw-classifier.\n","authors":["Aisyah Razak","Ariff Nazhan","Kamarul Adha","Wan Adzhar Faiq Adzlan","Mas Aisyah Ahmad","Ammar Azman"],"pdf_url":"https://arxiv.org/pdf/2407.20729v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17629v2","updated":"2024-07-30T10:33:01Z","published":"2024-07-24T20:38:13Z","title":"Papilusion at DAGPap24: Paper or Illusion? Detecting AI-generated\n Scientific Papers","summary":" This paper presents Papilusion, an AI-generated scientific text detector\ndeveloped within the DAGPap24 shared task on detecting automatically generated\nscientific papers. We propose an ensemble-based approach and conduct ablation\nstudies to analyze the effect of the detector configurations on the\nperformance. Papilusion is ranked 6th on the leaderboard, and we improve our\nperformance after the competition ended, achieving 99.46 (+9.63) of the\nF1-score on the official test set.\n","authors":["Nikita Andreev","Alexander Shirnin","Vladislav Mikhailov","Ekaterina Artemova"],"pdf_url":"https://arxiv.org/pdf/2407.17629v2.pdf","comment":"to appear in \"The 4th Workshop on Scholarly Document Processing @ ACL\n 2024\" proceedings"},{"id":"http://arxiv.org/abs/2406.00019v3","updated":"2024-07-30T10:09:13Z","published":"2024-05-23T07:14:21Z","title":"EHR-SeqSQL : A Sequential Text-to-SQL Dataset For Interactively\n Exploring Electronic Health Records","summary":" In this paper, we introduce EHR-SeqSQL, a novel sequential text-to-SQL\ndataset for Electronic Health Record (EHR) databases. EHR-SeqSQL is designed to\naddress critical yet underexplored aspects in text-to-SQL parsing:\ninteractivity, compositionality, and efficiency. To the best of our knowledge,\nEHR-SeqSQL is not only the largest but also the first medical text-to-SQL\ndataset benchmark to include sequential and contextual questions. We provide a\ndata split and the new test set designed to assess compositional generalization\nability. Our experiments demonstrate the superiority of a multi-turn approach\nover a single-turn approach in learning compositionality. Additionally, our\ndataset integrates specially crafted tokens into SQL queries to improve\nexecution efficiency. With EHR-SeqSQL, we aim to bridge the gap between\npractical needs and academic research in the text-to-SQL domain. EHR-SeqSQL is\navailable at https://github.com/seonhee99/EHR-SeqSQL.\n","authors":["Jaehee Ryu","Seonhee Cho","Gyubok Lee","Edward Choi"],"pdf_url":"https://arxiv.org/pdf/2406.00019v3.pdf","comment":"ACL 2024 (Findings)"},{"id":"http://arxiv.org/abs/2407.20700v1","updated":"2024-07-30T09:53:55Z","published":"2024-07-30T09:53:55Z","title":"Industrial-Grade Smart Troubleshooting through Causal Technical Language\n Processing: a Proof of Concept","summary":" This paper describes the development of a causal diagnosis approach for\ntroubleshooting an industrial environment on the basis of the technical\nlanguage expressed in Return on Experience records. The proposed method\nleverages the vectorized linguistic knowledge contained in the distributed\nrepresentation of a Large Language Model, and the causal associations entailed\nby the embedded failure modes and mechanisms of the industrial assets. The\npaper presents the elementary but essential concepts of the solution, which is\nconceived as a causality-aware retrieval augmented generation system, and\nillustrates them experimentally on a real-world Predictive Maintenance setting.\nFinally, it discusses avenues of improvement for the maturity of the utilized\ncausal technology to meet the robustness challenges of increasingly complex\nscenarios in the industry.\n","authors":["Alexandre Trilla","Ossee Yiboe","Nenad Mijatovic","Jordi Vitrià"],"pdf_url":"https://arxiv.org/pdf/2407.20700v1.pdf","comment":"2nd Workshop on Causal Inference and Machine Learning in Practice at\n the KDD 2024 Conference. arXiv admin note: text overlap with arXiv:2407.11056"},{"id":"http://arxiv.org/abs/2405.03452v3","updated":"2024-07-30T09:51:41Z","published":"2024-05-06T13:23:57Z","title":"Large Language Models (LLMs) as Agents for Augmented Democracy","summary":" We explore an augmented democracy system built on off-the-shelf LLMs\nfine-tuned to augment data on citizen's preferences elicited over policies\nextracted from the government programs of the two main candidates of Brazil's\n2022 presidential election. We use a train-test cross-validation setup to\nestimate the accuracy with which the LLMs predict both: a subject's individual\npolitical choices and the aggregate preferences of the full sample of\nparticipants. At the individual level, we find that LLMs predict out of sample\npreferences more accurately than a \"bundle rule\", which would assume that\ncitizens always vote for the proposals of the candidate aligned with their\nself-reported political orientation. At the population level, we show that a\nprobabilistic sample augmented by an LLM provides a more accurate estimate of\nthe aggregate preferences of a population than the non-augmented probabilistic\nsample alone. Together, these results indicates that policy preference data\naugmented using LLMs can capture nuances that transcend party lines and\nrepresents a promising avenue of research for data augmentation.\n","authors":["Jairo Gudiño-Rosero","Umberto Grandi","César A. Hidalgo"],"pdf_url":"https://arxiv.org/pdf/2405.03452v3.pdf","comment":"24 pages main manuscript with 4 figures. 13 pages of supplementary\n material"},{"id":"http://arxiv.org/abs/2407.20685v1","updated":"2024-07-30T09:26:43Z","published":"2024-07-30T09:26:43Z","title":"CultureVo: The Serious Game of Utilizing Gen AI for Enhancing Cultural\n Intelligence","summary":" CultureVo, Inc. has developed the Integrated Culture Learning Suite (ICLS) to\ndeliver foundational knowledge of world cultures through a combination of\ninteractive lessons and gamified experiences. This paper explores how\nGenerative AI powered by open source Large Langauge Models are utilized within\nthe ICLS to enhance cultural intelligence. The suite employs Generative AI\ntechniques to automate the assessment of learner knowledge, analyze behavioral\npatterns, and manage interactions with non-player characters using real time\nlearner assessment. Additionally, ICLS provides contextual hint and recommend\ncourse content by assessing learner proficiency, while Generative AI\nfacilitates the automated creation and validation of educational content.\n","authors":["Ajita Agarwala","Anupam Purwar","Viswanadhasai Rao"],"pdf_url":"https://arxiv.org/pdf/2407.20685v1.pdf","comment":"Fourth International Conference on AI-ML Systems, 8-11 October, 2024,\n Louisiana, USA"},{"id":"http://arxiv.org/abs/2401.06461v5","updated":"2024-07-30T09:26:04Z","published":"2024-01-12T09:15:20Z","title":"Between Lines of Code: Unraveling the Distinct Patterns of Machine and\n Human Programmers","summary":" Large language models have catalyzed an unprecedented wave in code\ngeneration. While achieving significant advances, they blur the distinctions\nbetween machine- and human-authored source code, causing integrity and\nauthenticity issues of software artifacts. Previous methods such as DetectGPT\nhave proven effective in discerning machine-generated texts, but they do not\nidentify and harness the unique patterns of machine-generated code. Thus, its\napplicability falters when applied to code. In this paper, we carefully study\nthe specific patterns that characterize machine- and human-authored code.\nThrough a rigorous analysis of code attributes such as lexical diversity,\nconciseness, and naturalness, we expose unique patterns inherent to each\nsource. We particularly notice that the syntactic segmentation of code is a\ncritical factor in identifying its provenance. Based on our findings, we\npropose DetectCodeGPT, a novel method for detecting machine-generated code,\nwhich improves DetectGPT by capturing the distinct stylized patterns of code.\nDiverging from conventional techniques that depend on external LLMs for\nperturbations, DetectCodeGPT perturbs the code corpus by strategically\ninserting spaces and newlines, ensuring both efficacy and efficiency.\nExperiment results show that our approach significantly outperforms\nstate-of-the-art techniques in detecting machine-generated code.\n","authors":["Yuling Shi","Hongyu Zhang","Chengcheng Wan","Xiaodong Gu"],"pdf_url":"https://arxiv.org/pdf/2401.06461v5.pdf","comment":"Accepted by the 47th International Conference on Software Engineering\n (ICSE 2025). Code available at https://github.com/YerbaPage/DetectCodeGPT"},{"id":"http://arxiv.org/abs/2407.20673v1","updated":"2024-07-30T09:11:17Z","published":"2024-07-30T09:11:17Z","title":"Label-Guided Prompt for Multi-label Few-shot Aspect Category Detection","summary":" Multi-label few-shot aspect category detection aims at identifying multiple\naspect categories from sentences with a limited number of training instances.\nThe representation of sentences and categories is a key issue in this task.\nMost of current methods extract keywords for the sentence representations and\nthe category representations. Sentences often contain many category-independent\nwords, which leads to suboptimal performance of keyword-based methods. Instead\nof directly extracting keywords, we propose a label-guided prompt method to\nrepresent sentences and categories. To be specific, we design label-specific\nprompts to represent sentences by combining crucial contextual and semantic\ninformation. Further, the label is introduced into a prompt to obtain category\ndescriptions by utilizing a large language model. This kind of category\ndescriptions contain the characteristics of the aspect categories, guiding the\nconstruction of discriminative category prototypes. Experimental results on two\npublic datasets show that our method outperforms current state-of-the-art\nmethods with a 3.86% - 4.75% improvement in the Macro-F1 score.\n","authors":["ChaoFeng Guan","YaoHui Zhu","Yu Bai","LingYun Wang"],"pdf_url":"https://arxiv.org/pdf/2407.20673v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.10167v3","updated":"2024-07-30T08:59:26Z","published":"2024-07-14T11:41:03Z","title":"Key-Point-Driven Mathematical Reasoning Distillation of Large Language\n Model","summary":" Large Language Models (LLMs) have demonstrated exceptional proficiency in\nmathematical reasoning tasks due to their extensive parameter counts and\ntraining on vast datasets. Despite these capabilities, deploying LLMs is\nhindered by their computational demands. Distilling LLM mathematical reasoning\ninto Smaller Language Models (SLMs) has emerged as a solution to this\nchallenge, although these smaller models often suffer from errors in\ncalculation and semantic understanding. Prior work has proposed\nProgram-of-Thought Distillation (PoTD) to avoid calculation error. To further\naddress semantic understanding errors, we propose Key-Point-Driven Mathematical\nReasoning Distillation (KPDD). KPDD enhances the reasoning performance of SLMs\nby breaking down the problem-solving process into three stages: Core Question\nExtraction, Problem-Solving Information Extraction, and Step-by-Step Solution.\nThis method is further divided into KPDD-CoT, which generates Chain-of-Thought\nrationales, and KPDD-PoT, which creates Program-of-Thought rationales. The\nexperiment results show that KPDD-CoT significantly improves reasoning\nabilities, while KPDD-PoT achieves state-of-the-art performance in mathematical\nreasoning tasks. Our approach effectively mitigates misunderstanding errors,\nadvancing the deployment of efficient and capable SLMs.\n","authors":["Xunyu Zhu","Jian Li","Can Ma","Weiping Wang"],"pdf_url":"https://arxiv.org/pdf/2407.10167v3.pdf","comment":"Modify the description error in the experiment settings, i.e., the\n teacher LLM changes deepseek-v2 from GPT-4"},{"id":"http://arxiv.org/abs/2407.20663v1","updated":"2024-07-30T08:57:01Z","published":"2024-07-30T08:57:01Z","title":"ArabicNLU 2024: The First Arabic Natural Language Understanding Shared\n Task","summary":" This paper presents an overview of the Arabic Natural Language Understanding\n(ArabicNLU 2024) shared task, focusing on two subtasks: Word Sense\nDisambiguation (WSD) and Location Mention Disambiguation (LMD). The task aimed\nto evaluate the ability of automated systems to resolve word ambiguity and\nidentify locations mentioned in Arabic text. We provided participants with\nnovel datasets, including a sense-annotated corpus for WSD, called SALMA with\napproximately 34k annotated tokens, and the IDRISI-DA dataset with 3,893\nannotations and 763 unique location mentions. These are challenging tasks. Out\nof the 38 registered teams, only three teams participated in the final\nevaluation phase, with the highest accuracy being 77.8% for WSD and the highest\nMRR@1 being 95.0% for LMD. The shared task not only facilitated the evaluation\nand comparison of different techniques, but also provided valuable insights and\nresources for the continued advancement of Arabic NLU technologies.\n","authors":["Mohammed Khalilia","Sanad Malaysha","Reem Suwaileh","Mustafa Jarrar","Alaa Aljabari","Tamer Elsayed","Imed Zitouni"],"pdf_url":"https://arxiv.org/pdf/2407.20663v1.pdf","comment":"In Proceedings of the Second Arabic Natural Language Processing\n Conference (ArabicNLP 2024), Bangkok, Thailand. Association for Computational\n Linguistics"},{"id":"http://arxiv.org/abs/2407.20657v1","updated":"2024-07-30T08:52:16Z","published":"2024-07-30T08:52:16Z","title":"Prompt-Driven Contrastive Learning for Transferable Adversarial Attacks","summary":" Recent vision-language foundation models, such as CLIP, have demonstrated\nsuperior capabilities in learning representations that can be transferable\nacross diverse range of downstream tasks and domains. With the emergence of\nsuch powerful models, it has become crucial to effectively leverage their\ncapabilities in tackling challenging vision tasks. On the other hand, only a\nfew works have focused on devising adversarial examples that transfer well to\nboth unknown domains and model architectures. In this paper, we propose a novel\ntransfer attack method called PDCL-Attack, which leverages the CLIP model to\nenhance the transferability of adversarial perturbations generated by a\ngenerative model-based attack framework. Specifically, we formulate an\neffective prompt-driven feature guidance by harnessing the semantic\nrepresentation power of text, particularly from the ground-truth class labels\nof input images. To the best of our knowledge, we are the first to introduce\nprompt learning to enhance the transferable generative attacks. Extensive\nexperiments conducted across various cross-domain and cross-model settings\nempirically validate our approach, demonstrating its superiority over\nstate-of-the-art methods.\n","authors":["Hunmin Yang","Jongoh Jeong","Kuk-Jin Yoon"],"pdf_url":"https://arxiv.org/pdf/2407.20657v1.pdf","comment":"Accepted to ECCV 2024, Project Page: https://PDCL-Attack.github.io"},{"id":"http://arxiv.org/abs/2407.20654v1","updated":"2024-07-30T08:50:16Z","published":"2024-07-30T08:50:16Z","title":"Prompting Encoder Models for Zero-Shot Classification: A Cross-Domain\n Study in Italian","summary":" Addressing the challenge of limited annotated data in specialized fields and\nlow-resource languages is crucial for the effective use of Language Models\n(LMs). While most Large Language Models (LLMs) are trained on general-purpose\nEnglish corpora, there is a notable gap in models specifically tailored for\nItalian, particularly for technical and bureaucratic jargon. This paper\nexplores the feasibility of employing smaller, domain-specific encoder LMs\nalongside prompting techniques to enhance performance in these specialized\ncontexts. Our study concentrates on the Italian bureaucratic and legal\nlanguage, experimenting with both general-purpose and further pre-trained\nencoder-only models. We evaluated the models on downstream tasks such as\ndocument classification and entity typing and conducted intrinsic evaluations\nusing Pseudo-Log-Likelihood. The results indicate that while further\npre-trained models may show diminished robustness in general knowledge, they\nexhibit superior adaptability for domain-specific tasks, even in a zero-shot\nsetting. Furthermore, the application of calibration techniques and in-domain\nverbalizers significantly enhances the efficacy of encoder models. These\ndomain-specialized models prove to be particularly advantageous in scenarios\nwhere in-domain resources or expertise are scarce. In conclusion, our findings\noffer new insights into the use of Italian models in specialized contexts,\nwhich may have a significant impact on both research and industrial\napplications in the digital transformation era.\n","authors":["Serena Auriemma","Martina Miliani","Mauro Madeddu","Alessandro Bondielli","Lucia Passaro","Alessandro Lenci"],"pdf_url":"https://arxiv.org/pdf/2407.20654v1.pdf","comment":"Submitted to 'Language Resource and Evaluation'"},{"id":"http://arxiv.org/abs/2404.12827v2","updated":"2024-07-30T08:38:50Z","published":"2024-04-19T12:04:32Z","title":"CT-ADE: An Evaluation Benchmark for Adverse Drug Event Prediction from\n Clinical Trial Results","summary":" Adverse drug events (ADEs) significantly impact clinical research, causing\nmany clinical trial failures. ADE prediction is key for developing safer\nmedications and enhancing patient outcomes. To support this effort, we\nintroduce CT-ADE, a dataset for multilabel predictive modeling of ADEs in\nmonopharmacy treatments. CT-ADE integrates data from 2,497 unique drugs,\nencompassing 168,984 drug-ADE pairs extracted from clinical trials, annotated\nwith patient and contextual information, and comprehensive ADE concepts\nstandardized across multiple levels of the MedDRA ontology. Preliminary\nanalyses with large language models (LLMs) achieved F1-scores up to 55.90%.\nModels using patient and contextual information showed F1-score improvements of\n21%-38% over models using only chemical structure data. Our results highlight\nthe importance of target population and treatment regimens in the predictive\nmodeling of ADEs, offering greater performance gains than LLM domain\nspecialization and scaling. CT-ADE provides an essential tool for researchers\naiming to leverage artificial intelligence and machine learning to enhance\npatient safety and minimize the impact of ADEs on pharmaceutical research and\ndevelopment. The dataset is publicly accessible at\nhttps://github.com/ds4dh/CT-ADE.\n","authors":["Anthony Yazdani","Alban Bornet","Philipp Khlebnikov","Boya Zhang","Hossein Rouhizadeh","Poorya Amini","Douglas Teodoro"],"pdf_url":"https://arxiv.org/pdf/2404.12827v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19705v2","updated":"2024-07-30T08:23:05Z","published":"2024-07-29T05:00:48Z","title":"CollectiveSFT: Scaling Large Language Models for Chinese Medical\n Benchmark with Collective Instructions in Healthcare","summary":" The rapid progress in Large Language Models (LLMs) has prompted the creation\nof numerous benchmarks to evaluate their capabilities.This study focuses on the\nComprehensive Medical Benchmark in Chinese (CMB), showcasing how dataset\ndiversity and distribution in supervised fine-tuning (SFT) may enhance LLM\nperformance.Remarkably, We successfully trained a smaller base model to achieve\nscores comparable to larger models, indicating that a diverse and\nwell-distributed dataset can optimize performance regardless of model size.This\nstudy suggests that even smaller models may reach high performance levels with\ncarefully curated and varied datasets. By integrating a wide range of\ninstructional content, our approach addresses potential issues such as data\nquality inconsistencies. Our results imply that a broader spectrum of training\ndata may enhance a model's ability to generalize and perform effectively across\ndifferent medical scenarios, highlighting the importance of dataset quality and\ndiversity in fine-tuning processes. We open-source the model for future\nresearch at https://github.com/CAS-SIAT-XinHai/CollectiveSFT\n","authors":["Jingwei Zhu","Minghuan Tan","Min Yang","Ruixue Li","Hamid Alinejad-Rokny"],"pdf_url":"https://arxiv.org/pdf/2407.19705v2.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2407.20622v1","updated":"2024-07-30T07:55:44Z","published":"2024-07-30T07:55:44Z","title":"Decoding Linguistic Representations of Human Brain","summary":" Language, as an information medium created by advanced organisms, has always\nbeen a concern of neuroscience regarding how it is represented in the brain.\nDecoding linguistic representations in the evoked brain has shown\ngroundbreaking achievements, thanks to the rapid improvement of neuroimaging,\nmedical technology, life sciences and artificial intelligence. In this work, we\npresent a taxonomy of brain-to-language decoding of both textual and speech\nformats. This work integrates two types of research: neuroscience focusing on\nlanguage understanding and deep learning-based brain decoding. Generating\ndiscernible language information from brain activity could not only help those\nwith limited articulation, especially amyotrophic lateral sclerosis (ALS)\npatients but also open up a new way for the next generation's brain-computer\ninterface (BCI). This article will help brain scientists and deep-learning\nresearchers to gain a bird's eye view of fine-grained language perception, and\nthus facilitate their further investigation and research of neural process and\nlanguage decoding.\n","authors":["Yu Wang","Heyang Liu","Yuhao Wang","Chuan Xuan","Yixuan Hou","Sheng Feng","Hongcheng Liu","Yusheng Liao","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2407.20622v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20608v1","updated":"2024-07-30T07:34:40Z","published":"2024-07-30T07:34:40Z","title":"Questionnaires for Everyone: Streamlining Cross-Cultural Questionnaire\n Adaptation with GPT-Based Translation Quality Evaluation","summary":" Adapting questionnaires to new languages is a resource-intensive process\noften requiring the hiring of multiple independent translators, which limits\nthe ability of researchers to conduct cross-cultural research and effectively\ncreates inequalities in research and society. This work presents a prototype\ntool that can expedite the questionnaire translation process. The tool\nincorporates forward-backward translation using DeepL alongside GPT-4-generated\ntranslation quality evaluations and improvement suggestions. We conducted two\nonline studies in which participants translated questionnaires from English to\neither German (Study 1; n=10) or Portuguese (Study 2; n=20) using our\nprototype. To evaluate the quality of the translations created using the tool,\nevaluation scores between conventionally translated and tool-supported versions\nwere compared. Our results indicate that integrating LLM-generated translation\nquality evaluations and suggestions for improvement can help users\nindependently attain results similar to those provided by conventional,\nnon-NLP-supported translation methods. This is the first step towards more\nequitable questionnaire-based research, powered by AI.\n","authors":["Otso Haavisto","Robin Welsch"],"pdf_url":"https://arxiv.org/pdf/2407.20608v1.pdf","comment":"19 pages, 13 figures"},{"id":"http://arxiv.org/abs/2407.20595v1","updated":"2024-07-30T07:14:04Z","published":"2024-07-30T07:14:04Z","title":"Harvesting Textual and Structured Data from the HAL Publication\n Repository","summary":" HAL (Hyper Articles en Ligne) is the French national publication repository,\nused by most higher education and research organizations for their open science\npolicy. As a digital library, it is a rich repository of scholarly documents,\nbut its potential for advanced research has been underutilized. We present\nHALvest, a unique dataset that bridges the gap between citation networks and\nthe full text of papers submitted on HAL. We craft our dataset by filtering HAL\nfor scholarly publications, resulting in approximately 700,000 documents,\nspanning 34 languages across 13 identified domains, suitable for language model\ntraining, and yielding approximately 16.5 billion tokens (with 8 billion in\nFrench and 7 billion in English, the most represented languages). We transform\nthe metadata of each paper into a citation network, producing a directed\nheterogeneous graph. This graph includes uniquely identified authors on HAL, as\nwell as all open submitted papers, and their citations. We provide a baseline\nfor authorship attribution using the dataset, implement a range of\nstate-of-the-art models in graph representation learning for link prediction,\nand discuss the usefulness of our generated knowledge graph structure.\n","authors":["Francis Kulumba","Wissam Antoun","Guillaume Vimont","Laurent Romary"],"pdf_url":"https://arxiv.org/pdf/2407.20595v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20588v1","updated":"2024-07-30T06:49:55Z","published":"2024-07-30T06:49:55Z","title":"Enhancing Agricultural Machinery Management through Advanced LLM\n Integration","summary":" The integration of artificial intelligence into agricultural practices,\nspecifically through Consultation on Intelligent Agricultural Machinery\nManagement (CIAMM), has the potential to revolutionize efficiency and\nsustainability in farming. This paper introduces a novel approach that\nleverages large language models (LLMs), particularly GPT-4, combined with\nmulti-round prompt engineering to enhance decision-making processes in\nagricultural machinery management. We systematically developed and refined\nprompts to guide the LLMs in generating precise and contextually relevant\noutputs. Our approach was evaluated using a manually curated dataset from\nvarious online sources, and performance was assessed with accuracy and GPT-4\nScores. Comparative experiments were conducted using LLama-2-70B, ChatGPT, and\nGPT-4 models, alongside baseline and state-of-the-art methods such as Chain of\nThought (CoT) and Thought of Thought (ThoT). The results demonstrate that our\nmethod significantly outperforms these approaches, achieving higher accuracy\nand relevance in generated responses. This paper highlights the potential of\nadvanced prompt engineering techniques in improving the robustness and\napplicability of AI in agricultural contexts.\n","authors":["Emily Johnson","Noah Wilson"],"pdf_url":"https://arxiv.org/pdf/2407.20588v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2308.15334v2","updated":"2024-07-30T06:36:22Z","published":"2023-08-29T14:29:57Z","title":"The Responsible Development of Automated Student Feedback with\n Generative AI","summary":" Contribution: This paper identifies four critical ethical considerations for\nimplementing generative AI tools to provide automated feedback to students.\n Background: Providing rich feedback to students is essential for supporting\nstudent learning. Recent advances in generative AI, particularly with large\nlanguage models (LLMs), provide the opportunity to deliver repeatable, scalable\nand instant automatically generated feedback to students, making abundant a\npreviously scarce and expensive learning resource. Such an approach is feasible\nfrom a technical perspective due to these recent advances in Artificial\nIntelligence (AI) and Natural Language Processing (NLP); while the potential\nupside is a strong motivator, doing so introduces a range of potential ethical\nissues that must be considered as we apply these technologies.\n Intended Outcomes: The goal of this work is to enable the use of AI systems\nto automate mundane assessment and feedback tasks, without introducing a\n\"tyranny of the majority\", where the needs of minorities in the long tail are\noverlooked because they are difficult to automate.\n Application Design: This paper applies an extant ethical framework used for\nAI and machine learning to the specific challenge of providing automated\nfeedback to student engineers. The task is considered from both a development\nand maintenance perspective, considering how automated feedback tools will\nevolve and be used over time.\n Findings: This paper identifies four key ethical considerations for the\nimplementation of automated feedback for students: Participation, Development,\nImpact on Learning and Evolution over Time.\n","authors":["Euan D Lindsay","Mike Zhang","Aditya Johri","Johannes Bjerva"],"pdf_url":"https://arxiv.org/pdf/2308.15334v2.pdf","comment":"Under review at IEEE ToE"},{"id":"http://arxiv.org/abs/2407.20584v1","updated":"2024-07-30T06:33:44Z","published":"2024-07-30T06:33:44Z","title":"Pruning Large Language Models with Semi-Structural Adaptive Sparse\n Training","summary":" Transformer-based Large Language Models (LLMs) have demonstrated remarkable\nsuccess across various challenging tasks. However, the deployment of LLMs is\nhindered by their substantial parameter count and memory consumption. Recently,\nnumerous studies have attempted to compress LLMs by pruning them using\ntraining-free methods. However, these pruned models often experience\nsignificant performance degradation on complex tasks. To address this issue, we\npropose a novel training pipeline for semi-structured sparse models, named\nAdaptive Sparse Trainer (AST). By distilling the knowledge stored in its dense\ncounterpart, we prevent the sparse model from overfitting and ensure a stable\ntraining process. Moreover, AST allows the model to adaptively select better\nlottery tickets (e.g., masks) during training. Additionally, we discovered that\nadding extra well-initialized parameters can further enhance model performance\nwith only a small increase in memory footprint. Our method significantly\nnarrows the performance gap between dense and sparse models while maintaining\nlimited computational cost. Furthermore, when combined with existing\nquantization methods, AST can compress language models by up to 16x compared to\ndense FP32 precision models with minimal performance loss. AST outperforms\nprevious state-of-the-art methods by reducing the zero-shot accuracy gap\nbetween dense and semi-structured sparse models to 1.12% across multiple\nzero-shot tasks on Llama2-7B, using less than 0.4% of the pretraining tokens.\n","authors":["Weiyu Huang","Guohao Jian","Yuezhou Hu","Jun Zhu","Jianfei Chen"],"pdf_url":"https://arxiv.org/pdf/2407.20584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20581v1","updated":"2024-07-30T06:29:01Z","published":"2024-07-30T06:29:01Z","title":"Knesset-DictaBERT: A Hebrew Language Model for Parliamentary Proceedings","summary":" We present Knesset-DictaBERT, a large Hebrew language model fine-tuned on the\nKnesset Corpus, which comprises Israeli parliamentary proceedings. The model is\nbased on the DictaBERT architecture and demonstrates significant improvements\nin understanding parliamentary language according to the MLM task. We provide a\ndetailed evaluation of the model's performance, showing improvements in\nperplexity and accuracy over the baseline DictaBERT model.\n","authors":["Gili Goldin","Shuly Wintner"],"pdf_url":"https://arxiv.org/pdf/2407.20581v1.pdf","comment":"3 pages, 1 table"},{"id":"http://arxiv.org/abs/2407.20578v1","updated":"2024-07-30T06:23:59Z","published":"2024-07-30T06:23:59Z","title":"Comparison of Large Language Models for Generating Contextually Relevant\n Questions","summary":" This study explores the effectiveness of Large Language Models (LLMs) for\nAutomatic Question Generation in educational settings. Three LLMs are compared\nin their ability to create questions from university slide text without\nfine-tuning. Questions were obtained in a two-step pipeline: first, answer\nphrases were extracted from slides using Llama 2-Chat 13B; then, the three\nmodels generated questions for each answer. To analyze whether the questions\nwould be suitable in educational applications for students, a survey was\nconducted with 46 students who evaluated a total of 246 questions across five\nmetrics: clarity, relevance, difficulty, slide relation, and question-answer\nalignment. Results indicate that GPT-3.5 and Llama 2-Chat 13B outperform Flan\nT5 XXL by a small margin, particularly in terms of clarity and question-answer\nalignment. GPT-3.5 especially excels at tailoring questions to match the input\nanswers. The contribution of this research is the analysis of the capacity of\nLLMs for Automatic Question Generation in education.\n","authors":["Ivo Lodovico Molina","Valdemar Švábenský","Tsubasa Minematsu","Li Chen","Fumiya Okubo","Atsushi Shimada"],"pdf_url":"https://arxiv.org/pdf/2407.20578v1.pdf","comment":"Published in Springer ECTEL 2024 conference proceedings"},{"id":"http://arxiv.org/abs/2402.16654v2","updated":"2024-07-30T06:04:31Z","published":"2024-02-26T15:26:56Z","title":"GigaPevt: Multimodal Medical Assistant","summary":" Building an intelligent and efficient medical assistant is still a\nchallenging AI problem. The major limitation comes from the data modality\nscarceness, which reduces comprehensive patient perception. This demo paper\npresents the GigaPevt, the first multimodal medical assistant that combines the\ndialog capabilities of large language models with specialized medical models.\nSuch an approach shows immediate advantages in dialog quality and metric\nperformance, with a 1.18% accuracy improvement in the question-answering task.\n","authors":["Pavel Blinov","Konstantin Egorov","Ivan Sviridov","Nikolay Ivanov","Stepan Botman","Evgeniy Tagin","Stepan Kudin","Galina Zubkova","Andrey Savchenko"],"pdf_url":"https://arxiv.org/pdf/2402.16654v2.pdf","comment":"IJCAI 2024, 4 pages, 2 figures, 2 tables"},{"id":"http://arxiv.org/abs/2407.20564v1","updated":"2024-07-30T05:40:32Z","published":"2024-07-30T05:40:32Z","title":"CLR-Fact: Evaluating the Complex Logical Reasoning Capability of Large\n Language Models over Factual Knowledge","summary":" While large language models (LLMs) have demonstrated impressive capabilities\nacross various natural language processing tasks by acquiring rich factual\nknowledge from their broad training data, their ability to synthesize and\nlogically reason with this knowledge in complex ways remains underexplored. In\nthis work, we present a systematic evaluation of state-of-the-art LLMs' complex\nlogical reasoning abilities through a novel benchmark of automatically\ngenerated complex reasoning questions over general domain and biomedical\nknowledge graphs. Our extensive experiments, employing diverse in-context\nlearning techniques, reveal that LLMs excel at reasoning over general world\nknowledge but face significant challenges with specialized domain-specific\nknowledge. We find that prompting with explicit Chain-of-Thought demonstrations\ncan substantially improve LLM performance on complex logical reasoning tasks\nwith diverse logical operations. Interestingly, our controlled evaluations\nuncover an asymmetry where LLMs display proficiency at set union operations,\nbut struggle considerably with set intersections - a key building block of\nlogical reasoning. To foster further work, we will publicly release our\nevaluation benchmark and code.\n","authors":["Tianshi Zheng","Jiaxin Bai","Yicheng Wang","Tianqing Fang","Yue Guo","Yauwai Yim","Yangqiu Song"],"pdf_url":"https://arxiv.org/pdf/2407.20564v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2407.20556v1","updated":"2024-07-30T05:22:31Z","published":"2024-07-30T05:22:31Z","title":"Survey of Design Paradigms for Social Robots","summary":" The demand for social robots in fields like healthcare, education, and\nentertainment increases due to their emotional adaptation features. These\nrobots leverage multimodal communication, incorporating speech, facial\nexpressions, and gestures to enhance user engagement and emotional support. The\nunderstanding of design paradigms of social robots is obstructed by the\ncomplexity of the system and the necessity to tune it to a specific task. This\narticle provides a structured review of social robot design paradigms,\ncategorizing them into cognitive architectures, role design models, linguistic\nmodels, communication flow, activity system models, and integrated design\nmodels. By breaking down the articles on social robot design and application\nbased on these paradigms, we highlight the strengths and areas for improvement\nin current approaches. We further propose our original integrated design model\nthat combines the most important aspects of the design of social robots. Our\napproach shows the importance of integrating operational, communicational, and\nemotional dimensions to create more adaptive and empathetic interactions\nbetween robots and humans.\n","authors":["Rita Frieske","Xiaoyu Mo","Yini Fang","Jay Nieles","Bertram E. Shi"],"pdf_url":"https://arxiv.org/pdf/2407.20556v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.08199v3","updated":"2024-07-30T05:12:28Z","published":"2022-09-16T23:49:00Z","title":"ScreenQA: Large-Scale Question-Answer Pairs over Mobile App Screenshots","summary":" We present a new benchmark and dataset, ScreenQA, for screen content\nunderstanding via question answering. The existing screen datasets are focused\neither on structure and component-level understanding, or on a much\nhigher-level composite task such as navigation and task completion. We attempt\nto bridge the gap between these two by annotating 86K question-answer pairs\nover the RICO dataset in hope to benchmark the screen reading comprehension\ncapacity. This work is also the first to annotate answers for different\napplication scenarios, including both full sentences and short forms, as well\nas supporting UI contents on screen and their bounding boxes. With the rich\nannotation, we discuss and define the evaluation metrics of the benchmark, show\napplications of the dataset, and provide a few baselines using closed and open\nsource models.\n","authors":["Yu-Chung Hsiao","Fedir Zubach","Gilles Baechler","Victor Carbune","Jason Lin","Maria Wang","Srinivas Sunkara","Yun Zhu","Jindong Chen"],"pdf_url":"https://arxiv.org/pdf/2209.08199v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16711v2","updated":"2024-07-30T04:48:26Z","published":"2024-07-22T17:52:12Z","title":"Benchmarks as Microscopes: A Call for Model Metrology","summary":" Modern language models (LMs) pose a new challenge in capability assessment.\nStatic benchmarks inevitably saturate without providing confidence in the\ndeployment tolerances of LM-based systems, but developers nonetheless claim\nthat their models have generalized traits such as reasoning or open-domain\nlanguage understanding based on these flawed metrics. The science and practice\nof LMs requires a new approach to benchmarking which measures specific\ncapabilities with dynamic assessments. To be confident in our metrics, we need\na new discipline of model metrology -- one which focuses on how to generate\nbenchmarks that predict performance under deployment. Motivated by our\nevaluation criteria, we outline how building a community of model metrology\npractitioners -- one focused on building tools and studying how to measure\nsystem capabilities -- is the best way to meet these needs to and add clarity\nto the AI discussion.\n","authors":["Michael Saxon","Ari Holtzman","Peter West","William Yang Wang","Naomi Saphra"],"pdf_url":"https://arxiv.org/pdf/2407.16711v2.pdf","comment":"Conference paper at COLM 2024"},{"id":"http://arxiv.org/abs/2406.12793v2","updated":"2024-07-30T03:58:11Z","published":"2024-06-18T16:58:21Z","title":"ChatGLM: A Family of Large Language Models from GLM-130B to GLM-4 All\n Tools","summary":" We introduce ChatGLM, an evolving family of large language models that we\nhave been developing over time. This report primarily focuses on the GLM-4\nlanguage series, which includes GLM-4, GLM-4-Air, and GLM-4-9B. They represent\nour most capable models that are trained with all the insights and lessons\ngained from the preceding three generations of ChatGLM. To date, the GLM-4\nmodels are pre-trained on ten trillions of tokens mostly in Chinese and\nEnglish, along with a small set of corpus from 24 languages, and aligned\nprimarily for Chinese and English usage. The high-quality alignment is achieved\nvia a multi-stage post-training process, which involves supervised fine-tuning\nand learning from human feedback. Evaluations show that GLM-4 1) closely rivals\nor outperforms GPT-4 in terms of general metrics such as MMLU, GSM8K, MATH,\nBBH, GPQA, and HumanEval, 2) gets close to GPT-4-Turbo in instruction following\nas measured by IFEval, 3) matches GPT-4 Turbo (128K) and Claude 3 for long\ncontext tasks, and 4) outperforms GPT-4 in Chinese alignments as measured by\nAlignBench. The GLM-4 All Tools model is further aligned to understand user\nintent and autonomously decide when and which tool(s) touse -- including web\nbrowser, Python interpreter, text-to-image model, and user-defined functions --\nto effectively complete complex tasks. In practical applications, it matches\nand even surpasses GPT-4 All Tools in tasks like accessing online information\nvia web browsing and solving math problems using Python interpreter. Over the\ncourse, we have open-sourced a series of models, including ChatGLM-6B (three\ngenerations), GLM-4-9B (128K, 1M), GLM-4V-9B, WebGLM, and CodeGeeX, attracting\nover 10 million downloads on Hugging face in the year 2023 alone. The open\nmodels can be accessed through https://github.com/THUDM and\nhttps://huggingface.co/THUDM.\n","authors":["Team GLM"," :","Aohan Zeng","Bin Xu","Bowen Wang","Chenhui Zhang","Da Yin","Dan Zhang","Diego Rojas","Guanyu Feng","Hanlin Zhao","Hanyu Lai","Hao Yu","Hongning Wang","Jiadai Sun","Jiajie Zhang","Jiale Cheng","Jiayi Gui","Jie Tang","Jing Zhang","Jingyu Sun","Juanzi Li","Lei Zhao","Lindong Wu","Lucen Zhong","Mingdao Liu","Minlie Huang","Peng Zhang","Qinkai Zheng","Rui Lu","Shuaiqi Duan","Shudan Zhang","Shulin Cao","Shuxun Yang","Weng Lam Tam","Wenyi Zhao","Xiao Liu","Xiao Xia","Xiaohan Zhang","Xiaotao Gu","Xin Lv","Xinghan Liu","Xinyi Liu","Xinyue Yang","Xixuan Song","Xunkai Zhang","Yifan An","Yifan Xu","Yilin Niu","Yuantao Yang","Yueyan Li","Yushi Bai","Yuxiao Dong","Zehan Qi","Zhaoyu Wang","Zhen Yang","Zhengxiao Du","Zhenyu Hou","Zihan Wang"],"pdf_url":"https://arxiv.org/pdf/2406.12793v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20524v1","updated":"2024-07-30T03:50:10Z","published":"2024-07-30T03:50:10Z","title":"Contrastive Feedback Mechanism for Simultaneous Speech Translation","summary":" Recent advances in simultaneous speech translation (SST) focus on the\ndecision policies that enable the use of offline-trained ST models for\nsimultaneous inference. These decision policies not only control the\nquality-latency trade-off in SST but also mitigate the impact of unstable\npredictions on translation quality by delaying translation for more context or\ndiscarding these predictions through stable hypothesis detection. However,\nthese policies often overlook the potential benefits of utilizing unstable\npredictions. We introduce the contrastive feedback mechanism (CFM) for SST, a\nnovel method that leverages these unstable predictions as feedback to improve\ntranslation quality. CFM guides the system to eliminate undesired model\nbehaviors from these predictions through a contrastive objective. The\nexperiments on 3 state-of-the-art decision policies across 8 languages in the\nMuST-C v1.0 dataset show that CFM effectively improves the performance of SST.\n","authors":["Haotian Tan","Sakriani Sakti"],"pdf_url":"https://arxiv.org/pdf/2407.20524v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20516v1","updated":"2024-07-30T03:26:09Z","published":"2024-07-30T03:26:09Z","title":"Machine Unlearning in Generative AI: A Survey","summary":" Generative AI technologies have been deployed in many places, such as\n(multimodal) large language models and vision generative models. Their\nremarkable performance should be attributed to massive training data and\nemergent reasoning abilities. However, the models would memorize and generate\nsensitive, biased, or dangerous information originated from the training data\nespecially those from web crawl. New machine unlearning (MU) techniques are\nbeing developed to reduce or eliminate undesirable knowledge and its effects\nfrom the models, because those that were designed for traditional\nclassification tasks could not be applied for Generative AI. We offer a\ncomprehensive survey on many things about MU in Generative AI, such as a new\nproblem formulation, evaluation methods, and a structured discussion on the\nadvantages and limitations of different kinds of MU techniques. It also\npresents several critical challenges and promising directions in MU research. A\ncurated list of readings can be found:\nhttps://github.com/franciscoliu/GenAI-MU-Reading.\n","authors":["Zheyuan Liu","Guangyao Dou","Zhaoxuan Tan","Yijun Tian","Meng Jiang"],"pdf_url":"https://arxiv.org/pdf/2407.20516v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.08407v3","updated":"2024-07-30T03:15:55Z","published":"2024-06-12T16:54:54Z","title":"MMWorld: Towards Multi-discipline Multi-faceted World Model Evaluation\n in Videos","summary":" Multimodal Language Language Models (MLLMs) demonstrate the emerging\nabilities of \"world models\" -- interpreting and reasoning about complex\nreal-world dynamics. To assess these abilities, we posit videos are the ideal\nmedium, as they encapsulate rich representations of real-world dynamics and\ncausalities. To this end, we introduce MMWorld, a new benchmark for\nmulti-discipline, multi-faceted multimodal video understanding. MMWorld\ndistinguishes itself from previous video understanding benchmarks with two\nunique advantages: (1) multi-discipline, covering various disciplines that\noften require domain expertise for comprehensive understanding; (2)\nmulti-faceted reasoning, including explanation, counterfactual thinking, future\nprediction, etc. MMWorld consists of a human-annotated dataset to evaluate\nMLLMs with questions about the whole videos and a synthetic dataset to analyze\nMLLMs within a single modality of perception. Together, MMWorld encompasses\n1,910 videos across seven broad disciplines and 69 subdisciplines, complete\nwith 6,627 question-answer pairs and associated captions. The evaluation\nincludes 2 proprietary and 10 open-source MLLMs, which struggle on MMWorld\n(e.g., GPT-4V performs the best with only 52.3\\% accuracy), showing large room\nfor improvement. Further ablation studies reveal other interesting findings\nsuch as models' different skill sets from humans. We hope MMWorld can serve as\nan essential step towards world model evaluation in videos.\n","authors":["Xuehai He","Weixi Feng","Kaizhi Zheng","Yujie Lu","Wanrong Zhu","Jiachen Li","Yue Fan","Jianfeng Wang","Linjie Li","Zhengyuan Yang","Kevin Lin","William Yang Wang","Lijuan Wang","Xin Eric Wang"],"pdf_url":"https://arxiv.org/pdf/2406.08407v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20513v1","updated":"2024-07-30T03:10:30Z","published":"2024-07-30T03:10:30Z","title":"Prompt2DeModel: Declarative Neuro-Symbolic Modeling with Natural\n Language","summary":" This paper presents a conversational pipeline for crafting domain knowledge\nfor complex neuro-symbolic models through natural language prompts. It\nleverages large language models to generate declarative programs in the\nDomiKnowS framework. The programs in this framework express concepts and their\nrelationships as a graph in addition to logical constraints between them. The\ngraph, later, can be connected to trainable neural models according to those\nspecifications. Our proposed pipeline utilizes techniques like dynamic\nin-context demonstration retrieval, model refinement based on feedback from a\nsymbolic parser, visualization, and user interaction to generate the tasks'\nstructure and formal knowledge representation. This approach empowers domain\nexperts, even those not well-versed in ML/AI, to formally declare their\nknowledge to be incorporated in customized neural models in the DomiKnowS\nframework.\n","authors":["Hossein Rajaby Faghihi","Aliakbar Nafar","Andrzej Uszok","Hamid Karimian","Parisa Kordjamshidi"],"pdf_url":"https://arxiv.org/pdf/2407.20513v1.pdf","comment":"Accepted in NeSy 2024 Conference"},{"id":"http://arxiv.org/abs/2402.03181v5","updated":"2024-07-30T02:47:47Z","published":"2024-02-05T16:46:16Z","title":"C-RAG: Certified Generation Risks for Retrieval-Augmented Language\n Models","summary":" Despite the impressive capabilities of large language models (LLMs) across\ndiverse applications, they still suffer from trustworthiness issues, such as\nhallucinations and misalignments. Retrieval-augmented language models (RAG)\nhave been proposed to enhance the credibility of generations by grounding\nexternal knowledge, but the theoretical understandings of their generation\nrisks remains unexplored. In this paper, we answer: 1) whether RAG can indeed\nlead to low generation risks, 2) how to provide provable guarantees on the\ngeneration risks of RAG and vanilla LLMs, and 3) what sufficient conditions\nenable RAG models to reduce generation risks. We propose C-RAG, the first\nframework to certify generation risks for RAG models. Specifically, we provide\nconformal risk analysis for RAG models and certify an upper confidence bound of\ngeneration risks, which we refer to as conformal generation risk. We also\nprovide theoretical guarantees on conformal generation risks for general\nbounded risk functions under test distribution shifts. We prove that RAG\nachieves a lower conformal generation risk than that of a single LLM when the\nquality of the retrieval model and transformer is non-trivial. Our intensive\nempirical results demonstrate the soundness and tightness of our conformal\ngeneration risk guarantees across four widely-used NLP datasets on four\nstate-of-the-art retrieval models.\n","authors":["Mintong Kang","Nezihe Merve Gürel","Ning Yu","Dawn Song","Bo Li"],"pdf_url":"https://arxiv.org/pdf/2402.03181v5.pdf","comment":"Accepted to ICML 2024"},{"id":"http://arxiv.org/abs/2407.05694v2","updated":"2024-07-30T02:37:56Z","published":"2024-07-08T07:53:06Z","title":"On the Limitations of Compute Thresholds as a Governance Strategy","summary":" At face value, this essay is about understanding a fairly esoteric governance\ntool called compute thresholds. However, in order to grapple with whether these\nthresholds will achieve anything, we must first understand how they came to be.\nTo do so, we need to engage with a decades-old debate at the heart of computer\nscience progress, namely, is bigger always better? Does a certain inflection\npoint of compute result in changes to the risk profile of a model? Hence, this\nessay may be of interest not only to policymakers and the wider public but also\nto computer scientists interested in understanding the role of compute in\nunlocking breakthroughs. This discussion is timely given the wide adoption of\ncompute thresholds in both the White House Executive Orders on AI Safety (EO)\nand the EU AI Act to identify more risky systems. A key conclusion of this\nessay is that compute thresholds, as currently implemented, are shortsighted\nand likely to fail to mitigate risk. The relationship between compute and risk\nis highly uncertain and rapidly changing. Relying upon compute thresholds\noverestimates our ability to predict what abilities emerge at different scales.\nThis essay ends with recommendations for a better way forward.\n","authors":["Sara Hooker"],"pdf_url":"https://arxiv.org/pdf/2407.05694v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.12840v2","updated":"2024-07-30T02:19:13Z","published":"2024-02-20T09:07:41Z","title":"ArabicMMLU: Assessing Massive Multitask Language Understanding in Arabic","summary":" The focus of language model evaluation has transitioned towards reasoning and\nknowledge-intensive tasks, driven by advancements in pretraining large models.\nWhile state-of-the-art models are partially trained on large Arabic texts,\nevaluating their performance in Arabic remains challenging due to the limited\navailability of relevant datasets. To bridge this gap, we present\n\\datasetname{}, the first multi-task language understanding benchmark for the\nArabic language, sourced from school exams across diverse educational levels in\ndifferent countries spanning North Africa, the Levant, and the Gulf regions.\nOur data comprises 40 tasks and 14,575 multiple-choice questions in Modern\nStandard Arabic (MSA) and is carefully constructed by collaborating with native\nspeakers in the region. Our comprehensive evaluations of 35 models reveal\nsubstantial room for improvement, particularly among the best open-source\nmodels. Notably, BLOOMZ, mT0, LLaMA2, and Falcon struggle to achieve a score of\n50%, while even the top-performing Arabic-centric model only achieves a score\nof 62.3%.\n","authors":["Fajri Koto","Haonan Li","Sara Shatnawi","Jad Doughman","Abdelrahman Boda Sadallah","Aisha Alraeesi","Khalid Almubarak","Zaid Alyafeai","Neha Sengupta","Shady Shehata","Nizar Habash","Preslav Nakov","Timothy Baldwin"],"pdf_url":"https://arxiv.org/pdf/2402.12840v2.pdf","comment":"Findings of ACL 2024"},{"id":"http://arxiv.org/abs/2407.19594v2","updated":"2024-07-30T01:38:06Z","published":"2024-07-28T21:58:28Z","title":"Meta-Rewarding Language Models: Self-Improving Alignment with\n LLM-as-a-Meta-Judge","summary":" Large Language Models (LLMs) are rapidly surpassing human knowledge in many\ndomains. While improving these models traditionally relies on costly human\ndata, recent self-rewarding mechanisms (Yuan et al., 2024) have shown that LLMs\ncan improve by judging their own responses instead of relying on human\nlabelers. However, existing methods have primarily focused on improving model\nresponses rather than judgment capabilities, resulting in rapid saturation\nduring iterative training. To address this issue, we introduce a novel\nMeta-Rewarding step to the self-improvement process, where the model judges its\nown judgements and uses that feedback to refine its judgment skills.\nSurprisingly, this unsupervised approach improves the model's ability to judge\n{\\em and} follow instructions, as demonstrated by a win rate improvement of\nLlama-3-8B-Instruct from 22.9% to 39.4% on AlpacaEval 2, and 20.6% to 29.1% on\nArena-Hard. These results strongly suggest the potential for self-improving\nmodels without human supervision.\n","authors":["Tianhao Wu","Weizhe Yuan","Olga Golovneva","Jing Xu","Yuandong Tian","Jiantao Jiao","Jason Weston","Sainbayar Sukhbaatar"],"pdf_url":"https://arxiv.org/pdf/2407.19594v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20485v1","updated":"2024-07-30T01:13:42Z","published":"2024-07-30T01:13:42Z","title":"A2SF: Accumulative Attention Scoring with Forgetting Factor for Token\n Pruning in Transformer Decoder","summary":" Recently, large language models (LLM) based on transformers are facing memory\nbottleneck issues due to KV cache, especially in long sequence handling.\nPrevious researches proposed KV cache compression techniques that identify\ninsignificant tokens based on Accumulative Attention Scores and removes their\nitems from KV cache, noting that only few tokens play an important role in\nattention operations. However, we have observed that the existing Accumulative\nAttention Score is not suitable for the transformer decoder structure. In the\ndecoder model, the number of times the Attention Score accumulates varies\ndepending on the order of token appearance due to the effect of masking,\ncausing an uneven comparison between tokens. To solve this, we propose\nAccumulative Attention Score with Forgetting Factor (A2SF) technique, which\nintroduces a Forgetting Factor in the Attention Score accumulation process.\nA2SF applies a penalty to the past Attention Score generated from old tokens by\nrepeatedly multiplying the Forgetting Factor to the Attention Score over time.\nTherefore, older tokens receive a larger penalty, providing fairness among\ndifferent ages of tokens. Through the fair comparison among tokens, we can more\neffectively select important tokens. We have verified the accuracy improvement\nthrough A2SF in the OPT and LLaMA models and A2SF improves the accuracy of\nLLaMA 2 by up to 7.8% and 5.1% on 1-shot and 0-shot.\n","authors":["Hyun Rae Jo","Dong Kun Shin"],"pdf_url":"https://arxiv.org/pdf/2407.20485v1.pdf","comment":"11 pages(9 pages + reference 2 pages), 6 figures"},{"id":"http://arxiv.org/abs/2407.21248v1","updated":"2024-07-30T23:43:59Z","published":"2024-07-30T23:43:59Z","title":"Adaptive Pre-training Data Detection for Large Language Models via\n Surprising Tokens","summary":" While large language models (LLMs) are extensively used, there are raising\nconcerns regarding privacy, security, and copyright due to their opaque\ntraining data, which brings the problem of detecting pre-training data on the\ntable. Current solutions to this problem leverage techniques explored in\nmachine learning privacy such as Membership Inference Attacks (MIAs), which\nheavily depend on LLMs' capability of verbatim memorization. However, this\nreliance presents challenges, especially given the vast amount of training data\nand the restricted number of effective training epochs. In this paper, we\npropose an adaptive pre-training data detection method which alleviates this\nreliance and effectively amplify the identification. Our method adaptively\nlocates \\textit{surprising tokens} of the input. A token is surprising to a LLM\nif the prediction on the token is \"certain but wrong\", which refers to low\nShannon entropy of the probability distribution and low probability of the\nground truth token at the same time. By using the prediction probability of\nsurprising tokens to measure \\textit{surprising}, the detection method is\nachieved based on the simple hypothesis that seeing seen data is less\nsurprising for the model compared with seeing unseen data. The method can be\napplied without any access to the the pre-training data corpus or additional\ntraining like reference models. Our approach exhibits a consistent enhancement\ncompared to existing methods in diverse experiments conducted on various\nbenchmarks and models, achieving a maximum improvement of 29.5\\%. We also\nintroduce a new benchmark Dolma-Book developed upon a novel framework, which\nemploys book data collected both before and after model training to provide\nfurther evaluation.\n","authors":["Anqi Zhang","Chaofeng Wu"],"pdf_url":"https://arxiv.org/pdf/2407.21248v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11514v3","updated":"2024-07-30T23:37:20Z","published":"2023-12-12T18:57:08Z","title":"LLM in a flash: Efficient Large Language Model Inference with Limited\n Memory","summary":" Large language models (LLMs) are central to modern natural language\nprocessing, delivering exceptional performance in various tasks. However, their\nsubstantial computational and memory requirements present challenges,\nespecially for devices with limited DRAM capacity. This paper tackles the\nchallenge of efficiently running LLMs that exceed the available DRAM capacity\nby storing the model parameters in flash memory, but bringing them on demand to\nDRAM. Our method involves constructing an inference cost model that takes into\naccount the characteristics of flash memory, guiding us to optimize in two\ncritical areas: reducing the volume of data transferred from flash and reading\ndata in larger, more contiguous chunks. Within this hardware-informed\nframework, we introduce two principal techniques. First, \"windowing\"\nstrategically reduces data transfer by reusing previously activated neurons,\nand second, \"row-column bundling\", tailored to the sequential data access\nstrengths of flash memory, increases the size of data chunks read from flash\nmemory. These methods collectively enable running models up to twice the size\nof the available DRAM, with a 4-5x and 20-25x increase in inference speed\ncompared to naive loading approaches in CPU and GPU, respectively. Our\nintegration of sparsity awareness, context-adaptive loading, and a\nhardware-oriented design paves the way for effective inference of LLMs on\ndevices with limited memory.\n","authors":["Keivan Alizadeh","Iman Mirzadeh","Dmitry Belenko","Karen Khatamifard","Minsik Cho","Carlo C Del Mundo","Mohammad Rastegari","Mehrdad Farajtabar"],"pdf_url":"https://arxiv.org/pdf/2312.11514v3.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2407.21229v1","updated":"2024-07-30T22:32:50Z","published":"2024-07-30T22:32:50Z","title":"Advancing Vietnamese Visual Question Answering with Transformer and\n Convolutional Integration","summary":" Visual Question Answering (VQA) has recently emerged as a potential research\ndomain, captivating the interest of many in the field of artificial\nintelligence and computer vision. Despite the prevalence of approaches in\nEnglish, there is a notable lack of systems specifically developed for certain\nlanguages, particularly Vietnamese. This study aims to bridge this gap by\nconducting comprehensive experiments on the Vietnamese Visual Question\nAnswering (ViVQA) dataset, demonstrating the effectiveness of our proposed\nmodel. In response to community interest, we have developed a model that\nenhances image representation capabilities, thereby improving overall\nperformance in the ViVQA system. Specifically, our model integrates the\nBootstrapping Language-Image Pre-training with frozen unimodal models (BLIP-2)\nand the convolutional neural network EfficientNet to extract and process both\nlocal and global features from images. This integration leverages the strengths\nof transformer-based architectures for capturing comprehensive contextual\ninformation and convolutional networks for detailed local features. By freezing\nthe parameters of these pre-trained models, we significantly reduce the\ncomputational cost and training time, while maintaining high performance. This\napproach significantly improves image representation and enhances the\nperformance of existing VQA systems. We then leverage a multi-modal fusion\nmodule based on a general-purpose multi-modal foundation model (BEiT-3) to fuse\nthe information between visual and textual features. Our experimental findings\ndemonstrate that our model surpasses competing baselines, achieving promising\nperformance. This is particularly evident in its accuracy of $71.04\\%$ on the\ntest set of the ViVQA dataset, marking a significant advancement in our\nresearch area. The code is available at https://github.com/nngocson2002/ViVQA.\n","authors":["Ngoc Son Nguyen","Van Son Nguyen","Tung Le"],"pdf_url":"https://arxiv.org/pdf/2407.21229v1.pdf","comment":"Accepted at the journal of Computers & Electrical Engineering\n (Received 8 March 2024, Revised 8 June 2024, Accepted 10 July 2024)"},{"id":"http://arxiv.org/abs/2403.03861v3","updated":"2024-07-30T22:19:20Z","published":"2024-03-06T17:11:38Z","title":"Designing Informative Metrics for Few-Shot Example Selection","summary":" Pretrained language models (PLMs) have shown remarkable few-shot learning\ncapabilities when provided with properly formatted examples. However, selecting\nthe \"best\" examples remains an open challenge. We propose a complexity-based\nprompt selection approach for sequence tagging tasks. This approach avoids the\ntraining of a dedicated model for selection of examples, and instead uses\ncertain metrics to align the syntactico-semantic complexity of test sentences\nand examples. We use both sentence- and word-level metrics to match the\ncomplexity of examples to the (test) sentence being considered. Our results\ndemonstrate that our approach extracts greater performance from PLMs: it\nachieves state-of-the-art performance on few-shot NER, achieving a 5% absolute\nimprovement in F1 score on the CoNLL2003 dataset for GPT-4. We also see large\ngains of upto 28.85 points (F1/Acc.) in smaller models like GPT-j-6B.\n","authors":["Rishabh Adiga","Lakshminarayanan Subramanian","Varun Chandrasekaran"],"pdf_url":"https://arxiv.org/pdf/2403.03861v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15419v2","updated":"2024-07-30T21:28:36Z","published":"2023-08-29T16:24:09Z","title":"Characterizing Learning Curves During Language Model Pre-Training:\n Learning, Forgetting, and Stability","summary":" How do language models learn to make predictions during pre-training? To\nstudy this, we extract learning curves from five autoregressive English\nlanguage model pre-training runs, for 1M unseen tokens in context. We observe\nthat the language models generate short repetitive phrases before learning to\ngenerate longer and more coherent text. We also find that individual tokens\noften exhibit sudden increases or decreases in loss that are surprisingly\nconsistent across pre-training runs. To better understand these fluctuations,\nwe quantify the final surprisal, within-run variability, age of acquisition,\nforgettability, and cross-run variability of learning curves for individual\ntokens in context. More frequent tokens reach lower final surprisals, exhibit\nless variability within and across pre-training runs, are learned earlier, and\nare less likely to be \"forgotten\" during pre-training. Higher n-gram\nprobabilities further accentuate these effects. Independent of the target\ntoken, shorter and more frequent contexts correlate with marginally more stable\nand quickly acquired predictions. Based on our results, we argue for the\nexistence of sequential learning dependencies between different model\ncapabilities, and we characterize language model learning as early n-gram\nlearning before gradual refinement of tail n-gram predictions.\n","authors":["Tyler A. Chang","Zhuowen Tu","Benjamin K. Bergen"],"pdf_url":"https://arxiv.org/pdf/2308.15419v2.pdf","comment":"Accepted to TACL (pre-MIT Press version)"},{"id":"http://arxiv.org/abs/2407.21191v1","updated":"2024-07-30T20:58:36Z","published":"2024-07-30T20:58:36Z","title":"GenRec: Generative Personalized Sequential Recommendation","summary":" Sequential recommendation is a task to capture hidden user preferences from\nhistorical user item interaction data. Significant progress has been made in\nthis domain by leveraging classification based learning methods. Inspired by\nthe recent paradigm of 'pretrain, prompt and predict' in NLP, we consider\nsequential recommendation as a sequence to sequence generation task and propose\na novel model named Generative Recommendation (GenRec). Unlike classification\nbased models that learn explicit user and item representations, GenRec utilizes\nthe sequence modeling capability of Transformer and adopts the masked item\nprediction objective to effectively learn the hidden bidirectional sequential\npatterns. Different from existing generative sequential recommendation models,\nGenRec does not rely on manually designed hard prompts. The input to GenRec is\ntextual user item sequence and the output is top ranked next items. Moreover,\nGenRec is lightweight and requires only a few hours to train effectively in\nlow-resource settings, making it highly applicable to real-world scenarios and\nhelping to democratize large language models in the sequential recommendation\ndomain. Our extensive experiments have demonstrated that GenRec generalizes on\nvarious public real-world datasets and achieves state-of-the-art results. Our\nexperiments also validate the effectiveness of the the proposed masked item\nprediction objective that improves the model performance by a large margin.\n","authors":["Panfeng Cao","Pietro Lio"],"pdf_url":"https://arxiv.org/pdf/2407.21191v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21170v1","updated":"2024-07-30T20:24:44Z","published":"2024-07-30T20:24:44Z","title":"Decomposed Prompting to Answer Questions on a Course Discussion Board","summary":" We propose and evaluate a question-answering system that uses decomposed\nprompting to classify and answer student questions on a course discussion\nboard. Our system uses a large language model (LLM) to classify questions into\none of four types: conceptual, homework, logistics, and not answerable. This\nenables us to employ a different strategy for answering questions that fall\nunder different types. Using a variant of GPT-3, we achieve $81\\%$\nclassification accuracy. We discuss our system's performance on answering\nconceptual questions from a machine learning course and various failure modes.\n","authors":["Brandon Jaipersaud","Paul Zhang","Jimmy Ba","Andrew Petersen","Lisa Zhang","Michael R. Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.21170v1.pdf","comment":"6 pages. Published at International Conference on Artificial\n Intelligence in Education 2023. Code repository:\n https://github.com/brandonjaipersaud/piazza-qabot-gpt"},{"id":"http://arxiv.org/abs/2407.21153v1","updated":"2024-07-30T19:32:22Z","published":"2024-07-30T19:32:22Z","title":"Event-Arguments Extraction Corpus and Modeling using BERT for Arabic","summary":" Event-argument extraction is a challenging task, particularly in Arabic due\nto sparse linguistic resources. To fill this gap, we introduce the \\hadath\ncorpus ($550$k tokens) as an extension of Wojood, enriched with event-argument\nannotations. We used three types of event arguments: $agent$, $location$, and\n$date$, which we annotated as relation types. Our inter-annotator agreement\nevaluation resulted in $82.23\\%$ $Kappa$ score and $87.2\\%$ $F_1$-score.\nAdditionally, we propose a novel method for event relation extraction using\nBERT, in which we treat the task as text entailment. This method achieves an\n$F_1$-score of $94.01\\%$. To further evaluate the generalization of our\nproposed method, we collected and annotated another out-of-domain corpus (about\n$80$k tokens) called \\testNLI and used it as a second test set, on which our\napproach achieved promising results ($83.59\\%$ $F_1$-score). Last but not\nleast, we propose an end-to-end system for event-arguments extraction. This\nsystem is implemented as part of SinaTools, and both corpora are publicly\navailable at {\\small \\url{https://sina.birzeit.edu/wojood}}\n","authors":["Alaa Aljabari","Lina Duaibes","Mustafa Jarrar","Mohammed Khalilia"],"pdf_url":"https://arxiv.org/pdf/2407.21153v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21139v1","updated":"2024-07-30T19:03:03Z","published":"2024-07-30T19:03:03Z","title":"Enhancing Semantic Similarity Understanding in Arabic NLP with Nested\n Embedding Learning","summary":" This work presents a novel framework for training Arabic nested embedding\nmodels through Matryoshka Embedding Learning, leveraging multilingual,\nArabic-specific, and English-based models, to highlight the power of nested\nembeddings models in various Arabic NLP downstream tasks. Our innovative\ncontribution includes the translation of various sentence similarity datasets\ninto Arabic, enabling a comprehensive evaluation framework to compare these\nmodels across different dimensions. We trained several nested embedding models\non the Arabic Natural Language Inference triplet dataset and assessed their\nperformance using multiple evaluation metrics, including Pearson and Spearman\ncorrelations for cosine similarity, Manhattan distance, Euclidean distance, and\ndot product similarity. The results demonstrate the superior performance of the\nMatryoshka embedding models, particularly in capturing semantic nuances unique\nto the Arabic language. Results demonstrated that Arabic Matryoshka embedding\nmodels have superior performance in capturing semantic nuances unique to the\nArabic language, significantly outperforming traditional models by up to\n20-25\\% across various similarity metrics. These results underscore the\neffectiveness of language-specific training and highlight the potential of\nMatryoshka models in enhancing semantic textual similarity tasks for Arabic\nNLP.\n","authors":["Omer Nacar","Anis Koubaa"],"pdf_url":"https://arxiv.org/pdf/2407.21139v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08753v4","updated":"2024-07-30T18:58:01Z","published":"2023-10-12T22:43:38Z","title":"CompA: Addressing the Gap in Compositional Reasoning in Audio-Language\n Models","summary":" A fundamental characteristic of audio is its compositional nature.\nAudio-language models (ALMs) trained using a contrastive approach (e.g., CLAP)\nthat learns a shared representation between audio and language modalities have\nimproved performance in many downstream applications, including zero-shot audio\nclassification, audio retrieval, etc. However, the ability of these models to\neffectively perform compositional reasoning remains largely unexplored and\nnecessitates additional research. In this paper, we propose CompA, a collection\nof two expert-annotated benchmarks with a majority of real-world audio samples,\nto evaluate compositional reasoning in ALMs. Our proposed CompA-order evaluates\nhow well an ALM understands the order or occurrence of acoustic events in\naudio, and CompA-attribute evaluates attribute-binding of acoustic events. An\ninstance from either benchmark consists of two audio-caption pairs, where both\naudios have the same acoustic events but with different compositions. An ALM is\nevaluated on how well it matches the right audio to the right caption. Using\nthis benchmark, we first show that current ALMs perform only marginally better\nthan random chance, thereby struggling with compositional reasoning. Next, we\npropose CompA-CLAP, where we fine-tune CLAP using a novel learning method to\nimprove its compositional reasoning abilities. To train CompA-CLAP, we first\npropose improvements to contrastive training with composition-aware hard\nnegatives, allowing for more focused training. Next, we propose a novel modular\ncontrastive loss that helps the model learn fine-grained compositional\nunderstanding and overcomes the acute scarcity of openly available\ncompositional audios. CompA-CLAP significantly improves over all our baseline\nmodels on the CompA benchmark, indicating its superior compositional reasoning\ncapabilities.\n","authors":["Sreyan Ghosh","Ashish Seth","Sonal Kumar","Utkarsh Tyagi","Chandra Kiran Evuru","S. Ramaneswaran","S. Sakshi","Oriol Nieto","Ramani Duraiswami","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2310.08753v4.pdf","comment":"ICLR 2024. Project Page: https://sreyan88.github.io/compa_iclr/"},{"id":"http://arxiv.org/abs/2407.17477v2","updated":"2024-07-30T18:47:31Z","published":"2024-07-01T17:20:37Z","title":"Toward Automated Detection of Biased Social Signals from the Content of\n Clinical Conversations","summary":" Implicit bias can impede patient-provider interactions and lead to inequities\nin care. Raising awareness is key to reducing such bias, but its manifestations\nin the social dynamics of patient-provider communication are difficult to\ndetect. In this study, we used automated speech recognition (ASR) and natural\nlanguage processing (NLP) to identify social signals in patient-provider\ninteractions. We built an automated pipeline to predict social signals from\naudio recordings of 782 primary care visits that achieved 90.1% average\naccuracy across codes, and exhibited fairness in its predictions for white and\nnon-white patients. Applying this pipeline, we identified statistically\nsignificant differences in provider communication behavior toward white versus\nnon-white patients. In particular, providers expressed more patient-centered\nbehaviors towards white patients including more warmth, engagement, and\nattentiveness. Our study underscores the potential of automated tools in\nidentifying subtle communication signals that may be linked with bias and\nimpact healthcare quality and equity.\n","authors":["Feng Chen","Manas Satish Bedmutha","Ray-Yuan Chung","Janice Sabin","Wanda Pratt","Brian R. Wood","Nadir Weibel","Andrea L. Hartzler","Trevor Cohen"],"pdf_url":"https://arxiv.org/pdf/2407.17477v2.pdf","comment":"Accepted by AMIA 2024 Annual Symposium"},{"id":"http://arxiv.org/abs/2402.11359v4","updated":"2024-07-30T18:22:00Z","published":"2024-02-17T18:31:21Z","title":"Offline Training of Language Model Agents with Functions as Learnable\n Weights","summary":" Researchers and practitioners have recently reframed powerful Large Language\nModels (LLMs) as agents, enabling them to automate complex tasks largely via\nthe use of specialized functions. To facilitate the development of LLM agents,\nwe present a novel paradigm of training LLM agents without modifying the LLM\nweights, which is particularly useful when the LLMs are difficult or\ninaccessible for modifications. Inspired by how humans continuously forge tools\nto adapt to real-world tasks, rather than change our biological structure to\nfit a static set of tools, we propose to progressively forge agent's functions\nto better solve the downstream tasks instead of modifying the LLM weights. By\ntreating the functions as learnable `agent parameters' and leveraging the\nfundamental idea of model training in artificial intelligence, we develop\nAgentOptimizer that employs the LLM to update agents' functions and devise an\nagent training algorithm with two strategies, roll-back, and early-stop, to\nstreamline the training process. With extensive experiments, we showcase that\nthe agent training paradigm could significantly improve the performance of\nrepresentative LLM agents in various downstream tasks. We also study the\nbehavior of the agent training regarding aspects like the learning curve and\ndomain transferability.\n","authors":["Shaokun Zhang","Jieyu Zhang","Jiale Liu","Linxin Song","Chi Wang","Ranjay Krishna","Qingyun Wu"],"pdf_url":"https://arxiv.org/pdf/2402.11359v4.pdf","comment":"22 pages, 10 figures"},{"id":"http://arxiv.org/abs/2406.10777v2","updated":"2024-07-30T18:19:09Z","published":"2024-06-16T02:08:49Z","title":"RoseLoRA: Row and Column-wise Sparse Low-rank Adaptation of Pre-trained\n Language Model for Knowledge Editing and Fine-tuning","summary":" Pre-trained language models, trained on large-scale corpora, demonstrate\nstrong generalizability across various NLP tasks. Fine-tuning these models for\nspecific tasks typically involves updating all parameters, which is\nresource-intensive. Parameter-efficient fine-tuning (PEFT) methods, such as the\npopular LoRA family, introduce low-rank matrices to learn only a few parameters\nefficiently. However, during inference, the product of these matrices updates\nall pre-trained parameters, complicating tasks like knowledge editing that\nrequire selective updates. We propose a novel PEFT method, which conducts\n\\textbf{r}ow and c\\textbf{o}lumn-wise spar\\textbf{se}\n\\textbf{lo}w-\\textbf{r}ank \\textbf{a}daptation (RoseLoRA), to address this\nchallenge. RoseLoRA identifies and updates only the most important parameters\nfor a specific task, maintaining efficiency while preserving other model\nknowledge. By adding a sparsity constraint on the product of low-rank matrices\nand converting it to row and column-wise sparsity, we ensure efficient and\nprecise model updates. Our theoretical analysis guarantees the lower bound of\nthe sparsity with respective to the matrix product. Extensive experiments on\nfive benchmarks across twenty datasets demonstrate that RoseLoRA outperforms\nbaselines in both general fine-tuning and knowledge editing tasks.\n","authors":["Haoyu Wang","Tianci Liu","Ruirui Li","Monica Cheng","Tuo Zhao","Jing Gao"],"pdf_url":"https://arxiv.org/pdf/2406.10777v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21092v1","updated":"2024-07-30T17:11:15Z","published":"2024-07-30T17:11:15Z","title":"Entropy, Thermodynamics and the Geometrization of the Language Model","summary":" In this paper, we discuss how pure mathematics and theoretical physics can be\napplied to the study of language models. Using set theory and analysis, we\nformulate mathematically rigorous definitions of language models, and introduce\nthe concept of the moduli space of distributions for a language model. We\nformulate a generalized distributional hypothesis using functional analysis and\ntopology. We define the entropy function associated with a language model and\nshow how it allows us to understand many interesting phenomena in languages. We\nargue that the zero points of the entropy function and the points where the\nentropy is close to 0 are the key obstacles for an LLM to approximate an\nintelligent language model, which explains why good LLMs need billions of\nparameters. Using the entropy function, we formulate a conjecture about AGI.\n Then, we show how thermodynamics gives us an immediate interpretation to\nlanguage models. In particular we will define the concepts of partition\nfunction, internal energy and free energy for a language model, which offer\ninsights into how language models work. Based on these results, we introduce a\ngeneral concept of the geometrization of language models and define what is\ncalled the Boltzmann manifold. While the current LLMs are the special cases of\nthe Boltzmann manifold.\n","authors":["Wenzhe Yang"],"pdf_url":"https://arxiv.org/pdf/2407.21092v1.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2407.21082v1","updated":"2024-07-30T07:58:28Z","published":"2024-07-30T07:58:28Z","title":"Accelerating Large Language Model Inference with Self-Supervised Early\n Exits","summary":" This paper presents a novel technique for accelerating inference in large,\npre-trained language models (LLMs) by introducing early exits during inference.\nThe computational demands of these models, used across a wide range of\napplications, can be substantial. By capitalizing on the inherent variability\nin token complexity, our approach enables selective acceleration of the\ninference process. Specifically, we propose the integration of early exit\n''heads'' atop existing transformer layers, which facilitate conditional\nterminations based on a confidence metric. These heads are trained in a\nself-supervised manner using the model's own predictions as training data,\nthereby eliminating the need for additional annotated data. The confidence\nmetric, established using a calibration set, ensures a desired level of\naccuracy while enabling early termination when confidence exceeds a\npredetermined threshold. Notably, our method preserves the original accuracy\nand reduces computational time on certain tasks, leveraging the existing\nknowledge of pre-trained LLMs without requiring extensive retraining. This\nlightweight, modular modification has the potential to greatly enhance the\npractical usability of LLMs, particularly in applications like real-time\nlanguage processing in resource-constrained environments.\n","authors":["Florian Valade"],"pdf_url":"https://arxiv.org/pdf/2407.21082v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2407.21017v1","updated":"2024-07-30T17:58:52Z","published":"2024-07-30T17:58:52Z","title":"Matting by Generation","summary":" This paper introduces an innovative approach for image matting that redefines\nthe traditional regression-based task as a generative modeling challenge. Our\nmethod harnesses the capabilities of latent diffusion models, enriched with\nextensive pre-trained knowledge, to regularize the matting process. We present\nnovel architectural innovations that empower our model to produce mattes with\nsuperior resolution and detail. The proposed method is versatile and can\nperform both guidance-free and guidance-based image matting, accommodating a\nvariety of additional cues. Our comprehensive evaluation across three benchmark\ndatasets demonstrates the superior performance of our approach, both\nquantitatively and qualitatively. The results not only reflect our method's\nrobust effectiveness but also highlight its ability to generate visually\ncompelling mattes that approach photorealistic quality. The project page for\nthis paper is available at\nhttps://lightchaserx.github.io/matting-by-generation/\n","authors":["Zhixiang Wang","Baiang Li","Jian Wang","Yu-Lun Liu","Jinwei Gu","Yung-Yu Chuang","Shin'ichi Satoh"],"pdf_url":"https://arxiv.org/pdf/2407.21017v1.pdf","comment":"SIGGRAPH'24, Project page:\n https://lightchaserx.github.io/matting-by-generation/"},{"id":"http://arxiv.org/abs/2407.21016v1","updated":"2024-07-30T17:58:13Z","published":"2024-07-30T17:58:13Z","title":"Add-SD: Rational Generation without Manual Reference","summary":" Diffusion models have exhibited remarkable prowess in visual generalization.\nBuilding on this success, we introduce an instruction-based object addition\npipeline, named Add-SD, which automatically inserts objects into realistic\nscenes with rational sizes and positions. Different from layout-conditioned\nmethods, Add-SD is solely conditioned on simple text prompts rather than any\nother human-costly references like bounding boxes. Our work contributes in\nthree aspects: proposing a dataset containing numerous instructed image pairs;\nfine-tuning a diffusion model for rational generation; and generating synthetic\ndata to boost downstream tasks. The first aspect involves creating a\nRemovalDataset consisting of original-edited image pairs with textual\ninstructions, where an object has been removed from the original image while\nmaintaining strong pixel consistency in the background. These data pairs are\nthen used for fine-tuning the Stable Diffusion (SD) model. Subsequently, the\npretrained Add-SD model allows for the insertion of expected objects into an\nimage with good rationale. Additionally, we generate synthetic instances for\ndownstream task datasets at scale, particularly for tail classes, to alleviate\nthe long-tailed problem. Downstream tasks benefit from the enriched dataset\nwith enhanced diversity and rationale. Experiments on LVIS val demonstrate that\nAdd-SD yields an improvement of 4.3 mAP on rare classes over the baseline. Code\nand models are available at https://github.com/ylingfeng/Add-SD.\n","authors":["Lingfeng Yang","Xinyu Zhang","Xiang Li","Jinwen Chen","Kun Yao","Gang Zhang","Errui Ding","Lingqiao Liu","Jingdong Wang","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2407.21016v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21011v1","updated":"2024-07-30T17:57:32Z","published":"2024-07-30T17:57:32Z","title":"CLEFT: Language-Image Contrastive Learning with Efficient Large Language\n Model and Prompt Fine-Tuning","summary":" Recent advancements in Contrastive Language-Image Pre-training (CLIP) have\ndemonstrated notable success in self-supervised representation learning across\nvarious tasks. However, the existing CLIP-like approaches often demand\nextensive GPU resources and prolonged training times due to the considerable\nsize of the model and dataset, making them poor for medical applications, in\nwhich large datasets are not always common. Meanwhile, the language model\nprompts are mainly manually derived from labels tied to images, potentially\noverlooking the richness of information within training samples. We introduce a\nnovel language-image Contrastive Learning method with an Efficient large\nlanguage model and prompt Fine-Tuning (CLEFT) that harnesses the strengths of\nthe extensive pre-trained language and visual models. Furthermore, we present\nan efficient strategy for learning context-based prompts that mitigates the gap\nbetween informative clinical diagnostic data and simple class labels. Our\nmethod demonstrates state-of-the-art performance on multiple chest X-ray and\nmammography datasets compared with various baselines. The proposed parameter\nefficient framework can reduce the total trainable model size by 39% and reduce\nthe trainable language model to only 4% compared with the current BERT encoder.\n","authors":["Yuexi Du","Brian Chang","Nicha C. Dvornek"],"pdf_url":"https://arxiv.org/pdf/2407.21011v1.pdf","comment":"Accepted by MICCAI 2024"},{"id":"http://arxiv.org/abs/2407.21004v1","updated":"2024-07-30T17:51:44Z","published":"2024-07-30T17:51:44Z","title":"Evolver: Chain-of-Evolution Prompting to Boost Large Multimodal Models\n for Hateful Meme Detection","summary":" Recent advances show that two-stream approaches have achieved outstanding\nperformance in hateful meme detection. However, hateful memes constantly evolve\nas new memes emerge by fusing progressive cultural ideas, making existing\nmethods obsolete or ineffective. In this work, we explore the potential of\nLarge Multimodal Models (LMMs) for hateful meme detection. To this end, we\npropose Evolver, which incorporates LMMs via Chain-of-Evolution (CoE)\nPrompting, by integrating the evolution attribute and in-context information of\nmemes. Specifically, Evolver simulates the evolving and expressing process of\nmemes and reasons through LMMs in a step-by-step manner. First, an evolutionary\npair mining module retrieves the top-k most similar memes in the external\ncurated meme set with the input meme. Second, an evolutionary information\nextractor is designed to summarize the semantic regularities between the paired\nmemes for prompting. Finally, a contextual relevance amplifier enhances the\nin-context hatefulness information to boost the search for evolutionary\nprocesses. Extensive experiments on public FHM, MAMI, and HarM datasets show\nthat CoE prompting can be incorporated into existing LMMs to improve their\nperformance. More encouragingly, it can serve as an interpretive tool to\npromote the understanding of the evolution of social memes.\n","authors":["Jinfa Huang","Jinsheng Pan","Zhongwei Wan","Hanjia Lyu","Jiebo Luo"],"pdf_url":"https://arxiv.org/pdf/2407.21004v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21002v1","updated":"2024-07-30T17:49:21Z","published":"2024-07-30T17:49:21Z","title":"XHand: Real-time Expressive Hand Avatar","summary":" Hand avatars play a pivotal role in a wide array of digital interfaces,\nenhancing user immersion and facilitating natural interaction within virtual\nenvironments. While previous studies have focused on photo-realistic hand\nrendering, little attention has been paid to reconstruct the hand geometry with\nfine details, which is essential to rendering quality. In the realms of\nextended reality and gaming, on-the-fly rendering becomes imperative. To this\nend, we introduce an expressive hand avatar, named XHand, that is designed to\ncomprehensively generate hand shape, appearance, and deformations in real-time.\nTo obtain fine-grained hand meshes, we make use of three feature embedding\nmodules to predict hand deformation displacements, albedo, and linear blending\nskinning weights, respectively. To achieve photo-realistic hand rendering on\nfine-grained meshes, our method employs a mesh-based neural renderer by\nleveraging mesh topological consistency and latent codes from embedding\nmodules. During training, a part-aware Laplace smoothing strategy is proposed\nby incorporating the distinct levels of regularization to effectively maintain\nthe necessary details and eliminate the undesired artifacts. The experimental\nevaluations on InterHand2.6M and DeepHandMesh datasets demonstrate the efficacy\nof XHand, which is able to recover high-fidelity geometry and texture for hand\nanimations across diverse poses in real-time. To reproduce our results, we will\nmake the full implementation publicly available at\nhttps://github.com/agnJason/XHand.\n","authors":["Qijun Gan","Zijie Zhou","Jianke Zhu"],"pdf_url":"https://arxiv.org/pdf/2407.21002v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21001v1","updated":"2024-07-30T17:46:06Z","published":"2024-07-30T17:46:06Z","title":"GABInsight: Exploring Gender-Activity Binding Bias in Vision-Language\n Models","summary":" Vision-language models (VLMs) are intensively used in many downstream tasks,\nincluding those requiring assessments of individuals appearing in the images.\nWhile VLMs perform well in simple single-person scenarios, in real-world\napplications, we often face complex situations in which there are persons of\ndifferent genders doing different activities. We show that in such cases, VLMs\nare biased towards identifying the individual with the expected gender\n(according to ingrained gender stereotypes in the model or other forms of\nsample selection bias) as the performer of the activity. We refer to this bias\nin associating an activity with the gender of its actual performer in an image\nor text as the Gender-Activity Binding (GAB) bias and analyze how this bias is\ninternalized in VLMs. To assess this bias, we have introduced the GAB dataset\nwith approximately 5500 AI-generated images that represent a variety of\nactivities, addressing the scarcity of real-world images for some scenarios. To\nhave extensive quality control, the generated images are evaluated for their\ndiversity, quality, and realism. We have tested 12 renowned pre-trained VLMs on\nthis dataset in the context of text-to-image and image-to-text retrieval to\nmeasure the effect of this bias on their predictions. Additionally, we have\ncarried out supplementary experiments to quantify the bias in VLMs' text\nencoders and to evaluate VLMs' capability to recognize activities. Our\nexperiments indicate that VLMs experience an average performance decline of\nabout 13.2% when confronted with gender-activity binding bias.\n","authors":["Ali Abdollahi","Mahdi Ghaznavi","Mohammad Reza Karimi Nejad","Arash Mari Oriyad","Reza Abbasi","Ali Salesi","Melika Behjati","Mohammad Hossein Rohban","Mahdieh Soleymani Baghshah"],"pdf_url":"https://arxiv.org/pdf/2407.21001v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20990v1","updated":"2024-07-30T17:27:20Z","published":"2024-07-30T17:27:20Z","title":"From Feature Importance to Natural Language Explanations Using LLMs with\n RAG","summary":" As machine learning becomes increasingly integral to autonomous\ndecision-making processes involving human interaction, the necessity of\ncomprehending the model's outputs through conversational means increases. Most\nrecently, foundation models are being explored for their potential as post hoc\nexplainers, providing a pathway to elucidate the decision-making mechanisms of\npredictive models. In this work, we introduce traceable question-answering,\nleveraging an external knowledge repository to inform the responses of Large\nLanguage Models (LLMs) to user queries within a scene understanding task. This\nknowledge repository comprises contextual details regarding the model's output,\ncontaining high-level features, feature importance, and alternative\nprobabilities. We employ subtractive counterfactual reasoning to compute\nfeature importance, a method that entails analysing output variations resulting\nfrom decomposing semantic features. Furthermore, to maintain a seamless\nconversational flow, we integrate four key characteristics - social, causal,\nselective, and contrastive - drawn from social science research on human\nexplanations into a single-shot prompt, guiding the response generation\nprocess. Our evaluation demonstrates that explanations generated by the LLMs\nencompassed these elements, indicating its potential to bridge the gap between\ncomplex model outputs and natural language expressions.\n","authors":["Sule Tekkesinoglu","Lars Kunze"],"pdf_url":"https://arxiv.org/pdf/2407.20990v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19985v2","updated":"2024-07-30T17:26:22Z","published":"2024-07-29T13:19:31Z","title":"Mixture of Nested Experts: Adaptive Processing of Visual Tokens","summary":" The visual medium (images and videos) naturally contains a large amount of\ninformation redundancy, thereby providing a great opportunity for leveraging\nefficiency in processing. While Vision Transformer (ViT) based models scale\neffectively to large data regimes, they fail to capitalize on this inherent\nredundancy, leading to higher computational costs. Mixture of Experts (MoE)\nnetworks demonstrate scalability while maintaining same inference-time costs,\nbut they come with a larger parameter footprint. We present Mixture of Nested\nExperts (MoNE), which utilizes a nested structure for experts, wherein\nindividual experts fall on an increasing compute-accuracy curve. Given a\ncompute budget, MoNE learns to dynamically choose tokens in a priority order,\nand thus redundant tokens are processed through cheaper nested experts. Using\nthis framework, we achieve equivalent performance as the baseline models, while\nreducing inference time compute by over two-fold. We validate our approach on\nstandard image and video datasets - ImageNet-21K, Kinetics400, and\nSomething-Something-v2. We further highlight MoNE$'$s adaptability by\nshowcasing its ability to maintain strong performance across different\ninference-time compute budgets on videos, using only a single trained model.\n","authors":["Gagan Jain","Nidhi Hegde","Aditya Kusupati","Arsha Nagrani","Shyamal Buch","Prateek Jain","Anurag Arnab","Sujoy Paul"],"pdf_url":"https://arxiv.org/pdf/2407.19985v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20987v1","updated":"2024-07-30T17:21:32Z","published":"2024-07-30T17:21:32Z","title":"PIXELMOD: Improving Soft Moderation of Visual Misleading Information on\n Twitter","summary":" Images are a powerful and immediate vehicle to carry misleading or outright\nfalse messages, yet identifying image-based misinformation at scale poses\nunique challenges. In this paper, we present PIXELMOD, a system that leverages\nperceptual hashes, vector databases, and optical character recognition (OCR) to\nefficiently identify images that are candidates to receive soft moderation\nlabels on Twitter. We show that PIXELMOD outperforms existing image similarity\napproaches when applied to soft moderation, with negligible performance\noverhead. We then test PIXELMOD on a dataset of tweets surrounding the 2020 US\nPresidential Election, and find that it is able to identify visually misleading\nimages that are candidates for soft moderation with 0.99% false detection and\n2.06% false negatives.\n","authors":["Pujan Paudel","Chen Ling","Jeremy Blackburn","Gianluca Stringhini"],"pdf_url":"https://arxiv.org/pdf/2407.20987v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.11272v2","updated":"2024-07-30T16:50:09Z","published":"2024-07-15T22:39:03Z","title":"Differentiable Voxelization and Mesh Morphing","summary":" In this paper, we propose the differentiable voxelization of 3D meshes via\nthe winding number and solid angles. The proposed approach achieves fast,\nflexible, and accurate voxelization of 3D meshes, admitting the computation of\ngradients with respect to the input mesh and GPU acceleration. We further\ndemonstrate the application of the proposed voxelization in mesh morphing,\nwhere the voxelized mesh is deformed by a neural network. The proposed method\nis evaluated on the ShapeNet dataset and achieves state-of-the-art performance\nin terms of both accuracy and efficiency.\n","authors":["Yihao Luo","Yikai Wang","Zhengrui Xiang","Yuliang Xiu","Guang Yang","ChoonHwai Yap"],"pdf_url":"https://arxiv.org/pdf/2407.11272v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20962v1","updated":"2024-07-30T16:43:24Z","published":"2024-07-30T16:43:24Z","title":"MMTrail: A Multimodal Trailer Video Dataset with Language and Music\n Descriptions","summary":" Massive multi-modality datasets play a significant role in facilitating the\nsuccess of large video-language models. However, current video-language\ndatasets primarily provide text descriptions for visual frames, considering\naudio to be weakly related information. They usually overlook exploring the\npotential of inherent audio-visual correlation, leading to monotonous\nannotation within each modality instead of comprehensive and precise\ndescriptions. Such ignorance results in the difficulty of multiple\ncross-modality studies. To fulfill this gap, we present MMTrail, a large-scale\nmulti-modality video-language dataset incorporating more than 20M trailer clips\nwith visual captions, and 2M high-quality clips with multimodal captions.\nTrailers preview full-length video works and integrate context, visual frames,\nand background music. In particular, the trailer has two main advantages: (1)\nthe topics are diverse, and the content characters are of various types, e.g.,\nfilm, news, and gaming. (2) the corresponding background music is\ncustom-designed, making it more coherent with the visual context. Upon these\ninsights, we propose a systemic captioning framework, achieving various\nmodality annotations with more than 27.1k hours of trailer videos. Here, to\nensure the caption retains music perspective while preserving the authority of\nvisual context, we leverage the advanced LLM to merge all annotations\nadaptively. In this fashion, our MMtrail dataset potentially paves the path for\nfine-grained large multimodal-language model training. In experiments, we\nprovide evaluation metrics and benchmark results on our dataset, demonstrating\nthe high quality of our annotation and its effectiveness for model training.\n","authors":["Xiaowei Chi","Yatian Wang","Aosong Cheng","Pengjun Fang","Zeyue Tian","Yingqing He","Zhaoyang Liu","Xingqun Qi","Jiahao Pan","Rongyu Zhang","Mengfei Li","Ruibin Yuan","Yanbing Jiang","Wei Xue","Wenhan Luo","Qifeng Chen","Shanghang Zhang","Qifeng Liu","Yike Guo"],"pdf_url":"https://arxiv.org/pdf/2407.20962v1.pdf","comment":"15 Pages. Dataset report"},{"id":"http://arxiv.org/abs/2407.09230v2","updated":"2024-07-30T16:40:23Z","published":"2024-07-12T12:49:11Z","title":"Surgical Text-to-Image Generation","summary":" Acquiring surgical data for research and development is significantly\nhindered by high annotation costs and practical and ethical constraints.\nUtilizing synthetically generated images could offer a valuable alternative. In\nthis work, we explore adapting text-to-image generative models for the surgical\ndomain using the CholecT50 dataset, which provides surgical images annotated\nwith action triplets (instrument, verb, target). We investigate several\nlanguage models and find T5 to offer more distinct features for differentiating\nsurgical actions on triplet-based textual inputs, and showcasing stronger\nalignment between long and triplet-based captions. To address challenges in\ntraining text-to-image models solely on triplet-based captions without\nadditional inputs and supervisory signals, we discover that triplet text\nembeddings are instrument-centric in the latent space. Leveraging this insight,\nwe design an instrument-based class balancing technique to counteract data\nimbalance and skewness, improving training convergence. Extending Imagen, a\ndiffusion-based generative model, we develop Surgical Imagen to generate\nphotorealistic and activity-aligned surgical images from triplet-based textual\nprompts. We assess the model on quality, alignment, reasoning, and knowledge,\nachieving FID and CLIP scores of 3.7 and 26.8% respectively. Human expert\nsurvey shows that participants were highly challenged by the realistic\ncharacteristics of the generated samples, demonstrating Surgical Imagen's\neffectiveness as a practical alternative to real data collection.\n","authors":["Chinedu Innocent Nwoye","Rupak Bose","Kareem Elgohary","Lorenzo Arboit","Giorgio Carlino","Joël L. Lavanchy","Pietro Mascagni","Nicolas Padoy"],"pdf_url":"https://arxiv.org/pdf/2407.09230v2.pdf","comment":"11 pages, 11 figures, 3 tables, project page at\n https://camma-public.github.io/endogen/"},{"id":"http://arxiv.org/abs/2407.20959v1","updated":"2024-07-30T16:36:15Z","published":"2024-07-30T16:36:15Z","title":"Learning Ordinality in Semantic Segmentation","summary":" Semantic segmentation consists of predicting a semantic label for each image\npixel. Conventional deep learning models do not take advantage of ordinal\nrelations that might exist in the domain at hand. For example, it is known that\nthe pupil is inside the iris, and the lane markings are inside the road. Such\ndomain knowledge can be employed as constraints to make the model more robust.\nThe current literature on this topic has explored pixel-wise ordinal\nsegmentation methods, which treat each pixel as an independent observation and\npromote ordinality in its representation. This paper proposes novel spatial\nordinal segmentation methods, which take advantage of the structured image\nspace by considering each pixel as an observation dependent on its neighborhood\ncontext to also promote ordinal spatial consistency. When evaluated with five\nbiomedical datasets and multiple configurations of autonomous driving datasets,\nordinal methods resulted in more ordinally-consistent models, with substantial\nimprovements in ordinal metrics and some increase in the Dice coefficient. It\nwas also shown that the incorporation of ordinal consistency results in models\nwith better generalization abilities.\n","authors":["Rafael Cristino","Ricardo P. M. Cruz","Jaime S. Cardoso"],"pdf_url":"https://arxiv.org/pdf/2407.20959v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2407.20950v1","updated":"2024-07-30T16:27:51Z","published":"2024-07-30T16:27:51Z","title":"dopanim: A Dataset of Doppelganger Animals with Noisy Annotations from\n Multiple Humans","summary":" Human annotators typically provide annotated data for training machine\nlearning models, such as neural networks. Yet, human annotations are subject to\nnoise, impairing generalization performances. Methodological research on\napproaches counteracting noisy annotations requires corresponding datasets for\na meaningful empirical evaluation. Consequently, we introduce a novel benchmark\ndataset, dopanim, consisting of about 15,750 animal images of 15 classes with\nground truth labels. For approximately 10,500 of these images, 20 humans\nprovided over 52,000 annotations with an accuracy of circa 67%. Its key\nattributes include (1) the challenging task of classifying doppelganger\nanimals, (2) human-estimated likelihoods as annotations, and (3) annotator\nmetadata. We benchmark well-known multi-annotator learning approaches using\nseven variants of this dataset and outline further evaluation use cases such as\nlearning beyond hard class labels and active learning. Our dataset and a\ncomprehensive codebase are publicly available to emulate the data collection\nprocess and to reproduce all empirical results.\n","authors":["Marek Herde","Denis Huseljic","Lukas Rauch","Bernhard Sick"],"pdf_url":"https://arxiv.org/pdf/2407.20950v1.pdf","comment":"Under review @ NeurIPS 2024 (Datasets and Benchmarks Track)"},{"id":"http://arxiv.org/abs/2407.20937v1","updated":"2024-07-30T16:19:14Z","published":"2024-07-30T16:19:14Z","title":"EAR: Edge-Aware Reconstruction of 3-D vertebrae structures from\n bi-planar X-ray images","summary":" X-ray images ease the diagnosis and treatment process due to their rapid\nimaging speed and high resolution. However, due to the projection process of\nX-ray imaging, much spatial information has been lost. To accurately provide\nefficient spinal morphological and structural information, reconstructing the\n3-D structures of the spine from the 2-D X-ray images is essential. It is\nchallenging for current reconstruction methods to preserve the edge information\nand local shapes of the asymmetrical vertebrae structures. In this study, we\npropose a new Edge-Aware Reconstruction network (EAR) to focus on the\nperformance improvement of the edge information and vertebrae shapes. In our\nnetwork, by using the auto-encoder architecture as the backbone, the edge\nattention module and frequency enhancement module are proposed to strengthen\nthe perception of the edge reconstruction. Meanwhile, we also combine four loss\nterms, including reconstruction loss, edge loss, frequency loss and projection\nloss. The proposed method is evaluated using three publicly accessible datasets\nand compared with four state-of-the-art models. The proposed method is superior\nto other methods and achieves 25.32%, 15.32%, 86.44%, 80.13%, 23.7612 and\n0.3014 with regard to MSE, MAE, Dice, SSIM, PSNR and frequency distance. Due to\nthe end-to-end and accurate reconstruction process, EAR can provide sufficient\n3-D spatial information and precise preoperative surgical planning guidance.\n","authors":["Lixing Tan","Shuang Song","Yaofeng He","Kangneng Zhou","Tong Lu","Ruoxiu Xiao"],"pdf_url":"https://arxiv.org/pdf/2407.20937v1.pdf","comment":"13 pages, 11 figures, 3 tables"},{"id":"http://arxiv.org/abs/2402.02209v3","updated":"2024-07-30T16:16:45Z","published":"2024-02-03T16:45:31Z","title":"On the Exploitation of DCT-Traces in the Generative-AI Domain","summary":" Deepfakes represent one of the toughest challenges in the world of\nCybersecurity and Digital Forensics, especially considering the high-quality\nresults obtained with recent generative AI-based solutions. Almost all\ngenerative models leave unique traces in synthetic data that, if analyzed and\nidentified in detail, can be exploited to improve the generalization\nlimitations of existing deepfake detectors. In this paper we analyzed deepfake\nimages in the frequency domain generated by both GAN and Diffusion Model\nengines, examining in detail the underlying statistical distribution of\nDiscrete Cosine Transform (DCT) coefficients. Recognizing that not all\ncoefficients contribute equally to image detection, we hypothesize the\nexistence of a unique ``discriminative fingerprint\", embedded in specific\ncombinations of coefficients. To identify them, Machine Learning classifiers\nwere trained on various combinations of coefficients. In addition, the\nExplainable AI (XAI) LIME algorithm was used to search for intrinsic\ndiscriminative combinations of coefficients. Finally, we performed a robustness\ntest to analyze the persistence of traces by applying JPEG compression. The\nexperimental results reveal the existence of traces left by the generative\nmodels that are more discriminative and persistent at JPEG attacks. Code and\ndataset are available at https://github.com/opontorno/dcts_analysis_deepfakes.\n","authors":["Orazio Pontorno","Luca Guarnera","Sebastiano Battiato"],"pdf_url":"https://arxiv.org/pdf/2402.02209v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13307v3","updated":"2024-07-30T16:11:48Z","published":"2023-11-22T10:55:36Z","title":"Rethinking Radiology Report Generation via Causal Inspired\n Counterfactual Augmentation","summary":" Radiology Report Generation (RRG) draws attention as a vision-and-language\ninteraction of biomedical fields. Previous works inherited the ideology of\ntraditional language generation tasks, aiming to generate paragraphs with high\nreadability as reports. Despite significant progress, the independence between\ndiseases-a specific property of RRG-was neglected, yielding the models being\nconfused by the co-occurrence of diseases brought on by the biased data\ndistribution, thus generating inaccurate reports. In this paper, to rethink\nthis issue, we first model the causal effects between the variables from a\ncausal perspective, through which we prove that the co-occurrence relationships\nbetween diseases on the biased distribution function as confounders, confusing\nthe accuracy through two backdoor paths, i.e. the Joint Vision Coupling and the\nConditional Sequential Coupling. Then, we proposed a novel model-agnostic\ncounterfactual augmentation method that contains two strategies, i.e. the\nPrototype-based Counterfactual Sample Synthesis (P-CSS) and the Magic-Cube-like\nCounterfactual Report Reconstruction (Cube), to intervene the backdoor paths,\nthus enhancing the accuracy and generalization of RRG models. Experimental\nresults on the widely used MIMIC-CXR dataset demonstrate the effectiveness of\nour proposed method. Additionally, a generalization performance is evaluated on\nIU X-Ray dataset, which verifies our work can effectively reduce the impact of\nco-occurrences caused by different distributions on the results.\n","authors":["Xiao Song","Jiafan Liu","Yun Li","Yan Liu","Wenbin Lei","Ruxin Wang"],"pdf_url":"https://arxiv.org/pdf/2311.13307v3.pdf","comment":"10 pages,5 figures"},{"id":"http://arxiv.org/abs/2407.20928v1","updated":"2024-07-30T16:06:39Z","published":"2024-07-30T16:06:39Z","title":"UniProcessor: A Text-induced Unified Low-level Image Processor","summary":" Image processing, including image restoration, image enhancement, etc.,\ninvolves generating a high-quality clean image from a degraded input. Deep\nlearning-based methods have shown superior performance for various image\nprocessing tasks in terms of single-task conditions. However, they require to\ntrain separate models for different degradations and levels, which limits the\ngeneralization abilities of these models and restricts their applications in\nreal-world. In this paper, we propose a text-induced unified image processor\nfor low-level vision tasks, termed UniProcessor, which can effectively process\nvarious degradation types and levels, and support multimodal control.\nSpecifically, our UniProcessor encodes degradation-specific information with\nthe subject prompt and process degradations with the manipulation prompt. These\ncontext control features are injected into the UniProcessor backbone via\ncross-attention to control the processing procedure. For automatic\nsubject-prompt generation, we further build a vision-language model for\ngeneral-purpose low-level degradation perception via instruction tuning\ntechniques. Our UniProcessor covers 30 degradation types, and extensive\nexperiments demonstrate that our UniProcessor can well process these\ndegradations without additional training or tuning and outperforms other\ncompeting methods. Moreover, with the help of degradation-aware context\ncontrol, our UniProcessor first shows the ability to individually handle a\nsingle distortion in an image with multiple degradations.\n","authors":["Huiyu Duan","Xiongkuo Min","Sijing Wu","Wei Shen","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2407.20928v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20920v1","updated":"2024-07-30T15:58:25Z","published":"2024-07-30T15:58:25Z","title":"SSPA: Split-and-Synthesize Prompting with Gated Alignments for\n Multi-Label Image Recognition","summary":" Multi-label image recognition is a fundamental task in computer vision.\nRecently, Vision-Language Models (VLMs) have made notable advancements in this\narea. However, previous methods fail to effectively leverage the rich knowledge\nin language models and often incorporate label semantics into visual features\nunidirectionally. To overcome these problems, we propose a Split-and-Synthesize\nPrompting with Gated Alignments (SSPA) framework to amplify the potential of\nVLMs. Specifically, we develop an in-context learning approach to associate the\ninherent knowledge from LLMs. Then we propose a novel Split-and-Synthesize\nPrompting (SSP) strategy to first model the generic knowledge and downstream\nlabel semantics individually and then aggregate them carefully through the\nquaternion network. Moreover, we present Gated Dual-Modal Alignments (GDMA) to\nbidirectionally interact visual and linguistic modalities while eliminating\nredundant cross-modal information, enabling more efficient region-level\nalignments. Rather than making the final prediction by a sharp manner in\nprevious works, we propose a soft aggregator to jointly consider results from\nall image regions. With the help of flexible prompting and gated alignments,\nSSPA is generalizable to specific domains. Extensive experiments on nine\ndatasets from three domains (i.e., natural, pedestrian attributes and remote\nsensing) demonstrate the state-of-the-art performance of SSPA. Further analyses\nverify the effectiveness of SSP and the interpretability of GDMA. The code will\nbe made public.\n","authors":["Hao Tan","Zichang Tan","Jun Li","Jun Wan","Zhen Lei","Stan Z. Li"],"pdf_url":"https://arxiv.org/pdf/2407.20920v1.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2309.09954v2","updated":"2024-07-30T15:54:49Z","published":"2023-09-18T17:26:22Z","title":"vSHARP: variable Splitting Half-quadratic Admm algorithm for\n Reconstruction of inverse-Problems","summary":" Medical Imaging (MI) tasks, such as accelerated parallel Magnetic Resonance\nImaging (MRI), often involve reconstructing an image from noisy or incomplete\nmeasurements. This amounts to solving ill-posed inverse problems, where a\nsatisfactory closed-form analytical solution is not available. Traditional\nmethods such as Compressed Sensing (CS) in MRI reconstruction can be\ntime-consuming or prone to obtaining low-fidelity images. Recently, a plethora\nof Deep Learning (DL) approaches have demonstrated superior performance in\ninverse-problem solving, surpassing conventional methods. In this study, we\npropose vSHARP (variable Splitting Half-quadratic ADMM algorithm for\nReconstruction of inverse Problems), a novel DL-based method for solving\nill-posed inverse problems arising in MI. vSHARP utilizes the Half-Quadratic\nVariable Splitting method and employs the Alternating Direction Method of\nMultipliers (ADMM) to unroll the optimization process. For data consistency,\nvSHARP unrolls a differentiable gradient descent process in the image domain,\nwhile a DL-based denoiser, such as a U-Net architecture, is applied to enhance\nimage quality. vSHARP also employs a dilated-convolution DL-based model to\npredict the Lagrange multipliers for the ADMM initialization. We evaluate\nvSHARP on tasks of accelerated parallel MRI Reconstruction using two distinct\ndatasets and on accelerated parallel dynamic MRI Reconstruction using another\ndataset. Our comparative analysis with state-of-the-art methods demonstrates\nthe superior performance of vSHARP in these applications.\n","authors":["George Yiasemis","Nikita Moriakov","Jan-Jakob Sonke","Jonas Teuwen"],"pdf_url":"https://arxiv.org/pdf/2309.09954v2.pdf","comment":"22 pages, 9 figures, 5 tables"},{"id":"http://arxiv.org/abs/2407.20917v1","updated":"2024-07-30T15:54:18Z","published":"2024-07-30T15:54:18Z","title":"How to Choose a Reinforcement-Learning Algorithm","summary":" The field of reinforcement learning offers a large variety of concepts and\nmethods to tackle sequential decision-making problems. This variety has become\nso large that choosing an algorithm for a task at hand can be challenging. In\nthis work, we streamline the process of choosing reinforcement-learning\nalgorithms and action-distribution families. We provide a structured overview\nof existing methods and their properties, as well as guidelines for when to\nchoose which methods. An interactive version of these guidelines is available\nonline at https://rl-picker.github.io/.\n","authors":["Fabian Bongratz","Vladimir Golkov","Lukas Mautner","Luca Della Libera","Frederik Heetmeyer","Felix Czaja","Julian Rodemann","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2407.20917v1.pdf","comment":"40 pages"},{"id":"http://arxiv.org/abs/2312.13604v2","updated":"2024-07-30T15:49:16Z","published":"2023-12-21T06:44:18Z","title":"Ponymation: Learning 3D Animal Motions from Unlabeled Online Videos","summary":" We introduce Ponymation, a new method for learning a generative model of\narticulated 3D animal motions from raw, unlabeled online videos. Unlike\nexisting approaches for motion synthesis, our model does not require any pose\nannotations or parametric shape models for training, and is learned purely from\na collection of raw video clips obtained from the Internet. We build upon a\nrecent work, MagicPony, which learns articulated 3D animal shapes purely from\nsingle image collections, and extend it on two fronts. First, instead of\ntraining on static images, we augment the framework with a video training\npipeline that incorporates temporal regularizations, achieving more accurate\nand temporally consistent reconstructions. Second, we learn a generative model\nof the underlying articulated 3D motion sequences via a spatio-temporal\ntransformer VAE, simply using 2D reconstruction losses without relying on any\nexplicit pose annotations. At inference time, given a single 2D image of a new\nanimal instance, our model reconstructs an articulated, textured 3D mesh, and\ngenerates plausible 3D animations by sampling from the learned motion latent\nspace.\n","authors":["Keqiang Sun","Dor Litvak","Yunzhi Zhang","Hongsheng Li","Jiajun Wu","Shangzhe Wu"],"pdf_url":"https://arxiv.org/pdf/2312.13604v2.pdf","comment":"Project page: https://keqiangsun.github.io/projects/ponymation. The\n first two authors contributed equally to this work. The last two authors\n contributed equally"},{"id":"http://arxiv.org/abs/2403.15698v2","updated":"2024-07-30T15:41:41Z","published":"2024-03-23T03:23:29Z","title":"SceneX:Procedural Controllable Large-scale Scene Generation via\n Large-language Models","summary":" Due to its great application potential, large-scale scene generation has\ndrawn extensive attention in academia and industry. Recent research employs\npowerful generative models to create desired scenes and achieves promising\nresults. However, most of these methods represent the scene using 3D primitives\n(e.g. point cloud or radiance field) incompatible with the industrial pipeline,\nwhich leads to a substantial gap between academic research and industrial\ndeployment. Procedural Controllable Generation (PCG) is an efficient technique\nfor creating scalable and high-quality assets, but it is unfriendly for\nordinary users as it demands profound domain expertise. To address these\nissues, we resort to using the large language model (LLM) to drive the\nprocedural modeling. In this paper, we introduce a large-scale scene generation\nframework, SceneX, which can automatically produce high-quality procedural\nmodels according to designers' textual descriptions.Specifically, the proposed\nmethod comprises two components, PCGBench and PCGPlanner. The former\nencompasses an extensive collection of accessible procedural assets and\nthousands of hand-craft API documents. The latter aims to generate executable\nactions for Blender to produce controllable and precise 3D assets guided by the\nuser's instructions. Our SceneX can generate a city spanning 2.5 km times 2.5\nkm with delicate layout and geometric structures, drastically reducing the time\ncost from several weeks for professional PCG engineers to just a few hours for\nan ordinary user. Extensive experiments demonstrated the capability of our\nmethod in controllable large-scale scene generation and editing, including\nasset placement and season translation.\n","authors":["Mengqi Zhou","Yuxi Wang","Jun Hou","Chuanchen Luo","Zhaoxiang Zhang","Junran Peng"],"pdf_url":"https://arxiv.org/pdf/2403.15698v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20908v1","updated":"2024-07-30T15:33:58Z","published":"2024-07-30T15:33:58Z","title":"Dynamic Scene Understanding through Object-Centric Voxelization and\n Neural Rendering","summary":" Learning object-centric representations from unsupervised videos is\nchallenging. Unlike most previous approaches that focus on decomposing 2D\nimages, we present a 3D generative model named DynaVol-S for dynamic scenes\nthat enables object-centric learning within a differentiable volume rendering\nframework. The key idea is to perform object-centric voxelization to capture\nthe 3D nature of the scene, which infers per-object occupancy probabilities at\nindividual spatial locations. These voxel features evolve through a\ncanonical-space deformation function and are optimized in an inverse rendering\npipeline with a compositional NeRF. Additionally, our approach integrates 2D\nsemantic features to create 3D semantic grids, representing the scene through\nmultiple disentangled voxel grids. DynaVol-S significantly outperforms existing\nmodels in both novel view synthesis and unsupervised decomposition tasks for\ndynamic scenes. By jointly considering geometric structures and semantic\nfeatures, it effectively addresses challenging real-world scenarios involving\ncomplex object interactions. Furthermore, once trained, the explicitly\nmeaningful voxel features enable additional capabilities that 2D scene\ndecomposition methods cannot achieve, such as novel scene generation through\nediting geometric shapes or manipulating the motion trajectories of objects.\n","authors":["Yanpeng Zhao","Yiwei Hao","Siyu Gao","Yunbo Wang","Xiaokang Yang"],"pdf_url":"https://arxiv.org/pdf/2407.20908v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20198v2","updated":"2024-07-30T15:33:41Z","published":"2024-07-29T17:24:52Z","title":"SpaER: Learning Spatio-temporal Equivariant Representations for Fetal\n Brain Motion Tracking","summary":" In this paper, we introduce SpaER, a pioneering method for fetal motion\ntracking that leverages equivariant filters and self-attention mechanisms to\neffectively learn spatio-temporal representations. Different from conventional\napproaches that statically estimate fetal brain motions from pairs of images,\nour method dynamically tracks the rigid movement patterns of the fetal head\nacross temporal and spatial dimensions. Specifically, we first develop an\nequivariant neural network that efficiently learns rigid motion sequences\nthrough low-dimensional spatial representations of images. Subsequently, we\nlearn spatio-temporal representations by incorporating time encoding and\nself-attention neural network layers. This approach allows for the capture of\nlong-term dependencies of fetal brain motion and addresses alignment errors due\nto contrast changes and severe motion artifacts. Our model also provides a\ngeometric deformation estimation that properly addresses image distortions\namong all time frames. To the best of our knowledge, our approach is the first\nto learn spatial-temporal representations via deep neural networks for fetal\nmotion tracking without data augmentation. We validated our model using real\nfetal echo-planar images with simulated and real motions. Our method carries\nsignificant potential value in accurately measuring, tracking, and correcting\nfetal motion in fetal MRI sequences.\n","authors":["Jian Wang","Razieh Faghihpirayesh","Polina Golland","Ali Ghoulipour"],"pdf_url":"https://arxiv.org/pdf/2407.20198v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20892v1","updated":"2024-07-30T15:09:45Z","published":"2024-07-30T15:09:45Z","title":"What is YOLOv5: A deep look into the internal features of the popular\n object detector","summary":" This study presents a comprehensive analysis of the YOLOv5 object detection\nmodel, examining its architecture, training methodologies, and performance. Key\ncomponents, including the Cross Stage Partial backbone and Path\nAggregation-Network, are explored in detail. The paper reviews the model's\nperformance across various metrics and hardware platforms. Additionally, the\nstudy discusses the transition from Darknet to PyTorch and its impact on model\ndevelopment. Overall, this research provides insights into YOLOv5's\ncapabilities and its position within the broader landscape of object detection\nand why it is a popular choice for constrained edge deployment scenarios.\n","authors":["Rahima Khanam","Muhammad Hussain"],"pdf_url":"https://arxiv.org/pdf/2407.20892v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20891v1","updated":"2024-07-30T15:07:13Z","published":"2024-07-30T15:07:13Z","title":"Bayesian Low-Rank LeArning (Bella): A Practical Approach to Bayesian\n Neural Networks","summary":" Computational complexity of Bayesian learning is impeding its adoption in\npractical, large-scale tasks. Despite demonstrations of significant merits such\nas improved robustness and resilience to unseen or out-of-distribution inputs\nover their non- Bayesian counterparts, their practical use has faded to near\ninsignificance. In this study, we introduce an innovative framework to mitigate\nthe computational burden of Bayesian neural networks (BNNs). Our approach\nfollows the principle of Bayesian techniques based on deep ensembles, but\nsignificantly reduces their cost via multiple low-rank perturbations of\nparameters arising from a pre-trained neural network. Both vanilla version of\nensembles as well as more sophisticated schemes such as Bayesian learning with\nStein Variational Gradient Descent (SVGD), previously deemed impractical for\nlarge models, can be seamlessly implemented within the proposed framework,\ncalled Bayesian Low-Rank LeArning (Bella). In a nutshell, i) Bella achieves a\ndramatic reduction in the number of trainable parameters required to\napproximate a Bayesian posterior; and ii) it not only maintains, but in some\ninstances, surpasses the performance of conventional Bayesian learning methods\nand non-Bayesian baselines. Our results with large-scale tasks such as\nImageNet, CAMELYON17, DomainNet, VQA with CLIP, LLaVA demonstrate the\neffectiveness and versatility of Bella in building highly scalable and\npractical Bayesian deep models for real-world applications.\n","authors":["Bao Gia Doan","Afshar Shamsi","Xiao-Yu Guo","Arash Mohammadi","Hamid Alinejad-Rokny","Dino Sejdinovic","Damith C. Ranasinghe","Ehsan Abbasnejad"],"pdf_url":"https://arxiv.org/pdf/2407.20891v1.pdf","comment":"25 pages, 14 figures, 11 tables"},{"id":"http://arxiv.org/abs/2407.20878v1","updated":"2024-07-30T14:56:06Z","published":"2024-07-30T14:56:06Z","title":"S3PET: Semi-supervised Standard-dose PET Image Reconstruction via\n Dose-aware Token Swap","summary":" To acquire high-quality positron emission tomography (PET) images while\nreducing the radiation tracer dose, numerous efforts have been devoted to\nreconstructing standard-dose PET (SPET) images from low-dose PET (LPET).\nHowever, the success of current fully-supervised approaches relies on abundant\npaired LPET and SPET images, which are often unavailable in clinic. Moreover,\nthese methods often mix the dose-invariant content with dose level-related\ndose-specific details during reconstruction, resulting in distorted images. To\nalleviate these problems, in this paper, we propose a two-stage Semi-Supervised\nSPET reconstruction framework, namely S3PET, to accommodate the training of\nabundant unpaired and limited paired SPET and LPET images. Our S3PET involves\nan un-supervised pre-training stage (Stage I) to extract representations from\nunpaired images, and a supervised dose-aware reconstruction stage (Stage II) to\nachieve LPET-to-SPET reconstruction by transferring the dose-specific knowledge\nbetween paired images. Specifically, in stage I, two independent dose-specific\nmasked autoencoders (DsMAEs) are adopted to comprehensively understand the\nunpaired SPET and LPET images. Then, in Stage II, the pre-trained DsMAEs are\nfurther finetuned using paired images. To prevent distortions in both content\nand details, we introduce two elaborate modules, i.e., a dose knowledge\ndecouple module to disentangle the respective dose-specific and dose-invariant\nknowledge of LPET and SPET, and a dose-specific knowledge learning module to\ntransfer the dose-specific information from SPET to LPET, thereby achieving\nhigh-quality SPET reconstruction from LPET images. Experiments on two datasets\ndemonstrate that our S3PET achieves state-of-the-art performance quantitatively\nand qualitatively.\n","authors":["Jiaqi Cui","Pinxian Zeng","Yuanyuan Xu","Xi Wu","Jiliu Zhou","Yan Wang"],"pdf_url":"https://arxiv.org/pdf/2407.20878v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20876v1","updated":"2024-07-30T14:54:54Z","published":"2024-07-30T14:54:54Z","title":"Automatic Die Studies for Ancient Numismatics","summary":" Die studies are fundamental to quantifying ancient monetary production,\nproviding insights into the relationship between coinage, politics, and\nhistory. The process requires tedious manual work, which limits the size of the\ncorpora that can be studied. Few works have attempted to automate this task,\nand none have been properly released and evaluated from a computer vision\nperspective. We propose a fully automatic approach that introduces several\ninnovations compared to previous methods. We rely on fast and robust local\ndescriptors matching that is set automatically. Second, the core of our\nproposal is a clustering-based approach that uses an intrinsic metric (that\ndoes not need the ground truth labels) to determine its critical\nhyper-parameters. We validate the approach on two corpora of Greek coins,\npropose an automatic implementation and evaluation of previous baselines, and\nshow that our approach significantly outperforms them.\n","authors":["Clément Cornet","Héloïse Aumaître","Romaric Besançon","Julien Olivier","Thomas Faucher","Hervé Le Borgne"],"pdf_url":"https://arxiv.org/pdf/2407.20876v1.pdf","comment":"code: https://cea-list-lasti.github.io/projects/studies/studies.html"},{"id":"http://arxiv.org/abs/2407.20872v1","updated":"2024-07-30T14:50:49Z","published":"2024-07-30T14:50:49Z","title":"A Comparative Analysis of YOLOv5, YOLOv8, and YOLOv10 in Kitchen Safety","summary":" Knife safety in the kitchen is essential for preventing accidents or injuries\nwith an emphasis on proper handling, maintenance, and storage methods. This\nresearch presents a comparative analysis of three YOLO models, YOLOv5, YOLOv8,\nand YOLOv10, to detect the hazards involved in handling knife, concentrating\nmainly on ensuring fingers are curled while holding items to be cut and that\nhands should only be in contact with knife handle avoiding the blade.\nPrecision, recall, F-score, and normalized confusion matrix are used to\nevaluate the performance of the models. The results indicate that YOLOv5\nperformed better than the other two models in identifying the hazard of\nensuring hands only touch the blade, while YOLOv8 excelled in detecting the\nhazard of curled fingers while holding items. YOLOv5 and YOLOv8 performed\nalmost identically in recognizing classes such as hand, knife, and vegetable,\nwhereas YOLOv5, YOLOv8, and YOLOv10 accurately identified the cutting board.\nThis paper provides insights into the advantages and shortcomings of these\nmodels in real-world settings. Moreover, by detailing the optimization of YOLO\narchitectures for safe knife handling, this study promotes the development of\nincreased accuracy and efficiency in safety surveillance systems.\n","authors":["Athulya Sundaresan Geetha","Muhammad Hussain"],"pdf_url":"https://arxiv.org/pdf/2407.20872v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15856v2","updated":"2024-07-30T14:49:15Z","published":"2023-11-27T14:23:36Z","title":"JSSL: Joint Supervised and Self-supervised Learning for MRI\n Reconstruction","summary":" Purpose: MRI represents an important diagnostic modality; however, its\ninherently slow acquisition process poses challenges in obtaining fully-sampled\nk-space data under motion. In the absence of fully-sampled acquisitions,\nserving as ground truths, training deep learning algorithms in a supervised\nmanner to predict the underlying ground truth image becomes challenging. To\naddress this limitation, self-supervised methods have emerged as a viable\nalternative, leveraging available subsampled k-space data to train deep neural\nnetworks for MRI reconstruction. Nevertheless, these approaches often fall\nshort when compared to supervised methods.\n Methods: We propose Joint Supervised and Self-supervised Learning (JSSL), a\nnovel training approach for deep learning-based MRI reconstruction algorithms\naimed at enhancing reconstruction quality in cases where target datasets\ncontaining fully-sampled k-space measurements are unavailable. JSSL operates by\nsimultaneously training a model in a self-supervised learning setting, using\nsubsampled data from the target dataset(s), and in a supervised learning\nmanner, utilizing datasets with fully-sampled k-space data, referred to as\nproxy datasets. We demonstrate JSSL's efficacy using subsampled prostate or\ncardiac MRI data as the target datasets, with fully-sampled brain and knee, or\nbrain, knee and prostate k-space acquisitions, respectively, as proxy datasets.\n Results: Our results showcase substantial improvements over conventional\nself-supervised methods, validated using common image quality metrics.\nFurthermore, we provide theoretical motivations for JSSL and establish\nrule-of-thumb guidelines for training MRI reconstruction models.\n Conclusion: JSSL effectively enhances MRI reconstruction quality in scenarios\nwhere fully-sampled k-space data is not available, leveraging the strengths of\nsupervised learning by incorporating proxy datasets.\n","authors":["George Yiasemis","Nikita Moriakov","Clara I. Sánchez","Jan-Jakob Sonke","Jonas Teuwen"],"pdf_url":"https://arxiv.org/pdf/2311.15856v2.pdf","comment":"pages, 14 figures, 6 tables"},{"id":"http://arxiv.org/abs/2407.20870v1","updated":"2024-07-30T14:45:31Z","published":"2024-07-30T14:45:31Z","title":"Mean of Means: A 10-dollar Solution for Human Localization with\n Calibration-free and Unconstrained Camera Settings","summary":" Accurate human localization is crucial for various applications, especially\nin the Metaverse era. Existing high precision solutions rely on expensive,\ntag-dependent hardware, while vision-based methods offer a cheaper, tag-free\nalternative. However, current vision solutions based on stereo vision face\nlimitations due to rigid perspective transformation principles and error\npropagation in multi-stage SVD solvers. These solutions also require multiple\nhigh-resolution cameras with strict setup constraints. To address these\nlimitations, we propose a probabilistic approach that considers all points on\nthe human body as observations generated by a distribution centered around the\nbody's geometric center. This enables us to improve sampling significantly,\nincreasing the number of samples for each point of interest from hundreds to\nbillions. By modeling the relation between the means of the distributions of\nworld coordinates and pixel coordinates, leveraging the Central Limit Theorem,\nwe ensure normality and facilitate the learning process. Experimental results\ndemonstrate human localization accuracy of 95% within a 0.3m range and nearly\n100% accuracy within a 0.5m range, achieved at a low cost of only 10 USD using\ntwo web cameras with a resolution of 640x480 pixels.\n","authors":["Tianyi Zhang","Wengyu Zhang","Xulu Zhang","Jiaxin Wu","Xiao-Yong Wei","Jiannong Cao","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2407.20870v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20868v1","updated":"2024-07-30T14:43:54Z","published":"2024-07-30T14:43:54Z","title":"A Comparative Study of Neural Surface Reconstruction for Scientific\n Visualization","summary":" This comparative study evaluates various neural surface reconstruction\nmethods, particularly focusing on their implications for scientific\nvisualization through reconstructing 3D surfaces via multi-view rendering\nimages. We categorize ten methods into neural radiance fields and neural\nimplicit surfaces, uncovering the benefits of leveraging distance functions\n(i.e., SDFs and UDFs) to enhance the accuracy and smoothness of the\nreconstructed surfaces. Our findings highlight the efficiency and quality of\nNeuS2 for reconstructing closed surfaces and identify NeUDF as a promising\ncandidate for reconstructing open surfaces despite some limitations. By sharing\nour benchmark dataset, we invite researchers to test the performance of their\nmethods, contributing to the advancement of surface reconstruction solutions\nfor scientific visualization.\n","authors":["Siyuan Yao","Weixi Song","Chaoli Wang"],"pdf_url":"https://arxiv.org/pdf/2407.20868v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20855v1","updated":"2024-07-30T14:31:33Z","published":"2024-07-30T14:31:33Z","title":"DeTurb: Atmospheric Turbulence Mitigation with Deformable 3D\n Convolutions and 3D Swin Transformers","summary":" Atmospheric turbulence in long-range imaging significantly degrades the\nquality and fidelity of captured scenes due to random variations in both\nspatial and temporal dimensions. These distortions present a formidable\nchallenge across various applications, from surveillance to astronomy,\nnecessitating robust mitigation strategies. While model-based approaches\nachieve good results, they are very slow. Deep learning approaches show promise\nin image and video restoration but have struggled to address these\nspatiotemporal variant distortions effectively. This paper proposes a new\nframework that combines geometric restoration with an enhancement module.\nRandom perturbations and geometric distortion are removed using a pyramid\narchitecture with deformable 3D convolutions, resulting in aligned frames.\nThese frames are then used to reconstruct a sharp, clear image via a\nmulti-scale architecture of 3D Swin Transformers. The proposed framework\ndemonstrates superior performance over the state of the art for both synthetic\nand real atmospheric turbulence effects, with reasonable speed and model size.\n","authors":["Zhicheng Zou","Nantheera Anantrasirichai"],"pdf_url":"https://arxiv.org/pdf/2407.20855v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20853v1","updated":"2024-07-30T14:27:59Z","published":"2024-07-30T14:27:59Z","title":"NIS-SLAM: Neural Implicit Semantic RGB-D SLAM for 3D Consistent Scene\n Understanding","summary":" In recent years, the paradigm of neural implicit representations has gained\nsubstantial attention in the field of Simultaneous Localization and Mapping\n(SLAM). However, a notable gap exists in the existing approaches when it comes\nto scene understanding. In this paper, we introduce NIS-SLAM, an efficient\nneural implicit semantic RGB-D SLAM system, that leverages a pre-trained 2D\nsegmentation network to learn consistent semantic representations.\nSpecifically, for high-fidelity surface reconstruction and spatial consistent\nscene understanding, we combine high-frequency multi-resolution\ntetrahedron-based features and low-frequency positional encoding as the\nimplicit scene representations. Besides, to address the inconsistency of 2D\nsegmentation results from multiple views, we propose a fusion strategy that\nintegrates the semantic probabilities from previous non-keyframes into\nkeyframes to achieve consistent semantic learning. Furthermore, we implement a\nconfidence-based pixel sampling and progressive optimization weight function\nfor robust camera tracking. Extensive experimental results on various datasets\nshow the better or more competitive performance of our system when compared to\nother existing neural dense implicit RGB-D SLAM approaches. Finally, we also\nshow that our approach can be used in augmented reality applications. Project\npage:\n\\href{https://zju3dv.github.io/nis_slam}{https://zju3dv.github.io/nis\\_slam}.\n","authors":["Hongjia Zhai","Gan Huang","Qirui Hu","Guanglin Li","Hujun Bao","Guofeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.20853v1.pdf","comment":"Accept by TVCG (ISMAR 2024 Journal Track)"},{"id":"http://arxiv.org/abs/2407.20845v1","updated":"2024-07-30T14:22:13Z","published":"2024-07-30T14:22:13Z","title":"Assessing Graphical Perception of Image Embedding Models using Channel\n Effectiveness","summary":" Recent advancements in vision models have greatly improved their ability to\nhandle complex chart understanding tasks, like chart captioning and question\nanswering. However, it remains challenging to assess how these models process\ncharts. Existing benchmarks only roughly evaluate model performance without\nevaluating the underlying mechanisms, such as how models extract image\nembeddings. This limits our understanding of the model's ability to perceive\nfundamental graphical components. To address this, we introduce a novel\nevaluation framework to assess the graphical perception of image embedding\nmodels. For chart comprehension, we examine two main aspects of channel\neffectiveness: accuracy and discriminability of various visual channels.\nChannel accuracy is assessed through the linearity of embeddings, measuring how\nwell the perceived magnitude aligns with the size of the stimulus.\nDiscriminability is evaluated based on the distances between embeddings,\nindicating their distinctness. Our experiments with the CLIP model show that it\nperceives channel accuracy differently from humans and shows unique\ndiscriminability in channels like length, tilt, and curvature. We aim to\ndevelop this work into a broader benchmark for reliable visual encoders,\nenhancing models for precise chart comprehension and human-like perception in\nfuture applications.\n","authors":["Soohyun Lee","Minsuk Chang","Seokhyeon Park","Jinwook Seo"],"pdf_url":"https://arxiv.org/pdf/2407.20845v1.pdf","comment":"In Proceedings of the 2024 IEEE Visualization and Visual Analytics\n (VIS)"},{"id":"http://arxiv.org/abs/2407.20843v1","updated":"2024-07-30T14:16:09Z","published":"2024-07-30T14:16:09Z","title":"DFE-IANet: A Method for Polyp Image Classification Based on Dual-domain\n Feature Extraction and Interaction Attention","summary":" It is helpful in preventing colorectal cancer to detect and treat polyps in\nthe gastrointestinal tract early. However, there have been few studies to date\non designing polyp image classification networks that balance efficiency and\naccuracy. This challenge is mainly attributed to the fact that polyps are\nsimilar to other pathologies and have complex features influenced by texture,\ncolor, and morphology. In this paper, we propose a novel network DFE-IANet\nbased on both spectral transformation and feature interaction. Firstly, to\nextract detailed features and multi-scale features, the features are\ntransformed by the multi-scale frequency domain feature extraction (MSFD) block\nto extract texture details at the fine-grained level in the frequency domain.\nSecondly, the multi-scale interaction attention (MSIA) block is designed to\nenhance the network's capability of extracting critical features. This block\nintroduces multi-scale features into self-attention, aiming to adaptively guide\nthe network to concentrate on vital regions. Finally, with a compact parameter\nof only 4M, DFE-IANet outperforms the latest and classical networks in terms of\nefficiency. Furthermore, DFE-IANet achieves state-of-the-art (SOTA) results on\nthe challenging Kvasir dataset, demonstrating a remarkable Top-1 accuracy of\n93.94%. This outstanding accuracy surpasses ViT by 8.94%, ResNet50 by 1.69%,\nand VMamba by 1.88%. Our code is publicly available at\nhttps://github.com/PURSUETHESUN/DFE-IANet.\n","authors":["Wei Wang","Jixing He","Xin Wang"],"pdf_url":"https://arxiv.org/pdf/2407.20843v1.pdf","comment":"This paper has been accepted by 2024 International Conference on\n Intelligent Computing.It can be accessed at http://poster-openaccess.com"},{"id":"http://arxiv.org/abs/2407.20836v1","updated":"2024-07-30T14:07:17Z","published":"2024-07-30T14:07:17Z","title":"Vulnerabilities in AI-generated Image Detection: The Challenge of\n Adversarial Attacks","summary":" Recent advancements in image synthesis, particularly with the advent of GAN\nand Diffusion models, have amplified public concerns regarding the\ndissemination of disinformation. To address such concerns, numerous\nAI-generated Image (AIGI) Detectors have been proposed and achieved promising\nperformance in identifying fake images. However, there still lacks a systematic\nunderstanding of the adversarial robustness of these AIGI detectors. In this\npaper, we examine the vulnerability of state-of-the-art AIGI detectors against\nadversarial attack under white-box and black-box settings, which has been\nrarely investigated so far. For the task of AIGI detection, we propose a new\nattack containing two main parts. First, inspired by the obvious difference\nbetween real images and fake images in the frequency domain, we add\nperturbations under the frequency domain to push the image away from its\noriginal frequency distribution. Second, we explore the full posterior\ndistribution of the surrogate model to further narrow this gap between\nheterogeneous models, e.g. transferring adversarial examples across CNNs and\nViTs. This is achieved by introducing a novel post-train Bayesian strategy that\nturns a single surrogate into a Bayesian one, capable of simulating diverse\nvictim models using one pre-trained surrogate, without the need for\nre-training. We name our method as frequency-based post-train Bayesian attack,\nor FPBA. Through FPBA, we show that adversarial attack is truly a real threat\nto AIGI detectors, because FPBA can deliver successful black-box attacks across\nmodels, generators, defense methods, and even evade cross-generator detection,\nwhich is a crucial real-world detection scenario.\n","authors":["Yunfeng Diao","Naixin Zhai","Changtao Miao","Xun Yang","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2407.20836v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20830v1","updated":"2024-07-30T13:56:26Z","published":"2024-07-30T13:56:26Z","title":"Federated Knowledge Recycling: Privacy-Preserving Synthetic Data Sharing","summary":" Federated learning has emerged as a paradigm for collaborative learning,\nenabling the development of robust models without the need to centralise\nsensitive data. However, conventional federated learning techniques have\nprivacy and security vulnerabilities due to the exposure of models, parameters\nor updates, which can be exploited as an attack surface. This paper presents\nFederated Knowledge Recycling (FedKR), a cross-silo federated learning approach\nthat uses locally generated synthetic data to facilitate collaboration between\ninstitutions. FedKR combines advanced data generation techniques with a dynamic\naggregation process to provide greater security against privacy attacks than\nexisting methods, significantly reducing the attack surface. Experimental\nresults on generic and medical datasets show that FedKR achieves competitive\nperformance, with an average improvement in accuracy of 4.24% compared to\ntraining models from local data, demonstrating particular effectiveness in data\nscarcity scenarios.\n","authors":["Eugenio Lomurno","Matteo Matteucci"],"pdf_url":"https://arxiv.org/pdf/2407.20830v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.18375v2","updated":"2024-07-30T13:43:54Z","published":"2024-06-26T14:19:31Z","title":"From Majority to Minority: A Diffusion-based Augmentation for\n Underrepresented Groups in Skin Lesion Analysis","summary":" AI-based diagnoses have demonstrated dermatologist-level performance in\nclassifying skin cancer. However, such systems are prone to under-performing\nwhen tested on data from minority groups that lack sufficient representation in\nthe training sets. Although data collection and annotation offer the best means\nfor promoting minority groups, these processes are costly and time-consuming.\nPrior works have suggested that data from majority groups may serve as a\nvaluable information source to supplement the training of diagnosis tools for\nminority groups. In this work, we propose an effective diffusion-based\naugmentation framework that maximizes the use of rich information from majority\ngroups to benefit minority groups. Using groups with different skin types as a\ncase study, our results show that the proposed framework can generate synthetic\nimages that improve diagnostic results for the minority groups, even when there\nis little or no reference data from these target groups. The practical value of\nour work is evident in medical imaging analysis, where under-diagnosis persists\nas a problem for certain groups due to insufficient representation.\n","authors":["Janet Wang","Yunsung Chung","Zhengming Ding","Jihun Hamm"],"pdf_url":"https://arxiv.org/pdf/2406.18375v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12007v3","updated":"2024-07-30T13:35:51Z","published":"2023-10-18T14:40:52Z","title":"KI-PMF: Knowledge Integrated Plausible Motion Forecasting","summary":" Accurately forecasting the motion of traffic actors is crucial for the\ndeployment of autonomous vehicles at a large scale. Current trajectory\nforecasting approaches primarily concentrate on optimizing a loss function with\na specific metric, which can result in predictions that do not adhere to\nphysical laws or violate external constraints. Our objective is to incorporate\nexplicit knowledge priors that allow a network to forecast future trajectories\nin compliance with both the kinematic constraints of a vehicle and the geometry\nof the driving environment. To achieve this, we introduce a non-parametric\npruning layer and attention layers to integrate the defined knowledge priors.\nOur proposed method is designed to ensure reachability guarantees for traffic\nactors in both complex and dynamic situations. By conditioning the network to\nfollow physical laws, we can obtain accurate and safe predictions, essential\nfor maintaining autonomous vehicles' safety and efficiency in real-world\nsettings.In summary, this paper presents concepts that prevent off-road\npredictions for safe and reliable motion forecasting by incorporating knowledge\npriors into the training process.\n","authors":["Abhishek Vivekanandan","Ahmed Abouelazm","Philip Schörner","J. Marius Zöllner"],"pdf_url":"https://arxiv.org/pdf/2310.12007v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20818v1","updated":"2024-07-30T13:32:34Z","published":"2024-07-30T13:32:34Z","title":"WARM-3D: A Weakly-Supervised Sim2Real Domain Adaptation Framework for\n Roadside Monocular 3D Object Detection","summary":" Existing roadside perception systems are limited by the absence of publicly\navailable, large-scale, high-quality 3D datasets. Exploring the use of\ncost-effective, extensive synthetic datasets offers a viable solution to tackle\nthis challenge and enhance the performance of roadside monocular 3D detection.\nIn this study, we introduce the TUMTraf Synthetic Dataset, offering a diverse\nand substantial collection of high-quality 3D data to augment scarce real-world\ndatasets. Besides, we present WARM-3D, a concise yet effective framework to aid\nthe Sim2Real domain transfer for roadside monocular 3D detection. Our method\nleverages cheap synthetic datasets and 2D labels from an off-the-shelf 2D\ndetector for weak supervision. We show that WARM-3D significantly enhances\nperformance, achieving a +12.40% increase in mAP 3D over the baseline with only\npseudo-2D supervision. With 2D GT as weak labels, WARM-3D even reaches\nperformance close to the Oracle baseline. Moreover, WARM-3D improves the\nability of 3D detectors to unseen sample recognition across various real-world\nenvironments, highlighting its potential for practical applications.\n","authors":["Xingcheng Zhou","Deyu Fu","Walter Zimmer","Mingyu Liu","Venkatnarayanan Lakshminarasimhan","Leah Strand","Alois C. Knoll"],"pdf_url":"https://arxiv.org/pdf/2407.20818v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.12857v4","updated":"2024-07-30T13:29:25Z","published":"2022-05-25T15:33:06Z","title":"Structure Unbiased Adversarial Model for Medical Image Segmentation","summary":" Generative models have been widely proposed in image recognition to generate\nmore images where the distribution is similar to that of the real ones. It\noften introduces a discriminator network to differentiate the real data from\nthe generated ones. Such models utilise a discriminator network tasked with\ndifferentiating style transferred data from data contained in the target\ndataset. However in doing so the network focuses on discrepancies in the\nintensity distribution and may overlook structural differences between the\ndatasets. In this paper we formulate a new image-to-image translation problem\nto ensure that the structure of the generated images is similar to that in the\ntarget dataset. We propose a simple, yet powerful Structure-Unbiased\nAdversarial (SUA) network which accounts for both intensity and structural\ndifferences between the training and test sets when performing image\nsegmentation. It consists of a spatial transformation block followed by an\nintensity distribution rendering module. The spatial transformation block is\nproposed to reduce the structure gap between the two images, and also produce\nan inverse deformation field to warp the final segmented image back. The\nintensity distribution rendering module then renders the deformed structure to\nan image with the target intensity distribution. Experimental results show that\nthe proposed SUA method has the capability to transfer both intensity\ndistribution and structural content between multiple datasets.\n","authors":["Tianyang Zhang","Shaoming Zheng","Jun Cheng","Xi Jia","Joseph Bartlett","Xinxing Cheng","Huazhu Fu","Zhaowen Qiu","Jiang Liu","Jinming Duan"],"pdf_url":"https://arxiv.org/pdf/2205.12857v4.pdf","comment":"Will revise the paper and resubmit"},{"id":"http://arxiv.org/abs/2302.03566v4","updated":"2024-07-30T13:18:32Z","published":"2023-02-07T16:26:45Z","title":"Look Around and Learn: Self-Training Object Detection by Exploration","summary":" When an object detector is deployed in a novel setting it often experiences a\ndrop in performance. This paper studies how an embodied agent can automatically\nfine-tune a pre-existing object detector while exploring and acquiring images\nin a new environment without relying on human intervention, i.e., a fully\nself-supervised approach. In our setting, an agent initially learns to explore\nthe environment using a pre-trained off-the-shelf detector to locate objects\nand associate pseudo-labels. By assuming that pseudo-labels for the same object\nmust be consistent across different views, we learn the exploration policy Look\nAround to mine hard samples, and we devise a novel mechanism called\nDisagreement Reconciliation for producing refined pseudo-labels from the\nconsensus among observations. We implement a unified benchmark of the current\nstate-of-the-art and compare our approach with pre-existing exploration\npolicies and perception mechanisms. Our method is shown to outperform existing\napproaches, improving the object detector by 6.2% in a simulated scenario, a\n3.59% advancement over other state-of-the-art methods, and by 9.97% in the real\nrobotic test without relying on ground-truth. Code for the proposed approach\nand baselines are available at\nhttps://iit-pavis.github.io/Look_Around_And_Learn/.\n","authors":["Gianluca Scarpellini","Stefano Rosa","Pietro Morerio","Lorenzo Natale","Alessio Del Bue"],"pdf_url":"https://arxiv.org/pdf/2302.03566v4.pdf","comment":"Paper accepted at ECCV2024"},{"id":"http://arxiv.org/abs/2308.16071v3","updated":"2024-07-30T13:09:47Z","published":"2023-08-30T14:49:34Z","title":"Semantic Image Synthesis via Class-Adaptive Cross-Attention","summary":" In semantic image synthesis the state of the art is dominated by methods that\nuse customized variants of the SPatially-Adaptive DE-normalization (SPADE)\nlayers, which allow for good visual generation quality and editing versatility.\nBy design, such layers learn pixel-wise modulation parameters to de-normalize\nthe generator activations based on the semantic class each pixel belongs to.\nThus, they tend to overlook global image statistics, ultimately leading to\nunconvincing local style editing and causing global inconsistencies such as\ncolor or illumination distribution shifts. Also, SPADE layers require the\nsemantic segmentation mask for mapping styles in the generator, preventing\nshape manipulations without manual intervention. In response, we designed a\nnovel architecture where cross-attention layers are used in place of SPADE for\nlearning shape-style correlations and so conditioning the image generation\nprocess. Our model inherits the versatility of SPADE, at the same time\nobtaining state-of-the-art generation quality, as well as improved global and\nlocal style transfer. Code and models available at\nhttps://github.com/TFonta/CA2SIS.\n","authors":["Tomaso Fontanini","Claudio Ferrari","Giuseppe Lisanti","Massimo Bertozzi","Andrea Prati"],"pdf_url":"https://arxiv.org/pdf/2308.16071v3.pdf","comment":"Code and models available at https://github.com/TFonta/CA2SIS The\n paper is under consideration at Computer Vision and Image Understanding"},{"id":"http://arxiv.org/abs/2407.15526v2","updated":"2024-07-30T13:03:36Z","published":"2024-07-22T10:31:07Z","title":"Synthetic Image Learning: Preserving Performance and Preventing\n Membership Inference Attacks","summary":" Generative artificial intelligence has transformed the generation of\nsynthetic data, providing innovative solutions to challenges like data scarcity\nand privacy, which are particularly critical in fields such as medicine.\nHowever, the effective use of this synthetic data to train high-performance\nmodels remains a significant challenge. This paper addresses this issue by\nintroducing Knowledge Recycling (KR), a pipeline designed to optimise the\ngeneration and use of synthetic data for training downstream classifiers. At\nthe heart of this pipeline is Generative Knowledge Distillation (GKD), the\nproposed technique that significantly improves the quality and usefulness of\nthe information provided to classifiers through a synthetic dataset\nregeneration and soft labelling mechanism. The KR pipeline has been tested on a\nvariety of datasets, with a focus on six highly heterogeneous medical image\ndatasets, ranging from retinal images to organ scans. The results show a\nsignificant reduction in the performance gap between models trained on real and\nsynthetic data, with models based on synthetic data outperforming those trained\non real data in some cases. Furthermore, the resulting models show almost\ncomplete immunity to Membership Inference Attacks, manifesting privacy\nproperties missing in models trained with conventional techniques.\n","authors":["Eugenio Lomurno","Matteo Matteucci"],"pdf_url":"https://arxiv.org/pdf/2407.15526v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20799v1","updated":"2024-07-30T13:02:08Z","published":"2024-07-30T13:02:08Z","title":"SpotFormer: Multi-Scale Spatio-Temporal Transformer for Facial\n Expression Spotting","summary":" Facial expression spotting, identifying periods where facial expressions\noccur in a video, is a significant yet challenging task in facial expression\nanalysis. The issues of irrelevant facial movements and the challenge of\ndetecting subtle motions in micro-expressions remain unresolved, hindering\naccurate expression spotting. In this paper, we propose an efficient framework\nfor facial expression spotting. First, we propose a Sliding Window-based\nMulti-Resolution Optical flow (SW-MRO) feature, which calculates\nmulti-resolution optical flow of the input image sequence within compact\nsliding windows. The window length is tailored to perceive complete\nmicro-expressions and distinguish between general macro- and micro-expressions.\nSW-MRO can effectively reveal subtle motions while avoiding severe head\nmovement problems. Second, we propose SpotFormer, a multi-scale spatio-temporal\nTransformer that simultaneously encodes spatio-temporal relationships of the\nSW-MRO features for accurate frame-level probability estimation. In SpotFormer,\nour proposed Facial Local Graph Pooling (FLGP) and convolutional layers are\napplied for multi-scale spatio-temporal feature extraction. We show the\nvalidity of the architecture of SpotFormer by comparing it with several model\nvariants. Third, we introduce supervised contrastive learning into SpotFormer\nto enhance the discriminability between different types of expressions.\nExtensive experiments on SAMM-LV and CAS(ME)^2 show that our method outperforms\nstate-of-the-art models, particularly in micro-expression spotting.\n","authors":["Yicheng Deng","Hideaki Hayashi","Hajime Nagahara"],"pdf_url":"https://arxiv.org/pdf/2407.20799v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.08578v2","updated":"2024-07-30T12:57:12Z","published":"2024-05-14T13:19:43Z","title":"Local-peak scale-invariant feature transform for fast and random image\n stitching","summary":" Image stitching aims to construct a wide field of view with high spatial\nresolution, which cannot be achieved in a single exposure. Typically,\nconventional image stitching techniques, other than deep learning, require\ncomplex computation and thus computational pricy, especially for stitching\nlarge raw images. In this study, inspired by the multiscale feature of fluid\nturbulence, we developed a fast feature point detection algorithm named\nlocal-peak scale-invariant feature transform (LP-SIFT), based on the multiscale\nlocal peaks and scale-invariant feature transform method. By combining LP-SIFT\nand RANSAC in image stitching, the stitching speed can be improved by orders,\ncompared with the original SIFT method. Nine large images (over 2600*1600\npixels), arranged randomly without prior knowledge, can be stitched within\n158.94 s. The algorithm is highly practical for applications requiring a wide\nfield of view in diverse application scenes, e.g., terrain mapping, biological\nanalysis, and even criminal investigation.\n","authors":["Hao Li","Lipo Wang","Tianyun Zhao","Wei Zhao"],"pdf_url":"https://arxiv.org/pdf/2405.08578v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.08260v2","updated":"2024-07-30T12:54:59Z","published":"2024-07-11T08:00:19Z","title":"SALSA: Swift Adaptive Lightweight Self-Attention for Enhanced LiDAR\n Place Recognition","summary":" Large-scale LiDAR mappings and localization leverage place recognition\ntechniques to mitigate odometry drifts, ensuring accurate mapping. These\ntechniques utilize scene representations from LiDAR point clouds to identify\npreviously visited sites within a database. Local descriptors, assigned to each\npoint within a point cloud, are aggregated to form a scene representation for\nthe point cloud. These descriptors are also used to re-rank the retrieved point\nclouds based on geometric fitness scores. We propose SALSA, a novel,\nlightweight, and efficient framework for LiDAR place recognition. It consists\nof a Sphereformer backbone that uses radial window attention to enable\ninformation aggregation for sparse distant points, an adaptive self-attention\nlayer to pool local descriptors into tokens, and a multi-layer-perceptron Mixer\nlayer for aggregating the tokens to generate a scene descriptor. The proposed\nframework outperforms existing methods on various LiDAR place recognition\ndatasets in terms of both retrieval and metric localization while operating in\nreal-time.\n","authors":["Raktim Gautam Goswami","Naman Patel","Prashanth Krishnamurthy","Farshad Khorrami"],"pdf_url":"https://arxiv.org/pdf/2407.08260v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16937v2","updated":"2024-07-30T12:46:04Z","published":"2023-11-28T16:39:49Z","title":"The Sky's the Limit: Re-lightable Outdoor Scenes via a Sky-pixel\n Constrained Illumination Prior and Outside-In Visibility","summary":" Inverse rendering of outdoor scenes from unconstrained image collections is a\nchallenging task, particularly illumination/albedo ambiguities and occlusion of\nthe illumination environment (shadowing) caused by geometry. However, there are\nmany cues in an image that can aid in the disentanglement of geometry, albedo\nand shadows. Whilst sky is frequently masked out in state-of-the-art methods,\nwe exploit the fact that any sky pixel provides a direct observation of distant\nlighting in the corresponding direction and, via a neural illumination prior, a\nstatistical cue to derive the remaining illumination environment. The\nincorporation of our illumination prior is enabled by a novel `outside-in'\nmethod for computing differentiable sky visibility based on a neural\ndirectional distance function. This is highly efficient and can be trained in\nparallel with the neural scene representation, allowing gradients from\nappearance loss to flow from shadows to influence the estimation of\nillumination and geometry. Our method estimates high-quality albedo, geometry,\nillumination and sky visibility, achieving state-of-the-art results on the\nNeRF-OSR relighting benchmark. Our code and models can be found at\nhttps://github.com/JADGardner/neusky\n","authors":["James A. D. Gardner","Evgenii Kashin","Bernhard Egger","William A. P. Smith"],"pdf_url":"https://arxiv.org/pdf/2311.16937v2.pdf","comment":"Accepted to ECCV 2024"},{"id":"http://arxiv.org/abs/2403.16292v2","updated":"2024-07-30T12:12:00Z","published":"2024-03-24T20:48:36Z","title":"latentSplat: Autoencoding Variational Gaussians for Fast Generalizable\n 3D Reconstruction","summary":" We present latentSplat, a method to predict semantic Gaussians in a 3D latent\nspace that can be splatted and decoded by a light-weight generative 2D\narchitecture. Existing methods for generalizable 3D reconstruction either do\nnot scale to large scenes and resolutions, or are limited to interpolation of\nclose input views. latentSplat combines the strengths of regression-based and\ngenerative approaches while being trained purely on readily available real\nvideo data. The core of our method are variational 3D Gaussians, a\nrepresentation that efficiently encodes varying uncertainty within a latent\nspace consisting of 3D feature Gaussians. From these Gaussians, specific\ninstances can be sampled and rendered via efficient splatting and a fast,\ngenerative decoder. We show that latentSplat outperforms previous works in\nreconstruction quality and generalization, while being fast and scalable to\nhigh-resolution data.\n","authors":["Christopher Wewer","Kevin Raj","Eddy Ilg","Bernt Schiele","Jan Eric Lenssen"],"pdf_url":"https://arxiv.org/pdf/2403.16292v2.pdf","comment":"Project website: https://geometric-rl.mpi-inf.mpg.de/latentsplat/"},{"id":"http://arxiv.org/abs/2407.20766v1","updated":"2024-07-30T12:10:33Z","published":"2024-07-30T12:10:33Z","title":"Highly Efficient No-reference 4K Video Quality Assessment with\n Full-Pixel Covering Sampling and Training Strategy","summary":" Deep Video Quality Assessment (VQA) methods have shown impressive\nhigh-performance capabilities. Notably, no-reference (NR) VQA methods play a\nvital role in situations where obtaining reference videos is restricted or not\nfeasible. Nevertheless, as more streaming videos are being created in\nultra-high definition (e.g., 4K) to enrich viewers' experiences, the current\ndeep VQA methods face unacceptable computational costs. Furthermore, the\nresizing, cropping, and local sampling techniques employed in these methods can\ncompromise the details and content of original 4K videos, thereby negatively\nimpacting quality assessment. In this paper, we propose a highly efficient and\nnovel NR 4K VQA technology. Specifically, first, a novel data sampling and\ntraining strategy is proposed to tackle the problem of excessive resolution.\nThis strategy allows the VQA Swin Transformer-based model to effectively train\nand make inferences using the full data of 4K videos on standard consumer-grade\nGPUs without compromising content or details. Second, a weighting and scoring\nscheme is developed to mimic the human subjective perception mode, which is\nachieved by considering the distinct impact of each sub-region within a 4K\nframe on the overall perception. Third, we incorporate the frequency domain\ninformation of video frames to better capture the details that affect video\nquality, consequently further improving the model's generalizability. To our\nknowledge, this is the first technology for the NR 4K VQA task. Thorough\nempirical studies demonstrate it not only significantly outperforms existing\nmethods on a specialized 4K VQA dataset but also achieves state-of-the-art\nperformance across multiple open-source NR video quality datasets.\n","authors":["Xiaoheng Tan","Jiabin Zhang","Yuhui Quan","Jing Li","Yajing Wu","Zilin Bian"],"pdf_url":"https://arxiv.org/pdf/2407.20766v1.pdf","comment":"Accepted by ACM MM 2024"},{"id":"http://arxiv.org/abs/2407.19768v2","updated":"2024-07-30T12:07:57Z","published":"2024-07-29T08:03:33Z","title":"Efficient Face Super-Resolution via Wavelet-based Feature Enhancement\n Network","summary":" Face super-resolution aims to reconstruct a high-resolution face image from a\nlow-resolution face image. Previous methods typically employ an encoder-decoder\nstructure to extract facial structural features, where the direct downsampling\ninevitably introduces distortions, especially to high-frequency features such\nas edges. To address this issue, we propose a wavelet-based feature enhancement\nnetwork, which mitigates feature distortion by losslessly decomposing the input\nfeature into high and low-frequency components using the wavelet transform and\nprocessing them separately. To improve the efficiency of facial feature\nextraction, a full domain Transformer is further proposed to enhance local,\nregional, and global facial features. Such designs allow our method to perform\nbetter without stacking many modules as previous methods did. Experiments show\nthat our method effectively balances performance, model size, and speed. Code\nlink: https://github.com/PRIS-CV/WFEN.\n","authors":["Wenjie Li","Heng Guo","Xuannan Liu","Kongming Liang","Jiani Hu","Zhanyu Ma","Jun Guo"],"pdf_url":"https://arxiv.org/pdf/2407.19768v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20756v1","updated":"2024-07-30T11:57:40Z","published":"2024-07-30T11:57:40Z","title":"SynthVLM: High-Efficiency and High-Quality Synthetic Data for Vision\n Language Models","summary":" Recently, with the rise of web images, managing and understanding large-scale\nimage datasets has become increasingly important. Vision Large Language Models\n(VLLMs) have recently emerged due to their robust vision-understanding\ncapabilities. However, training these models requires vast amounts of data,\nposing challenges to efficiency, effectiveness, data quality, and privacy. In\nthis paper, we introduce SynthVLM, a novel data synthesis pipeline for VLLMs.\nUnlike existing methods that generate captions from images, SynthVLM employs\nadvanced diffusion models and high-quality captions to automatically generate\nand select high-resolution images from captions, creating precisely aligned\nimage-text pairs. Leveraging these pairs, we achieve state-of-the-art (SoTA)\nperformance on various vision question answering tasks, maintaining high\nalignment quality and preserving advanced language abilities. Moreover,\nSynthVLM surpasses traditional GPT-4 Vision-based caption generation methods in\nperformance while significantly reducing computational overhead. Crucially, our\nmethod's reliance on purely generated data ensures the preservation of privacy,\nachieving SoTA performance with just 100k data points (only 18% of the official\ndataset size).\n","authors":["Zheng Liu","Hao Liang","Wentao Xiong","Qinhan Yu","Conghui He","Bin Cui","Wentao Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.20756v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03056v2","updated":"2024-07-30T11:56:43Z","published":"2024-07-03T12:24:40Z","title":"Improving Zero-shot Generalization of Learned Prompts via Unsupervised\n Knowledge Distillation","summary":" Vision-Language Models (VLMs) demonstrate remarkable zero-shot generalization\nto unseen tasks, but fall short of the performance of supervised methods in\ngeneralizing to downstream tasks with limited data. Prompt learning is emerging\nas a parameter-efficient method for adapting VLMs, but state-of-the-art\napproaches require annotated samples. In this paper we propose a novel approach\nto prompt learning based on unsupervised knowledge distillation from more\npowerful models. Our approach, which we call Knowledge Distillation Prompt\nLearning (KDPL), can be integrated into existing prompt learning techniques and\neliminates the need for labeled examples during adaptation. Our experiments on\nmore than ten standard benchmark datasets demonstrate that KDPL is very\neffective at improving generalization of learned prompts for zero-shot domain\ngeneralization, zero-shot cross-dataset generalization, and zero-shot\nbase-to-novel class generalization problems. KDPL requires no ground-truth\nlabels for adaptation, and moreover we show that even in the absence of any\nknowledge of training class names it can be used to effectively transfer\nknowledge. The code is publicly available at https://github.com/miccunifi/KDPL.\n","authors":["Marco Mistretta","Alberto Baldrati","Marco Bertini","Andrew D. Bagdanov"],"pdf_url":"https://arxiv.org/pdf/2407.03056v2.pdf","comment":"Accepted for publication at ECCV24"},{"id":"http://arxiv.org/abs/2407.20749v1","updated":"2024-07-30T11:34:04Z","published":"2024-07-30T11:34:04Z","title":"Re-localization acceleration with Medoid Silhouette Clustering","summary":" Two crucial performance criteria for the deployment of visual localization\nare speed and accuracy. Current research on visual localization with neural\nnetworks is limited to examining methods for enhancing the accuracy of networks\nacross various datasets. How to expedite the re-localization process within\ndeep neural network architectures still needs further investigation. In this\npaper, we present a novel approach for accelerating visual re-localization in\npractice. A tree-like search strategy, built on the keyframes extracted by a\nvisual clustering algorithm, is designed for matching acceleration. Our method\nhas been validated on two tasks across three public datasets, allowing for 50\nup to 90 percent time saving over the baseline while not reducing location\naccuracy.\n","authors":["Hongyi Zhang","Walterio Mayol-Cuevas"],"pdf_url":"https://arxiv.org/pdf/2407.20749v1.pdf","comment":"11 pages, 6 figures"},{"id":"http://arxiv.org/abs/2403.07263v2","updated":"2024-07-30T11:31:31Z","published":"2024-03-12T02:45:24Z","title":"Adaptive Bounding Box Uncertainties via Two-Step Conformal Prediction","summary":" Quantifying a model's predictive uncertainty is essential for safety-critical\napplications such as autonomous driving. We consider quantifying such\nuncertainty for multi-object detection. In particular, we leverage conformal\nprediction to obtain uncertainty intervals with guaranteed coverage for object\nbounding boxes. One challenge in doing so is that bounding box predictions are\nconditioned on the object's class label. Thus, we develop a novel two-step\nconformal approach that propagates uncertainty in predicted class labels into\nthe uncertainty intervals of bounding boxes. This broadens the validity of our\nconformal coverage guarantees to include incorrectly classified objects, thus\noffering more actionable safety assurances. Moreover, we investigate novel\nensemble and quantile regression formulations to ensure the bounding box\nintervals are adaptive to object size, leading to a more balanced coverage.\nValidating our two-step approach on real-world datasets for 2D bounding box\nlocalization, we find that desired coverage levels are satisfied with\npractically tight predictive uncertainty intervals.\n","authors":["Alexander Timans","Christoph-Nikolas Straehle","Kaspar Sakmann","Eric Nalisnick"],"pdf_url":"https://arxiv.org/pdf/2403.07263v2.pdf","comment":"European Conference on Computer Vision (ECCV) 2024; 37 pages, 14\n figures, 6 tables (incl. appendix)"},{"id":"http://arxiv.org/abs/2402.01393v3","updated":"2024-07-30T11:20:47Z","published":"2024-02-02T13:17:19Z","title":"ALERT-Transformer: Bridging Asynchronous and Synchronous Machine\n Learning for Real-Time Event-based Spatio-Temporal Data","summary":" We seek to enable classic processing of continuous ultra-sparse\nspatiotemporal data generated by event-based sensors with dense machine\nlearning models. We propose a novel hybrid pipeline composed of asynchronous\nsensing and synchronous processing that combines several ideas: (1) an\nembedding based on PointNet models -- the ALERT module -- that can continuously\nintegrate new and dismiss old events thanks to a leakage mechanism, (2) a\nflexible readout of the embedded data that allows to feed any downstream model\nwith always up-to-date features at any sampling rate, (3) exploiting the input\nsparsity in a patch-based approach inspired by Vision Transformer to optimize\nthe efficiency of the method. These embeddings are then processed by a\ntransformer model trained for object and gesture recognition. Using this\napproach, we achieve performances at the state-of-the-art with a lower latency\nthan competitors. We also demonstrate that our asynchronous model can operate\nat any desired sampling rate.\n","authors":["Carmen Martin-Turrero","Maxence Bouvier","Manuel Breitenstein","Pietro Zanuttigh","Vincent Parret"],"pdf_url":"https://arxiv.org/pdf/2402.01393v3.pdf","comment":"Originally published in the Proceedings of Machine Learning Research\n ICML 2024"},{"id":"http://arxiv.org/abs/2407.20732v1","updated":"2024-07-30T11:06:39Z","published":"2024-07-30T11:06:39Z","title":"Scene-Specific Trajectory Sets: Maximizing Representation in Motion\n Forecasting","summary":" Representing diverse and plausible future trajectories of actors is crucial\nfor motion forecasting in autonomous driving. However, efficiently capturing\nthe true trajectory distribution with a compact set is challenging. In this\nwork, we propose a novel approach for generating scene-specific trajectory sets\nthat better represent the diversity and admissibility of future actor behavior.\nOur method constructs multiple trajectory sets tailored to different scene\ncontexts, such as intersections and non-intersections, by leveraging map\ninformation and actor dynamics. We introduce a deterministic goal sampling\nalgorithm that identifies relevant map regions and generates trajectories\nconditioned on the scene layout. Furthermore, we empirically investigate\nvarious sampling strategies and set sizes to optimize the trade-off between\ncoverage and diversity. Experiments on the Argoverse 2 dataset demonstrate that\nour scene-specific sets achieve higher plausibility while maintaining diversity\ncompared to traditional single-set approaches. The proposed Recursive\nIn-Distribution Subsampling (RIDS) method effectively condenses the\nrepresentation space and outperforms metric-driven sampling in terms of\ntrajectory admissibility. Our work highlights the benefits of scene-aware\ntrajectory set generation for capturing the complex and heterogeneous nature of\nactor behavior in real-world driving scenarios.\n","authors":["Abhishek Vivekanandan","J. Marius Zöllner"],"pdf_url":"https://arxiv.org/pdf/2407.20732v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.05688v2","updated":"2024-07-30T11:02:51Z","published":"2024-07-08T07:43:06Z","title":"Learning with Alignments: Tackling the Inter- and Intra-domain Shifts\n for Cross-multidomain Facial Expression Recognition","summary":" Facial Expression Recognition (FER) holds significant importance in\nhuman-computer interactions. Existing cross-domain FER methods often transfer\nknowledge solely from a single labeled source domain to an unlabeled target\ndomain, neglecting the comprehensive information across multiple sources.\nNevertheless, cross-multidomain FER (CMFER) is very challenging for (i) the\ninherent inter-domain shifts across multiple domains and (ii) the intra-domain\nshifts stemming from the ambiguous expressions and low inter-class\ndistinctions. In this paper, we propose a novel Learning with Alignments CMFER\nframework, named LA-CMFER, to handle both inter- and intra-domain shifts.\nSpecifically, LA-CMFER is constructed with a global branch and a local branch\nto extract features from the full images and local subtle expressions,\nrespectively. Based on this, LA-CMFER presents a dual-level inter-domain\nalignment method to force the model to prioritize hard-to-align samples in\nknowledge transfer at a sample level while gradually generating a\nwell-clustered feature space with the guidance of class attributes at a cluster\nlevel, thus narrowing the inter-domain shifts. To address the intra-domain\nshifts, LA-CMFER introduces a multi-view intra-domain alignment method with a\nmulti-view clustering consistency constraint where a prediction similarity\nmatrix is built to pursue consistency between the global and local views, thus\nrefining pseudo labels and eliminating latent noise. Extensive experiments on\nsix benchmark datasets have validated the superiority of our LA-CMFER.\n","authors":["Yuxiang Yang","Lu Wen","Xinyi Zeng","Yuanyuan Xu","Xi Wu","Jiliu Zhou","Yan Wang"],"pdf_url":"https://arxiv.org/pdf/2407.05688v2.pdf","comment":"Accepted by ACM MM 2024"},{"id":"http://arxiv.org/abs/2407.03634v2","updated":"2024-07-30T11:02:50Z","published":"2024-07-04T04:54:03Z","title":"SOWA: Adapting Hierarchical Frozen Window Self-Attention to\n Visual-Language Models for Better Anomaly Detection","summary":" Visual anomaly detection is critical in industrial manufacturing, but\ntraditional methods often rely on extensive normal datasets and custom models,\nlimiting scalability. Recent advancements in large-scale visual-language models\nhave significantly improved zero/few-shot anomaly detection. However, these\napproaches may not fully utilize hierarchical features, potentially missing\nnuanced details. We introduce a window self-attention mechanism based on the\nCLIP model, combined with learnable prompts to process multi-level features\nwithin a Soldier-Offier Window self-Attention (SOWA) framework. Our method has\nbeen tested on five benchmark datasets, demonstrating superior performance by\nleading in 18 out of 20 metrics compared to existing state-of-the-art\ntechniques.\n","authors":["Zongxiang Hu","Zhaosheng Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.03634v2.pdf","comment":"8 pages, 9 figures, conference"},{"id":"http://arxiv.org/abs/2407.20730v1","updated":"2024-07-30T11:02:45Z","published":"2024-07-30T11:02:45Z","title":"Autogenic Language Embedding for Coherent Point Tracking","summary":" Point tracking is a challenging task in computer vision, aiming to establish\npoint-wise correspondence across long video sequences. Recent advancements have\nprimarily focused on temporal modeling techniques to improve local feature\nsimilarity, often overlooking the valuable semantic consistency inherent in\ntracked points. In this paper, we introduce a novel approach leveraging\nlanguage embeddings to enhance the coherence of frame-wise visual features\nrelated to the same object. Our proposed method, termed autogenic language\nembedding for visual feature enhancement, strengthens point correspondence in\nlong-term sequences. Unlike existing visual-language schemes, our approach\nlearns text embeddings from visual features through a dedicated mapping\nnetwork, enabling seamless adaptation to various tracking tasks without\nexplicit text annotations. Additionally, we introduce a consistency decoder\nthat efficiently integrates text tokens into visual features with minimal\ncomputational overhead. Through enhanced visual consistency, our approach\nsignificantly improves tracking trajectories in lengthy videos with substantial\nappearance variations. Extensive experiments on widely-used tracking benchmarks\ndemonstrate the superior performance of our method, showcasing notable\nenhancements compared to trackers relying solely on visual cues.\n","authors":["Zikai Song","Ying Tang","Run Luo","Lintao Ma","Junqing Yu","Yi-Ping Phoebe Chen","Wei Yang"],"pdf_url":"https://arxiv.org/pdf/2407.20730v1.pdf","comment":"accepted by ACM MM 2024"},{"id":"http://arxiv.org/abs/2407.20728v1","updated":"2024-07-30T10:50:51Z","published":"2024-07-30T10:50:51Z","title":"Neural Fields for Continuous Periodic Motion Estimation in 4D\n Cardiovascular Imaging","summary":" Time-resolved three-dimensional flow MRI (4D flow MRI) provides a unique\nnon-invasive solution to visualize and quantify hemodynamics in blood vessels\nsuch as the aortic arch. However, most current analysis methods for arterial 4D\nflow MRI use static artery walls because of the difficulty in obtaining a full\ncycle segmentation. To overcome this limitation, we propose a neural\nfields-based method that directly estimates continuous periodic wall\ndeformations throughout the cardiac cycle. For a 3D + time imaging dataset, we\noptimize an implicit neural representation (INR) that represents a\ntime-dependent velocity vector field (VVF). An ODE solver is used to integrate\nthe VVF into a deformation vector field (DVF), that can deform images,\nsegmentation masks, or meshes over time, thereby visualizing and quantifying\nlocal wall motion patterns. To properly reflect the periodic nature of 3D +\ntime cardiovascular data, we impose periodicity in two ways. First, by\nperiodically encoding the time input to the INR, and hence VVF. Second, by\nregularizing the DVF. We demonstrate the effectiveness of this approach on\nsynthetic data with different periodic patterns, ECG-gated CT, and 4D flow MRI\ndata. The obtained method could be used to improve 4D flow MRI analysis.\n","authors":["Simone Garzia","Patryk Rygiel","Sven Dummer","Filippo Cademartiri","Simona Celi","Jelmer M. Wolterink"],"pdf_url":"https://arxiv.org/pdf/2407.20728v1.pdf","comment":"10 pages, 5 figures, STACOM 2024"},{"id":"http://arxiv.org/abs/2407.20727v1","updated":"2024-07-30T10:45:28Z","published":"2024-07-30T10:45:28Z","title":"SceneTeller: Language-to-3D Scene Generation","summary":" Designing high-quality indoor 3D scenes is important in many practical\napplications, such as room planning or game development. Conventionally, this\nhas been a time-consuming process which requires both artistic skill and\nfamiliarity with professional software, making it hardly accessible for layman\nusers. However, recent advances in generative AI have established solid\nfoundation for democratizing 3D design. In this paper, we propose a pioneering\napproach for text-based 3D room design. Given a prompt in natural language\ndescribing the object placement in the room, our method produces a high-quality\n3D scene corresponding to it. With an additional text prompt the users can\nchange the appearance of the entire scene or of individual objects in it. Built\nusing in-context learning, CAD model retrieval and 3D-Gaussian-Splatting-based\nstylization, our turnkey pipeline produces state-of-the-art 3D scenes, while\nbeing easy to use even for novices. Our project page is available at\nhttps://sceneteller.github.io/.\n","authors":["Başak Melis Öcal","Maxim Tatarchenko","Sezer Karaoglu","Theo Gevers"],"pdf_url":"https://arxiv.org/pdf/2407.20727v1.pdf","comment":"ECCV'24 camera-ready version"},{"id":"http://arxiv.org/abs/2401.03836v5","updated":"2024-07-30T10:36:51Z","published":"2024-01-08T11:50:23Z","title":"WidthFormer: Toward Efficient Transformer-based BEV View Transformation","summary":" We present WidthFormer, a novel transformer-based module to compute\nBird's-Eye-View (BEV) representations from multi-view cameras for real-time\nautonomous-driving applications. WidthFormer is computationally efficient,\nrobust and does not require any special engineering effort to deploy. We first\nintroduce a novel 3D positional encoding mechanism capable of accurately\nencapsulating 3D geometric information, which enables our model to compute\nhigh-quality BEV representations with only a single transformer decoder layer.\nThis mechanism is also beneficial for existing sparse 3D object detectors.\nInspired by the recently proposed works, we further improve our model's\nefficiency by vertically compressing the image features when serving as\nattention keys and values, and then we develop two modules to compensate for\npotential information loss due to feature compression. Experimental evaluation\non the widely-used nuScenes 3D object detection benchmark demonstrates that our\nmethod outperforms previous approaches across different 3D detection\narchitectures. More importantly, our model is highly efficient. For example,\nwhen using $256\\times 704$ input images, it achieves 1.5 ms and 2.8 ms latency\non NVIDIA 3090 GPU and Horizon Journey-5 computation solutions. Furthermore,\nWidthFormer also exhibits strong robustness to different degrees of camera\nperturbations. Our study offers valuable insights into the deployment of BEV\ntransformation methods in real-world, complex road environments. Code is\navailable at https://github.com/ChenhongyiYang/WidthFormer .\n","authors":["Chenhongyi Yang","Tianwei Lin","Lichao Huang","Elliot J. Crowley"],"pdf_url":"https://arxiv.org/pdf/2401.03836v5.pdf","comment":"IROS 2024 Oral Presentation"},{"id":"http://arxiv.org/abs/2403.08156v2","updated":"2024-07-30T10:32:30Z","published":"2024-03-13T00:43:10Z","title":"NeRF-Supervised Feature Point Detection and Description","summary":" Feature point detection and description is the backbone for various computer\nvision applications, such as Structure-from-Motion, visual SLAM, and visual\nplace recognition. While learning-based methods have surpassed traditional\nhandcrafted techniques, their training often relies on simplistic\nhomography-based simulations of multi-view perspectives, limiting model\ngeneralisability. This paper presents a novel approach leveraging Neural\nRadiance Fields (NeRFs) to generate a diverse and realistic dataset consisting\nof indoor and outdoor scenes. Our proposed methodology adapts state-of-the-art\nfeature detectors and descriptors for training on multi-view NeRF-synthesised\ndata, with supervision achieved through perspective projective geometry.\nExperiments demonstrate that the proposed methodology achieves competitive or\nsuperior performance on standard benchmarks for relative pose estimation, point\ncloud registration, and homography estimation while requiring significantly\nless training data and time compared to existing approaches.\n","authors":["Ali Youssef","Francisco Vasconcelos"],"pdf_url":"https://arxiv.org/pdf/2403.08156v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20705v1","updated":"2024-07-30T10:00:16Z","published":"2024-07-30T10:00:16Z","title":"PIP: Prototypes-Injected Prompt for Federated Class Incremental Learning","summary":" Federated Class Incremental Learning (FCIL) is a new direction in continual\nlearning (CL) for addressing catastrophic forgetting and non-IID data\ndistribution simultaneously. Existing FCIL methods call for high communication\ncosts and exemplars from previous classes. We propose a novel rehearsal-free\nmethod for FCIL named prototypes-injected prompt (PIP) that involves 3 main\nideas: a) prototype injection on prompt learning, b) prototype augmentation,\nand c) weighted Gaussian aggregation on the server side. Our experiment result\nshows that the proposed method outperforms the current state of the arts\n(SOTAs) with a significant improvement (up to 33%) in CIFAR100, MiniImageNet\nand TinyImageNet datasets. Our extensive analysis demonstrates the robustness\nof PIP in different task sizes, and the advantage of requiring smaller\nparticipating local clients, and smaller global rounds. For further study,\nsource codes of PIP, baseline, and experimental logs are shared publicly in\nhttps://github.com/anwarmaxsum/PIP.\n","authors":["Muhammad Anwar Ma'sum","Mahardhika Pratama","Savitha Ramasamy","Lin Liu","Habibullah Habibullah","Ryszard Kowalczyk"],"pdf_url":"https://arxiv.org/pdf/2407.20705v1.pdf","comment":"Conference on Information and Knowledge Management (CIKM) 2024\n (Accepted)"},{"id":"http://arxiv.org/abs/2403.14279v2","updated":"2024-07-30T09:56:26Z","published":"2024-03-21T10:38:18Z","title":"Zero123-6D: Zero-shot Novel View Synthesis for RGB Category-level 6D\n Pose Estimation","summary":" Estimating the pose of objects through vision is essential to make robotic\nplatforms interact with the environment. Yet, it presents many challenges,\noften related to the lack of flexibility and generalizability of\nstate-of-the-art solutions. Diffusion models are a cutting-edge neural\narchitecture transforming 2D and 3D computer vision, outlining remarkable\nperformances in zero-shot novel-view synthesis. Such a use case is particularly\nintriguing for reconstructing 3D objects. However, localizing objects in\nunstructured environments is rather unexplored. To this end, this work presents\nZero123-6D, the first work to demonstrate the utility of Diffusion Model-based\nnovel-view-synthesizers in enhancing RGB 6D pose estimation at category-level,\nby integrating them with feature extraction techniques. Novel View Synthesis\nallows to obtain a coarse pose that is refined through an online optimization\nmethod introduced in this work to deal with intra-category geometric\ndifferences. In such a way, the outlined method shows reduction in data\nrequirements, removal of the necessity of depth information in zero-shot\ncategory-level 6D pose estimation task, and increased performance,\nquantitatively demonstrated through experiments on the CO3D dataset.\n","authors":["Francesco Di Felice","Alberto Remus","Stefano Gasperini","Benjamin Busam","Lionel Ott","Federico Tombari","Roland Siegwart","Carlo Alberto Avizzano"],"pdf_url":"https://arxiv.org/pdf/2403.14279v2.pdf","comment":"6 pages, 2 reference pages, 4 figures"},{"id":"http://arxiv.org/abs/2402.00912v2","updated":"2024-07-30T09:49:51Z","published":"2024-02-01T10:18:43Z","title":"Can we Constrain Concept Bottleneck Models to Learn Semantically\n Meaningful Input Features?","summary":" Concept Bottleneck Models (CBMs) are regarded as inherently interpretable\nbecause they first predict a set of human-defined concepts which are used to\npredict a task label. For inherent interpretability to be fully realised, and\nensure trust in a model's output, it's desirable for concept predictions to use\nsemantically meaningful input features. For instance, in an image, pixels\nrepresenting a broken bone should contribute to predicting a fracture. However,\ncurrent literature suggests that concept predictions often rely on irrelevant\ninput features. We hypothesise that this occurs when dataset labels include\ninaccurate concept annotations, or the relationship between input features and\nconcepts is unclear. In general, the effect of dataset labelling on concept\nrepresentations remains an understudied area. In this paper, we demonstrate\nthat CBMs can learn to map concepts to semantically meaningful input features,\nby utilising datasets with a clear link between the input features and the\ndesired concept predictions. This is achieved, for instance, by ensuring\nmultiple concepts do not always co-occur and, therefore provide a clear\ntraining signal for the CBM to distinguish the relevant input features for each\nconcept. We validate our hypothesis on both synthetic and real-world image\ndatasets, and demonstrate under the correct conditions, CBMs can learn to\nattribute semantically meaningful input features to the correct concept\npredictions.\n","authors":["Jack Furby","Daniel Cunnington","Dave Braines","Alun Preece"],"pdf_url":"https://arxiv.org/pdf/2402.00912v2.pdf","comment":"Main paper: 8 pages, 9 figures, Appendix: 14 pages, 21 figures. This\n paper is a preprint"},{"id":"http://arxiv.org/abs/2407.20695v1","updated":"2024-07-30T09:43:42Z","published":"2024-07-30T09:43:42Z","title":"Time Series Anomaly Detection with CNN for Environmental Sensors in\n Healthcare-IoT","summary":" This research develops a new method to detect anomalies in time series data\nusing Convolutional Neural Networks (CNNs) in healthcare-IoT. The proposed\nmethod creates a Distributed Denial of Service (DDoS) attack using an IoT\nnetwork simulator, Cooja, which emulates environmental sensors such as\ntemperature and humidity. CNNs detect anomalies in time series data, resulting\nin a 92\\% accuracy in identifying possible attacks.\n","authors":["Mirza Akhi Khatun","Mangolika Bhattacharya","Ciarán Eising","Lubna Luxmi Dhirani"],"pdf_url":"https://arxiv.org/pdf/2407.20695v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20693v1","updated":"2024-07-30T09:41:37Z","published":"2024-07-30T09:41:37Z","title":"Boosting Audio Visual Question Answering via Key Semantic-Aware Cues","summary":" The Audio Visual Question Answering (AVQA) task aims to answer questions\nrelated to various visual objects, sounds, and their interactions in videos.\nSuch naturally multimodal videos contain rich and complex dynamic audio-visual\ncomponents, with only a portion of them closely related to the given questions.\nHence, effectively perceiving audio-visual cues relevant to the given questions\nis crucial for correctly answering them. In this paper, we propose a\nTemporal-Spatial Perception Model (TSPM), which aims to empower the model to\nperceive key visual and auditory cues related to the questions. Specifically,\nconsidering the challenge of aligning non-declarative questions and visual\nrepresentations into the same semantic space using visual-language pretrained\nmodels, we construct declarative sentence prompts derived from the question\ntemplate, to assist the temporal perception module in better identifying\ncritical segments relevant to the questions. Subsequently, a spatial perception\nmodule is designed to merge visual tokens from selected segments to highlight\nkey latent targets, followed by cross-modal interaction with audio to perceive\npotential sound-aware areas. Finally, the significant temporal-spatial cues\nfrom these modules are integrated to answer the question. Extensive experiments\non multiple AVQA benchmarks demonstrate that our framework excels not only in\nunderstanding audio-visual scenes but also in answering complex questions\neffectively. Code is available at https://github.com/GeWu-Lab/TSPM.\n","authors":["Guangyao Li","Henghui Du","Di Hu"],"pdf_url":"https://arxiv.org/pdf/2407.20693v1.pdf","comment":"Accepted by ACM MM 2024"},{"id":"http://arxiv.org/abs/2406.08379v3","updated":"2024-07-30T09:32:17Z","published":"2024-06-12T16:29:45Z","title":"Eyes Wide Unshut: Unsupervised Mistake Detection in Egocentric\n Procedural Video by Detecting Unpredictable Gaze","summary":" In this paper, we address the challenge of unsupervised mistake detection in\negocentric procedural video through the analysis of gaze signals. Traditional\nsupervised mistake detection methods rely on manually labeled mistakes, and\nhence suffer from domain-dependence and scalability issues. We introduce an\nunsupervised method for detecting mistakes in videos of human activities,\novercoming the challenges of domain-specific requirements and the need for\nannotated data. We postulate that, when a subject is making a mistake in the\nexecution of a procedure, their attention patterns will deviate from normality.\nWe hence propose to detect mistakes by comparing gaze trajectories predicted\nfrom input video with ground truth gaze signals collected through a gaze\ntracker. Since predicting gaze in video is characterized by high uncertainty,\nwe propose a novel \\textit{gaze completion task}, which aims to predict gaze\nfrom visual observations and partial gaze trajectories. We further contribute a\n\\textit{gaze completion approach} based on a Gaze-Frame Correlation module to\nexplicitly model the correlation between gaze information and each local visual\ntoken. Inconsistencies between the predicted and observed gaze trajectories act\nas an indicator for identifying mistakes. Experiments on the EPIC-Tent,\nHoloAssist and IndustReal datasets showcase the effectiveness of the proposed\napproach as compared to unsupervised and one-class techniques. Our method is\nranked first on the HoloAssist Mistake Detection challenge.\n","authors":["Michele Mazzamuto","Antonino Furnari","Giovanni Maria Farinella"],"pdf_url":"https://arxiv.org/pdf/2406.08379v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08383v2","updated":"2024-07-30T09:27:43Z","published":"2024-03-13T09:48:04Z","title":"AFGI: Towards Accurate and Fast-convergent Gradient Inversion Attack in\n Federated Learning","summary":" Federated learning (FL) empowers privacypreservation in model training by\nonly exposing users' model gradients. Yet, FL users are susceptible to gradient\ninversion attacks (GIAs) which can reconstruct ground-truth training data such\nas images based on model gradients. However, reconstructing high-resolution\nimages by existing GIAs faces two challenges: inferior accuracy and\nslow-convergence, especially when duplicating labels exist in the training\nbatch. To address these challenges, we present an Accurate and Fast-convergent\nGradient Inversion attack algorithm, called AFGI, with two components: Label\nRecovery Block (LRB) which can accurately restore duplicating labels of private\nimages based on exposed gradients; VME Regularization Term, which includes the\ntotal variance of reconstructed images, the discrepancy between three-channel\nmeans and edges, between values from exposed gradients and reconstructed\nimages, respectively. The AFGI can be regarded as a white-box attack strategy\nto reconstruct images by leveraging labels recovered by LRB. In particular,\nAFGI is efficient that accurately reconstruct ground-truth images when users'\ntraining batch size is up to 48. Our experimental results manifest that AFGI\ncan diminish 85% time costs while achieving superb inversion quality in the\nImageNet dataset. At last, our study unveils the shortcomings of FL in\nprivacy-preservation, prompting the development of more advanced countermeasure\nstrategies.\n","authors":["Can Liu","Jin Wang","and Yipeng Zhou","Yachao Yuan","Quanzheng Sheng","Kejie Lu"],"pdf_url":"https://arxiv.org/pdf/2403.08383v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14352v3","updated":"2024-07-30T09:01:26Z","published":"2024-07-19T14:34:25Z","title":"Vision-Based Power Line Cables and Pylons Detection for Low Flying\n Aircraft","summary":" Power lines are dangerous for low-flying aircraft, especially in\nlow-visibility conditions. Thus, a vision-based system able to analyze the\naircraft's surroundings and to provide the pilots with a \"second pair of eyes\"\ncan contribute to enhancing their safety. To this end, we have developed a deep\nlearning approach to jointly detect power line cables and pylons from images\ncaptured at distances of several hundred meters by aircraft-mounted cameras. In\ndoing so, we have combined a modern convolutional architecture with transfer\nlearning and a loss function adapted to curvilinear structure delineation. We\nuse a single network for both detection tasks and demonstrated its performance\non two benchmarking datasets. We have integrated it within an onboard system\nand run it in flight, and have demonstrated with our experiments that it\noutperforms the prior distant cable detection method on both datasets, while\nalso successfully detecting pylons, given their annotations are available for\nthe data.\n","authors":["Jakub Gwizdała","Doruk Oner","Soumava Kumar Roy","Mian Akbar Shah","Ad Eberhard","Ivan Egorov","Philipp Krüsi","Grigory Yakushev","Pascal Fua"],"pdf_url":"https://arxiv.org/pdf/2407.14352v3.pdf","comment":"Added several declarations at the end of the publication"},{"id":"http://arxiv.org/abs/2407.20664v1","updated":"2024-07-30T08:59:05Z","published":"2024-07-30T08:59:05Z","title":"3D-GRES: Generalized 3D Referring Expression Segmentation","summary":" 3D Referring Expression Segmentation (3D-RES) is dedicated to segmenting a\nspecific instance within a 3D space based on a natural language description.\nHowever, current approaches are limited to segmenting a single target,\nrestricting the versatility of the task. To overcome this limitation, we\nintroduce Generalized 3D Referring Expression Segmentation (3D-GRES), which\nextends the capability to segment any number of instances based on natural\nlanguage instructions. In addressing this broader task, we propose the\nMulti-Query Decoupled Interaction Network (MDIN), designed to break down\nmulti-object segmentation tasks into simpler, individual segmentations. MDIN\ncomprises two fundamental components: Text-driven Sparse Queries (TSQ) and\nMulti-object Decoupling Optimization (MDO). TSQ generates sparse point cloud\nfeatures distributed over key targets as the initialization for queries.\nMeanwhile, MDO is tasked with assigning each target in multi-object scenarios\nto different queries while maintaining their semantic consistency. To adapt to\nthis new task, we build a new dataset, namely Multi3DRes. Our comprehensive\nevaluations on this dataset demonstrate substantial enhancements over existing\nmodels, thus charting a new path for intricate multi-object 3D scene\ncomprehension. The benchmark and code are available at\nhttps://github.com/sosppxo/3D-GRES.\n","authors":["Changli Wu","Yihang Liu","Jiayi Ji","Yiwei Ma","Haowei Wang","Gen Luo","Henghui Ding","Xiaoshuai Sun","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2407.20664v1.pdf","comment":"Accepted by ACM MM 2024 (Oral), Code:\n https://github.com/sosppxo/3D-GRES"},{"id":"http://arxiv.org/abs/2407.20013v2","updated":"2024-07-30T08:56:25Z","published":"2024-07-29T13:45:23Z","title":"Classification of freshwater snails of the genus Radomaniola with\n multimodal triplet networks","summary":" In this paper, we present our first proposal of a machine learning system for\nthe classification of freshwater snails of the genus Radomaniola. We elaborate\non the specific challenges encountered during system design, and how we tackled\nthem; namely a small, very imbalanced dataset with a high number of classes and\nhigh visual similarity between classes. We then show how we employed triplet\nnetworks and the multiple input modalities of images, measurements, and genetic\ninformation to overcome these challenges and reach a performance comparable to\nthat of a trained domain expert.\n","authors":["Dennis Vetter","Muhammad Ahsan","Diana Delicado","Thomas A. Neubauer","Thomas Wilke","Gemma Roig"],"pdf_url":"https://arxiv.org/pdf/2407.20013v2.pdf","comment":"Spotlight at ICML 2024 AI for Science workshop"},{"id":"http://arxiv.org/abs/2407.20662v1","updated":"2024-07-30T08:55:27Z","published":"2024-07-30T08:55:27Z","title":"DocXPand-25k: a large and diverse benchmark dataset for identity\n documents analysis","summary":" Identity document (ID) image analysis has become essential for many online\nservices, like bank account opening or insurance subscription. In recent years,\nmuch research has been conducted on subjects like document localization, text\nrecognition and fraud detection, to achieve a level of accuracy reliable enough\nto automatize identity verification. However, there are only a few available\ndatasets to benchmark ID analysis methods, mainly because of privacy\nrestrictions, security requirements and legal reasons.\n In this paper, we present the DocXPand-25k dataset, which consists of 24,994\nrichly labeled IDs images, generated using custom-made vectorial templates\nrepresenting nine fictitious ID designs, including four identity cards, two\nresidence permits and three passports designs. These synthetic IDs feature\nartificially generated personal information (names, dates, identifiers, faces,\nbarcodes, ...), and present a rich diversity in the visual layouts and textual\ncontents.\n We collected about 5.8k diverse backgrounds coming from real-world photos,\nscans and screenshots of IDs to guarantee the variety of the backgrounds. The\nsoftware we wrote to generate these images has been published\n(https://github.com/QuickSign/docxpand/) under the terms of the MIT license,\nand our dataset has been published\n(https://github.com/QuickSign/docxpand/releases/tag/v1.0.0) under the terms of\nthe CC-BY-NC-SA 4.0 License.\n","authors":["Julien Lerouge","Guillaume Betmont","Thomas Bres","Evgeny Stepankevich","Alexis Bergès"],"pdf_url":"https://arxiv.org/pdf/2407.20662v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20660v1","updated":"2024-07-30T08:52:51Z","published":"2024-07-30T08:52:51Z","title":"What makes for good morphology representations for spatial omics?","summary":" Spatial omics has transformed our understanding of tissue architecture by\npreserving spatial context of gene expression patterns. Simultaneously,\nadvances in imaging AI have enabled extraction of morphological features\ndescribing the tissue. The intersection of spatial omics and imaging AI\npresents opportunities for a more holistic understanding. In this review we\nintroduce a framework for categorizing spatial omics-morphology combination\nmethods, focusing on how morphological features can be translated or integrated\ninto spatial omics analyses. By translation we mean finding morphological\nfeatures that spatially correlate with gene expression patterns with the\npurpose of predicting gene expression. Such features can be used to generate\nsuper-resolution gene expression maps or infer genetic information from\nclinical H&E-stained samples. By integration we mean finding morphological\nfeatures that spatially complement gene expression patterns with the purpose of\nenriching information. Such features can be used to define spatial domains,\nespecially where gene expression has preceded morphological changes and where\nmorphology remains after gene expression. We discuss learning strategies and\ndirections for further development of the field.\n","authors":["Eduard Chelebian","Christophe Avenel","Carolina Wählby"],"pdf_url":"https://arxiv.org/pdf/2407.20660v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20657v1","updated":"2024-07-30T08:52:16Z","published":"2024-07-30T08:52:16Z","title":"Prompt-Driven Contrastive Learning for Transferable Adversarial Attacks","summary":" Recent vision-language foundation models, such as CLIP, have demonstrated\nsuperior capabilities in learning representations that can be transferable\nacross diverse range of downstream tasks and domains. With the emergence of\nsuch powerful models, it has become crucial to effectively leverage their\ncapabilities in tackling challenging vision tasks. On the other hand, only a\nfew works have focused on devising adversarial examples that transfer well to\nboth unknown domains and model architectures. In this paper, we propose a novel\ntransfer attack method called PDCL-Attack, which leverages the CLIP model to\nenhance the transferability of adversarial perturbations generated by a\ngenerative model-based attack framework. Specifically, we formulate an\neffective prompt-driven feature guidance by harnessing the semantic\nrepresentation power of text, particularly from the ground-truth class labels\nof input images. To the best of our knowledge, we are the first to introduce\nprompt learning to enhance the transferable generative attacks. Extensive\nexperiments conducted across various cross-domain and cross-model settings\nempirically validate our approach, demonstrating its superiority over\nstate-of-the-art methods.\n","authors":["Hunmin Yang","Jongoh Jeong","Kuk-Jin Yoon"],"pdf_url":"https://arxiv.org/pdf/2407.20657v1.pdf","comment":"Accepted to ECCV 2024, Project Page: https://PDCL-Attack.github.io"},{"id":"http://arxiv.org/abs/2407.20653v1","updated":"2024-07-30T08:50:06Z","published":"2024-07-30T08:50:06Z","title":"FACL-Attack: Frequency-Aware Contrastive Learning for Transferable\n Adversarial Attacks","summary":" Deep neural networks are known to be vulnerable to security risks due to the\ninherent transferable nature of adversarial examples. Despite the success of\nrecent generative model-based attacks demonstrating strong transferability, it\nstill remains a challenge to design an efficient attack strategy in a\nreal-world strict black-box setting, where both the target domain and model\narchitectures are unknown. In this paper, we seek to explore a feature\ncontrastive approach in the frequency domain to generate adversarial examples\nthat are robust in both cross-domain and cross-model settings. With that goal\nin mind, we propose two modules that are only employed during the training\nphase: a Frequency-Aware Domain Randomization (FADR) module to randomize\ndomain-variant low- and high-range frequency components and a\nFrequency-Augmented Contrastive Learning (FACL) module to effectively separate\ndomain-invariant mid-frequency features of clean and perturbed image. We\ndemonstrate strong transferability of our generated adversarial perturbations\nthrough extensive cross-domain and cross-model experiments, while keeping the\ninference time complexity.\n","authors":["Hunmin Yang","Jongoh Jeong","Kuk-Jin Yoon"],"pdf_url":"https://arxiv.org/pdf/2407.20653v1.pdf","comment":"Accepted to AAAI 2024, Project Page: https://FACL-Attack.github.io"},{"id":"http://arxiv.org/abs/2307.13717v5","updated":"2024-07-30T08:47:54Z","published":"2023-07-25T17:29:32Z","title":"Exploit the Leak: Understanding Risks in Biometric Matchers","summary":" In a biometric authentication or identification system, the matcher compares\na stored and a fresh template to determine whether there is a match. This\nassessment is based on both a similarity score and a predefined threshold. For\nbetter compliance with privacy legislation, the matcher can be built upon a\nprivacy-preserving distance. Beyond the binary output (`yes' or `no'), most\nschemes may perform more precise computations, e.g., the value of the distance.\nSuch precise information is prone to leakage even when not returned by the\nsystem. This can occur due to a malware infection or the use of a weakly\nprivacy-preserving distance, exemplified by side channel attacks or partially\nobfuscated designs. This paper provides an analysis of information leakage\nduring distance evaluation. We provide a catalog of information leakage\nscenarios with their impacts on data privacy. Each scenario gives rise to\nunique attacks with impacts quantified in terms of computational costs, thereby\nproviding a better understanding of the security level.\n","authors":["Axel Durbet","Kevin Thiry-Atighehchi","Dorine Chagnon","Paul-Marie Grollemund"],"pdf_url":"https://arxiv.org/pdf/2307.13717v5.pdf","comment":"Minor corrections"},{"id":"http://arxiv.org/abs/2407.20647v1","updated":"2024-07-30T08:43:53Z","published":"2024-07-30T08:43:53Z","title":"Image Re-Identification: Where Self-supervision Meets Vision-Language\n Learning","summary":" Recently, large-scale vision-language pre-trained models like CLIP have shown\nimpressive performance in image re-identification (ReID). In this work, we\nexplore whether self-supervision can aid in the use of CLIP for image ReID\ntasks. Specifically, we propose SVLL-ReID, the first attempt to integrate\nself-supervision and pre-trained CLIP via two training stages to facilitate the\nimage ReID. We observe that: 1) incorporating language self-supervision in the\nfirst training stage can make the learnable text prompts more distinguishable,\nand 2) incorporating vision self-supervision in the second training stage can\nmake the image features learned by the image encoder more discriminative. These\nobservations imply that: 1) the text prompt learning in the first stage can\nbenefit from the language self-supervision, and 2) the image feature learning\nin the second stage can benefit from the vision self-supervision. These\nbenefits jointly facilitate the performance gain of the proposed SVLL-ReID. By\nconducting experiments on six image ReID benchmark datasets without any\nconcrete text labels, we find that the proposed SVLL-ReID achieves the overall\nbest performances compared with state-of-the-arts. Codes will be publicly\navailable at https://github.com/BinWangGzhu/SVLL-ReID.\n","authors":["Bin Wang","Yuying Liang","Lei Cai","Huakun Huang","Huanqiang Zeng"],"pdf_url":"https://arxiv.org/pdf/2407.20647v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.14928v3","updated":"2024-07-30T08:39:52Z","published":"2023-09-26T13:35:31Z","title":"Noise-Tolerant Few-Shot Unsupervised Adapter for Vision-Language Models","summary":" Recent advances in large-scale vision-language models have achieved\nimpressive performance in various zero-shot image classification tasks. While\nprior studies have demonstrated significant improvements by introducing\nfew-shot labelled target samples, they still require labelling of target\nsamples, which greatly degrades their scalability and generalizability while\nhandling various visual recognition tasks. We design NtUA, a Noise-tolerant\nUnsupervised Adapter that allows the learning of effective target models with\nfew unlabelled target samples. NtUA works as a key-value cache that formulates\nvisual features and predicted pseudo-labels of the few unlabelled target\nsamples as key-value pairs. It consists of two complementary designs. The first\nis adaptive cache formation that combats pseudo-label noises by weighting the\nkey-value pairs according to their prediction confidence. The second is\nknowledge-guided cache refinement, which refines pair values (i.e.,\npseudo-labels) and cache weights by leveraging knowledge distillation from\nlarge-scale vision language models. Extensive experiments show that NtUA\nachieves superior performance consistently across multiple widely adopted\nbenchmarks.\n","authors":["Eman Ali","Muhammad Haris Khan"],"pdf_url":"https://arxiv.org/pdf/2309.14928v3.pdf","comment":"Accepted at BMVC 2024"},{"id":"http://arxiv.org/abs/2407.20643v1","updated":"2024-07-30T08:39:30Z","published":"2024-07-30T08:39:30Z","title":"Generalizing AI-driven Assessment of Immunohistochemistry across\n Immunostains and Cancer Types: A Universal Immunohistochemistry Analyzer","summary":" Despite advancements in methodologies, immunohistochemistry (IHC) remains the\nmost utilized ancillary test for histopathologic and companion diagnostics in\ntargeted therapies. However, objective IHC assessment poses challenges.\nArtificial intelligence (AI) has emerged as a potential solution, yet its\ndevelopment requires extensive training for each cancer and IHC type, limiting\nversatility. We developed a Universal IHC (UIHC) analyzer, an AI model for\ninterpreting IHC images regardless of tumor or IHC types, using training\ndatasets from various cancers stained for PD-L1 and/or HER2. This multi-cohort\ntrained model outperforms conventional single-cohort models in interpreting\nunseen IHCs (Kappa score 0.578 vs. up to 0.509) and consistently shows superior\nperformance across different positive staining cutoff values. Qualitative\nanalysis reveals that UIHC effectively clusters patches based on expression\nlevels. The UIHC model also quantitatively assesses c-MET expression with MET\nmutations, representing a significant advancement in AI application in the era\nof personalized medicine and accumulating novel biomarkers.\n","authors":["Biagio Brattoli","Mohammad Mostafavi","Taebum Lee","Wonkyung Jung","Jeongun Ryu","Seonwook Park","Jongchan Park","Sergio Pereira","Seunghwan Shin","Sangjoon Choi","Hyojin Kim","Donggeun Yoo","Siraj M. Ali","Kyunghyun Paeng","Chan-Young Ock","Soo Ick Cho","Seokhwi Kim"],"pdf_url":"https://arxiv.org/pdf/2407.20643v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20642v1","updated":"2024-07-30T08:39:20Z","published":"2024-07-30T08:39:20Z","title":"Effectively Leveraging CLIP for Generating Situational Summaries of\n Images and Videos","summary":" Situation recognition refers to the ability of an agent to identify and\nunderstand various situations or contexts based on available information and\nsensory inputs. It involves the cognitive process of interpreting data from the\nenvironment to determine what is happening, what factors are involved, and what\nactions caused those situations. This interpretation of situations is\nformulated as a semantic role labeling problem in computer vision-based\nsituation recognition. Situations depicted in images and videos hold pivotal\ninformation, essential for various applications like image and video\ncaptioning, multimedia retrieval, autonomous systems and event monitoring.\nHowever, existing methods often struggle with ambiguity and lack of context in\ngenerating meaningful and accurate predictions. Leveraging multimodal models\nsuch as CLIP, we propose ClipSitu, which sidesteps the need for full\nfine-tuning and achieves state-of-the-art results in situation recognition and\nlocalization tasks. ClipSitu harnesses CLIP-based image, verb, and role\nembeddings to predict nouns fulfilling all the roles associated with a verb,\nproviding a comprehensive understanding of depicted scenarios. Through a\ncross-attention Transformer, ClipSitu XTF enhances the connection between\nsemantic role queries and visual token representations, leading to superior\nperformance in situation recognition. We also propose a verb-wise role\nprediction model with near-perfect accuracy to create an end-to-end framework\nfor producing situational summaries for out-of-domain images. We show that\nsituational summaries empower our ClipSitu models to produce structured\ndescriptions with reduced ambiguity compared to generic captions. Finally, we\nextend ClipSitu to video situation recognition to showcase its versatility and\nproduce comparable performance to state-of-the-art methods.\n","authors":["Dhruv Verma","Debaditya Roy","Basura Fernando"],"pdf_url":"https://arxiv.org/pdf/2407.20642v1.pdf","comment":"38 pages, 12 figures. arXiv admin note: text overlap with\n arXiv:2307.00586"},{"id":"http://arxiv.org/abs/2407.20633v1","updated":"2024-07-30T08:23:47Z","published":"2024-07-30T08:23:47Z","title":"Spiking-DD: Neuromorphic Event Camera based Driver Distraction Detection\n with Spiking Neural Network","summary":" Event camera-based driver monitoring is emerging as a pivotal area of\nresearch, driven by its significant advantages such as rapid response, low\nlatency, power efficiency, enhanced privacy, and prevention of undersampling.\nEffective detection of driver distraction is crucial in driver monitoring\nsystems to enhance road safety and reduce accident rates. The integration of an\noptimized sensor such as Event Camera with an optimized network is essential\nfor maximizing these benefits. This paper introduces the innovative concept of\nsensing without seeing to detect driver distraction, leveraging computationally\nefficient spiking neural networks (SNN). To the best of our knowledge, this\nstudy is the first to utilize event camera data with spiking neural networks\nfor driver distraction. The proposed Spiking-DD network not only achieve state\nof the art performance but also exhibit fewer parameters and provides greater\naccuracy than current event-based methodologies.\n","authors":["Waseem Shariff","Paul Kielty","Joseph Lemley","Peter Corcoran"],"pdf_url":"https://arxiv.org/pdf/2407.20633v1.pdf","comment":"Irish Machine Vision and Image Processing Conference (IMVIP) 2024"},{"id":"http://arxiv.org/abs/2309.06680v2","updated":"2024-07-30T08:17:44Z","published":"2023-09-13T02:35:59Z","title":"STUPD: A Synthetic Dataset for Spatial and Temporal Relation Reasoning","summary":" Understanding relations between objects is crucial for understanding the\nsemantics of a visual scene. It is also an essential step in order to bridge\nvisual and language models. However, current state-of-the-art computer vision\nmodels still lack the ability to perform spatial reasoning well. Existing\ndatasets mostly cover a relatively small number of spatial relations, all of\nwhich are static relations that do not intrinsically involve motion. In this\npaper, we propose the Spatial and Temporal Understanding of Prepositions\nDataset (STUPD) -- a large-scale video dataset for understanding static and\ndynamic spatial relationships derived from prepositions of the English\nlanguage. The dataset contains 150K visual depictions (videos and images),\nconsisting of 30 distinct spatial prepositional senses, in the form of object\ninteraction simulations generated synthetically using Unity3D. In addition to\nspatial relations, we also propose 50K visual depictions across 10 temporal\nrelations, consisting of videos depicting event/time-point interactions. To our\nknowledge, no dataset exists that represents temporal relations through visual\nsettings. In this dataset, we also provide 3D information about object\ninteractions such as frame-wise coordinates, and descriptions of the objects\nused. The goal of this synthetic dataset is to help models perform better in\nvisual relationship detection in real-world settings. We demonstrate an\nincrease in the performance of various models over 2 real-world datasets\n(ImageNet-VidVRD and Spatial Senses) when pretrained on the STUPD dataset, in\ncomparison to other pretraining datasets.\n","authors":["Palaash Agrawal","Haidi Azaman","Cheston Tan"],"pdf_url":"https://arxiv.org/pdf/2309.06680v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20623v1","updated":"2024-07-30T07:59:28Z","published":"2024-07-30T07:59:28Z","title":"SharkTrack: an accurate, generalisable software for streamlining shark\n and ray underwater video analysis","summary":" Elasmobranchs (sharks and rays) can be important components of marine\necosystems but are experiencing global population declines. Effective\nmonitoring of these populations is essential to their protection. Baited Remote\nUnderwater Video Stations (BRUVS) have been a key tool for monitoring, but\nrequire time-consuming manual analysis. To address these challenges, we\ndeveloped SharkTrack, an AI-enhanced BRUVS analysis software. SharkTrack uses\nConvolutional Neural Networks and Multi-Object Tracking to detect and track\nelasmobranchs and provides an annotation pipeline to manually classify\nelasmobranch species and compute MaxN, the standard metric of relative\nabundance. We tested SharkTrack on BRUVS footage from locations unseen by the\nmodel during training. SharkTrack computed MaxN with 89% accuracy over 207\nhours of footage. The semi-automatic SharkTrack pipeline required two minutes\nof manual classification per hour of video, a 97% reduction of manual BRUVS\nanalysis time compared to traditional methods, estimated conservatively at one\nhour per hour of video. Furthermore, we demonstrate SharkTrack application\nacross diverse marine ecosystems and elasmobranch species, an advancement\ncompared to previous models, which were limited to specific species or\nlocations. SharkTrack applications extend beyond BRUVS analysis, facilitating\nrapid annotation of unlabeled videos, aiding the development of further models\nto classify elasmobranch species. We provide public access to the software and\nan unprecedentedly diverse dataset, facilitating future research in an\nimportant area of marine conservation.\n","authors":["Filippo Varini","Francesco Ferretti","Jeremy Jenrette","Joel H. Gayford","Mark E. Bond","Matthew J. Witt","Michael R. Heithaus","Sophie Wilday","Ben Glocker"],"pdf_url":"https://arxiv.org/pdf/2407.20623v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12743v2","updated":"2024-07-30T07:54:01Z","published":"2024-03-19T14:02:13Z","title":"Controllable Face Synthesis with Semantic Latent Diffusion Models","summary":" Semantic Image Synthesis (SIS) is among the most popular and effective\ntechniques in the field of face generation and editing, thanks to its good\ngeneration quality and the versatility is brings along. Recent works attempted\nto go beyond the standard GAN-based framework, and started to explore Diffusion\nModels (DMs) for this task as these stand out with respect to GANs in terms of\nboth quality and diversity. On the other hand, DMs lack in fine-grained\ncontrollability and reproducibility. To address that, in this paper we propose\na SIS framework based on a novel Latent Diffusion Model architecture for human\nface generation and editing that is both able to reproduce and manipulate a\nreal reference image and generate diversity-driven results. The proposed system\nutilizes both SPADE normalization and cross-attention layers to merge shape and\nstyle information and, by doing so, allows for a precise control over each of\nthe semantic parts of the human face. This was not possible with previous\nmethods in the state of the art. Finally, we performed an extensive set of\nexperiments to prove that our model surpasses current state of the art, both\nqualitatively and quantitatively.\n","authors":["Alex Ergasti","Claudio Ferrari","Tomaso Fontanini","Massimo Bertozzi","Andrea Prati"],"pdf_url":"https://arxiv.org/pdf/2403.12743v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.13505v4","updated":"2024-07-30T07:52:34Z","published":"2024-02-21T03:39:04Z","title":"SimPro: A Simple Probabilistic Framework Towards Realistic Long-Tailed\n Semi-Supervised Learning","summary":" Recent advancements in semi-supervised learning have focused on a more\nrealistic yet challenging task: addressing imbalances in labeled data while the\nclass distribution of unlabeled data remains both unknown and potentially\nmismatched. Current approaches in this sphere often presuppose rigid\nassumptions regarding the class distribution of unlabeled data, thereby\nlimiting the adaptability of models to only certain distribution ranges. In\nthis study, we propose a novel approach, introducing a highly adaptable\nframework, designated as SimPro, which does not rely on any predefined\nassumptions about the distribution of unlabeled data. Our framework, grounded\nin a probabilistic model, innovatively refines the expectation-maximization\n(EM) algorithm by explicitly decoupling the modeling of conditional and\nmarginal class distributions. This separation facilitates a closed-form\nsolution for class distribution estimation during the maximization phase,\nleading to the formulation of a Bayes classifier. The Bayes classifier, in\nturn, enhances the quality of pseudo-labels in the expectation phase.\nRemarkably, the SimPro framework not only comes with theoretical guarantees but\nalso is straightforward to implement. Moreover, we introduce two novel class\ndistributions broadening the scope of the evaluation. Our method showcases\nconsistent state-of-the-art performance across diverse benchmarks and data\ndistribution scenarios. Our code is available at\nhttps://github.com/LeapLabTHU/SimPro.\n","authors":["Chaoqun Du","Yizeng Han","Gao Huang"],"pdf_url":"https://arxiv.org/pdf/2402.13505v4.pdf","comment":"ICML2024 camera-ready version"},{"id":"http://arxiv.org/abs/2406.14050v2","updated":"2024-07-30T07:41:44Z","published":"2024-06-20T07:16:41Z","title":"Gaze-directed Vision GNN for Mitigating Shortcut Learning in Medical\n Image","summary":" Deep neural networks have demonstrated remarkable performance in medical\nimage analysis. However, its susceptibility to spurious correlations due to\nshortcut learning raises concerns about network interpretability and\nreliability. Furthermore, shortcut learning is exacerbated in medical contexts\nwhere disease indicators are often subtle and sparse. In this paper, we propose\na novel gaze-directed Vision GNN (called GD-ViG) to leverage the visual\npatterns of radiologists from gaze as expert knowledge, directing the network\ntoward disease-relevant regions, and thereby mitigating shortcut learning.\nGD-ViG consists of a gaze map generator (GMG) and a gaze-directed classifier\n(GDC). Combining the global modelling ability of GNNs with the locality of\nCNNs, GMG generates the gaze map based on radiologists' visual patterns.\nNotably, it eliminates the need for real gaze data during inference, enhancing\nthe network's practical applicability. Utilizing gaze as the expert knowledge,\nthe GDC directs the construction of graph structures by incorporating both\nfeature distances and gaze distances, enabling the network to focus on\ndisease-relevant foregrounds. Thereby avoiding shortcut learning and improving\nthe network's interpretability. The experiments on two public medical image\ndatasets demonstrate that GD-ViG outperforms the state-of-the-art methods, and\neffectively mitigates shortcut learning. Our code is available at\nhttps://github.com/SX-SS/GD-ViG.\n","authors":["Shaoxuan Wu","Xiao Zhang","Bin Wang","Zhuo Jin","Hansheng Li","Jun Feng"],"pdf_url":"https://arxiv.org/pdf/2406.14050v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.12854v3","updated":"2024-07-30T07:30:44Z","published":"2023-05-22T09:27:17Z","title":"RDA-INR: Riemannian Diffeomorphic Autoencoding via Implicit Neural\n Representations","summary":" Diffeomorphic registration frameworks such as Large Deformation Diffeomorphic\nMetric Mapping (LDDMM) are used in computer graphics and the medical domain for\natlas building, statistical latent modeling, and pairwise and groupwise\nregistration. In recent years, researchers have developed neural network-based\napproaches regarding diffeomorphic registration to improve the accuracy and\ncomputational efficiency of traditional methods. In this work, we focus on a\nlimitation of neural network-based atlas building and statistical latent\nmodeling methods, namely that they either are (i) resolution dependent or (ii)\ndisregard any data- or problem-specific geometry needed for proper\nmean-variance analysis. In particular, we overcome this limitation by designing\na novel encoder based on resolution-independent implicit neural\nrepresentations. The encoder achieves resolution invariance for LDDMM-based\nstatistical latent modeling. Additionally, the encoder adds LDDMM Riemannian\ngeometry to resolution-independent deep learning models for statistical latent\nmodeling. We investigate how the Riemannian geometry improves latent modeling\nand is required for a proper mean-variance analysis. To highlight the benefit\nof resolution independence for LDDMM-based data variability modeling, we show\nthat our approach outperforms current neural network-based LDDMM latent code\nmodels. Our work paves the way for more research into how Riemannian geometry,\nshape respectively image analysis, and deep learning can be combined.\n","authors":["Sven Dummer","Nicola Strisciuglio","Christoph Brune"],"pdf_url":"https://arxiv.org/pdf/2305.12854v3.pdf","comment":"41 pages, 27 figures (including subfigures), revised version, to be\n published in SIAM Journal on Imaging Sciences"},{"id":"http://arxiv.org/abs/2407.20600v1","updated":"2024-07-30T07:24:33Z","published":"2024-07-30T07:24:33Z","title":"Knowledge Fused Recognition: Fusing Hierarchical Knowledge for Image\n Recognition through Quantitative Relativity Modeling and Deep Metric Learning","summary":" Image recognition is an essential baseline for deep metric learning.\nHierarchical knowledge about image classes depicts inter-class similarities or\ndissimilarities. Effective fusion of hierarchical knowledge about image classes\nto enhance image recognition remains a challenging topic to advance. In this\npaper, we propose a novel deep metric learning based method to effectively fuse\nhierarchical prior knowledge about image classes and enhance image recognition\nperformances in an end-to-end supervised regression manner. Existing deep\nmetric learning incorporated image classification mainly exploits qualitative\nrelativity between image classes, i.e., whether sampled images are from the\nsame class. A new triplet loss function term that exploits quantitative\nrelativity and aligns distances in model latent space with those in knowledge\nspace is also proposed and incorporated in the proposed dual-modality fusion\nmethod. Experimental results indicate that the proposed method enhanced image\nrecognition performances and outperformed baseline and existing methods on\nCIFAR-10, CIFAR-100, Mini-ImageNet, and ImageNet-1K datasets.\n","authors":["Yunfeng Zhao","Huiyu Zhou","Fei Wu","Xifeng Wu"],"pdf_url":"https://arxiv.org/pdf/2407.20600v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.05873v2","updated":"2024-07-30T07:21:51Z","published":"2023-05-10T03:40:25Z","title":"Learning Signed Hyper Surfaces for Oriented Point Cloud Normal\n Estimation","summary":" We propose a novel method called SHS-Net for oriented normal estimation of\npoint clouds by learning signed hyper surfaces, which can accurately predict\nnormals with global consistent orientation from various point clouds. Almost\nall existing methods estimate oriented normals through a two-stage pipeline,\ni.e., unoriented normal estimation and normal orientation, and each step is\nimplemented by a separate algorithm. However, previous methods are sensitive to\nparameter settings, resulting in poor results from point clouds with noise,\ndensity variations and complex geometries. In this work, we introduce signed\nhyper surfaces (SHS), which are parameterized by multi-layer perceptron (MLP)\nlayers, to learn to estimate oriented normals from point clouds in an\nend-to-end manner. The signed hyper surfaces are implicitly learned in a\nhigh-dimensional feature space where the local and global information is\naggregated. Specifically, we introduce a patch encoding module and a shape\nencoding module to encode a 3D point cloud into a local latent code and a\nglobal latent code, respectively. Then, an attention-weighted normal prediction\nmodule is proposed as a decoder, which takes the local and global latent codes\nas input to predict oriented normals. Experimental results show that our\nSHS-Net outperforms the state-of-the-art methods in both unoriented and\noriented normal estimation on the widely used benchmarks.\n","authors":["Qing Li","Huifang Feng","Kanle Shi","Yue Gao","Yi Fang","Yu-Shen Liu","Zhizhong Han"],"pdf_url":"https://arxiv.org/pdf/2305.05873v2.pdf","comment":"Accepted by TPAMI 2024 (extension) and CVPR 2023. Project page:\n https://leoqli.github.io/SHS-Net/. Code: https://github.com/LeoQLi/SHS-Net"},{"id":"http://arxiv.org/abs/2407.20596v1","updated":"2024-07-30T07:15:39Z","published":"2024-07-30T07:15:39Z","title":"Benchmarking Histopathology Foundation Models for Ovarian Cancer\n Bevacizumab Treatment Response Prediction from Whole Slide Images","summary":" Bevacizumab is a widely studied targeted therapeutic drug used in conjunction\nwith standard chemotherapy for the treatment of recurrent ovarian cancer. While\nits administration has shown to increase the progression-free survival (PFS) in\npatients with advanced stage ovarian cancer, the lack of identifiable\nbiomarkers for predicting patient response has been a major roadblock in its\neffective adoption towards personalized medicine. In this work, we leverage the\nlatest histopathology foundation models trained on large-scale whole slide\nimage (WSI) datasets to extract ovarian tumor tissue features for predicting\nbevacizumab response from WSIs. Our extensive experiments across a combination\nof different histopathology foundation models and multiple instance learning\n(MIL) strategies demonstrate capability of these large models in predicting\nbevacizumab response in ovarian cancer patients with the best models achieving\nan AUC score of 0.86 and an accuracy score of 72.5%. Furthermore, our survival\nmodels are able to stratify high- and low-risk cases with statistical\nsignificance (p < 0.05) even among the patients with the aggressive subtype of\nhigh-grade serous ovarian carcinoma. This work highlights the utility of\nhistopathology foundation models for the task of ovarian bevacizumab response\nprediction from WSIs. The high-attention regions of the WSIs highlighted by\nthese models not only aid the model explainability but also serve as promising\nimaging biomarkers for treatment prognosis.\n","authors":["Mayur Mallya","Ali Khajegili Mirabadi","Hossein Farahani","Ali Bashashati"],"pdf_url":"https://arxiv.org/pdf/2407.20596v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20592v1","updated":"2024-07-30T06:57:00Z","published":"2024-07-30T06:57:00Z","title":"EgoSonics: Generating Synchronized Audio for Silent Egocentric Videos","summary":" We introduce EgoSonics, a method to generate semantically meaningful and\nsynchronized audio tracks conditioned on silent egocentric videos. Generating\naudio for silent egocentric videos could open new applications in virtual\nreality, assistive technologies, or for augmenting existing datasets. Existing\nwork has been limited to domains like speech, music, or impact sounds and\ncannot easily capture the broad range of audio frequencies found in egocentric\nvideos. EgoSonics addresses these limitations by building on the strength of\nlatent diffusion models for conditioned audio synthesis. We first encode and\nprocess audio and video data into a form that is suitable for generation. The\nencoded data is used to train our model to generate audio tracks that capture\nthe semantics of the input video. Our proposed SyncroNet builds on top of\nControlNet to provide control signals that enables temporal synchronization to\nthe synthesized audio. Extensive evaluations show that our model outperforms\nexisting work in audio quality, and in our newly proposed synchronization\nevaluation method. Furthermore, we demonstrate downstream applications of our\nmodel in improving video summarization.\n","authors":["Aashish Rai","Srinath Sridhar"],"pdf_url":"https://arxiv.org/pdf/2407.20592v1.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2407.20582v1","updated":"2024-07-30T06:29:40Z","published":"2024-07-30T06:29:40Z","title":"Image-based Detection of Segment Misalignment in Multi-mirror Satellites\n using Transfer Learning","summary":" In this paper, we introduce a system based on transfer learning for detecting\nsegment misalignment in multimirror satellites, such as future CubeSat designs\nand the James Webb Space Telescope (JWST), using image-based methods. When a\nmirror segment becomes misaligned due to various environmental factors, such as\nspace debris, the images can become distorted with a shifted copy of itself\ncalled a \"ghost image\". To detect whether segments are misaligned, we use\npre-trained, large-scale image models trained on the Fast Fourier Transform\n(FFT) of patches of satellite images in grayscale. Multi-mirror designs can use\nany arbitrary number of mirrors. For our purposes, the tests were performed on\nsimulated CubeSats with 4, 6, and 8 segments. For system design, we took this\ninto account when we want to know when a satellite has a misaligned segment and\nhow many segments are misaligned. The intensity of the ghost image is directly\nproportional to the number of segments misaligned. Models trained for intensity\nclassification attempted to classify N-1 segments. Across eight classes, binary\nmodels were able to achieve a classification accuracy of 98.75%, and models for\nintensity classification were able to achieve an accuracy of 98.05%.\n","authors":["C. Tanner Fredieu","Jonathan Tesch","Andrew Kee","David Redding"],"pdf_url":"https://arxiv.org/pdf/2407.20582v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00085v3","updated":"2024-07-30T06:17:08Z","published":"2023-11-30T07:23:00Z","title":"X-Dreamer: Creating High-quality 3D Content by Bridging the Domain Gap\n Between Text-to-2D and Text-to-3D Generation","summary":" In recent times, automatic text-to-3D content creation has made significant\nprogress, driven by the development of pretrained 2D diffusion models. Existing\ntext-to-3D methods typically optimize the 3D representation to ensure that the\nrendered image aligns well with the given text, as evaluated by the pretrained\n2D diffusion model. Nevertheless, a substantial domain gap exists between 2D\nimages and 3D assets, primarily attributed to variations in camera-related\nattributes and the exclusive presence of foreground objects. Consequently,\nemploying 2D diffusion models directly for optimizing 3D representations may\nlead to suboptimal outcomes. To address this issue, we present X-Dreamer, a\nnovel approach for high-quality text-to-3D content creation that effectively\nbridges the gap between text-to-2D and text-to-3D synthesis. The key components\nof X-Dreamer are two innovative designs: Camera-Guided Low-Rank Adaptation\n(CG-LoRA) and Attention-Mask Alignment (AMA) Loss. CG-LoRA dynamically\nincorporates camera information into the pretrained diffusion models by\nemploying camera-dependent generation for trainable parameters. This\nintegration enhances the alignment between the generated 3D assets and the\ncamera's perspective. AMA loss guides the attention map of the pretrained\ndiffusion model using the binary mask of the 3D object, prioritizing the\ncreation of the foreground object. This module ensures that the model focuses\non generating accurate and detailed foreground objects. Extensive evaluations\ndemonstrate the effectiveness of our proposed method compared to existing\ntext-to-3D approaches. Our project webpage:\nhttps://xmu-xiaoma666.github.io/Projects/X-Dreamer/ .\n","authors":["Yiwei Ma","Yijun Fan","Jiayi Ji","Haowei Wang","Xiaoshuai Sun","Guannan Jiang","Annan Shu","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2312.00085v3.pdf","comment":"ToMM24"},{"id":"http://arxiv.org/abs/2402.16174v3","updated":"2024-07-30T06:05:31Z","published":"2024-02-25T18:59:29Z","title":"GenNBV: Generalizable Next-Best-View Policy for Active 3D Reconstruction","summary":" While recent advances in neural radiance field enable realistic digitization\nfor large-scale scenes, the image-capturing process is still time-consuming and\nlabor-intensive. Previous works attempt to automate this process using the\nNext-Best-View (NBV) policy for active 3D reconstruction. However, the existing\nNBV policies heavily rely on hand-crafted criteria, limited action space, or\nper-scene optimized representations. These constraints limit their\ncross-dataset generalizability. To overcome them, we propose GenNBV, an\nend-to-end generalizable NBV policy. Our policy adopts a reinforcement learning\n(RL)-based framework and extends typical limited action space to 5D free space.\nIt empowers our agent drone to scan from any viewpoint, and even interact with\nunseen geometries during training. To boost the cross-dataset generalizability,\nwe also propose a novel multi-source state embedding, including geometric,\nsemantic, and action representations. We establish a benchmark using the Isaac\nGym simulator with the Houses3K and OmniObject3D datasets to evaluate this NBV\npolicy. Experiments demonstrate that our policy achieves a 98.26% and 97.12%\ncoverage ratio on unseen building-scale objects from these datasets,\nrespectively, outperforming prior solutions.\n","authors":["Xiao Chen","Quanyi Li","Tai Wang","Tianfan Xue","Jiangmiao Pang"],"pdf_url":"https://arxiv.org/pdf/2402.16174v3.pdf","comment":"CVPR 2024 accepted paper. Project page: http://gennbv.github.io/"},{"id":"http://arxiv.org/abs/2312.07504v2","updated":"2024-07-30T05:47:30Z","published":"2023-12-12T18:39:52Z","title":"COLMAP-Free 3D Gaussian Splatting","summary":" While neural rendering has led to impressive advances in scene reconstruction\nand novel view synthesis, it relies heavily on accurately pre-computed camera\nposes. To relax this constraint, multiple efforts have been made to train\nNeural Radiance Fields (NeRFs) without pre-processed camera poses. However, the\nimplicit representations of NeRFs provide extra challenges to optimize the 3D\nstructure and camera poses at the same time. On the other hand, the recently\nproposed 3D Gaussian Splatting provides new opportunities given its explicit\npoint cloud representations. This paper leverages both the explicit geometric\nrepresentation and the continuity of the input video stream to perform novel\nview synthesis without any SfM preprocessing. We process the input frames in a\nsequential manner and progressively grow the 3D Gaussians set by taking one\ninput frame at a time, without the need to pre-compute the camera poses. Our\nmethod significantly improves over previous approaches in view synthesis and\ncamera pose estimation under large motion changes. Our project page is\nhttps://oasisyang.github.io/colmap-free-3dgs\n","authors":["Yang Fu","Sifei Liu","Amey Kulkarni","Jan Kautz","Alexei A. Efros","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2312.07504v2.pdf","comment":"Project Page: https://oasisyang.github.io/colmap-free-3dgs"},{"id":"http://arxiv.org/abs/2311.11273v2","updated":"2024-07-30T05:45:50Z","published":"2023-11-19T09:05:52Z","title":"Chain of Visual Perception: Harnessing Multimodal Large Language Models\n for Zero-shot Camouflaged Object Detection","summary":" In this paper, we introduce a novel multimodal camo-perceptive framework\n(MMCPF) aimed at handling zero-shot Camouflaged Object Detection (COD) by\nleveraging the powerful capabilities of Multimodal Large Language Models\n(MLLMs). Recognizing the inherent limitations of current COD methodologies,\nwhich predominantly rely on supervised learning models demanding extensive and\naccurately annotated datasets, resulting in weak generalization, our research\nproposes a zero-shot MMCPF that circumvents these challenges. Although MLLMs\nhold significant potential for broad applications, their effectiveness in COD\nis hindered and they would make misinterpretations of camouflaged objects. To\naddress this challenge, we further propose a strategic enhancement called the\nChain of Visual Perception (CoVP), which significantly improves the perceptual\ncapabilities of MLLMs in camouflaged scenes by leveraging both linguistic and\nvisual cues more effectively. We validate the effectiveness of MMCPF on five\nwidely used COD datasets, containing CAMO, COD10K, NC4K, MoCA-Mask and OVCamo.\nExperiments show that MMCPF can outperform all existing state-of-the-art\nzero-shot COD methods, and achieve competitive performance compared to\nweakly-supervised and fully-supervised methods, which demonstrates the\npotential of MMCPF. The Github link of this paper is\n\\url{https://github.com/luckybird1994/MMCPF}.\n","authors":["Lv Tang","Peng-Tao Jiang","Zhihao Shen","Hao Zhang","Jinwei Chen","Bo Li"],"pdf_url":"https://arxiv.org/pdf/2311.11273v2.pdf","comment":"Accepted by ACMMM2024"},{"id":"http://arxiv.org/abs/2407.20566v1","updated":"2024-07-30T05:45:06Z","published":"2024-07-30T05:45:06Z","title":"Monocular Human-Object Reconstruction in the Wild","summary":" Learning the prior knowledge of the 3D human-object spatial relation is\ncrucial for reconstructing human-object interaction from images and\nunderstanding how humans interact with objects in 3D space. Previous works\nlearn this prior from datasets collected in controlled environments, but due to\nthe diversity of domains, they struggle to generalize to real-world scenarios.\nTo overcome this limitation, we present a 2D-supervised method that learns the\n3D human-object spatial relation prior purely from 2D images in the wild. Our\nmethod utilizes a flow-based neural network to learn the prior distribution of\nthe 2D human-object keypoint layout and viewports for each image in the\ndataset. The effectiveness of the prior learned from 2D images is demonstrated\non the human-object reconstruction task by applying the prior to tune the\nrelative pose between the human and the object during the post-optimization\nstage. To validate and benchmark our method on in-the-wild images, we collect\nthe WildHOI dataset from the YouTube website, which consists of various\ninteractions with 8 objects in real-world scenarios. We conduct the experiments\non the indoor BEHAVE dataset and the outdoor WildHOI dataset. The results show\nthat our method achieves almost comparable performance with fully 3D supervised\nmethods on the BEHAVE dataset, even if we have only utilized the 2D layout\ninformation, and outperforms previous methods in terms of generality and\ninteraction diversity on in-the-wild images.\n","authors":["Chaofan Huo","Ye Shi","Jingya Wang"],"pdf_url":"https://arxiv.org/pdf/2407.20566v1.pdf","comment":"Accepted by MM '24"},{"id":"http://arxiv.org/abs/2407.20563v1","updated":"2024-07-30T05:36:43Z","published":"2024-07-30T05:36:43Z","title":"Pyramid Coder: Hierarchical Code Generator for Compositional Visual\n Question Answering","summary":" Visual question answering (VQA) is the task of providing accurate answers to\nnatural language questions based on visual input. Programmatic VQA (PVQA)\nmodels have been gaining attention recently. These use large language models\n(LLMs) to formulate executable programs that address questions requiring\ncomplex visual reasoning. However, there are challenges in enabling LLMs to\ncomprehend the usage of image processing modules and generate relevant code. To\novercome these challenges, this paper introduces PyramidCoder, a novel\nprompting framework for PVQA models. PyramidCoder consists of three\nhierarchical levels, each serving a distinct purpose: query rephrasing, code\ngeneration, and answer aggregation. Notably, PyramidCoder utilizes a single\nfrozen LLM and pre-defined prompts at each level, eliminating the need for\nadditional training and ensuring flexibility across various LLM architectures.\nCompared to the state-of-the-art PVQA model, our approach improves accuracy by\nat least 0.5% on the GQA dataset, 1.4% on the VQAv2 dataset, and 2.9% on the\nNLVR2 dataset.\n","authors":["Ruoyue Shen","Nakamasa Inoue","Koichi Shinoda"],"pdf_url":"https://arxiv.org/pdf/2407.20563v1.pdf","comment":"Accepted to the IEEE International Conference on Image Processing\n (IEEE ICIP) 2024"},{"id":"http://arxiv.org/abs/2407.19674v2","updated":"2024-07-30T05:26:13Z","published":"2024-07-29T03:30:09Z","title":"Advancing Prompt Learning through an External Layer","summary":" Prompt learning represents a promising method for adapting pre-trained\nvision-language models (VLMs) to various downstream tasks by learning a set of\ntext embeddings. One challenge inherent to these methods is the poor\ngeneralization performance due to the invalidity of the learned text embeddings\nfor unseen tasks. A straightforward approach to bridge this gap is to freeze\nthe text embeddings in prompts, which results in a lack of capacity to adapt\nVLMs for downstream tasks. To address this dilemma, we propose a paradigm\ncalled EnPrompt with a novel External Layer (EnLa). Specifically, we propose a\ntextual external layer and learnable visual embeddings for adapting VLMs to\ndownstream tasks. The learnable external layer is built upon valid embeddings\nof pre-trained CLIP. This design considers the balance of learning capabilities\nbetween the two branches. To align the textual and visual features, we propose\na novel two-pronged approach: i) we introduce the optimal transport as the\ndiscrepancy metric to align the vision and text modalities, and ii) we\nintroduce a novel strengthening feature to enhance the interaction between\nthese two modalities. Four representative experiments (i.e., base-to-novel\ngeneralization, few-shot learning, cross-dataset generalization, domain shifts\ngeneralization) across 15 datasets demonstrate that our method outperforms the\nexisting prompt learning method.\n","authors":["Fangming Cui","Xun Yang","Chao Wu","Liang Xiao","Xinmei Tian"],"pdf_url":"https://arxiv.org/pdf/2407.19674v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.08199v3","updated":"2024-07-30T05:12:28Z","published":"2022-09-16T23:49:00Z","title":"ScreenQA: Large-Scale Question-Answer Pairs over Mobile App Screenshots","summary":" We present a new benchmark and dataset, ScreenQA, for screen content\nunderstanding via question answering. The existing screen datasets are focused\neither on structure and component-level understanding, or on a much\nhigher-level composite task such as navigation and task completion. We attempt\nto bridge the gap between these two by annotating 86K question-answer pairs\nover the RICO dataset in hope to benchmark the screen reading comprehension\ncapacity. This work is also the first to annotate answers for different\napplication scenarios, including both full sentences and short forms, as well\nas supporting UI contents on screen and their bounding boxes. With the rich\nannotation, we discuss and define the evaluation metrics of the benchmark, show\napplications of the dataset, and provide a few baselines using closed and open\nsource models.\n","authors":["Yu-Chung Hsiao","Fedir Zubach","Gilles Baechler","Victor Carbune","Jason Lin","Maria Wang","Srinivas Sunkara","Yun Zhu","Jindong Chen"],"pdf_url":"https://arxiv.org/pdf/2209.08199v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20545v1","updated":"2024-07-30T04:57:21Z","published":"2024-07-30T04:57:21Z","title":"StackFLOW: Monocular Human-Object Reconstruction by Stacked Normalizing\n Flow with Offset","summary":" Modeling and capturing the 3D spatial arrangement of the human and the object\nis the key to perceiving 3D human-object interaction from monocular images. In\nthis work, we propose to use the Human-Object Offset between anchors which are\ndensely sampled from the surface of human mesh and object mesh to represent\nhuman-object spatial relation. Compared with previous works which use contact\nmap or implicit distance filed to encode 3D human-object spatial relations, our\nmethod is a simple and efficient way to encode the highly detailed spatial\ncorrelation between the human and object. Based on this representation, we\npropose Stacked Normalizing Flow (StackFLOW) to infer the posterior\ndistribution of human-object spatial relations from the image. During the\noptimization stage, we finetune the human body pose and object 6D pose by\nmaximizing the likelihood of samples based on this posterior distribution and\nminimizing the 2D-3D corresponding reprojection loss. Extensive experimental\nresults show that our method achieves impressive results on two challenging\nbenchmarks, BEHAVE and InterCap datasets.\n","authors":["Chaofan Huo","Ye Shi","Yuexin Ma","Lan Xu","Jingyi Yu","Jingya Wang"],"pdf_url":"https://arxiv.org/pdf/2407.20545v1.pdf","comment":"Accepted by IJCAI-23"},{"id":"http://arxiv.org/abs/2407.20542v1","updated":"2024-07-30T04:53:35Z","published":"2024-07-30T04:53:35Z","title":"HandDAGT: A Denoising Adaptive Graph Transformer for 3D Hand Pose\n Estimation","summary":" The extraction of keypoint positions from input hand frames, known as 3D hand\npose estimation, is crucial for various human-computer interaction\napplications. However, current approaches often struggle with the dynamic\nnature of self-occlusion of hands and intra-occlusion with interacting objects.\nTo address this challenge, this paper proposes the Denoising Adaptive Graph\nTransformer, HandDAGT, for hand pose estimation. The proposed HandDAGT\nleverages a transformer structure to thoroughly explore effective geometric\nfeatures from input patches. Additionally, it incorporates a novel attention\nmechanism to adaptively weigh the contribution of kinematic correspondence and\nlocal geometric features for the estimation of specific keypoints. This\nattribute enables the model to adaptively employ kinematic and local\ninformation based on the occlusion situation, enhancing its robustness and\naccuracy. Furthermore, we introduce a novel denoising training strategy aimed\nat improving the model's robust performance in the face of occlusion\nchallenges. Experimental results show that the proposed model significantly\noutperforms the existing methods on four challenging hand pose benchmark\ndatasets. Codes and pre-trained models are publicly available at\nhttps://github.com/cwc1260/HandDAGT.\n","authors":["Wencan Cheng","Eunji Kim","Jong Hwan Ko"],"pdf_url":"https://arxiv.org/pdf/2407.20542v1.pdf","comment":"Accepted as a conference paper to European Conference on Computer\n Vision (ECCV) 2024"},{"id":"http://arxiv.org/abs/2404.11929v2","updated":"2024-07-30T04:50:58Z","published":"2024-04-18T06:18:48Z","title":"A Symmetric Regressor for MRI-Based Assessment of Striatal Dopamine\n Transporter Uptake in Parkinson's Disease","summary":" Dopamine transporter (DAT) imaging is commonly used for monitoring\nParkinson's disease (PD), where striatal DAT uptake amount is computed to\nassess PD severity. However, DAT imaging has a high cost and the risk of\nradiance exposure and is not available in general clinics. Recently, MRI patch\nof the nigral region has been proposed as a safer and easier alternative. This\npaper proposes a symmetric regressor for predicting the DAT uptake amount from\nthe nigral MRI patch. Acknowledging the symmetry between the right and left\nnigrae, the proposed regressor incorporates a paired input-output model that\nsimultaneously predicts the DAT uptake amounts for both the right and left\nstriata. Moreover, it employs a symmetric loss that imposes a constraint on the\ndifference between right-to-left predictions, resembling the high correlation\nin DAT uptake amounts in the two lateral sides. Additionally, we propose a\nsymmetric Monte-Carlo (MC) dropout method for providing a fruitful uncertainty\nestimate of the DAT uptake prediction, which utilizes the above symmetry. We\nevaluated the proposed approach on 734 nigral patches, which demonstrated\nsignificantly improved performance of the symmetric regressor compared with the\nstandard regressors while giving better explainability and feature\nrepresentation. The symmetric MC dropout also gave precise uncertainty ranges\nwith a high probability of including the true DAT uptake amounts within the\nrange.\n","authors":["Walid Abdullah Al","Il Dong Yun","Yun Jung Bae"],"pdf_url":"https://arxiv.org/pdf/2404.11929v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15318v2","updated":"2024-07-30T04:27:26Z","published":"2023-07-28T05:35:37Z","title":"DocDeshadower: Frequency-Aware Transformer for Document Shadow Removal","summary":" Shadows in scanned documents pose significant challenges for document\nanalysis and recognition tasks due to their negative impact on visual quality\nand readability. Current shadow removal techniques, including traditional\nmethods and deep learning approaches, face limitations in handling varying\nshadow intensities and preserving document details. To address these issues, we\npropose DocDeshadower, a novel multi-frequency Transformer-based model built\nupon the Laplacian Pyramid. By decomposing the shadow image into multiple\nfrequency bands and employing two critical modules: the Attention-Aggregation\nNetwork for low-frequency shadow removal and the Gated Multi-scale Fusion\nTransformer for global refinement. DocDeshadower effectively removes shadows at\ndifferent scales while preserving document content. Extensive experiments\ndemonstrate DocDeshadower's superior performance compared to state-of-the-art\nmethods, highlighting its potential to significantly improve document shadow\nremoval techniques. The code is available at\nhttps://github.com/leiyingtie/DocDeshadower.\n","authors":["Ziyang Zhou","Yingtie Lei","Xuhang Chen","Shenghong Luo","Wenjun Zhang","Chi-Man Pun","Zhen Wang"],"pdf_url":"https://arxiv.org/pdf/2307.15318v2.pdf","comment":"Accepted by IEEE International Conference on Systems, Man, and\n Cybernetics 2024"},{"id":"http://arxiv.org/abs/2406.10787v3","updated":"2024-07-30T04:00:44Z","published":"2024-06-16T03:00:16Z","title":"Evidential Uncertainty Sets in Deep Classifiers Using Conformal\n Prediction","summary":" In this paper, we propose Evidential Conformal Prediction (ECP) method for\nimage classifiers to generate the conformal prediction sets. Our method is\ndesigned based on a non-conformity score function that has its roots in\nEvidential Deep Learning (EDL) as a method of quantifying model (epistemic)\nuncertainty in DNN classifiers. We use evidence that are derived from the logit\nvalues of target labels to compute the components of our non-conformity score\nfunction: the heuristic notion of uncertainty in CP, uncertainty surprisal, and\nexpected utility. Our extensive experimental evaluation demonstrates that ECP\noutperforms three state-of-the-art methods for generating CP sets, in terms of\ntheir set sizes and adaptivity while maintaining the coverage of true labels.\n","authors":["Hamed Karimi","Reza Samavi"],"pdf_url":"https://arxiv.org/pdf/2406.10787v3.pdf","comment":"Accepted in 13th Symposium on Conformal and Probabilistic Prediction\n with Applications (COPA2024). To be published in the Proceedings of Machine\n Learning Research (PMLR), vol. 230, 2024 (25 Pages)"},{"id":"http://arxiv.org/abs/2403.13443v2","updated":"2024-07-30T03:44:19Z","published":"2024-03-20T09:39:39Z","title":"Fast-Poly: A Fast Polyhedral Framework For 3D Multi-Object Tracking","summary":" 3D Multi-Object Tracking (MOT) captures stable and comprehensive motion\nstates of surrounding obstacles, essential for robotic perception. However,\ncurrent 3D trackers face issues with accuracy and latency consistency. In this\npaper, we propose Fast-Poly, a fast and effective filter-based method for 3D\nMOT. Building upon our previous work Poly-MOT, Fast-Poly addresses object\nrotational anisotropy in 3D space, enhances local computation densification,\nand leverages parallelization technique, improving inference speed and\nprecision. Fast-Poly is extensively tested on two large-scale tracking\nbenchmarks with Python implementation. On the nuScenes dataset, Fast-Poly\nachieves new state-of-the-art performance with 75.8% AMOTA among all methods\nand can run at 34.2 FPS on a personal CPU. On the Waymo dataset, Fast-Poly\nexhibits competitive accuracy with 63.6% MOTA and impressive inference speed\n(35.5 FPS). The source code is publicly available at\nhttps://github.com/lixiaoyu2000/FastPoly.\n","authors":["Xiaoyu Li","Dedong Liu","Yitao Wu","Xian Wu","Lijun Zhao","Jinghan Gao"],"pdf_url":"https://arxiv.org/pdf/2403.13443v2.pdf","comment":"1st on the NuScenes Tracking benchmark with 75.8 AMOTA and 34.2 FPS"},{"id":"http://arxiv.org/abs/2405.08780v2","updated":"2024-07-30T03:42:00Z","published":"2024-05-14T17:15:28Z","title":"Harnessing the power of longitudinal medical imaging for eye disease\n prognosis using Transformer-based sequence modeling","summary":" Deep learning has enabled breakthroughs in automated diagnosis from medical\nimaging, with many successful applications in ophthalmology. However, standard\nmedical image classification approaches only assess disease presence at the\ntime of acquisition, neglecting the common clinical setting of longitudinal\nimaging. For slow, progressive eye diseases like age-related macular\ndegeneration (AMD) and primary open-angle glaucoma (POAG), patients undergo\nrepeated imaging over time to track disease progression and forecasting the\nfuture risk of developing disease is critical to properly plan treatment. Our\nproposed Longitudinal Transformer for Survival Analysis (LTSA) enables dynamic\ndisease prognosis from longitudinal medical imaging, modeling the time to\ndisease from sequences of fundus photography images captured over long,\nirregular time periods. Using longitudinal imaging data from the Age-Related\nEye Disease Study (AREDS) and Ocular Hypertension Treatment Study (OHTS), LTSA\nsignificantly outperformed a single-image baseline in 19/20 head-to-head\ncomparisons on late AMD prognosis and 18/20 comparisons on POAG prognosis. A\ntemporal attention analysis also suggested that, while the most recent image is\ntypically the most influential, prior imaging still provides additional\nprognostic value.\n","authors":["Gregory Holste","Mingquan Lin","Ruiwen Zhou","Fei Wang","Lei Liu","Qi Yan","Sarah H. Van Tassel","Kyle Kovacs","Emily Y. Chew","Zhiyong Lu","Zhangyang Wang","Yifan Peng"],"pdf_url":"https://arxiv.org/pdf/2405.08780v2.pdf","comment":"Accepted to npj Digital Medicine"},{"id":"http://arxiv.org/abs/2407.20518v1","updated":"2024-07-30T03:29:57Z","published":"2024-07-30T03:29:57Z","title":"High-Resolution Spatial Transcriptomics from Histology Images using\n HisToSGE","summary":" Spatial transcriptomics (ST) is a groundbreaking genomic technology that\nenables spatial localization analysis of gene expression within tissue\nsections. However, it is significantly limited by high costs and sparse spatial\nresolution. An alternative, more cost-effective strategy is to use deep\nlearning methods to predict high-density gene expression profiles from\nhistological images. However, existing methods struggle to capture rich image\nfeatures effectively or rely on low-dimensional positional coordinates, making\nit difficult to accurately predict high-resolution gene expression profiles. To\naddress these limitations, we developed HisToSGE, a method that employs a\nPathology Image Large Model (PILM) to extract rich image features from\nhistological images and utilizes a feature learning module to robustly generate\nhigh-resolution gene expression profiles. We evaluated HisToSGE on four ST\ndatasets, comparing its performance with five state-of-the-art baseline\nmethods. The results demonstrate that HisToSGE excels in generating\nhigh-resolution gene expression profiles and performing downstream tasks such\nas spatial domain identification. All code and public datasets used in this\npaper are available at https://github.com/wenwenmin/HisToSGE and\nhttps://zenodo.org/records/12792163.\n","authors":["Zhiceng Shi","Shuailin Xue","Fangfang Zhu","Wenwen Min"],"pdf_url":"https://arxiv.org/pdf/2407.20518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20515v1","updated":"2024-07-30T03:20:54Z","published":"2024-07-30T03:20:54Z","title":"Markers Identification for Relative Pose Estimation of an Uncooperative\n Target","summary":" This paper introduces a novel method using chaser spacecraft image processing\nand Convolutional Neural Networks (CNNs) to detect structural markers on the\nEuropean Space Agency's (ESA) Environmental Satellite (ENVISAT) for safe\nde-orbiting. Advanced image pre-processing techniques, including noise addition\nand blurring, are employed to improve marker detection accuracy and robustness.\nInitial results show promising potential for autonomous space debris removal,\nsupporting proactive strategies for space sustainability. The effectiveness of\nour approach suggests that our estimation method could significantly enhance\nthe safety and efficiency of debris removal operations by implementing more\nrobust and autonomous systems in actual space missions.\n","authors":["Batu Candan","Simone Servadio"],"pdf_url":"https://arxiv.org/pdf/2407.20515v1.pdf","comment":"2024 AAS/AIAA Astrodynamics Specialist Conference"},{"id":"http://arxiv.org/abs/2406.13642v4","updated":"2024-07-30T03:18:54Z","published":"2024-06-19T15:41:30Z","title":"SpatialBot: Precise Spatial Understanding with Vision Language Models","summary":" Vision Language Models (VLMs) have achieved impressive performance in 2D\nimage understanding, however they are still struggling with spatial\nunderstanding which is the foundation of Embodied AI. In this paper, we propose\nSpatialBot for better spatial understanding by feeding both RGB and depth\nimages. Additionally, we have constructed the SpatialQA dataset, which involves\nmulti-level depth-related questions to train VLMs for depth understanding.\nFinally, we present SpatialBench to comprehensively evaluate VLMs' capabilities\nin spatial understanding at different levels. Extensive experiments on our\nspatial-understanding benchmark, general VLM benchmarks and Embodied AI tasks,\ndemonstrate the remarkable improvements of SpatialBot trained on SpatialQA. The\nmodel, code and data are available at https://github.com/BAAI-DCAI/SpatialBot.\n","authors":["Wenxiao Cai","Yaroslav Ponomarenko","Jianhao Yuan","Xiaoqi Li","Wankou Yang","Hao Dong","Bo Zhao"],"pdf_url":"https://arxiv.org/pdf/2406.13642v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.08407v3","updated":"2024-07-30T03:15:55Z","published":"2024-06-12T16:54:54Z","title":"MMWorld: Towards Multi-discipline Multi-faceted World Model Evaluation\n in Videos","summary":" Multimodal Language Language Models (MLLMs) demonstrate the emerging\nabilities of \"world models\" -- interpreting and reasoning about complex\nreal-world dynamics. To assess these abilities, we posit videos are the ideal\nmedium, as they encapsulate rich representations of real-world dynamics and\ncausalities. To this end, we introduce MMWorld, a new benchmark for\nmulti-discipline, multi-faceted multimodal video understanding. MMWorld\ndistinguishes itself from previous video understanding benchmarks with two\nunique advantages: (1) multi-discipline, covering various disciplines that\noften require domain expertise for comprehensive understanding; (2)\nmulti-faceted reasoning, including explanation, counterfactual thinking, future\nprediction, etc. MMWorld consists of a human-annotated dataset to evaluate\nMLLMs with questions about the whole videos and a synthetic dataset to analyze\nMLLMs within a single modality of perception. Together, MMWorld encompasses\n1,910 videos across seven broad disciplines and 69 subdisciplines, complete\nwith 6,627 question-answer pairs and associated captions. The evaluation\nincludes 2 proprietary and 10 open-source MLLMs, which struggle on MMWorld\n(e.g., GPT-4V performs the best with only 52.3\\% accuracy), showing large room\nfor improvement. Further ablation studies reveal other interesting findings\nsuch as models' different skill sets from humans. We hope MMWorld can serve as\nan essential step towards world model evaluation in videos.\n","authors":["Xuehai He","Weixi Feng","Kaizhi Zheng","Yujie Lu","Wanrong Zhu","Jiachen Li","Yue Fan","Jianfeng Wang","Linjie Li","Zhengyuan Yang","Kevin Lin","William Yang Wang","Lijuan Wang","Xin Eric Wang"],"pdf_url":"https://arxiv.org/pdf/2406.08407v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15706v4","updated":"2024-07-30T03:13:16Z","published":"2024-07-22T15:16:47Z","title":"Multi-Modality Co-Learning for Efficient Skeleton-based Action\n Recognition","summary":" Skeleton-based action recognition has garnered significant attention due to\nthe utilization of concise and resilient skeletons. Nevertheless, the absence\nof detailed body information in skeletons restricts performance, while other\nmultimodal methods require substantial inference resources and are inefficient\nwhen using multimodal data during both training and inference stages. To\naddress this and fully harness the complementary multimodal features, we\npropose a novel multi-modality co-learning (MMCL) framework by leveraging the\nmultimodal large language models (LLMs) as auxiliary networks for efficient\nskeleton-based action recognition, which engages in multi-modality co-learning\nduring the training stage and keeps efficiency by employing only concise\nskeletons in inference. Our MMCL framework primarily consists of two modules.\nFirst, the Feature Alignment Module (FAM) extracts rich RGB features from video\nframes and aligns them with global skeleton features via contrastive learning.\nSecond, the Feature Refinement Module (FRM) uses RGB images with temporal\ninformation and text instruction to generate instructive features based on the\npowerful generalization of multimodal LLMs. These instructive text features\nwill further refine the classification scores and the refined scores will\nenhance the model's robustness and generalization in a manner similar to soft\nlabels. Extensive experiments on NTU RGB+D, NTU RGB+D 120 and Northwestern-UCLA\nbenchmarks consistently verify the effectiveness of our MMCL, which outperforms\nthe existing skeleton-based action recognition methods. Meanwhile, experiments\non UTD-MHAD and SYSU-Action datasets demonstrate the commendable generalization\nof our MMCL in zero-shot and domain-adaptive action recognition. Our code is\npublicly available at: https://github.com/liujf69/MMCL-Action.\n","authors":["Jinfu Liu","Chen Chen","Mengyuan Liu"],"pdf_url":"https://arxiv.org/pdf/2407.15706v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20505v1","updated":"2024-07-30T02:41:32Z","published":"2024-07-30T02:41:32Z","title":"Interpreting and Mitigating Hallucination in MLLMs through Multi-agent\n Debate","summary":" MLLMs often generate outputs that are inconsistent with the visual content, a\nchallenge known as hallucination. Previous methods focus on determining whether\na generated output is hallucinated, without identifying which image region\nleads to the hallucination or interpreting why such hallucinations occur. In\nthis paper, we argue that hallucination in MLLMs is partially due to a lack of\nslow-thinking and divergent-thinking in these models. To address this, we\npropose adopting a self-reflection scheme to promote slow-thinking.\nFurthermore, we consider eliminating hallucination as a complex reasoning task\nand propose a multi-agent debate approach to encourage divergent-thinking.\nConsequently, our approach can not only mitigate hallucinations but also\ninterpret why they occur and detail the specifics of hallucination. In\naddition, we propose to distinguish creativity from hallucination in the\ncontext of MLLMs, and illustrate how to evaluate MLLMs' creativity capability.\nExtensive experiments on various benchmarks demonstrate that our approach\nexhibits generalized hallucinations-mitigating performance across several\nMLLMs.\n","authors":["Zheng Lin","Zhenxing Niu","Zhibin Wang","Yinghui Xu"],"pdf_url":"https://arxiv.org/pdf/2407.20505v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11057v3","updated":"2024-07-30T02:35:52Z","published":"2024-03-17T02:06:49Z","title":"Large Language Models Powered Context-aware Motion Prediction in\n Autonomous Driving","summary":" Motion prediction is among the most fundamental tasks in autonomous driving.\nTraditional methods of motion forecasting primarily encode vector information\nof maps and historical trajectory data of traffic participants, lacking a\ncomprehensive understanding of overall traffic semantics, which in turn affects\nthe performance of prediction tasks. In this paper, we utilized Large Language\nModels (LLMs) to enhance the global traffic context understanding for motion\nprediction tasks. We first conducted systematic prompt engineering, visualizing\ncomplex traffic environments and historical trajectory information of traffic\nparticipants into image prompts -- Transportation Context Map (TC-Map),\naccompanied by corresponding text prompts. Through this approach, we obtained\nrich traffic context information from the LLM. By integrating this information\ninto the motion prediction model, we demonstrate that such context can enhance\nthe accuracy of motion predictions. Furthermore, considering the cost\nassociated with LLMs, we propose a cost-effective deployment strategy:\nenhancing the accuracy of motion prediction tasks at scale with 0.7\\%\nLLM-augmented datasets. Our research offers valuable insights into enhancing\nthe understanding of traffic scenes of LLMs and the motion prediction\nperformance of autonomous driving. The source code is available at\n\\url{https://github.com/AIR-DISCOVER/LLM-Augmented-MTR} and\n\\url{https://aistudio.baidu.com/projectdetail/7809548}.\n","authors":["Xiaoji Zheng","Lixiu Wu","Zhijie Yan","Yuanrong Tang","Hao Zhao","Chen Zhong","Bokui Chen","Jiangtao Gong"],"pdf_url":"https://arxiv.org/pdf/2403.11057v3.pdf","comment":"6 pages,4 figures"},{"id":"http://arxiv.org/abs/2407.20502v1","updated":"2024-07-30T02:29:59Z","published":"2024-07-30T02:29:59Z","title":"Restoring Real-World Degraded Events Improves Deblurring Quality","summary":" Due to its high speed and low latency, DVS is frequently employed in motion\ndeblurring. Ideally, high-quality events would adeptly capture intricate motion\ninformation. However, real-world events are generally degraded, thereby\nintroducing significant artifacts into the deblurred results. In response to\nthis challenge, we model the degradation of events and propose RDNet to improve\nthe quality of image deblurring. Specifically, we first analyze the mechanisms\nunderlying degradation and simulate paired events based on that. These paired\nevents are then fed into the first stage of the RDNet for training the\nrestoration model. The events restored in this stage serve as a guide for the\nsecond-stage deblurring process. To better assess the deblurring performance of\ndifferent methods on real-world degraded events, we present a new real-world\ndataset named DavisMCR. This dataset incorporates events with diverse\ndegradation levels, collected by manipulating environmental brightness and\ntarget object contrast. Our experiments are conducted on synthetic datasets\n(GOPRO), real-world datasets (REBlur), and the proposed dataset (DavisMCR). The\nresults demonstrate that RDNet outperforms classical event denoising methods in\nevent restoration. Furthermore, RDNet exhibits better performance in deblurring\ntasks compared to state-of-the-art methods. DavisMCR are available at\nhttps://github.com/Yeeesir/DVS_RDNet.\n","authors":["Yeqing Shen","Shang Li","Kun Song"],"pdf_url":"https://arxiv.org/pdf/2407.20502v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19593v2","updated":"2024-07-30T02:20:28Z","published":"2024-07-28T21:26:33Z","title":"Bridging the Gap: Studio-like Avatar Creation from a Monocular Phone\n Capture","summary":" Creating photorealistic avatars for individuals traditionally involves\nextensive capture sessions with complex and expensive studio devices like the\nLightStage system. While recent strides in neural representations have enabled\nthe generation of photorealistic and animatable 3D avatars from quick phone\nscans, they have the capture-time lighting baked-in, lack facial details and\nhave missing regions in areas such as the back of the ears. Thus, they lag in\nquality compared to studio-captured avatars. In this paper, we propose a method\nthat bridges this gap by generating studio-like illuminated texture maps from\nshort, monocular phone captures. We do this by parameterizing the phone texture\nmaps using the $W^+$ space of a StyleGAN2, enabling near-perfect\nreconstruction. Then, we finetune a StyleGAN2 by sampling in the $W^+$\nparameterized space using a very small set of studio-captured textures as an\nadversarial training signal. To further enhance the realism and accuracy of\nfacial details, we super-resolve the output of the StyleGAN2 using carefully\ndesigned diffusion model that is guided by image gradients of the\nphone-captured texture map. Once trained, our method excels at producing\nstudio-like facial texture maps from casual monocular smartphone videos.\nDemonstrating its capabilities, we showcase the generation of photorealistic,\nuniformly lit, complete avatars from monocular phone captures. The project page\ncan be found at http://shahrukhathar.github.io/2024/07/22/Bridging.html\n","authors":["ShahRukh Athar","Shunsuke Saito","Zhengyu Yang","Stanislav Pidhorsky","Chen Cao"],"pdf_url":"https://arxiv.org/pdf/2407.19593v2.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2407.16684v3","updated":"2024-07-30T02:15:03Z","published":"2024-07-23T17:50:00Z","title":"AutoRG-Brain: Grounded Report Generation for Brain MRI","summary":" Radiologists are tasked with interpreting a large number of images in a daily\nbase, with the responsibility of generating corresponding reports. This\ndemanding workload elevates the risk of human error, potentially leading to\ntreatment delays, increased healthcare costs, revenue loss, and operational\ninefficiencies. To address these challenges, we initiate a series of work on\ngrounded Automatic Report Generation (AutoRG), starting from the brain MRI\ninterpretation system, which supports the delineation of brain structures, the\nlocalization of anomalies, and the generation of well-organized findings. We\nmake contributions from the following aspects, first, on dataset construction,\nwe release a comprehensive dataset encompassing segmentation masks of anomaly\nregions and manually authored reports, termed as RadGenome-Brain MRI. This data\nresource is intended to catalyze ongoing research and development in the field\nof AI-assisted report generation systems. Second, on system design, we propose\nAutoRG-Brain, the first brain MRI report generation system with pixel-level\ngrounded visual clues. Third, for evaluation, we conduct quantitative\nassessments and human evaluations of brain structure segmentation, anomaly\nlocalization, and report generation tasks to provide evidence of its\nreliability and accuracy. This system has been integrated into real clinical\nscenarios, where radiologists were instructed to write reports based on our\ngenerated findings and anomaly segmentation masks. The results demonstrate that\nour system enhances the report-writing skills of junior doctors, aligning their\nperformance more closely with senior doctors, thereby boosting overall\nproductivity.\n","authors":["Jiayu Lei","Xiaoman Zhang","Chaoyi Wu","Lisong Dai","Ya Zhang","Yanyong Zhang","Yanfeng Wang","Weidi Xie","Yuehua Li"],"pdf_url":"https://arxiv.org/pdf/2407.16684v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20021v2","updated":"2024-07-30T02:03:06Z","published":"2024-07-29T13:57:40Z","title":"MimiQ: Low-Bit Data-Free Quantization of Vision Transformers with\n Encouraging Inter-Head Attention Similarity","summary":" Data-free quantization (DFQ) is a technique that creates a lightweight\nnetwork from its full-precision counterpart without the original training data,\noften through a synthetic dataset. Although several DFQ methods have been\nproposed for vision transformer (ViT) architectures, they fail to achieve\nefficacy in low-bit settings. Examining the existing methods, we identify that\ntheir synthetic data produce misaligned attention maps, while those of the real\nsamples are highly aligned. From the observation of aligned attention, we find\nthat aligning attention maps of synthetic data helps to improve the overall\nperformance of quantized ViTs. Motivated by this finding, we devise \\aname, a\nnovel DFQ method designed for ViTs that focuses on inter-head attention\nsimilarity. First, we generate synthetic data by aligning head-wise attention\nresponses in relation to spatial query patches. Then, we apply head-wise\nstructural attention distillation to align the attention maps of the quantized\nnetwork to those of the full-precision teacher. The experimental results show\nthat the proposed method significantly outperforms baselines, setting a new\nstate-of-the-art performance for data-free ViT quantization.\n","authors":["Kanghyun Choi","Hye Yoon Lee","Dain Kwon","SunJong Park","Kyuyeun Kim","Noseong Park","Jinho Lee"],"pdf_url":"https://arxiv.org/pdf/2407.20021v2.pdf","comment":"Author Preprint"},{"id":"http://arxiv.org/abs/2407.20495v1","updated":"2024-07-30T01:39:30Z","published":"2024-07-30T01:39:30Z","title":"Enhancing Quantitative Image Synthesis through Pretraining and\n Resolution Scaling for Bone Mineral Density Estimation from a Plain X-ray\n Image","summary":" While most vision tasks are essentially visual in nature (for recognition),\nsome important tasks, especially in the medical field, also require\nquantitative analysis (for quantification) using quantitative images. Unlike in\nvisual analysis, pixel values in quantitative images correspond to physical\nmetrics measured by specific devices (e.g., a depth image). However, recent\nwork has shown that it is sometimes possible to synthesize accurate\nquantitative values from visual ones (e.g., depth from visual cues or defocus).\nThis research aims to improve quantitative image synthesis (QIS) by exploring\npretraining and image resolution scaling. We propose a benchmark for evaluating\npretraining performance using the task of QIS-based bone mineral density (BMD)\nestimation from plain X-ray images, where the synthesized quantitative image is\nused to derive BMD. Our results show that appropriate pretraining can improve\nQIS performance, significantly raising the correlation of BMD estimation from\n0.820 to 0.898, while others do not help or even hinder it. Scaling-up the\nresolution can further boost the correlation up to 0.923, a significant\nenhancement over conventional methods. Future work will include exploring more\npretraining strategies and validating them on other image synthesis tasks.\n","authors":["Yi Gu","Yoshito Otake","Keisuke Uemura","Masaki Takao","Mazen Soufi","Seiji Okada","Nobuhiko Sugano","Hugues Talbot","Yoshinobu Sato"],"pdf_url":"https://arxiv.org/pdf/2407.20495v1.pdf","comment":"SASHIMI, 2024 (MICCAI workshop). 13 pages, 3 figures"},{"id":"http://arxiv.org/abs/2407.16988v2","updated":"2024-07-30T01:16:47Z","published":"2024-07-24T04:13:43Z","title":"DreamCar: Leveraging Car-specific Prior for in-the-wild 3D Car\n Reconstruction","summary":" Self-driving industries usually employ professional artists to build\nexquisite 3D cars. However, it is expensive to craft large-scale digital\nassets. Since there are already numerous datasets available that contain a vast\nnumber of images of cars, we focus on reconstructing high-quality 3D car models\nfrom these datasets. However, these datasets only contain one side of cars in\nthe forward-moving scene. We try to use the existing generative models to\nprovide more supervision information, but they struggle to generalize well in\ncars since they are trained on synthetic datasets not car-specific. In\naddition, The reconstructed 3D car texture misaligns due to a large error in\ncamera pose estimation when dealing with in-the-wild images. These restrictions\nmake it challenging for previous methods to reconstruct complete 3D cars. To\naddress these problems, we propose a novel method, named DreamCar, which can\nreconstruct high-quality 3D cars given a few images even a single image. To\ngeneralize the generative model, we collect a car dataset, named Car360, with\nover 5,600 vehicles. With this dataset, we make the generative model more\nrobust to cars. We use this generative prior specific to the car to guide its\nreconstruction via Score Distillation Sampling. To further complement the\nsupervision information, we utilize the geometric and appearance symmetry of\ncars. Finally, we propose a pose optimization method that rectifies poses to\ntackle texture misalignment. Extensive experiments demonstrate that our method\nsignificantly outperforms existing methods in reconstructing high-quality 3D\ncars. \\href{https://xiaobiaodu.github.io/dreamcar-project/}{Our code is\navailable.}\n","authors":["Xiaobiao Du","Haiyang Sun","Ming Lu","Tianqing Zhu","Xin Yu"],"pdf_url":"https://arxiv.org/pdf/2407.16988v2.pdf","comment":"Projet Page: https://xiaobiaodu.github.io/dreamcar-project/"},{"id":"http://arxiv.org/abs/2404.11889v2","updated":"2024-07-30T23:50:55Z","published":"2024-04-18T04:25:56Z","title":"Multi-view X-ray Image Synthesis with Multiple Domain Disentanglement\n from CT Scans","summary":" X-ray images play a vital role in the intraoperative processes due to their\nhigh resolution and fast imaging speed and greatly promote the subsequent\nsegmentation, registration and reconstruction. However, over-dosed X-rays\nsuperimpose potential risks to human health to some extent. Data-driven\nalgorithms from volume scans to X-ray images are restricted by the scarcity of\npaired X-ray and volume data. Existing methods are mainly realized by modelling\nthe whole X-ray imaging procedure. In this study, we propose a learning-based\napproach termed CT2X-GAN to synthesize the X-ray images in an end-to-end manner\nusing the content and style disentanglement from three different image domains.\nOur method decouples the anatomical structure information from CT scans and\nstyle information from unpaired real X-ray images/ digital reconstructed\nradiography (DRR) images via a series of decoupling encoders. Additionally, we\nintroduce a novel consistency regularization term to improve the stylistic\nresemblance between synthesized X-ray images and real X-ray images. Meanwhile,\nwe also impose a supervised process by computing the similarity of computed\nreal DRR and synthesized DRR images. We further develop a pose attention module\nto fully strengthen the comprehensive information in the decoupled content code\nfrom CT scans, facilitating high-quality multi-view image synthesis in the\nlower 2D space. Extensive experiments were conducted on the publicly available\nCTSpine1K dataset and achieved 97.8350, 0.0842 and 3.0938 in terms of FID, KID\nand defined user-scored X-ray similarity, respectively. In comparison with\n3D-aware methods ($\\pi$-GAN, EG3D), CT2X-GAN is superior in improving the\nsynthesis quality and realistic to the real X-ray images.\n","authors":["Lixing Tan","Shuang Song","Kangneng Zhou","Chengbo Duan","Lanying Wang","Huayang Ren","Linlin Liu","Wei Zhang","Ruoxiu Xiao"],"pdf_url":"https://arxiv.org/pdf/2404.11889v2.pdf","comment":"13 pages, 10 figures, ACM MM2024"},{"id":"http://arxiv.org/abs/2407.21244v1","updated":"2024-07-30T23:29:47Z","published":"2024-07-30T23:29:47Z","title":"VITAL: Visual Teleoperation to Enhance Robot Learning through\n Human-in-the-Loop Corrections","summary":" Imitation Learning (IL) has emerged as a powerful approach in robotics,\nallowing robots to acquire new skills by mimicking human actions. Despite its\npotential, the data collection process for IL remains a significant challenge\ndue to the logistical difficulties and high costs associated with obtaining\nhigh-quality demonstrations. To address these issues, we propose a low-cost\nvisual teleoperation system for bimanual manipulation tasks, called VITAL. Our\napproach leverages affordable hardware and visual processing techniques to\ncollect demonstrations, which are then augmented to create extensive training\ndatasets for imitation learning. We enhance the generalizability and robustness\nof the learned policies by utilizing both real and simulated environments and\nhuman-in-the-loop corrections. We evaluated our method through several rounds\nof experiments in simulated and real-robot settings, focusing on tasks of\nvarying complexity, including bottle collecting, stacking objects, and\nhammering. Our experimental results validate the effectiveness of our approach\nin learning robust robot policies from simulated data, significantly improved\nby human-in-the-loop corrections and real-world data integration. Additionally,\nwe demonstrate the framework's capability to generalize to new tasks, such as\nsetting a drink tray, showcasing its adaptability and potential for handling a\nwide range of real-world bimanual manipulation tasks. A video of the\nexperiments can be found at: https://youtu.be/YeVAMRqRe64?si=R179xDlEGc7nPu8i\n","authors":["Hamidreza Kasaei","Mohammadreza Kasaei"],"pdf_url":"https://arxiv.org/pdf/2407.21244v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21233v1","updated":"2024-07-30T22:40:32Z","published":"2024-07-30T22:40:32Z","title":"TMA-Grid: An open-source, zero-footprint web application for FAIR Tissue\n MicroArray De-arraying","summary":" Background:\n Tissue Microarrays (TMAs) significantly increase analytical efficiency in\nhistopathology and large-scale epidemiologic studies by allowing multiple\ntissue cores to be scanned on a single slide. The individual cores can be\ndigitally extracted and then linked to metadata for analysis in a process known\nas de-arraying. However, TMAs often contain core misalignments and artifacts\ndue to assembly errors, which can adversely affect the reliability of the\nextracted cores during the de-arraying process. Moreover, conventional\napproaches for TMA de-arraying rely on desktop solutions.Therefore, a robust\nyet flexible de-arraying method is crucial to account for these inaccuracies\nand ensure effective downstream analyses.\n Results:\n We developed TMA-Grid, an in-browser, zero-footprint, interactive web\napplication for TMA de-arraying. This web application integrates a\nconvolutional neural network for precise tissue segmentation and a grid\nestimation algorithm to match each identified core to its expected location.\nThe application emphasizes interactivity, allowing users to easily adjust\nsegmentation and gridding results. Operating entirely in the web-browser,\nTMA-Grid eliminates the need for downloads or installations and ensures data\nprivacy. Adhering to FAIR principles (Findable, Accessible, Interoperable, and\nReusable), the application and its components are designed for seamless\nintegration into TMA research workflows.\n Conclusions:\n TMA-Grid provides a robust, user-friendly solution for TMA dearraying on the\nweb. As an open, freely accessible platform, it lays the foundation for\ncollaborative analyses of TMAs and similar histopathology imaging data.\nAvailability: Web application: https://episphere.github.io/tma-grid Code:\nhttps://github.com/episphere/tma-grid Tutorial: https://youtu.be/miajqyw4BVk\n","authors":["Aaron Ge","Monjoy Saha","Maire A. Duggan","Petra Lenz","Mustapha Abubakar","Montserrat García-Closas","Jeya Balasubramanian","Jonas S. Almeida","Praphulla MS Bhawsar"],"pdf_url":"https://arxiv.org/pdf/2407.21233v1.pdf","comment":"NA"},{"id":"http://arxiv.org/abs/2407.21229v1","updated":"2024-07-30T22:32:50Z","published":"2024-07-30T22:32:50Z","title":"Advancing Vietnamese Visual Question Answering with Transformer and\n Convolutional Integration","summary":" Visual Question Answering (VQA) has recently emerged as a potential research\ndomain, captivating the interest of many in the field of artificial\nintelligence and computer vision. Despite the prevalence of approaches in\nEnglish, there is a notable lack of systems specifically developed for certain\nlanguages, particularly Vietnamese. This study aims to bridge this gap by\nconducting comprehensive experiments on the Vietnamese Visual Question\nAnswering (ViVQA) dataset, demonstrating the effectiveness of our proposed\nmodel. In response to community interest, we have developed a model that\nenhances image representation capabilities, thereby improving overall\nperformance in the ViVQA system. Specifically, our model integrates the\nBootstrapping Language-Image Pre-training with frozen unimodal models (BLIP-2)\nand the convolutional neural network EfficientNet to extract and process both\nlocal and global features from images. This integration leverages the strengths\nof transformer-based architectures for capturing comprehensive contextual\ninformation and convolutional networks for detailed local features. By freezing\nthe parameters of these pre-trained models, we significantly reduce the\ncomputational cost and training time, while maintaining high performance. This\napproach significantly improves image representation and enhances the\nperformance of existing VQA systems. We then leverage a multi-modal fusion\nmodule based on a general-purpose multi-modal foundation model (BEiT-3) to fuse\nthe information between visual and textual features. Our experimental findings\ndemonstrate that our model surpasses competing baselines, achieving promising\nperformance. This is particularly evident in its accuracy of $71.04\\%$ on the\ntest set of the ViVQA dataset, marking a significant advancement in our\nresearch area. The code is available at https://github.com/nngocson2002/ViVQA.\n","authors":["Ngoc Son Nguyen","Van Son Nguyen","Tung Le"],"pdf_url":"https://arxiv.org/pdf/2407.21229v1.pdf","comment":"Accepted at the journal of Computers & Electrical Engineering\n (Received 8 March 2024, Revised 8 June 2024, Accepted 10 July 2024)"},{"id":"http://arxiv.org/abs/2407.21220v1","updated":"2024-07-30T22:14:47Z","published":"2024-07-30T22:14:47Z","title":"DeepBaR: Fault Backdoor Attack on Deep Neural Network Layers","summary":" Machine Learning using neural networks has received prominent attention\nrecently because of its success in solving a wide variety of computational\ntasks, in particular in the field of computer vision. However, several works\nhave drawn attention to potential security risks involved with the training and\nimplementation of such networks. In this work, we introduce DeepBaR, a novel\napproach that implants backdoors on neural networks by faulting their behavior\nat training, especially during fine-tuning. Our technique aims to generate\nadversarial samples by optimizing a custom loss function that mimics the\nimplanted backdoors while adding an almost non-visible trigger in the image. We\nattack three popular convolutional neural network architectures and show that\nDeepBaR attacks have a success rate of up to 98.30\\%. Furthermore, DeepBaR does\nnot significantly affect the accuracy of the attacked networks after deployment\nwhen non-malicious inputs are given. Remarkably, DeepBaR allows attackers to\nchoose an input that looks similar to a given class, from a human perspective,\nbut that will be classified as belonging to an arbitrary target class.\n","authors":["C. A. Martínez-Mejía","J. Solano","J. Breier","D. Bucko","X. Hou"],"pdf_url":"https://arxiv.org/pdf/2407.21220v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21216v1","updated":"2024-07-30T21:59:02Z","published":"2024-07-30T21:59:02Z","title":"Distribution-Aware Replay for Continual MRI Segmentation","summary":" Medical image distributions shift constantly due to changes in patient\npopulation and discrepancies in image acquisition. These distribution changes\nresult in performance deterioration; deterioration that continual learning aims\nto alleviate. However, only adaptation with data rehearsal strategies yields\npractically desirable performance for medical image segmentation. Such\nrehearsal violates patient privacy and, as most continual learning approaches,\noverlooks unexpected changes from out-of-distribution instances. To transcend\nboth of these challenges, we introduce a distribution-aware replay strategy\nthat mitigates forgetting through auto-encoding of features, while\nsimultaneously leveraging the learned distribution of features to detect model\nfailure. We provide empirical corroboration on hippocampus and prostate MRI\nsegmentation.\n","authors":["Nick Lemke","Camila González","Anirban Mukhopadhyay","Martin Mundt"],"pdf_url":"https://arxiv.org/pdf/2407.21216v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21174v1","updated":"2024-07-30T20:28:31Z","published":"2024-07-30T20:28:31Z","title":"AI Safety in Practice: Enhancing Adversarial Robustness in Multimodal\n Image Captioning","summary":" Multimodal machine learning models that combine visual and textual data are\nincreasingly being deployed in critical applications, raising significant\nsafety and security concerns due to their vulnerability to adversarial attacks.\nThis paper presents an effective strategy to enhance the robustness of\nmultimodal image captioning models against such attacks. By leveraging the Fast\nGradient Sign Method (FGSM) to generate adversarial examples and incorporating\nadversarial training techniques, we demonstrate improved model robustness on\ntwo benchmark datasets: Flickr8k and COCO. Our findings indicate that\nselectively training only the text decoder of the multimodal architecture shows\nperformance comparable to full adversarial training while offering increased\ncomputational efficiency. This targeted approach suggests a balance between\nrobustness and training costs, facilitating the ethical deployment of\nmultimodal AI systems across various domains.\n","authors":["Maisha Binte Rashid","Pablo Rivas"],"pdf_url":"https://arxiv.org/pdf/2407.21174v1.pdf","comment":"Accepted into KDD 2024 workshop on Ethical AI"},{"id":"http://arxiv.org/abs/2406.15946v2","updated":"2024-07-30T20:15:10Z","published":"2024-06-22T21:49:12Z","title":"Optimizing LaneSegNet for Real-Time Lane Topology Prediction in\n Autonomous Vehicles","summary":" With the increasing prevalence of autonomous vehicles, it is essential for\ncomputer vision algorithms to accurately assess road features in real-time.\nThis study explores the LaneSegNet architecture, a new approach to lane\ntopology prediction which integrates topological information with lane-line\ndata to provide a more contextual understanding of road environments. The\nLaneSegNet architecture includes a feature extractor, lane encoder, lane\ndecoder, and prediction head, leveraging components from ResNet-50, BEVFormer,\nand various attention mechanisms. We experimented with optimizations to the\nLaneSegNet architecture through feature extractor modification and transformer\nencoder-decoder stack modification. We found that modifying the encoder and\ndecoder stacks offered an interesting tradeoff between training time and\nprediction accuracy, with certain combinations showing promising results. Our\nimplementation, trained on a single NVIDIA Tesla A100 GPU, found that a 2:4\nratio reduced training time by 22.3% with only a 7.1% drop in mean average\nprecision, while a 4:8 ratio increased training time by only 11.1% but improved\nmean average precision by a significant 23.7%. These results indicate that\nstrategic hyperparameter tuning can yield substantial improvements depending on\nthe resources of the user. This study provides valuable insights for optimizing\nLaneSegNet according to available computation power, making it more accessible\nfor users with limited resources and increasing the capabilities for users with\nmore powerful resources.\n","authors":["William Stevens","Vishal Urs","Karthik Selvaraj","Gabriel Torres","Gaurish Lakhanpal"],"pdf_url":"https://arxiv.org/pdf/2406.15946v2.pdf","comment":"18 pages, 16 figures"},{"id":"http://arxiv.org/abs/2407.21159v1","updated":"2024-07-30T19:52:49Z","published":"2024-07-30T19:52:49Z","title":"Embedding Space Selection for Detecting Memorization and Fingerprinting\n in Generative Models","summary":" In the rapidly evolving landscape of artificial intelligence, generative\nmodels such as Generative Adversarial Networks (GANs) and Diffusion Models have\nbecome cornerstone technologies, driving innovation in diverse fields from art\ncreation to healthcare. Despite their potential, these models face the\nsignificant challenge of data memorization, which poses risks to privacy and\nthe integrity of generated content. Among various metrics of memorization\ndetection, our study delves into the memorization scores calculated from\nencoder layer embeddings, which involves measuring distances between samples in\nthe embedding spaces. Particularly, we find that the memorization scores\ncalculated from layer embeddings of Vision Transformers (ViTs) show an notable\ntrend - the latter (deeper) the layer, the less the memorization measured. It\nhas been found that the memorization scores from the early layers' embeddings\nare more sensitive to low-level memorization (e.g. colors and simple patterns\nfor an image), while those from the latter layers are more sensitive to\nhigh-level memorization (e.g. semantic meaning of an image). We also observe\nthat, for a specific model architecture, its degree of memorization on\ndifferent levels of information is unique. It can be viewed as an inherent\nproperty of the architecture. Building upon this insight, we introduce a unique\nfingerprinting methodology. This method capitalizes on the unique distributions\nof the memorization score across different layers of ViTs, providing a novel\napproach to identifying models involved in generating deepfakes and malicious\ncontent. Our approach demonstrates a marked 30% enhancement in identification\naccuracy over existing baseline methods, offering a more effective tool for\ncombating digital misinformation.\n","authors":["Jack He","Jianxing Zhao","Andrew Bai","Cho-Jui Hsieh"],"pdf_url":"https://arxiv.org/pdf/2407.21159v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21150v1","updated":"2024-07-30T19:27:37Z","published":"2024-07-30T19:27:37Z","title":"PLANesT-3D: A new annotated dataset for segmentation of 3D plant point\n clouds","summary":" Creation of new annotated public datasets is crucial in helping advances in\n3D computer vision and machine learning meet their full potential for automatic\ninterpretation of 3D plant models. In this paper, we introduce PLANesT-3D; a\nnew annotated dataset of 3D color point clouds of plants. PLANesT-3D is\ncomposed of 34 point cloud models representing 34 real plants from three\ndifferent plant species: \\textit{Capsicum annuum}, \\textit{Rosa kordana}, and\n\\textit{Ribes rubrum}. Both semantic labels in terms of \"leaf\" and \"stem\", and\norgan instance labels were manually annotated for the full point clouds. As an\nadditional contribution, SP-LSCnet, a novel semantic segmentation method that\nis a combination of unsupervised superpoint extraction and a 3D point-based\ndeep learning approach is introduced and evaluated on the new dataset. Two\nexisting deep neural network architectures, PointNet++ and RoseSegNet were also\ntested on the point clouds of PLANesT-3D for semantic segmentation.\n","authors":["Kerem Mertoğlu","Yusuf Şalk","Server Karahan Sarıkaya","Kaya Turgut","Yasemin Evrenesoğlu","Hakan Çevikalp","Ömer Nezih Gerek","Helin Dutağacı","David Rousseau"],"pdf_url":"https://arxiv.org/pdf/2407.21150v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21149v1","updated":"2024-07-30T19:23:29Z","published":"2024-07-30T19:23:29Z","title":"Domain Shift Analysis in Chest Radiographs Classification in a Veterans\n Healthcare Administration Population","summary":" Objectives: This study aims to assess the impact of domain shift on chest\nX-ray classification accuracy and to analyze the influence of ground truth\nlabel quality and demographic factors such as age group, sex, and study year.\nMaterials and Methods: We used a DenseNet121 model pretrained MIMIC-CXR dataset\nfor deep learning-based multilabel classification using ground truth labels\nfrom radiology reports extracted using the CheXpert and CheXbert Labeler. We\ncompared the performance of the 14 chest X-ray labels on the MIMIC-CXR and\nVeterans Healthcare Administration chest X-ray dataset (VA-CXR). The VA-CXR\ndataset comprises over 259k chest X-ray images spanning between the years 2010\nand 2022. Results: The validation of ground truth and the assessment of\nmulti-label classification performance across various NLP extraction tools\nrevealed that the VA-CXR dataset exhibited lower disagreement rates than the\nMIMIC-CXR datasets. Additionally, there were notable differences in AUC scores\nbetween models utilizing CheXpert and CheXbert. When evaluating multi-label\nclassification performance across different datasets, minimal domain shift was\nobserved in unseen datasets, except for the label \"Enlarged Cardiomediastinum.\"\nThe study year's subgroup analyses exhibited the most significant variations in\nmulti-label classification model performance. These findings underscore the\nimportance of considering domain shifts in chest X-ray classification tasks,\nparticularly concerning study years. Conclusion: Our study reveals the\nsignificant impact of domain shift and demographic factors on chest X-ray\nclassification, emphasizing the need for improved transfer learning and\nequitable model development. Addressing these challenges is crucial for\nadvancing medical imaging and enhancing patient care.\n","authors":["Mayanka Chandrashekar","Ian Goethert","Md Inzamam Ul Haque","Benjamin McMahon","Sayera Dhaubhadel","Kathryn Knight","Joseph Erdos","Donna Reagan","Caroline Taylor","Peter Kuzmak","John Michael Gaziano","Eileen McAllister","Lauren Costa","Yuk-Lam Ho","Kelly Cho","Suzanne Tamang","Samah Fodeh-Jarad","Olga S. Ovchinnikova","Amy C. Justice","Jacob Hinkle","Ioana Danciu"],"pdf_url":"https://arxiv.org/pdf/2407.21149v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16972v2","updated":"2024-07-30T19:07:49Z","published":"2024-04-25T18:50:26Z","title":"CriSp: Leveraging Tread Depth Maps for Enhanced Crime-Scene Shoeprint\n Matching","summary":" Shoeprints are a common type of evidence found at crime scenes and are used\nregularly in forensic investigations. However, existing methods cannot\neffectively employ deep learning techniques to match noisy and occluded\ncrime-scene shoeprints to a shoe database due to a lack of training data.\nMoreover, all existing methods match crime-scene shoeprints to clean reference\nprints, yet our analysis shows matching to more informative tread depth maps\nyields better retrieval results. The matching task is further complicated by\nthe necessity to identify similarities only in corresponding regions (heels,\ntoes, etc) of prints and shoe treads. To overcome these challenges, we leverage\nshoe tread images from online retailers and utilize an off-the-shelf predictor\nto estimate depth maps and clean prints. Our method, named CriSp, matches\ncrime-scene shoeprints to tread depth maps by training on this data. CriSp\nincorporates data augmentation to simulate crime-scene shoeprints, an encoder\nto learn spatially-aware features, and a masking module to ensure only visible\nregions of crime-scene prints affect retrieval results. To validate our\napproach, we introduce two validation sets by reprocessing existing datasets of\ncrime-scene shoeprints and establish a benchmarking protocol for comparison. On\nthis benchmark, CriSp significantly outperforms state-of-the-art methods in\nboth automated shoeprint matching and image retrieval tailored to this task.\n","authors":["Samia Shafique","Shu Kong","Charless Fowlkes"],"pdf_url":"https://arxiv.org/pdf/2404.16972v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21136v1","updated":"2024-07-30T18:57:06Z","published":"2024-07-30T18:57:06Z","title":"Adding Multi-modal Controls to Whole-body Human Motion Generation","summary":" Whole-body multi-modal motion generation, controlled by text, speech, or\nmusic, has numerous applications including video generation and character\nanimation. However, employing a unified model to accomplish various generation\ntasks with different condition modalities presents two main challenges: motion\ndistribution drifts across different generation scenarios and the complex\noptimization of mixed conditions with varying granularity. Furthermore,\ninconsistent motion formats in existing datasets further hinder effective\nmulti-modal motion generation. In this paper, we propose ControlMM, a unified\nframework to Control whole-body Multi-modal Motion generation in a\nplug-and-play manner. To effectively learn and transfer motion knowledge across\ndifferent motion distributions, we propose ControlMM-Attn, for parallel\nmodeling of static and dynamic human topology graphs. To handle conditions with\nvarying granularity, ControlMM employs a coarse-to-fine training strategy,\nincluding stage-1 text-to-motion pre-training for semantic generation and\nstage-2 multi-modal control adaptation for conditions of varying low-level\ngranularity. To address existing benchmarks' varying motion format limitations,\nwe introduce ControlMM-Bench, the first publicly available multi-modal\nwhole-body human motion generation benchmark based on the unified whole-body\nSMPL-X format. Extensive experiments show that ControlMM achieves\nstate-of-the-art performance across various standard motion generation tasks.\nOur website is at https://yxbian23.github.io/ControlMM.\n","authors":["Yuxuan Bian","Ailing Zeng","Xuan Ju","Xian Liu","Zhaoyang Zhang","Wei Liu","Qiang Xu"],"pdf_url":"https://arxiv.org/pdf/2407.21136v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21126v1","updated":"2024-07-30T18:37:59Z","published":"2024-07-30T18:37:59Z","title":"Self-supervised Multi-future Occupancy Forecasting for Autonomous\n Driving","summary":" Environment prediction frameworks are critical for the safe navigation of\nautonomous vehicles (AVs) in dynamic settings. LiDAR-generated occupancy grid\nmaps (L-OGMs) offer a robust bird's-eye view for the scene representation,\nenabling self-supervised joint scene predictions while exhibiting resilience to\npartial observability and perception detection failures. Prior approaches have\nfocused on deterministic L-OGM prediction architectures within the grid cell\nspace. While these methods have seen some success, they frequently produce\nunrealistic predictions and fail to capture the stochastic nature of the\nenvironment. Additionally, they do not effectively integrate additional sensor\nmodalities present in AVs. Our proposed framework performs stochastic L-OGM\nprediction in the latent space of a generative architecture and allows for\nconditioning on RGB cameras, maps, and planned trajectories. We decode\npredictions using either a single-step decoder, which provides high-quality\npredictions in real-time, or a diffusion-based batch decoder, which can further\nrefine the decoded frames to address temporal consistency issues and reduce\ncompression losses. Our experiments on the nuScenes and Waymo Open datasets\nshow that all variants of our approach qualitatively and quantitatively\noutperform prior approaches.\n","authors":["Bernard Lange","Masha Itkina","Jiachen Li","Mykel J. Kochenderfer"],"pdf_url":"https://arxiv.org/pdf/2407.21126v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21121v1","updated":"2024-07-30T18:24:46Z","published":"2024-07-30T18:24:46Z","title":"Taming the Frequency Factory of Sinusoidal Networks","summary":" This work investigates the structure and representation capacity of\n$sinusoidal$ MLPs, which have recently shown promising results in encoding\nlow-dimensional signals. This success can be attributed to its smoothness and\nhigh representation capacity. The first allows the use of the network's\nderivatives during training, enabling regularization. However, defining the\narchitecture and initializing its parameters to achieve a desired capacity\nremains an empirical task. This work provides theoretical and experimental\nresults justifying the capacity property of sinusoidal MLPs and offers control\nmechanisms for their initialization and training.\n We approach this from a Fourier series perspective and link the training with\nthe model's spectrum. Our analysis is based on a $harmonic$ expansion of the\nsinusoidal MLP, which says that the composition of sinusoidal layers produces a\nlarge number of new frequencies expressed as integer linear combinations of the\ninput frequencies (weights of the input layer). We use this novel $identity$ to\ninitialize the input neurons which work as a sampling in the signal spectrum.\nWe also note that each hidden neuron produces the same frequencies with\namplitudes completely determined by the hidden weights. Finally, we give an\nupper bound for these amplitudes, which results in a $bounding$ scheme for the\nnetwork's spectrum during training.\n","authors":["Tiago Novello","Diana Aldana","Luiz Velho"],"pdf_url":"https://arxiv.org/pdf/2407.21121v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.09429v3","updated":"2024-07-30T18:07:33Z","published":"2023-02-18T21:48:18Z","title":"NU-AIR -- A Neuromorphic Urban Aerial Dataset for Detection and\n Localization of Pedestrians and Vehicles","summary":" This paper presents an open-source aerial neuromorphic dataset that captures\npedestrians and vehicles moving in an urban environment. The dataset, titled\nNU-AIR, features 70.75 minutes of event footage acquired with a 640 x 480\nresolution neuromorphic sensor mounted on a quadrotor operating in an urban\nenvironment. Crowds of pedestrians, different types of vehicles, and street\nscenes featuring busy urban environments are captured at different elevations\nand illumination conditions. Manual bounding box annotations of vehicles and\npedestrians contained in the recordings are provided at a frequency of 30 Hz,\nyielding 93,204 labels in total. Evaluation of the dataset's fidelity is\nperformed through comprehensive ablation study for three Spiking Neural\nNetworks (SNNs) and training ten Deep Neural Networks (DNNs) to validate the\nquality and reliability of both the dataset and corresponding annotations. All\ndata and Python code to voxelize the data and subsequently train SNNs/DNNs has\nbeen open-sourced.\n","authors":["Craig Iaboni","Thomas Kelly","Pramod Abichandani"],"pdf_url":"https://arxiv.org/pdf/2302.09429v3.pdf","comment":"24 pages, 8 figures"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2407.20856v1","updated":"2024-07-30T14:31:53Z","published":"2024-07-30T14:31:53Z","title":"Learn by Selling: Equipping Large Language Models with Product Knowledge\n for Context-Driven Recommendations","summary":" The rapid evolution of large language models (LLMs) has opened up new\npossibilities for applications such as context-driven product recommendations.\nHowever, the effectiveness of these models in this context is heavily reliant\non their comprehensive understanding of the product inventory. This paper\npresents a novel approach to equipping LLMs with product knowledge by training\nthem to respond contextually to synthetic search queries that include product\nIDs. We delve into an extensive analysis of this method, evaluating its\neffectiveness, outlining its benefits, and highlighting its constraints. The\npaper also discusses the potential improvements and future directions for this\napproach, providing a comprehensive understanding of the role of LLMs in\nproduct recommendations.\n","authors":["Sarthak Anand","Yutong Jiang","Giorgi Kokaia"],"pdf_url":"https://arxiv.org/pdf/2407.20856v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15599v2","updated":"2024-07-30T13:40:08Z","published":"2023-12-25T03:29:39Z","title":"Preliminary Study on Incremental Learning for Large Language Model-based\n Recommender Systems","summary":" Adapting Large Language Models for Recommendation (LLM4Rec) has shown\npromising results. However, the challenges of deploying LLM4Rec in real-world\nscenarios remain largely unexplored. In particular, recommender models need\nincremental adaptation to evolving user preferences, while the suitability of\ntraditional incremental learning methods within LLM4Rec remains ambiguous due\nto the unique characteristics of Large Language Models (LLMs).\n In this study, we empirically evaluate two commonly employed incremental\nlearning strategies (full retraining and fine-tuning) for LLM4Rec.\nSurprisingly, neither approach shows significant improvements in the\nperformance of LLM4Rec. Instead of dismissing the role of incremental learning,\nwe attribute the lack of anticipated performance enhancement to a mismatch\nbetween the LLM4Rec architecture and incremental learning: LLM4Rec employs a\nsingle adaptation module for learning recommendations, limiting its ability to\nsimultaneously capture long-term and short-term user preferences in the\nincremental learning context. To test this speculation, we introduce a Long-\nand Short-term Adaptation-aware Tuning (LSAT) framework for incremental\nlearning in LLM4Rec. Unlike the single adaptation module approach, LSAT\nutilizes two distinct adaptation modules to independently learn long-term and\nshort-term user preferences. Empirical results verify that LSAT enhances\nperformance, thereby validating our speculation. We release our code at:\nhttps://github.com/TianhaoShi2001/LSAT.\n","authors":["Tianhao Shi","Yang Zhang","Zhijian Xu","Chong Chen","Fuli Feng","Xiangnan He","Qi Tian"],"pdf_url":"https://arxiv.org/pdf/2312.15599v2.pdf","comment":"accepted in the short paper track of the 2024 ACM International\n Conference on Information and Knowledge Management (CIKM 2024)"},{"id":"http://arxiv.org/abs/2407.20750v1","updated":"2024-07-30T11:42:19Z","published":"2024-07-30T11:42:19Z","title":"JaColBERTv2.5: Optimising Multi-Vector Retrievers to Create\n State-of-the-Art Japanese Retrievers with Constrained Resources","summary":" Neural Information Retrieval has advanced rapidly in high-resource languages,\nbut progress in lower-resource ones such as Japanese has been hindered by data\nscarcity, among other challenges. Consequently, multilingual models have\ndominated Japanese retrieval, despite their computational inefficiencies and\ninability to capture linguistic nuances. While recent multi-vector monolingual\nmodels like JaColBERT have narrowed this gap, they still lag behind\nmultilingual methods in large-scale evaluations. This work addresses the\nsuboptimal training methods of multi-vector retrievers in lower-resource\nsettings, focusing on Japanese. We systematically evaluate and improve key\naspects of the inference and training settings of JaColBERT, and more broadly,\nmulti-vector models. We further enhance performance through a novel checkpoint\nmerging step, showcasing it to be an effective way of combining the benefits of\nfine-tuning with the generalization capabilities of the original checkpoint.\nBuilding on our analysis, we introduce a novel training recipe, resulting in\nthe JaColBERTv2.5 model. JaColBERTv2.5, with only 110 million parameters and\ntrained in under 15 hours on 4 A100 GPUs, significantly outperforms all\nexisting methods across all common benchmarks, reaching an average score of\n0.754, significantly above the previous best of 0.720. To support future\nresearch, we make our final models, intermediate checkpoints and all data used\npublicly available.\n","authors":["Benjamin Clavié"],"pdf_url":"https://arxiv.org/pdf/2407.20750v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00019v3","updated":"2024-07-30T10:09:13Z","published":"2024-05-23T07:14:21Z","title":"EHR-SeqSQL : A Sequential Text-to-SQL Dataset For Interactively\n Exploring Electronic Health Records","summary":" In this paper, we introduce EHR-SeqSQL, a novel sequential text-to-SQL\ndataset for Electronic Health Record (EHR) databases. EHR-SeqSQL is designed to\naddress critical yet underexplored aspects in text-to-SQL parsing:\ninteractivity, compositionality, and efficiency. To the best of our knowledge,\nEHR-SeqSQL is not only the largest but also the first medical text-to-SQL\ndataset benchmark to include sequential and contextual questions. We provide a\ndata split and the new test set designed to assess compositional generalization\nability. Our experiments demonstrate the superiority of a multi-turn approach\nover a single-turn approach in learning compositionality. Additionally, our\ndataset integrates specially crafted tokens into SQL queries to improve\nexecution efficiency. With EHR-SeqSQL, we aim to bridge the gap between\npractical needs and academic research in the text-to-SQL domain. EHR-SeqSQL is\navailable at https://github.com/seonhee99/EHR-SeqSQL.\n","authors":["Jaehee Ryu","Seonhee Cho","Gyubok Lee","Edward Choi"],"pdf_url":"https://arxiv.org/pdf/2406.00019v3.pdf","comment":"ACL 2024 (Findings)"},{"id":"http://arxiv.org/abs/2407.20684v1","updated":"2024-07-30T09:25:40Z","published":"2024-07-30T09:25:40Z","title":"RevGNN: Negative Sampling Enhanced Contrastive Graph Learning for\n Academic Reviewer Recommendation","summary":" Acquiring reviewers for academic submissions is a challenging recommendation\nscenario. Recent graph learning-driven models have made remarkable progress in\nthe field of recommendation, but their performance in the academic reviewer\nrecommendation task may suffer from a significant false negative issue. This\narises from the assumption that unobserved edges represent negative samples. In\nfact, the mechanism of anonymous review results in inadequate exposure of\ninteractions between reviewers and submissions, leading to a higher number of\nunobserved interactions compared to those caused by reviewers declining to\nparticipate. Therefore, investigating how to better comprehend the negative\nlabeling of unobserved interactions in academic reviewer recommendations is a\nsignificant challenge. This study aims to tackle the ambiguous nature of\nunobserved interactions in academic reviewer recommendations. Specifically, we\npropose an unsupervised Pseudo Neg-Label strategy to enhance graph contrastive\nlearning (GCL) for recommending reviewers for academic submissions, which we\ncall RevGNN. RevGNN utilizes a two-stage encoder structure that encodes both\nscientific knowledge and behavior using Pseudo Neg-Label to approximate review\npreference. Extensive experiments on three real-world datasets demonstrate that\nRevGNN outperforms all baselines across four metrics. Additionally, detailed\nfurther analyses confirm the effectiveness of each component in RevGNN.\n","authors":["Weibin Liao","Yifan Zhu","Yanyan Li","Qi Zhang","Zhonghong Ou","Xuesong Li"],"pdf_url":"https://arxiv.org/pdf/2407.20684v1.pdf","comment":"Accepted by ACM Transactions on Information Systems (TOIS)"},{"id":"http://arxiv.org/abs/2407.20665v1","updated":"2024-07-30T08:59:50Z","published":"2024-07-30T08:59:50Z","title":"Powerful A/B-Testing Metrics and Where to Find Them","summary":" Online controlled experiments, colloquially known as A/B-tests, are the bread\nand butter of real-world recommender system evaluation. Typically, end-users\nare randomly assigned some system variant, and a plethora of metrics are then\ntracked, collected, and aggregated throughout the experiment. A North Star\nmetric (e.g. long-term growth or revenue) is used to assess which system\nvariant should be deemed superior. As a result, most collected metrics are\nsupporting in nature, and serve to either (i) provide an understanding of how\nthe experiment impacts user experience, or (ii) allow for confident\ndecision-making when the North Star metric moves insignificantly (i.e. a false\nnegative or type-II error). The latter is not straightforward: suppose a\ntreatment variant leads to fewer but longer sessions, with more views but fewer\nengagements; should this be considered a positive or negative outcome?\n The question then becomes: how do we assess a supporting metric's utility\nwhen it comes to decision-making using A/B-testing? Online platforms typically\nrun dozens of experiments at any given time. This provides a wealth of\ninformation about interventions and treatment effects that can be used to\nevaluate metrics' utility for online evaluation. We propose to collect this\ninformation and leverage it to quantify type-I, type-II, and type-III errors\nfor the metrics of interest, alongside a distribution of measurements of their\nstatistical power (e.g. $z$-scores and $p$-values). We present results and\ninsights from building this pipeline at scale for two large-scale short-video\nplatforms: ShareChat and Moj; leveraging hundreds of past experiments to find\nonline metrics with high statistical power.\n","authors":["Olivier Jeunen","Shubham Baweja","Neeti Pokharna","Aleksei Ustimenko"],"pdf_url":"https://arxiv.org/pdf/2407.20665v1.pdf","comment":"Accepted to the Industry Track of the 2024 ACM Conference on\n Recommender Systems (RecSys '24)"},{"id":"http://arxiv.org/abs/2403.06372v2","updated":"2024-07-30T04:24:05Z","published":"2024-03-11T01:50:41Z","title":"Repeated Padding for Sequential Recommendation","summary":" Sequential recommendation aims to provide users with personalized suggestions\nbased on their historical interactions. When training sequential models,\npadding is a widely adopted technique for two main reasons: 1) The vast\nmajority of models can only handle fixed-length sequences; 2) Batching-based\ntraining needs to ensure that the sequences in each batch have the same length.\nThe special value \\emph{0} is usually used as the padding content, which does\nnot contain the actual information and is ignored in the model calculations.\nThis common-sense padding strategy leads us to a problem that has never been\nexplored before: \\emph{Can we fully utilize this idle input space by padding\nother content to further improve model performance and training efficiency?}\n In this paper, we propose a simple yet effective padding method called\n\\textbf{Rep}eated \\textbf{Pad}ding (\\textbf{RepPad}). Specifically, we use the\noriginal interaction sequences as the padding content and fill it to the\npadding positions during model training. This operation can be performed a\nfinite number of times or repeated until the input sequences' length reaches\nthe maximum limit. Our RepPad can be viewed as a sequence-level data\naugmentation strategy. Unlike most existing works, our method contains no\ntrainable parameters or hyperparameters and is a plug-and-play data\naugmentation operation. Extensive experiments on various categories of\nsequential models and five real-world datasets demonstrate the effectiveness\nand efficiency of our approach. The average recommendation performance\nimprovement is up to 60.3\\% on GRU4Rec and 24.3\\% on SASRec. We also provide\nin-depth analysis and explanation of what makes RepPad effective from multiple\nperspectives. Our datasets and codes are available at\n\\url{https://github.com/KingGugu/RepPad}.\n","authors":["Yizhou Dang","Yuting Liu","Enneng Yang","Guibing Guo","Linying Jiang","Xingwei Wang","Jianzhe Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.06372v2.pdf","comment":"Accepted by RecSys 2024"},{"id":"http://arxiv.org/abs/2402.03181v5","updated":"2024-07-30T02:47:47Z","published":"2024-02-05T16:46:16Z","title":"C-RAG: Certified Generation Risks for Retrieval-Augmented Language\n Models","summary":" Despite the impressive capabilities of large language models (LLMs) across\ndiverse applications, they still suffer from trustworthiness issues, such as\nhallucinations and misalignments. Retrieval-augmented language models (RAG)\nhave been proposed to enhance the credibility of generations by grounding\nexternal knowledge, but the theoretical understandings of their generation\nrisks remains unexplored. In this paper, we answer: 1) whether RAG can indeed\nlead to low generation risks, 2) how to provide provable guarantees on the\ngeneration risks of RAG and vanilla LLMs, and 3) what sufficient conditions\nenable RAG models to reduce generation risks. We propose C-RAG, the first\nframework to certify generation risks for RAG models. Specifically, we provide\nconformal risk analysis for RAG models and certify an upper confidence bound of\ngeneration risks, which we refer to as conformal generation risk. We also\nprovide theoretical guarantees on conformal generation risks for general\nbounded risk functions under test distribution shifts. We prove that RAG\nachieves a lower conformal generation risk than that of a single LLM when the\nquality of the retrieval model and transformer is non-trivial. Our intensive\nempirical results demonstrate the soundness and tightness of our conformal\ngeneration risk guarantees across four widely-used NLP datasets on four\nstate-of-the-art retrieval models.\n","authors":["Mintong Kang","Nezihe Merve Gürel","Ning Yu","Dawn Song","Bo Li"],"pdf_url":"https://arxiv.org/pdf/2402.03181v5.pdf","comment":"Accepted to ICML 2024"},{"id":"http://arxiv.org/abs/2405.16871v2","updated":"2024-07-30T00:59:51Z","published":"2024-05-27T06:39:36Z","title":"Multi-Behavior Generative Recommendation","summary":" Multi-behavior sequential recommendation (MBSR) aims to incorporate behavior\ntypes of interactions for better recommendations. Existing approaches focus on\nthe next-item prediction objective, neglecting the value of integrating the\ntarget behavior type into the learning objective. In this paper, we propose\nMBGen, a novel Multi-Behavior sequential Generative recommendation framework.\nWe formulate the MBSR task into a consecutive two-step process: (1) given item\nsequences, MBGen first predicts the next behavior type to frame the user\nintention, (2) given item sequences and a target behavior type, MBGen then\npredicts the next items. To model such a two-step process, we tokenize both\nbehaviors and items into tokens and construct one single token sequence with\nboth behaviors and items placed interleaved. Furthermore, MBGen learns to\nautoregressively generate the next behavior and item tokens in a unified\ngenerative recommendation paradigm, naturally enabling a multi-task capability.\nAdditionally, we exploit the heterogeneous nature of token sequences in the\ngenerative recommendation and propose a position-routed sparse architecture to\nefficiently and effectively scale up models. Extensive experiments on public\ndatasets demonstrate that MBGen significantly outperforms existing MBSR models\nacross multiple tasks.\n","authors":["Zihan Liu","Yupeng Hou","Julian McAuley"],"pdf_url":"https://arxiv.org/pdf/2405.16871v2.pdf","comment":"Camera ready; accepted by CIKM 2024"},{"id":"http://arxiv.org/abs/2407.18553v2","updated":"2024-07-30T23:41:32Z","published":"2024-07-26T07:05:54Z","title":"REAPER: Reasoning based Retrieval Planning for Complex RAG Systems","summary":" Complex dialog systems often use retrieved evidence to facilitate factual\nresponses. Such RAG (Retrieval Augmented Generation) systems retrieve from\nmassive heterogeneous data stores that are usually architected as multiple\nindexes or APIs instead of a single monolithic source. For a given query,\nrelevant evidence needs to be retrieved from one or a small subset of possible\nretrieval sources. Complex queries can even require multi-step retrieval. For\nexample, a conversational agent on a retail site answering customer questions\nabout past orders will need to retrieve the appropriate customer order first\nand then the evidence relevant to the customer's question in the context of the\nordered product. Most RAG Agents handle such Chain-of-Thought (CoT) tasks by\ninterleaving reasoning and retrieval steps. However, each reasoning step\ndirectly adds to the latency of the system. For large models this latency cost\nis significant -- in the order of multiple seconds. Multi-agent systems may\nclassify the query to a single Agent associated with a retrieval source, though\nthis means that a (small) classification model dictates the performance of a\nlarge language model. In this work we present REAPER (REAsoning-based PlannER)\n- an LLM based planner to generate retrieval plans in conversational systems.\nWe show significant gains in latency over Agent-based systems and are able to\nscale easily to new and unseen use cases as compared to classification-based\nplanning. Though our method can be applied to any RAG system, we show our\nresults in the context of a conversational shopping assistant.\n","authors":["Ashutosh Joshi","Sheikh Muhammad Sarwar","Samarth Varshney","Sreyashi Nag","Shrivats Agrawal","Juhi Naik"],"pdf_url":"https://arxiv.org/pdf/2407.18553v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21191v1","updated":"2024-07-30T20:58:36Z","published":"2024-07-30T20:58:36Z","title":"GenRec: Generative Personalized Sequential Recommendation","summary":" Sequential recommendation is a task to capture hidden user preferences from\nhistorical user item interaction data. Significant progress has been made in\nthis domain by leveraging classification based learning methods. Inspired by\nthe recent paradigm of 'pretrain, prompt and predict' in NLP, we consider\nsequential recommendation as a sequence to sequence generation task and propose\na novel model named Generative Recommendation (GenRec). Unlike classification\nbased models that learn explicit user and item representations, GenRec utilizes\nthe sequence modeling capability of Transformer and adopts the masked item\nprediction objective to effectively learn the hidden bidirectional sequential\npatterns. Different from existing generative sequential recommendation models,\nGenRec does not rely on manually designed hard prompts. The input to GenRec is\ntextual user item sequence and the output is top ranked next items. Moreover,\nGenRec is lightweight and requires only a few hours to train effectively in\nlow-resource settings, making it highly applicable to real-world scenarios and\nhelping to democratize large language models in the sequential recommendation\ndomain. Our extensive experiments have demonstrated that GenRec generalizes on\nvarious public real-world datasets and achieves state-of-the-art results. Our\nexperiments also validate the effectiveness of the the proposed masked item\nprediction objective that improves the model performance by a large margin.\n","authors":["Panfeng Cao","Pietro Lio"],"pdf_url":"https://arxiv.org/pdf/2407.21191v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2407.21011v1","updated":"2024-07-30T17:57:32Z","published":"2024-07-30T17:57:32Z","title":"CLEFT: Language-Image Contrastive Learning with Efficient Large Language\n Model and Prompt Fine-Tuning","summary":" Recent advancements in Contrastive Language-Image Pre-training (CLIP) have\ndemonstrated notable success in self-supervised representation learning across\nvarious tasks. However, the existing CLIP-like approaches often demand\nextensive GPU resources and prolonged training times due to the considerable\nsize of the model and dataset, making them poor for medical applications, in\nwhich large datasets are not always common. Meanwhile, the language model\nprompts are mainly manually derived from labels tied to images, potentially\noverlooking the richness of information within training samples. We introduce a\nnovel language-image Contrastive Learning method with an Efficient large\nlanguage model and prompt Fine-Tuning (CLEFT) that harnesses the strengths of\nthe extensive pre-trained language and visual models. Furthermore, we present\nan efficient strategy for learning context-based prompts that mitigates the gap\nbetween informative clinical diagnostic data and simple class labels. Our\nmethod demonstrates state-of-the-art performance on multiple chest X-ray and\nmammography datasets compared with various baselines. The proposed parameter\nefficient framework can reduce the total trainable model size by 39% and reduce\nthe trainable language model to only 4% compared with the current BERT encoder.\n","authors":["Yuexi Du","Brian Chang","Nicha C. Dvornek"],"pdf_url":"https://arxiv.org/pdf/2407.21011v1.pdf","comment":"Accepted by MICCAI 2024"},{"id":"http://arxiv.org/abs/2407.21001v1","updated":"2024-07-30T17:46:06Z","published":"2024-07-30T17:46:06Z","title":"GABInsight: Exploring Gender-Activity Binding Bias in Vision-Language\n Models","summary":" Vision-language models (VLMs) are intensively used in many downstream tasks,\nincluding those requiring assessments of individuals appearing in the images.\nWhile VLMs perform well in simple single-person scenarios, in real-world\napplications, we often face complex situations in which there are persons of\ndifferent genders doing different activities. We show that in such cases, VLMs\nare biased towards identifying the individual with the expected gender\n(according to ingrained gender stereotypes in the model or other forms of\nsample selection bias) as the performer of the activity. We refer to this bias\nin associating an activity with the gender of its actual performer in an image\nor text as the Gender-Activity Binding (GAB) bias and analyze how this bias is\ninternalized in VLMs. To assess this bias, we have introduced the GAB dataset\nwith approximately 5500 AI-generated images that represent a variety of\nactivities, addressing the scarcity of real-world images for some scenarios. To\nhave extensive quality control, the generated images are evaluated for their\ndiversity, quality, and realism. We have tested 12 renowned pre-trained VLMs on\nthis dataset in the context of text-to-image and image-to-text retrieval to\nmeasure the effect of this bias on their predictions. Additionally, we have\ncarried out supplementary experiments to quantify the bias in VLMs' text\nencoders and to evaluate VLMs' capability to recognize activities. Our\nexperiments indicate that VLMs experience an average performance decline of\nabout 13.2% when confronted with gender-activity binding bias.\n","authors":["Ali Abdollahi","Mahdi Ghaznavi","Mohammad Reza Karimi Nejad","Arash Mari Oriyad","Reza Abbasi","Ali Salesi","Melika Behjati","Mohammad Hossein Rohban","Mahdieh Soleymani Baghshah"],"pdf_url":"https://arxiv.org/pdf/2407.21001v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20999v1","updated":"2024-07-30T17:38:24Z","published":"2024-07-30T17:38:24Z","title":"MoFO: Momentum-Filtered Optimizer for Mitigating Forgetting in LLM\n Fine-Tuning","summary":" Recently, large language models (LLMs) have demonstrated remarkable\ncapabilities in a wide range of tasks. Typically, an LLM is pre-trained on\nlarge corpora and subsequently fine-tuned on task-specific datasets. However,\nduring finetuning, LLMs may forget the knowledge acquired in the pretraining\nstage, leading to a decline in general capabilities. To address this issue, we\npropose a new fine-tuning algorithm termed Momentum-Filtered Optimizer (MoFO).\nThe key idea of MoFO is to iteratively select and update the model parameters\nwith the largest momentum magnitudes. Compared to full-parameter training, MoFO\nachieves similar fine-tuning performance while keeping parameters closer to the\npre-trained model, thereby mitigating knowledge forgetting. Unlike most\nexisting methods for forgetting mitigation, MoFO combines the following two\nadvantages. First, MoFO does not require access to pre-training data. This\nmakes MoFO particularly suitable for fine-tuning scenarios where pre-training\ndata is unavailable, such as fine-tuning checkpoint-only open-source LLMs.\nSecond, MoFO does not alter the original loss function. This could avoid\nimpairing the model performance on the fine-tuning tasks. We validate MoFO\nthrough rigorous convergence analysis and extensive experiments, demonstrating\nits superiority over existing methods in mitigating forgetting and enhancing\nfine-tuning performance.\n","authors":["Yupeng Chen","Senmiao Wang","Zhihang Lin","Zeyu Qin","Yushun Zhang","Tian Ding","Ruoyu Sun"],"pdf_url":"https://arxiv.org/pdf/2407.20999v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20990v1","updated":"2024-07-30T17:27:20Z","published":"2024-07-30T17:27:20Z","title":"From Feature Importance to Natural Language Explanations Using LLMs with\n RAG","summary":" As machine learning becomes increasingly integral to autonomous\ndecision-making processes involving human interaction, the necessity of\ncomprehending the model's outputs through conversational means increases. Most\nrecently, foundation models are being explored for their potential as post hoc\nexplainers, providing a pathway to elucidate the decision-making mechanisms of\npredictive models. In this work, we introduce traceable question-answering,\nleveraging an external knowledge repository to inform the responses of Large\nLanguage Models (LLMs) to user queries within a scene understanding task. This\nknowledge repository comprises contextual details regarding the model's output,\ncontaining high-level features, feature importance, and alternative\nprobabilities. We employ subtractive counterfactual reasoning to compute\nfeature importance, a method that entails analysing output variations resulting\nfrom decomposing semantic features. Furthermore, to maintain a seamless\nconversational flow, we integrate four key characteristics - social, causal,\nselective, and contrastive - drawn from social science research on human\nexplanations into a single-shot prompt, guiding the response generation\nprocess. Our evaluation demonstrates that explanations generated by the LLMs\nencompassed these elements, indicating its potential to bridge the gap between\ncomplex model outputs and natural language expressions.\n","authors":["Sule Tekkesinoglu","Lars Kunze"],"pdf_url":"https://arxiv.org/pdf/2407.20990v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19985v2","updated":"2024-07-30T17:26:22Z","published":"2024-07-29T13:19:31Z","title":"Mixture of Nested Experts: Adaptive Processing of Visual Tokens","summary":" The visual medium (images and videos) naturally contains a large amount of\ninformation redundancy, thereby providing a great opportunity for leveraging\nefficiency in processing. While Vision Transformer (ViT) based models scale\neffectively to large data regimes, they fail to capitalize on this inherent\nredundancy, leading to higher computational costs. Mixture of Experts (MoE)\nnetworks demonstrate scalability while maintaining same inference-time costs,\nbut they come with a larger parameter footprint. We present Mixture of Nested\nExperts (MoNE), which utilizes a nested structure for experts, wherein\nindividual experts fall on an increasing compute-accuracy curve. Given a\ncompute budget, MoNE learns to dynamically choose tokens in a priority order,\nand thus redundant tokens are processed through cheaper nested experts. Using\nthis framework, we achieve equivalent performance as the baseline models, while\nreducing inference time compute by over two-fold. We validate our approach on\nstandard image and video datasets - ImageNet-21K, Kinetics400, and\nSomething-Something-v2. We further highlight MoNE$'$s adaptability by\nshowcasing its ability to maintain strong performance across different\ninference-time compute budgets on videos, using only a single trained model.\n","authors":["Gagan Jain","Nidhi Hegde","Aditya Kusupati","Arsha Nagrani","Shyamal Buch","Prateek Jain","Anurag Arnab","Sujoy Paul"],"pdf_url":"https://arxiv.org/pdf/2407.19985v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20989v1","updated":"2024-07-30T17:26:16Z","published":"2024-07-30T17:26:16Z","title":"Contrasting Deep Learning Models for Direct Respiratory Insufficiency\n Detection Versus Blood Oxygen Saturation Estimation","summary":" We contrast high effectiveness of state of the art deep learning\narchitectures designed for general audio classification tasks, refined for\nrespiratory insufficiency (RI) detection and blood oxygen saturation (SpO2)\nestimation and classification through automated audio analysis. Recently,\nmultiple deep learning architectures have been proposed to detect RI in COVID\npatients through audio analysis, achieving accuracy above 95% and F1-score\nabove 0.93. RI is a condition associated with low SpO2 levels, commonly defined\nas the threshold SpO2 <92%. While SpO2 serves as a crucial determinant of RI, a\nmedical doctor's diagnosis typically relies on multiple factors. These include\nrespiratory frequency, heart rate, SpO2 levels, among others. Here we study\npretrained audio neural networks (CNN6, CNN10 and CNN14) and the Masked\nAutoencoder (Audio-MAE) for RI detection, where these models achieve near\nperfect accuracy, surpassing previous results. Yet, for the regression task of\nestimating SpO2 levels, the models achieve root mean square error values\nexceeding the accepted clinical range of 3.5% for finger oximeters.\nAdditionally, Pearson correlation coefficients fail to surpass 0.3. As deep\nlearning models perform better in classification than regression, we transform\nSpO2-regression into a SpO2-threshold binary classification problem, with a\nthreshold of 92%. However, this task still yields an F1-score below 0.65. Thus,\naudio analysis offers valuable insights into a patient's RI status, but does\nnot provide accurate information about actual SpO2 levels, indicating a\nseparation of domains in which voice and speech biomarkers may and may not be\nuseful in medical diagnostics under current technologies.\n","authors":["Marcelo Matheus Gauy","Natalia Hitomi Koza","Ricardo Mikio Morita","Gabriel Rocha Stanzione","Arnaldo Candido Junior","Larissa Cristina Berti","Anna Sara Shafferman Levin","Ester Cerdeira Sabino","Flaviane Romani Fernandes Svartman","Marcelo Finger"],"pdf_url":"https://arxiv.org/pdf/2407.20989v1.pdf","comment":"23 pages, 4 figures, in review at Journal of Biomedical Signal\n Processing and Control"},{"id":"http://arxiv.org/abs/2407.20959v1","updated":"2024-07-30T16:36:15Z","published":"2024-07-30T16:36:15Z","title":"Learning Ordinality in Semantic Segmentation","summary":" Semantic segmentation consists of predicting a semantic label for each image\npixel. Conventional deep learning models do not take advantage of ordinal\nrelations that might exist in the domain at hand. For example, it is known that\nthe pupil is inside the iris, and the lane markings are inside the road. Such\ndomain knowledge can be employed as constraints to make the model more robust.\nThe current literature on this topic has explored pixel-wise ordinal\nsegmentation methods, which treat each pixel as an independent observation and\npromote ordinality in its representation. This paper proposes novel spatial\nordinal segmentation methods, which take advantage of the structured image\nspace by considering each pixel as an observation dependent on its neighborhood\ncontext to also promote ordinal spatial consistency. When evaluated with five\nbiomedical datasets and multiple configurations of autonomous driving datasets,\nordinal methods resulted in more ordinally-consistent models, with substantial\nimprovements in ordinal metrics and some increase in the Dice coefficient. It\nwas also shown that the incorporation of ordinal consistency results in models\nwith better generalization abilities.\n","authors":["Rafael Cristino","Ricardo P. M. Cruz","Jaime S. Cardoso"],"pdf_url":"https://arxiv.org/pdf/2407.20959v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2301.03709v2","updated":"2024-07-30T16:31:46Z","published":"2023-01-09T22:47:12Z","title":"Transfer learning for conflict and duplicate detection in software\n requirement pairs","summary":" Consistent and holistic expression of software requirements is important for\nthe success of software projects. In this study, we aim to enhance the\nefficiency of the software development processes by automatically identifying\nconflicting and duplicate software requirement specifications. We formulate the\nconflict and duplicate detection problem as a requirement pair classification\ntask. We design a novel transformers-based architecture, SR-BERT, which\nincorporates Sentence-BERT and Bi-encoders for the conflict and duplicate\nidentification task. Furthermore, we apply supervised multi-stage fine-tuning\nto the pre-trained transformer models. We test the performance of different\ntransfer models using four different datasets. We find that sequentially\ntrained and fine-tuned transformer models perform well across the datasets with\nSR-BERT achieving the best performance for larger datasets. We also explore the\ncross-domain performance of conflict detection models and adopt a rule-based\nfiltering approach to validate the model classifications. Our analysis\nindicates that the sentence pair classification approach and the proposed\ntransformer-based natural language processing strategies can contribute\nsignificantly to achieving automation in conflict and duplicate detection\n","authors":["Garima Malik","Savas Yildirim","Mucahit Cevik","Ayse Bener","Devang Parikh"],"pdf_url":"https://arxiv.org/pdf/2301.03709v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20956v1","updated":"2024-07-30T16:30:09Z","published":"2024-07-30T16:30:09Z","title":"An Effective Dynamic Gradient Calibration Method for Continual Learning","summary":" Continual learning (CL) is a fundamental topic in machine learning, where the\ngoal is to train a model with continuously incoming data and tasks. Due to the\nmemory limit, we cannot store all the historical data, and therefore confront\nthe ``catastrophic forgetting'' problem, i.e., the performance on the previous\ntasks can substantially decrease because of the missing information in the\nlatter period. Though a number of elegant methods have been proposed, the\ncatastrophic forgetting phenomenon still cannot be well avoided in practice. In\nthis paper, we study the problem from the gradient perspective, where our aim\nis to develop an effective algorithm to calibrate the gradient in each updating\nstep of the model; namely, our goal is to guide the model to be updated in the\nright direction under the situation that a large amount of historical data are\nunavailable. Our idea is partly inspired by the seminal stochastic variance\nreduction methods (e.g., SVRG and SAGA) for reducing the variance of gradient\nestimation in stochastic gradient descent algorithms. Another benefit is that\nour approach can be used as a general tool, which is able to be incorporated\nwith several existing popular CL methods to achieve better performance. We also\nconduct a set of experiments on several benchmark datasets to evaluate the\nperformance in practice.\n","authors":["Weichen Lin","Jiaxiang Chen","Ruomin Huang","Hu Ding"],"pdf_url":"https://arxiv.org/pdf/2407.20956v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02209v3","updated":"2024-07-30T16:16:45Z","published":"2024-02-03T16:45:31Z","title":"On the Exploitation of DCT-Traces in the Generative-AI Domain","summary":" Deepfakes represent one of the toughest challenges in the world of\nCybersecurity and Digital Forensics, especially considering the high-quality\nresults obtained with recent generative AI-based solutions. Almost all\ngenerative models leave unique traces in synthetic data that, if analyzed and\nidentified in detail, can be exploited to improve the generalization\nlimitations of existing deepfake detectors. In this paper we analyzed deepfake\nimages in the frequency domain generated by both GAN and Diffusion Model\nengines, examining in detail the underlying statistical distribution of\nDiscrete Cosine Transform (DCT) coefficients. Recognizing that not all\ncoefficients contribute equally to image detection, we hypothesize the\nexistence of a unique ``discriminative fingerprint\", embedded in specific\ncombinations of coefficients. To identify them, Machine Learning classifiers\nwere trained on various combinations of coefficients. In addition, the\nExplainable AI (XAI) LIME algorithm was used to search for intrinsic\ndiscriminative combinations of coefficients. Finally, we performed a robustness\ntest to analyze the persistence of traces by applying JPEG compression. The\nexperimental results reveal the existence of traces left by the generative\nmodels that are more discriminative and persistent at JPEG attacks. Code and\ndataset are available at https://github.com/opontorno/dcts_analysis_deepfakes.\n","authors":["Orazio Pontorno","Luca Guarnera","Sebastiano Battiato"],"pdf_url":"https://arxiv.org/pdf/2402.02209v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20917v1","updated":"2024-07-30T15:54:18Z","published":"2024-07-30T15:54:18Z","title":"How to Choose a Reinforcement-Learning Algorithm","summary":" The field of reinforcement learning offers a large variety of concepts and\nmethods to tackle sequential decision-making problems. This variety has become\nso large that choosing an algorithm for a task at hand can be challenging. In\nthis work, we streamline the process of choosing reinforcement-learning\nalgorithms and action-distribution families. We provide a structured overview\nof existing methods and their properties, as well as guidelines for when to\nchoose which methods. An interactive version of these guidelines is available\nonline at https://rl-picker.github.io/.\n","authors":["Fabian Bongratz","Vladimir Golkov","Lukas Mautner","Luca Della Libera","Frederik Heetmeyer","Felix Czaja","Julian Rodemann","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2407.20917v1.pdf","comment":"40 pages"},{"id":"http://arxiv.org/abs/2303.14281v2","updated":"2024-07-30T15:42:59Z","published":"2023-03-24T21:39:06Z","title":"Sequential Knockoffs for Variable Selection in Reinforcement Learning","summary":" In real-world applications of reinforcement learning, it is often challenging\nto obtain a state representation that is parsimonious and satisfies the Markov\nproperty without prior knowledge. Consequently, it is common practice to\nconstruct a state larger than necessary, e.g., by concatenating measurements\nover contiguous time points. However, needlessly increasing the dimension of\nthe state may slow learning and obfuscate the learned policy. We introduce the\nnotion of a minimal sufficient state in a Markov decision process (MDP) as the\nsubvector of the original state under which the process remains an MDP and\nshares the same reward function as the original process. We propose a novel\nSEquEntial Knockoffs (SEEK) algorithm that estimates the minimal sufficient\nstate in a system with high-dimensional complex nonlinear dynamics. In large\nsamples, the proposed method achieves selection consistency. As the method is\nagnostic to the reinforcement learning algorithm being applied, it benefits\ndownstream tasks such as policy learning. Empirical experiments verify\ntheoretical results and show the proposed approach outperforms several\ncompeting methods regarding variable selection accuracy and regret.\n","authors":["Tao Ma","Jin Zhu","Hengrui Cai","Zhengling Qi","Yunxiao Chen","Chengchun Shi","Eric B. Laber"],"pdf_url":"https://arxiv.org/pdf/2303.14281v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20912v1","updated":"2024-07-30T15:38:14Z","published":"2024-07-30T15:38:14Z","title":"What Are Good Positional Encodings for Directed Graphs?","summary":" Positional encodings (PE) for graphs are essential in constructing powerful\nand expressive graph neural networks and graph transformers as they effectively\ncapture relative spatial relations between nodes. While PEs for undirected\ngraphs have been extensively studied, those for directed graphs remain largely\nunexplored, despite the fundamental role of directed graphs in representing\nentities with strong logical dependencies, such as those in program analysis\nand circuit designs. This work studies the design of PEs for directed graphs\nthat are expressive to represent desired directed spatial relations. We first\npropose walk profile, a generalization of walk counting sequence to directed\ngraphs. We identify limitations in existing PE methods, including symmetrized\nLaplacian PE, Singular Value Decomposition PE, and Magnetic Laplacian PE, in\ntheir ability to express walk profiles. To address these limitations, we\npropose the Multi-q Magnetic Laplacian PE, which extends Magnetic Laplacian PE\nwith multiple potential factors. This simple variant turns out to be capable of\nprovably expressing walk profiles. Furthermore, we generalize previous\nbasis-invariant and stable networks to handle complex-domain PEs decomposed\nfrom Magnetic Laplacians. Our numerical experiments demonstrate the\neffectiveness of Multi-q Magnetic Laplacian PE with a stable neural\narchitecture, outperforming previous PE methods (with stable networks) on\npredicting directed distances/walk profiles, sorting network satisfiability,\nand on general circuit benchmarks. Our code is available at\nhttps://github.com/Graph-COM/Multi-q-Maglap.\n","authors":["Yinan Huang","Haoyu Wang","Pan Li"],"pdf_url":"https://arxiv.org/pdf/2407.20912v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20902v1","updated":"2024-07-30T15:24:27Z","published":"2024-07-30T15:24:27Z","title":"Machine learning surrogates for efficient hydrologic modeling: Insights\n from stochastic simulations of managed aquifer recharge","summary":" Process-based hydrologic models are invaluable tools for understanding the\nterrestrial water cycle and addressing modern water resources problems.\nHowever, many hydrologic models are computationally expensive and, depending on\nthe resolution and scale, simulations can take on the order of hours to days to\ncomplete. While techniques such as uncertainty quantification and optimization\nhave become valuable tools for supporting management decisions, these analyses\ntypically require hundreds of model simulations, which are too computationally\nexpensive to perform with a process-based hydrologic model. To address this\ngap, we propose a hybrid modeling workflow in which a process-based model is\nused to generate an initial set of simulations and a machine learning (ML)\nsurrogate model is then trained to perform the remaining simulations required\nfor downstream analysis. As a case study, we apply this workflow to simulations\nof variably saturated groundwater flow at a prospective managed aquifer\nrecharge (MAR) site. We compare the accuracy and computational efficiency of\nseveral ML architectures, including deep convolutional networks, recurrent\nneural networks, vision transformers, and networks with Fourier transforms. Our\nresults demonstrate that ML surrogate models can achieve under 10% mean\nabsolute percentage error and yield order-of-magnitude runtime savings over\nprocessed-based models. We also offer practical recommendations for training\nhydrologic surrogate models, including implementing data normalization to\nimprove accuracy, using a normalized loss function to improve training\nstability and downsampling input features to decrease memory requirements.\n","authors":["Timothy Dai","Kate Maher","Zach Perzan"],"pdf_url":"https://arxiv.org/pdf/2407.20902v1.pdf","comment":"32 pages, 14 figures, 11 tables"},{"id":"http://arxiv.org/abs/2407.04522v2","updated":"2024-07-30T15:14:37Z","published":"2024-07-05T14:07:15Z","title":"Graph Reinforcement Learning in Power Grids: A Survey","summary":" The challenges posed by renewable energy and distributed electricity\ngeneration motivate the development of deep learning approaches to overcome the\nlack of flexibility of traditional methods in power grids use cases. The\napplication of GNNs is particularly promising due to their ability to learn\nfrom graph-structured data present in power grids. Combined with RL, they can\nserve as control approaches to determine remedial grid actions. This review\nanalyses the ability of GRL to capture the inherent graph structure of power\ngrids to improve representation learning and decision making in different power\ngrid use cases. It distinguishes between common problems in transmission and\ndistribution grids and explores the synergy between RL and GNNs. In\ntransmission grids, GRL typically addresses automated grid management and\ntopology control, whereas on the distribution side, GRL concentrates more on\nvoltage regulation. We analyzed the selected papers based on their graph\nstructure and GNN model, the applied RL algorithm, and their overall\ncontributions. Although GRL demonstrate adaptability in the face of\nunpredictable events and noisy or incomplete data, it primarily serves as a\nproof of concept at this stage. There are multiple open challenges and\nlimitations that need to be addressed when considering the application of RL to\nreal power grid operation.\n","authors":["Mohamed Hassouna","Clara Holzhüter","Pawel Lytaev","Josephine Thomas","Bernhard Sick","Christoph Scholz"],"pdf_url":"https://arxiv.org/pdf/2407.04522v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20893v1","updated":"2024-07-30T15:12:29Z","published":"2024-07-30T15:12:29Z","title":"MambaCapsule: Towards Transparent Cardiac Disease Diagnosis with\n Electrocardiography Using Mamba Capsule Network","summary":" Cardiac arrhythmia, a condition characterized by irregular heartbeats, often\nserves as an early indication of various heart ailments. With the advent of\ndeep learning, numerous innovative models have been introduced for diagnosing\narrhythmias using Electrocardiogram (ECG) signals. However, recent studies\nsolely focus on the performance of models, neglecting the interpretation of\ntheir results. This leads to a considerable lack of transparency, posing a\nsignificant risk in the actual diagnostic process. To solve this problem, this\npaper introduces MambaCapsule, a deep neural networks for ECG arrhythmias\nclassification, which increases the explainability of the model while enhancing\nthe accuracy.Our model utilizes Mamba for feature extraction and Capsule\nnetworks for prediction, providing not only a confidence score but also signal\nfeatures. Akin to the processing mechanism of human brain, the model learns\nsignal features and their relationship between them by reconstructing ECG\nsignals in the predicted selection. The model evaluation was conducted on\nMIT-BIH and PTB dataset, following the AAMI standard. MambaCapsule has achieved\na total accuracy of 99.54% and 99.59% on the test sets respectively. These\nresults demonstrate the promising performance of under the standard test\nprotocol.\n","authors":["Yinlong Xu","Xiaoqiang Liu","Zitai Kong","Yixuan Wu","Yue Wang","Yingzhou Lu","Honghao Gao","Jian Wu","Hongxia Xu"],"pdf_url":"https://arxiv.org/pdf/2407.20893v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20891v1","updated":"2024-07-30T15:07:13Z","published":"2024-07-30T15:07:13Z","title":"Bayesian Low-Rank LeArning (Bella): A Practical Approach to Bayesian\n Neural Networks","summary":" Computational complexity of Bayesian learning is impeding its adoption in\npractical, large-scale tasks. Despite demonstrations of significant merits such\nas improved robustness and resilience to unseen or out-of-distribution inputs\nover their non- Bayesian counterparts, their practical use has faded to near\ninsignificance. In this study, we introduce an innovative framework to mitigate\nthe computational burden of Bayesian neural networks (BNNs). Our approach\nfollows the principle of Bayesian techniques based on deep ensembles, but\nsignificantly reduces their cost via multiple low-rank perturbations of\nparameters arising from a pre-trained neural network. Both vanilla version of\nensembles as well as more sophisticated schemes such as Bayesian learning with\nStein Variational Gradient Descent (SVGD), previously deemed impractical for\nlarge models, can be seamlessly implemented within the proposed framework,\ncalled Bayesian Low-Rank LeArning (Bella). In a nutshell, i) Bella achieves a\ndramatic reduction in the number of trainable parameters required to\napproximate a Bayesian posterior; and ii) it not only maintains, but in some\ninstances, surpasses the performance of conventional Bayesian learning methods\nand non-Bayesian baselines. Our results with large-scale tasks such as\nImageNet, CAMELYON17, DomainNet, VQA with CLIP, LLaVA demonstrate the\neffectiveness and versatility of Bella in building highly scalable and\npractical Bayesian deep models for real-world applications.\n","authors":["Bao Gia Doan","Afshar Shamsi","Xiao-Yu Guo","Arash Mohammadi","Hamid Alinejad-Rokny","Dino Sejdinovic","Damith C. Ranasinghe","Ehsan Abbasnejad"],"pdf_url":"https://arxiv.org/pdf/2407.20891v1.pdf","comment":"25 pages, 14 figures, 11 tables"},{"id":"http://arxiv.org/abs/2402.17698v2","updated":"2024-07-30T15:06:44Z","published":"2024-02-27T17:21:10Z","title":"Learning reduced-order Quadratic-Linear models in Process Engineering\n using Operator Inference","summary":" In this work, we address the challenge of efficiently modeling dynamical\nsystems in process engineering. We use reduced-order model learning,\nspecifically operator inference. This is a non-intrusive, data-driven method\nfor learning dynamical systems from time-domain data. The application in our\nstudy is carbon dioxide methanation, an important reaction within the\nPower-to-X framework, to demonstrate its potential. The numerical results show\nthe ability of the reduced-order models constructed with operator inference to\nprovide a reduced yet accurate surrogate solution. This represents an important\nmilestone towards the implementation of fast and reliable digital twin\narchitectures.\n","authors":["Ion Victor Gosea","Luisa Peterson","Pawan Goyal","Jens Bremer","Kai Sundmacher","Peter Benner"],"pdf_url":"https://arxiv.org/pdf/2402.17698v2.pdf","comment":"10 pages, 3 figures"},{"id":"http://arxiv.org/abs/2310.11960v3","updated":"2024-07-30T15:02:51Z","published":"2023-10-18T13:40:41Z","title":"Fast Multipole Attention: A Divide-and-Conquer Attention Mechanism for\n Long Sequences","summary":" Transformer-based models have achieved state-of-the-art performance in many\nareas. However, the quadratic complexity of self-attention with respect to the\ninput length hinders the applicability of Transformer-based models to long\nsequences. To address this, we present Fast Multipole Attention, a new\nattention mechanism that uses a divide-and-conquer strategy to reduce the time\nand memory complexity of attention for sequences of length $n$ from\n$\\mathcal{O}(n^2)$ to $\\mathcal{O}(n \\log n)$ or $O(n)$, while retaining a\nglobal receptive field. The hierarchical approach groups queries, keys, and\nvalues into $\\mathcal{O}( \\log n)$ levels of resolution, where groups at\ngreater distances are increasingly larger in size and the weights to compute\ngroup quantities are learned. As such, the interaction between tokens far from\neach other is considered in lower resolution in an efficient hierarchical\nmanner. The overall complexity of Fast Multipole Attention is $\\mathcal{O}(n)$\nor $\\mathcal{O}(n \\log n)$, depending on whether the queries are down-sampled\nor not. This multi-level divide-and-conquer strategy is inspired by fast\nsummation methods from $n$-body physics and the Fast Multipole Method. We\nperform evaluation on autoregressive and bidirectional language modeling tasks\nand compare our Fast Multipole Attention model with other efficient attention\nvariants on medium-size datasets. We find empirically that the Fast Multipole\nTransformer performs much better than other efficient transformers in terms of\nmemory size and accuracy. The Fast Multipole Attention mechanism has the\npotential to empower large language models with much greater sequence lengths,\ntaking the full context into account in an efficient, naturally hierarchical\nmanner during training and when generating long sequences.\n","authors":["Yanming Kang","Giang Tran","Hans De Sterck"],"pdf_url":"https://arxiv.org/pdf/2310.11960v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.03151v2","updated":"2024-07-30T14:54:05Z","published":"2024-01-06T08:04:13Z","title":"Semi-supervised learning via DQN for log anomaly detection","summary":" Log anomaly detection is a critical component in modern software system\nsecurity and maintenance, serving as a crucial support and basis for system\nmonitoring, operation, and troubleshooting. It aids operations personnel in\ntimely identification and resolution of issues. However, current methods in log\nanomaly detection still face challenges such as underutilization of unlabeled\ndata, imbalance between normal and anomaly class data, and high rates of false\npositives and false negatives, leading to insufficient effectiveness in anomaly\nrecognition. In this study, we propose a semi-supervised log anomaly detection\nmethod named DQNLog, which integrates deep reinforcement learning to enhance\nanomaly detection performance by leveraging a small amount of labeled data and\nlarge-scale unlabeled data. To address issues of imbalanced data and\ninsufficient labeling, we design a state transition function biased towards\nanomalies based on cosine similarity, aiming to capture semantic-similar\nanomalies rather than favoring the majority class. To enhance the model's\ncapability in learning anomalies, we devise a joint reward function that\nencourages the model to utilize labeled anomalies and explore unlabeled\nanomalies, thereby reducing false positives and false negatives. Additionally,\nto prevent the model from deviating from normal trajectories due to\nmisestimation, we introduce a regularization term in the loss function to\nensure the model retains prior knowledge during updates. We evaluate DQNLog on\nthree widely used datasets, demonstrating its ability to effectively utilize\nlarge-scale unlabeled data and achieve promising results across all\nexperimental datasets.\n","authors":["Yingying He","Xiaobing Pei"],"pdf_url":"https://arxiv.org/pdf/2401.03151v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05441v2","updated":"2024-07-30T14:46:12Z","published":"2024-03-08T16:51:27Z","title":"Bayesian Hierarchical Probabilistic Forecasting of Intraday Electricity\n Prices","summary":" We present a first study of Bayesian forecasting of electricity prices traded\non the German continuous intraday market which fully incorporates parameter\nuncertainty. A particularly large set of endogenous and exogenous covariables\nis used, handled through feature selection with Orthogonal Matching Pursuit\n(OMP) and regularising priors. Our target variable is the IDFull price index,\nforecasts are given in terms of posterior predictive distributions. For\nvalidation we use the exceedingly volatile electricity prices of 2022, which\nhave hardly been the subject of forecasting studies before. As a benchmark\nmodel, we use all available intraday transactions at the time of forecast\ncreation to compute a current value for the IDFull. According to the weak-form\nefficiency hypothesis, it would not be possible to significantly improve this\nbenchmark built from last price information. We do, however, observe\nstatistically significant improvement in terms of both point measures and\nprobability scores. Finally, we challenge the declared gold standard of using\nLASSO for feature selection in electricity price forecasting by presenting\nstrong statistical evidence that OMP leads to better forecasting performance.\n","authors":["Daniel Nickelsen","Gernot Müller"],"pdf_url":"https://arxiv.org/pdf/2403.05441v2.pdf","comment":"22 pages, 14 figures, 4 tables. Revised version with an added\n schematic figure. Under review for Applied Energy"},{"id":"http://arxiv.org/abs/2407.20871v1","updated":"2024-07-30T14:45:40Z","published":"2024-07-30T14:45:40Z","title":"Co-Neighbor Encoding Schema: A Light-cost Structure Encoding Method for\n Dynamic Link Prediction","summary":" Structure encoding has proven to be the key feature to distinguishing links\nin a graph. However, Structure encoding in the temporal graph keeps changing as\nthe graph evolves, repeatedly computing such features can be time-consuming due\nto the high-order subgraph construction. We develop the Co-Neighbor Encoding\nSchema (CNES) to address this issue. Instead of recomputing the feature by the\nlink, CNES stores information in the memory to avoid redundant calculations.\nBesides, unlike the existing memory-based dynamic graph learning method that\nstores node hidden states, we introduce a hashtable-based memory to compress\nthe adjacency matrix for efficient structure feature construction and updating\nwith vector computation in parallel. Furthermore, CNES introduces a\nTemporal-Diverse Memory to generate long-term and short-term structure encoding\nfor neighbors with different structural information. A dynamic graph learning\nframework, Co-Neighbor Encoding Network (CNE-N), is proposed using the\naforementioned techniques. Extensive experiments on thirteen public datasets\nverify the effectiveness and efficiency of the proposed method.\n","authors":["Ke Cheng","Linzhi Peng","Junchen Ye","Leilei Sun","Bowen Du"],"pdf_url":"https://arxiv.org/pdf/2407.20871v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2108.07636v7","updated":"2024-07-30T14:40:07Z","published":"2021-08-17T13:58:44Z","title":"Accounting for shared covariates in semi-parametric Bayesian additive\n regression trees","summary":" We propose some extensions to semi-parametric models based on Bayesian\nadditive regression trees (BART). In the semi-parametric BART paradigm, the\nresponse variable is approximated by a linear predictor and a BART model, where\nthe linear component is responsible for estimating the main effects and BART\naccounts for non-specified interactions and non-linearities. Previous\nsemi-parametric models based on BART have assumed that the set of covariates in\nthe linear predictor and the BART model are mutually exclusive in an attempt to\navoid poor coverage properties and reduce bias in the estimates of the\nparameters in the linear predictor. The main novelty in our approach lies in\nthe way we change the tree-generation moves in BART to deal with this bias and\nresolve non-identifiability issues between the parametric and non-parametric\ncomponents, even when they have covariates in common. This allows us to model\ncomplex interactions involving the covariates of primary interest, both among\nthemselves and with those in the BART component. Our novel method is developed\nwith a view to analysing data from an international education assessment, where\ncertain predictors of students' achievements in mathematics are of particular\ninterpretational interest. Through additional simulation studies and another\napplication to a well-known benchmark dataset, we also show competitive\nperformance when compared to regression models, alternative formulations of\nsemi-parametric BART, and other tree-based methods. The implementation of the\nproposed method is available at \\url{https://github.com/ebprado/CSP-BART}.\n","authors":["Estevão B. Prado","Andrew C. Parnell","Keefe Murphy","Nathan McJames","Ann O'Shea","Rafael A. Moral"],"pdf_url":"https://arxiv.org/pdf/2108.07636v7.pdf","comment":"48 pages, 8 tables, 10 figures"},{"id":"http://arxiv.org/abs/2305.07216v2","updated":"2024-07-30T14:36:26Z","published":"2023-05-12T03:13:37Z","title":"Versatile audio-visual learning for emotion recognition","summary":" Most current audio-visual emotion recognition models lack the flexibility\nneeded for deployment in practical applications. We envision a multimodal\nsystem that works even when only one modality is available and can be\nimplemented interchangeably for either predicting emotional attributes or\nrecognizing categorical emotions. Achieving such flexibility in a multimodal\nemotion recognition system is difficult due to the inherent challenges in\naccurately interpreting and integrating varied data sources. It is also a\nchallenge to robustly handle missing or partial information while allowing\ndirect switch between regression or classification tasks. This study proposes a\nversatile audio-visual learning (VAVL) framework for handling unimodal and\nmultimodal systems for emotion regression or emotion classification tasks. We\nimplement an audio-visual framework that can be trained even when audio and\nvisual paired data is not available for part of the training set (i.e., audio\nonly or only video is present). We achieve this effective representation\nlearning with audio-visual shared layers, residual connections over shared\nlayers, and a unimodal reconstruction task. Our experimental results reveal\nthat our architecture significantly outperforms strong baselines on the\nCREMA-D, MSP-IMPROV, and CMU-MOSEI corpora. Notably, VAVL attains a new\nstate-of-the-art performance in the emotional attribute prediction task on the\nMSP-IMPROV corpus.\n","authors":["Lucas Goncalves","Seong-Gyun Leem","Wei-Cheng Lin","Berrak Sisman","Carlos Busso"],"pdf_url":"https://arxiv.org/pdf/2305.07216v2.pdf","comment":"18 pages, 4 Figures, 3 tables (published at IEEE Transactions on\n Affective Computing)"},{"id":"http://arxiv.org/abs/2407.20859v1","updated":"2024-07-30T14:35:31Z","published":"2024-07-30T14:35:31Z","title":"Breaking Agents: Compromising Autonomous LLM Agents Through Malfunction\n Amplification","summary":" Recently, autonomous agents built on large language models (LLMs) have\nexperienced significant development and are being deployed in real-world\napplications. These agents can extend the base LLM's capabilities in multiple\nways. For example, a well-built agent using GPT-3.5-Turbo as its core can\noutperform the more advanced GPT-4 model by leveraging external components.\nMore importantly, the usage of tools enables these systems to perform actions\nin the real world, moving from merely generating text to actively interacting\nwith their environment. Given the agents' practical applications and their\nability to execute consequential actions, it is crucial to assess potential\nvulnerabilities. Such autonomous systems can cause more severe damage than a\nstandalone language model if compromised. While some existing research has\nexplored harmful actions by LLM agents, our study approaches the vulnerability\nfrom a different perspective. We introduce a new type of attack that causes\nmalfunctions by misleading the agent into executing repetitive or irrelevant\nactions. We conduct comprehensive evaluations using various attack methods,\nsurfaces, and properties to pinpoint areas of susceptibility. Our experiments\nreveal that these attacks can induce failure rates exceeding 80\\% in multiple\nscenarios. Through attacks on implemented and deployable agents in multi-agent\nscenarios, we accentuate the realistic risks associated with these\nvulnerabilities. To mitigate such attacks, we propose self-examination\ndetection methods. However, our findings indicate these attacks are difficult\nto detect effectively using LLMs alone, highlighting the substantial risks\nassociated with this vulnerability.\n","authors":["Boyang Zhang","Yicong Tan","Yun Shen","Ahmed Salem","Michael Backes","Savvas Zannettou","Yang Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.20859v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.17055v3","updated":"2024-07-30T14:22:26Z","published":"2024-06-24T18:15:27Z","title":"Large Language Models Assume People are More Rational than We Really are","summary":" In order for AI systems to communicate effectively with people, they must\nunderstand how we make decisions. However, people's decisions are not always\nrational, so the implicit internal models of human decision-making in Large\nLanguage Models (LLMs) must account for this. Previous empirical evidence seems\nto suggest that these implicit models are accurate -- LLMs offer believable\nproxies of human behavior, acting how we expect humans would in everyday\ninteractions. However, by comparing LLM behavior and predictions to a large\ndataset of human decisions, we find that this is actually not the case: when\nboth simulating and predicting people's choices, a suite of cutting-edge LLMs\n(GPT-4o & 4-Turbo, Llama-3-8B & 70B, Claude 3 Opus) assume that people are more\nrational than we really are. Specifically, these models deviate from human\nbehavior and align more closely with a classic model of rational choice --\nexpected value theory. Interestingly, people also tend to assume that other\npeople are rational when interpreting their behavior. As a consequence, when we\ncompare the inferences that LLMs and people draw from the decisions of others\nusing another psychological dataset, we find that these inferences are highly\ncorrelated. Thus, the implicit decision-making models of LLMs appear to be\naligned with the human expectation that other people will act rationally,\nrather than with how people actually act.\n","authors":["Ryan Liu","Jiayi Geng","Joshua C. Peterson","Ilia Sucholutsky","Thomas L. Griffiths"],"pdf_url":"https://arxiv.org/pdf/2406.17055v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20845v1","updated":"2024-07-30T14:22:13Z","published":"2024-07-30T14:22:13Z","title":"Assessing Graphical Perception of Image Embedding Models using Channel\n Effectiveness","summary":" Recent advancements in vision models have greatly improved their ability to\nhandle complex chart understanding tasks, like chart captioning and question\nanswering. However, it remains challenging to assess how these models process\ncharts. Existing benchmarks only roughly evaluate model performance without\nevaluating the underlying mechanisms, such as how models extract image\nembeddings. This limits our understanding of the model's ability to perceive\nfundamental graphical components. To address this, we introduce a novel\nevaluation framework to assess the graphical perception of image embedding\nmodels. For chart comprehension, we examine two main aspects of channel\neffectiveness: accuracy and discriminability of various visual channels.\nChannel accuracy is assessed through the linearity of embeddings, measuring how\nwell the perceived magnitude aligns with the size of the stimulus.\nDiscriminability is evaluated based on the distances between embeddings,\nindicating their distinctness. Our experiments with the CLIP model show that it\nperceives channel accuracy differently from humans and shows unique\ndiscriminability in channels like length, tilt, and curvature. We aim to\ndevelop this work into a broader benchmark for reliable visual encoders,\nenhancing models for precise chart comprehension and human-like perception in\nfuture applications.\n","authors":["Soohyun Lee","Minsuk Chang","Seokhyeon Park","Jinwook Seo"],"pdf_url":"https://arxiv.org/pdf/2407.20845v1.pdf","comment":"In Proceedings of the 2024 IEEE Visualization and Visual Analytics\n (VIS)"},{"id":"http://arxiv.org/abs/2407.19707v2","updated":"2024-07-30T14:08:43Z","published":"2024-07-29T05:05:13Z","title":"Neural networks for bifurcation and linear stability analysis of steady\n states in partial differential equations","summary":" This research introduces an extended application of neural networks for\nsolving nonlinear partial differential equations (PDEs). A neural network,\ncombined with a pseudo-arclength continuation, is proposed to construct\nbifurcation diagrams from parameterized nonlinear PDEs. Additionally, a neural\nnetwork approach is also presented for solving eigenvalue problems to analyze\nsolution linear stability, focusing on identifying the largest eigenvalue. The\neffectiveness of the proposed neural network is examined through experiments on\nthe Bratu equation and the Burgers equation. Results from a finite difference\nmethod are also presented as comparison. Varying numbers of grid points are\nemployed in each case to assess the behavior and accuracy of both the neural\nnetwork and the finite difference method. The experimental results demonstrate\nthat the proposed neural network produces better solutions, generates more\naccurate bifurcation diagrams, has reasonable computational times, and proves\neffective for linear stability analysis.\n","authors":["Muhammad Luthfi Shahab","Hadi Susanto"],"pdf_url":"https://arxiv.org/pdf/2407.19707v2.pdf","comment":"Accepted for publication in Applied Mathematics and Computation"},{"id":"http://arxiv.org/abs/2407.20830v1","updated":"2024-07-30T13:56:26Z","published":"2024-07-30T13:56:26Z","title":"Federated Knowledge Recycling: Privacy-Preserving Synthetic Data Sharing","summary":" Federated learning has emerged as a paradigm for collaborative learning,\nenabling the development of robust models without the need to centralise\nsensitive data. However, conventional federated learning techniques have\nprivacy and security vulnerabilities due to the exposure of models, parameters\nor updates, which can be exploited as an attack surface. This paper presents\nFederated Knowledge Recycling (FedKR), a cross-silo federated learning approach\nthat uses locally generated synthetic data to facilitate collaboration between\ninstitutions. FedKR combines advanced data generation techniques with a dynamic\naggregation process to provide greater security against privacy attacks than\nexisting methods, significantly reducing the attack surface. Experimental\nresults on generic and medical datasets show that FedKR achieves competitive\nperformance, with an average improvement in accuracy of 4.24% compared to\ntraining models from local data, demonstrating particular effectiveness in data\nscarcity scenarios.\n","authors":["Eugenio Lomurno","Matteo Matteucci"],"pdf_url":"https://arxiv.org/pdf/2407.20830v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20828v1","updated":"2024-07-30T13:53:48Z","published":"2024-07-30T13:53:48Z","title":"How to Measure the Intelligence of Large Language Models?","summary":" With the release of ChatGPT and other large language models (LLMs) the\ndiscussion about the intelligence, possibilities, and risks, of current and\nfuture models have seen large attention. This discussion included much debated\nscenarios about the imminent rise of so-called \"super-human\" AI, i.e., AI\nsystems that are orders of magnitude smarter than humans. In the spirit of Alan\nTuring, there is no doubt that current state-of-the-art language models already\npass his famous test. Moreover, current models outperform humans in several\nbenchmark tests, so that publicly available LLMs have already become versatile\ncompanions that connect everyday life, industry and science. Despite their\nimpressive capabilities, LLMs sometimes fail completely at tasks that are\nthought to be trivial for humans. In other cases, the trustworthiness of LLMs\nbecomes much more elusive and difficult to evaluate. Taking the example of\nacademia, language models are capable of writing convincing research articles\non a given topic with only little input. Yet, the lack of trustworthiness in\nterms of factual consistency or the existence of persistent hallucinations in\nAI-generated text bodies has led to a range of restrictions for AI-based\ncontent in many scientific journals. In view of these observations, the\nquestion arises as to whether the same metrics that apply to human intelligence\ncan also be applied to computational methods and has been discussed\nextensively. In fact, the choice of metrics has already been shown to\ndramatically influence assessments on potential intelligence emergence. Here,\nwe argue that the intelligence of LLMs should not only be assessed by\ntask-specific statistical metrics, but separately in terms of qualitative and\nquantitative measures.\n","authors":["Nils Körber","Silvan Wehrli","Christopher Irrgang"],"pdf_url":"https://arxiv.org/pdf/2407.20828v1.pdf","comment":"3 pages, 1 figure"},{"id":"http://arxiv.org/abs/2402.03207v2","updated":"2024-07-30T13:43:36Z","published":"2024-02-05T17:17:57Z","title":"Light and Optimal Schrödinger Bridge Matching","summary":" Schr\\\"odinger Bridges (SB) have recently gained the attention of the ML\ncommunity as a promising extension of classic diffusion models which is also\ninterconnected to the Entropic Optimal Transport (EOT). Recent solvers for SB\nexploit the pervasive bridge matching procedures. Such procedures aim to\nrecover a stochastic process transporting the mass between distributions given\nonly a transport plan between them. In particular, given the EOT plan, these\nprocedures can be adapted to solve SB. This fact is heavily exploited by recent\nworks giving rise to matching-based SB solvers. The cornerstone here is\nrecovering the EOT plan: recent works either use heuristical approximations\n(e.g., the minibatch OT) or establish iterative matching procedures which by\nthe design accumulate the error during the training. We address these\nlimitations and propose a novel procedure to learn SB which we call the\n\\textbf{optimal Schr\\\"odinger bridge matching}. It exploits the optimal\nparameterization of the diffusion process and provably recovers the SB process\n\\textbf{(a)} with a single bridge matching step and \\textbf{(b)} with arbitrary\ntransport plan as the input. Furthermore, we show that the optimal bridge\nmatching objective coincides with the recently discovered energy-based modeling\n(EBM) objectives to learn EOT/SB. Inspired by this observation, we develop a\nlight solver (which we call LightSB-M) to implement optimal matching in\npractice using the Gaussian mixture parameterization of the adjusted\nSchr\\\"odinger potential. We experimentally showcase the performance of our\nsolver in a range of practical tasks. The code for our solver can be found at\nhttps://github.com/SKholkin/LightSB-Matching.\n","authors":["Nikita Gushchin","Sergei Kholkin","Evgeny Burnaev","Alexander Korotin"],"pdf_url":"https://arxiv.org/pdf/2402.03207v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20824v1","updated":"2024-07-30T13:43:32Z","published":"2024-07-30T13:43:32Z","title":"DyGKT: Dynamic Graph Learning for Knowledge Tracing","summary":" Knowledge Tracing aims to assess student learning states by predicting their\nperformance in answering questions. Different from the existing research which\nutilizes fixed-length learning sequence to obtain the student states and\nregards KT as a static problem, this work is motivated by three dynamical\ncharacteristics: 1) The scales of students answering records are constantly\ngrowing; 2) The semantics of time intervals between the records vary; 3) The\nrelationships between students, questions and concepts are evolving. The three\ndynamical characteristics above contain the great potential to revolutionize\nthe existing knowledge tracing methods. Along this line, we propose a Dynamic\nGraph-based Knowledge Tracing model, namely DyGKT. In particular, a\ncontinuous-time dynamic question-answering graph for knowledge tracing is\nconstructed to deal with the infinitely growing answering behaviors, and it is\nworth mentioning that it is the first time dynamic graph learning technology is\nused in this field. Then, a dual time encoder is proposed to capture long-term\nand short-term semantics among the different time intervals. Finally, a\nmultiset indicator is utilized to model the evolving relationships between\nstudents, questions, and concepts via the graph structural feature. Numerous\nexperiments are conducted on five real-world datasets, and the results\ndemonstrate the superiority of our model. All the used resources are publicly\navailable at https://github.com/PengLinzhi/DyGKT.\n","authors":["Ke Cheng","Linzhi Peng","Pengyang Wang","Junchen Ye","Leilei Sun","Bowen Du"],"pdf_url":"https://arxiv.org/pdf/2407.20824v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20817v1","updated":"2024-07-30T13:32:26Z","published":"2024-07-30T13:32:26Z","title":"Robust Load Prediction of Power Network Clusters Based on\n Cloud-Model-Improved Transformer","summary":" Load data from power network clusters indicates economic development in each\narea, crucial for predicting regional trends and guiding power enterprise\ndecisions. The Transformer model, a leading method for load prediction, faces\nchallenges modeling historical data due to variables like weather, events,\nfestivals, and data volatility. To tackle this, the cloud model's fuzzy feature\nis utilized to manage uncertainties effectively. Presenting an innovative\napproach, the Cloud Model Improved Transformer (CMIT) method integrates the\nTransformer model with the cloud model utilizing the particle swarm\noptimization algorithm, with the aim of achieving robust and precise power load\npredictions. Through comparative experiments conducted on 31 real datasets\nwithin a power network cluster, it is demonstrated that CMIT significantly\nsurpasses the Transformer model in terms of prediction accuracy, thereby\nhighlighting its effectiveness in enhancing forecasting capabilities within the\npower network cluster sector.\n","authors":["Cheng Jiang","Gang Lu","Xue Ma","Di Wu"],"pdf_url":"https://arxiv.org/pdf/2407.20817v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19320v2","updated":"2024-07-30T13:22:53Z","published":"2024-07-27T18:33:10Z","title":"WindsorML: High-Fidelity Computational Fluid Dynamics Dataset For\n Automotive Aerodynamics","summary":" This paper presents a new open-source high-fidelity dataset for Machine\nLearning (ML) containing 355 geometric variants of the Windsor body, to help\nthe development and testing of ML surrogate models for external automotive\naerodynamics. Each Computational Fluid Dynamics (CFD) simulation was run with a\nGPU-native high-fidelity Wall-Modeled Large-Eddy Simulations (WMLES) using a\nCartesian immersed-boundary method using more than 280M cells to ensure the\ngreatest possible accuracy. The dataset contains geometry variants that\nexhibits a wide range of flow characteristics that are representative of those\nobserved on road-cars. The dataset itself contains the 3D time-averaged volume\n& boundary data as well as the geometry and force & moment coefficients. This\npaper discusses the validation of the underlying CFD methods as well as\ncontents and structure of the dataset. To the authors knowledge, this\nrepresents the first, large-scale high-fidelity CFD dataset for the Windsor\nbody with a permissive open-source license (CC-BY-SA).\n","authors":["Neil Ashton","Jordan B. Angel","Aditya S. Ghate","Gaetan K. W. Kenway","Man Long Wong","Cetin Kiris","Astrid Walle","Danielle C. Maddix","Gary Page"],"pdf_url":"https://arxiv.org/pdf/2407.19320v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20806v1","updated":"2024-07-30T13:11:45Z","published":"2024-07-30T13:11:45Z","title":"ARCLE: The Abstraction and Reasoning Corpus Learning Environment for\n Reinforcement Learning","summary":" This paper introduces ARCLE, an environment designed to facilitate\nreinforcement learning research on the Abstraction and Reasoning Corpus (ARC).\nAddressing this inductive reasoning benchmark with reinforcement learning\npresents these challenges: a vast action space, a hard-to-reach goal, and a\nvariety of tasks. We demonstrate that an agent with proximal policy\noptimization can learn individual tasks through ARCLE. The adoption of\nnon-factorial policies and auxiliary losses led to performance enhancements,\neffectively mitigating issues associated with action spaces and goal\nattainment. Based on these insights, we propose several research directions and\nmotivations for using ARCLE, including MAML, GFlowNets, and World Models.\n","authors":["Hosung Lee","Sejin Kim","Seungpil Lee","Sanha Hwang","Jihwan Lee","Byung-Jun Lee","Sundong Kim"],"pdf_url":"https://arxiv.org/pdf/2407.20806v1.pdf","comment":"Accepted by CoLLAs 2024, Project page:\n https://github.com/confeitoHS/arcle"},{"id":"http://arxiv.org/abs/2407.20801v1","updated":"2024-07-30T13:07:51Z","published":"2024-07-30T13:07:51Z","title":"AhmedML: High-Fidelity Computational Fluid Dynamics Dataset for\n Incompressible, Low-Speed Bluff Body Aerodynamics","summary":" The development of Machine Learning (ML) methods for Computational Fluid\nDynamics (CFD) is currently limited by the lack of openly available training\ndata. This paper presents a new open-source dataset comprising of high\nfidelity, scale-resolving CFD simulations of 500 geometric variations of the\nAhmed Car Body - a simplified car-like shape that exhibits many of the flow\ntopologies that are present on bluff bodies such as road vehicles. The dataset\ncontains simulation results that exhibit a broad set of fundamental flow\nphysics such as geometry and pressure-induced flow separation as well as 3D\nvortical structures. Each variation of the Ahmed car body were run using a\nhigh-fidelity, time-accurate, hybrid Reynolds-Averaged Navier-Stokes (RANS) -\nLarge-Eddy Simulation (LES) turbulence modelling approach using the open-source\nCFD code OpenFOAM. The dataset contains boundary, volume, geometry, and\ntime-averaged forces/moments in widely used open-source formats. In addition,\nthe OpenFOAM case setup is provided so that others can reproduce or extend the\ndataset. This represents to the authors knowledge, the first open-source\nlarge-scale dataset using high-fidelity CFD methods for the widely used Ahmed\ncar body that is available to freely download with a permissive license\n(CC-BY-SA).\n","authors":["Neil Ashton","Danielle C. Maddix","Samuel Gundry","Parisa M. Shabestari"],"pdf_url":"https://arxiv.org/pdf/2407.20801v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2407.19320"},{"id":"http://arxiv.org/abs/2407.15526v2","updated":"2024-07-30T13:03:36Z","published":"2024-07-22T10:31:07Z","title":"Synthetic Image Learning: Preserving Performance and Preventing\n Membership Inference Attacks","summary":" Generative artificial intelligence has transformed the generation of\nsynthetic data, providing innovative solutions to challenges like data scarcity\nand privacy, which are particularly critical in fields such as medicine.\nHowever, the effective use of this synthetic data to train high-performance\nmodels remains a significant challenge. This paper addresses this issue by\nintroducing Knowledge Recycling (KR), a pipeline designed to optimise the\ngeneration and use of synthetic data for training downstream classifiers. At\nthe heart of this pipeline is Generative Knowledge Distillation (GKD), the\nproposed technique that significantly improves the quality and usefulness of\nthe information provided to classifiers through a synthetic dataset\nregeneration and soft labelling mechanism. The KR pipeline has been tested on a\nvariety of datasets, with a focus on six highly heterogeneous medical image\ndatasets, ranging from retinal images to organ scans. The results show a\nsignificant reduction in the performance gap between models trained on real and\nsynthetic data, with models based on synthetic data outperforming those trained\non real data in some cases. Furthermore, the resulting models show almost\ncomplete immunity to Membership Inference Attacks, manifesting privacy\nproperties missing in models trained with conventional techniques.\n","authors":["Eugenio Lomurno","Matteo Matteucci"],"pdf_url":"https://arxiv.org/pdf/2407.15526v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20798v1","updated":"2024-07-30T13:01:31Z","published":"2024-07-30T13:01:31Z","title":"Diffusion Augmented Agents: A Framework for Efficient Exploration and\n Transfer Learning","summary":" We introduce Diffusion Augmented Agents (DAAG), a novel framework that\nleverages large language models, vision language models, and diffusion models\nto improve sample efficiency and transfer learning in reinforcement learning\nfor embodied agents. DAAG hindsight relabels the agent's past experience by\nusing diffusion models to transform videos in a temporally and geometrically\nconsistent way to align with target instructions with a technique we call\nHindsight Experience Augmentation. A large language model orchestrates this\nautonomous process without requiring human supervision, making it well-suited\nfor lifelong learning scenarios. The framework reduces the amount of\nreward-labeled data needed to 1) finetune a vision language model that acts as\na reward detector, and 2) train RL agents on new tasks. We demonstrate the\nsample efficiency gains of DAAG in simulated robotics environments involving\nmanipulation and navigation. Our results show that DAAG improves learning of\nreward detectors, transferring past experience, and acquiring new tasks - key\nabilities for developing efficient lifelong learning agents. Supplementary\nmaterial and visualizations are available on our website\nhttps://sites.google.com/view/diffusion-augmented-agents/\n","authors":["Norman Di Palo","Leonard Hasenclever","Jan Humplik","Arunkumar Byravan"],"pdf_url":"https://arxiv.org/pdf/2407.20798v1.pdf","comment":"Published at 3rd Conference on Lifelong Learning Agents (CoLLAs),\n 2024"},{"id":"http://arxiv.org/abs/2407.20786v1","updated":"2024-07-30T12:45:05Z","published":"2024-07-30T12:45:05Z","title":"Be aware of overfitting by hyperparameter optimization!","summary":" Hyperparameter optimization is very frequently employed in machine learning.\nHowever, an optimization of a large space of parameters could result in\noverfitting of models. In recent studies on solubility prediction the authors\ncollected seven thermodynamic and kinetic solubility datasets from different\ndata sources. They used state-of-the-art graph-based methods and compared\nmodels developed for each dataset using different data cleaning protocols and\nhyperparameter optimization. In our study we showed that hyperparameter\noptimization did not always result in better models, possibly due to\noverfitting when using the same statistical measures. Similar results could be\ncalculated using pre-set hyperparameters, reducing the computational effort by\naround 10,000 times. We also extended the previous analysis by adding a\nrepresentation learning method based on Natural Language Processing of smiles\ncalled Transformer CNN. We show that across all analyzed sets using exactly the\nsame protocol, Transformer CNN provided better results than graph-based methods\nfor 26 out of 28 pairwise comparisons by using only a tiny fraction of time as\ncompared to other methods. Last but not least we stressed the importance of\ncomparing calculation results using exactly the same statistical measures.\n","authors":["Igor V. Tetko","Ruud van Deursen","Guillaume Godin"],"pdf_url":"https://arxiv.org/pdf/2407.20786v1.pdf","comment":"19 pages, 5 Tables"},{"id":"http://arxiv.org/abs/2407.20775v1","updated":"2024-07-30T12:22:03Z","published":"2024-07-30T12:22:03Z","title":"Interpretable Pre-Trained Transformers for Heart Time-Series Data","summary":" Decoder-only transformers are the backbone of the popular generative\npre-trained transformer (GPT) series of large language models. In this work, we\napply the same framework to periodic heart time-series data to create two\npre-trained general purpose cardiac models, namely PPG-PT and ECG-PT. We\ndemonstrate that both such pre-trained models are fully interpretable. This is\nachieved firstly through aggregate attention maps which show that the model\nfocuses on similar points in previous cardiac cycles in order to make\npredictions and gradually broadens its attention in deeper layers. Next, tokens\nwith the same value, that occur at different distinct points in the ECG and PPG\ncycle, form separate clusters in high dimensional space based on their phase as\nthey propagate through the transformer blocks. Finally, we highlight that\nindividual attention heads respond to specific physiologically relevent\nfeatures, such as the dicrotic notch in PPG and the P-wave in ECG. It is also\ndemonstrated that these pre-trained models can be easily fine-tuned for tasks\nsuch as classification of atrial fibrillation. In this specific example, the\nfine-tuning took 11 minutes of computer time, and achieved a\nleave-one-subject-out AUCs of 0.99 and 0.93 for ECG and PPG respectively.\nImportantly, these fine-tuned models are also fully explainable, with attention\nshifting to regions in the context that are strongly indicative of atrial\nfibrillation.\n","authors":["Harry J. Davies","James Monsen","Danilo P. Mandic"],"pdf_url":"https://arxiv.org/pdf/2407.20775v1.pdf","comment":"14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2402.17270v2","updated":"2024-07-30T12:21:17Z","published":"2024-02-27T07:31:30Z","title":"Multi-Agent, Human-Agent and Beyond: A Survey on Cooperation in Social\n Dilemmas","summary":" The study of cooperation within social dilemmas has long been a fundamental\ntopic across various disciplines, including computer science and social\nscience. Recent advancements in Artificial Intelligence (AI) have significantly\nreshaped this field, offering fresh insights into understanding and enhancing\ncooperation. This survey examines three key areas at the intersection of AI and\ncooperation in social dilemmas. First, focusing on multi-agent cooperation, we\nreview the intrinsic and external motivations that support cooperation among\nrational agents, and the methods employed to develop effective strategies\nagainst diverse opponents. Second, looking into human-agent cooperation, we\ndiscuss the current AI algorithms for cooperating with humans and the human\nbiases towards AI agents. Third, we review the emergent field of leveraging AI\nagents to enhance cooperation among humans. We conclude by discussing future\nresearch avenues, such as using large language models, establishing unified\ntheoretical frameworks, revisiting existing theories of human cooperation, and\nexploring multiple real-world applications.\n","authors":["Chunjiang Mu","Hao Guo","Yang Chen","Chen Shen","Shuyue Hu","Zhen Wang"],"pdf_url":"https://arxiv.org/pdf/2402.17270v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01690v7","updated":"2024-07-30T12:16:14Z","published":"2023-10-02T23:09:59Z","title":"Forecasting Tropical Cyclones with Cascaded Diffusion Models","summary":" As tropical cyclones become more intense due to climate change, the rise of\nAl-based modelling provides a more affordable and accessible approach compared\nto traditional methods based on mathematical models. This work leverages\ngenerative diffusion models to forecast cyclone trajectories and precipitation\npatterns by integrating satellite imaging, remote sensing, and atmospheric\ndata. It employs a cascaded approach that incorporates three main tasks:\nforecasting, super-resolution, and precipitation modelling. The training\ndataset includes 51 cyclones from six major tropical cyclone basins from\nJanuary 2019 - March 2023. Experiments demonstrate that the final forecasts\nfrom the cascaded models show accurate predictions up to a 36-hour rollout,\nwith excellent Structural Similarity (SSIM) and Peak-Singal-To-Noise Ratio\n(PSNR) values exceeding 0.5 and 20 dB, respectively, for all three tasks. The\n36-hour forecasts can be produced in as little as 30 mins on a single Nvidia\nA30/RTX 2080 Ti. This work also highlights the promising efficiency of Al\nmethods such as diffusion models for high-performance needs in weather\nforecasting, such as tropical cyclone forecasting, while remaining\ncomputationally affordable, making them ideal for highly vulnerable regions\nwith critical forecasting needs and financial limitations. Code accessible at\nhttps://github.com/nathzi1505/forecast-diffmodels.\n","authors":["Pritthijit Nath","Pancham Shukla","Shuai Wang","César Quilodrán-Casas"],"pdf_url":"https://arxiv.org/pdf/2310.01690v7.pdf","comment":"Accepted for poster presentation at the ICLR 2024 workshop on\n Tackling Climate Change with Machine Learning. 7 pages, 3 figures"},{"id":"http://arxiv.org/abs/2407.20768v1","updated":"2024-07-30T12:13:18Z","published":"2024-07-30T12:13:18Z","title":"HyperMM : Robust Multimodal Learning with Varying-sized Inputs","summary":" Combining multiple modalities carrying complementary information through\nmultimodal learning (MML) has shown considerable benefits for diagnosing\nmultiple pathologies. However, the robustness of multimodal models to missing\nmodalities is often overlooked. Most works assume modality completeness in the\ninput data, while in clinical practice, it is common to have incomplete\nmodalities. Existing solutions that address this issue rely on modality\nimputation strategies before using supervised learning models. These\nstrategies, however, are complex, computationally costly and can strongly\nimpact subsequent prediction models. Hence, they should be used with parsimony\nin sensitive applications such as healthcare. We propose HyperMM, an end-to-end\nframework designed for learning with varying-sized inputs. Specifically, we\nfocus on the task of supervised MML with missing imaging modalities without\nusing imputation before training. We introduce a novel strategy for training a\nuniversal feature extractor using a conditional hypernetwork, and propose a\npermutation-invariant neural network that can handle inputs of varying\ndimensions to process the extracted features, in a two-phase task-agnostic\nframework. We experimentally demonstrate the advantages of our method in two\ntasks: Alzheimer's disease detection and breast cancer classification. We\ndemonstrate that our strategy is robust to high rates of missing data and that\nits flexibility allows it to handle varying-sized datasets beyond the scenario\nof missing modalities.\n","authors":["Hava Chaptoukaev","Vincenzo Marcianó","Francesco Galati","Maria A. Zuluaga"],"pdf_url":"https://arxiv.org/pdf/2407.20768v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.06620v2","updated":"2024-07-30T12:03:09Z","published":"2024-06-07T14:34:28Z","title":"DualTime: A Dual-Adapter Multimodal Language Model for Time Series\n Representation","summary":" The recent rapid development of language models (LMs) has attracted attention\nin the field of time series, including multimodal time series modeling.\nHowever, we note that current time series multimodal methods are biased, often\nassigning a primary role to one modality while the other assumes a secondary\nrole. They overlook the mutual benefits and complementary of different\nmodalities. For example, in seizure diagnosis, relying solely on textual\nclinical reports makes it difficult to pinpoint the area and type of the\ndisease, while electroencephalograms (EEGs) alone cannot provide an accurate\ndiagnosis without considering the symptoms. In this study, based on the\ncomplementary information mining of time series multimodal data, we propose\nDualTime, a Dual-adapter multimodal language model for Time series\nrepresentation implementing temporal-primary and textual-primary modeling\nsimultaneously. By injecting lightweight adaption tokens, the LM pipeline\nshared by dual adapters encourages embedding alignment and achieves efficient\nfine-tuning. Empirically, our method outperforms state-of-the-art models in\nboth supervised and unsupervised settings, highlighting the complementary\nbenefits of different modalities. In addition, we conduct few-shot label\ntransfer experiments, which further verifies the transferability and\nexpressiveness of our proposed DualTime.\n","authors":["Weiqi Zhang","Jiexia Ye","Ziyue Li","Jia Li","Fugee Tsung"],"pdf_url":"https://arxiv.org/pdf/2406.06620v2.pdf","comment":"15 pages, 12 figure, 5 tables"},{"id":"http://arxiv.org/abs/2407.03056v2","updated":"2024-07-30T11:56:43Z","published":"2024-07-03T12:24:40Z","title":"Improving Zero-shot Generalization of Learned Prompts via Unsupervised\n Knowledge Distillation","summary":" Vision-Language Models (VLMs) demonstrate remarkable zero-shot generalization\nto unseen tasks, but fall short of the performance of supervised methods in\ngeneralizing to downstream tasks with limited data. Prompt learning is emerging\nas a parameter-efficient method for adapting VLMs, but state-of-the-art\napproaches require annotated samples. In this paper we propose a novel approach\nto prompt learning based on unsupervised knowledge distillation from more\npowerful models. Our approach, which we call Knowledge Distillation Prompt\nLearning (KDPL), can be integrated into existing prompt learning techniques and\neliminates the need for labeled examples during adaptation. Our experiments on\nmore than ten standard benchmark datasets demonstrate that KDPL is very\neffective at improving generalization of learned prompts for zero-shot domain\ngeneralization, zero-shot cross-dataset generalization, and zero-shot\nbase-to-novel class generalization problems. KDPL requires no ground-truth\nlabels for adaptation, and moreover we show that even in the absence of any\nknowledge of training class names it can be used to effectively transfer\nknowledge. The code is publicly available at https://github.com/miccunifi/KDPL.\n","authors":["Marco Mistretta","Alberto Baldrati","Marco Bertini","Andrew D. Bagdanov"],"pdf_url":"https://arxiv.org/pdf/2407.03056v2.pdf","comment":"Accepted for publication at ECCV24"},{"id":"http://arxiv.org/abs/2407.20753v1","updated":"2024-07-30T11:55:52Z","published":"2024-07-30T11:55:52Z","title":"Efficient Quantum One-Class Support Vector Machines for Anomaly\n Detection Using Randomized Measurements and Variable Subsampling","summary":" Quantum one-class support vector machines leverage the advantage of quantum\nkernel methods for semi-supervised anomaly detection. However, their quadratic\ntime complexity with respect to data size poses challenges when dealing with\nlarge datasets. In recent work, quantum randomized measurements kernels and\nvariable subsampling were proposed, as two independent methods to address this\nproblem. The former achieves higher average precision, but suffers from\nvariance, while the latter achieves linear complexity to data size and has\nlower variance. The current work focuses instead on combining these two\nmethods, along with rotated feature bagging, to achieve linear time complexity\nboth to data size and to number of features. Despite their instability, the\nresulting models exhibit considerably higher performance and faster training\nand testing times.\n","authors":["Michael Kölle","Afrae Ahouzi","Pascal Debus","Elif Çetiner","Robert Müller","Daniëlle Schuman","Claudia Linnhoff-Popien"],"pdf_url":"https://arxiv.org/pdf/2407.20753v1.pdf","comment":"Submitted to Springer Nature CS"},{"id":"http://arxiv.org/abs/2403.07263v2","updated":"2024-07-30T11:31:31Z","published":"2024-03-12T02:45:24Z","title":"Adaptive Bounding Box Uncertainties via Two-Step Conformal Prediction","summary":" Quantifying a model's predictive uncertainty is essential for safety-critical\napplications such as autonomous driving. We consider quantifying such\nuncertainty for multi-object detection. In particular, we leverage conformal\nprediction to obtain uncertainty intervals with guaranteed coverage for object\nbounding boxes. One challenge in doing so is that bounding box predictions are\nconditioned on the object's class label. Thus, we develop a novel two-step\nconformal approach that propagates uncertainty in predicted class labels into\nthe uncertainty intervals of bounding boxes. This broadens the validity of our\nconformal coverage guarantees to include incorrectly classified objects, thus\noffering more actionable safety assurances. Moreover, we investigate novel\nensemble and quantile regression formulations to ensure the bounding box\nintervals are adaptive to object size, leading to a more balanced coverage.\nValidating our two-step approach on real-world datasets for 2D bounding box\nlocalization, we find that desired coverage levels are satisfied with\npractically tight predictive uncertainty intervals.\n","authors":["Alexander Timans","Christoph-Nikolas Straehle","Kaspar Sakmann","Eric Nalisnick"],"pdf_url":"https://arxiv.org/pdf/2403.07263v2.pdf","comment":"European Conference on Computer Vision (ECCV) 2024; 37 pages, 14\n figures, 6 tables (incl. appendix)"},{"id":"http://arxiv.org/abs/2407.20100v2","updated":"2024-07-30T11:27:55Z","published":"2024-07-29T15:28:26Z","title":"F-KANs: Federated Kolmogorov-Arnold Networks","summary":" In this paper, we present an innovative federated learning (FL) approach that\nutilizes Kolmogorov-Arnold Networks (KANs) for classification tasks. By\nutilizing the adaptive activation capabilities of KANs in a federated\nframework, we aim to improve classification capabilities while preserving\nprivacy. The study evaluates the performance of federated KANs (F- KANs)\ncompared to traditional Multi-Layer Perceptrons (MLPs) on classification task.\nThe results show that the F-KANs model significantly outperforms the federated\nMLP model in terms of accuracy, precision, recall, F1 score and stability, and\nachieves better performance, paving the way for more efficient and\nprivacy-preserving predictive analytics.\n","authors":["Engin Zeydan","Cristian J. Vaca-Rubio","Luis Blanco","Roberto Pereira","Marius Caus","Abdullah Aydeger"],"pdf_url":"https://arxiv.org/pdf/2407.20100v2.pdf","comment":"This work has been submitted to IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible. Related Code: https://github.com/ezeydan/F-KANs.git"},{"id":"http://arxiv.org/abs/2402.01393v3","updated":"2024-07-30T11:20:47Z","published":"2024-02-02T13:17:19Z","title":"ALERT-Transformer: Bridging Asynchronous and Synchronous Machine\n Learning for Real-Time Event-based Spatio-Temporal Data","summary":" We seek to enable classic processing of continuous ultra-sparse\nspatiotemporal data generated by event-based sensors with dense machine\nlearning models. We propose a novel hybrid pipeline composed of asynchronous\nsensing and synchronous processing that combines several ideas: (1) an\nembedding based on PointNet models -- the ALERT module -- that can continuously\nintegrate new and dismiss old events thanks to a leakage mechanism, (2) a\nflexible readout of the embedded data that allows to feed any downstream model\nwith always up-to-date features at any sampling rate, (3) exploiting the input\nsparsity in a patch-based approach inspired by Vision Transformer to optimize\nthe efficiency of the method. These embeddings are then processed by a\ntransformer model trained for object and gesture recognition. Using this\napproach, we achieve performances at the state-of-the-art with a lower latency\nthan competitors. We also demonstrate that our asynchronous model can operate\nat any desired sampling rate.\n","authors":["Carmen Martin-Turrero","Maxence Bouvier","Manuel Breitenstein","Pietro Zanuttigh","Vincent Parret"],"pdf_url":"https://arxiv.org/pdf/2402.01393v3.pdf","comment":"Originally published in the Proceedings of Machine Learning Research\n ICML 2024"},{"id":"http://arxiv.org/abs/2407.20741v1","updated":"2024-07-30T11:19:48Z","published":"2024-07-30T11:19:48Z","title":"Improving PINNs By Algebraic Inclusion of Boundary and Initial\n Conditions","summary":" \"AI for Science\" aims to solve fundamental scientific problems using AI\ntechniques. As most physical phenomena can be described as Partial Differential\nEquations (PDEs) , approximating their solutions using neural networks has\nevolved as a central component of scientific-ML. Physics-Informed Neural\nNetworks (PINNs) is the general method that has evolved for this task but its\ntraining is well-known to be very unstable. In this work we explore the\npossibility of changing the model being trained from being just a neural\nnetwork to being a non-linear transformation of it - one that algebraically\nincludes the boundary/initial conditions. This reduces the number of terms in\nthe loss function than the standard PINN losses. We demonstrate that our\nmodification leads to significant performance gains across a range of benchmark\ntasks, in various dimensions and without having to tweak the training\nalgorithm. Our conclusions are based on conducting hundreds of experiments, in\nthe fully unsupervised setting, over multiple linear and non-linear PDEs set to\nexactly solvable scenarios, which lends to a concrete measurement of our\nperformance gains in terms of order(s) of magnitude lower fractional errors\nbeing achieved, than by standard PINNs. The code accompanying this manuscript\nis publicly available at,\nhttps://github.com/MorganREN/Improving-PINNs-By-Algebraic-Inclusion-of-Boundary-and-Initial-Conditions\n","authors":["Mohan Ren","Zhihao Fang","Keren Li","Anirbit Mukherjee"],"pdf_url":"https://arxiv.org/pdf/2407.20741v1.pdf","comment":"48 Pages, 25 Figures"},{"id":"http://arxiv.org/abs/2407.20734v1","updated":"2024-07-30T11:09:27Z","published":"2024-07-30T11:09:27Z","title":"Efficient Pareto Manifold Learning with Low-Rank Structure","summary":" Multi-task learning, which optimizes performance across multiple tasks, is\ninherently a multi-objective optimization problem. Various algorithms are\ndeveloped to provide discrete trade-off solutions on the Pareto front.\nRecently, continuous Pareto front approximations using a linear combination of\nbase networks have emerged as a compelling strategy. However, it suffers from\nscalability issues when the number of tasks is large. To address this issue, we\npropose a novel approach that integrates a main network with several low-rank\nmatrices to efficiently learn the Pareto manifold. It significantly reduces the\nnumber of parameters and facilitates the extraction of shared features. We also\nintroduce orthogonal regularization to further bolster performance. Extensive\nexperimental results demonstrate that the proposed approach outperforms\nstate-of-the-art baselines, especially on datasets with a large number of\ntasks.\n","authors":["Weiyu Chen","James T. Kwok"],"pdf_url":"https://arxiv.org/pdf/2407.20734v1.pdf","comment":"ICML 2024 (Spotlight)"},{"id":"http://arxiv.org/abs/2407.17654v2","updated":"2024-07-30T10:42:06Z","published":"2024-07-24T21:46:39Z","title":"Generative Learning for Simulation of Vehicle Faults","summary":" We develop a novel generative model to simulate vehicle health and forecast\nfaults, conditioned on practical operational considerations. The model, trained\non data from the US Army's Predictive Logistics program, aims to support\npredictive maintenance. It forecasts faults far enough in advance to execute a\nmaintenance intervention before a breakdown occurs. The model incorporates\nreal-world factors that affect vehicle health. It also allows us to understand\nthe vehicle's condition by analyzing operating data, and characterizing each\nvehicle into discrete states. Importantly, the model predicts the time to first\nfault with high accuracy. We compare its performance to other models and\ndemonstrate its successful training.\n","authors":["Patrick Kuiper","Sirui Lin","Jose Blanchet","Vahid Tarokh"],"pdf_url":"https://arxiv.org/pdf/2407.17654v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.10608v2","updated":"2024-07-30T10:38:31Z","published":"2024-05-17T08:12:53Z","title":"ECATS: Explainable-by-design concept-based anomaly detection for time\n series","summary":" Deep learning methods for time series have already reached excellent\nperformances in both prediction and classification tasks, including anomaly\ndetection. However, the complexity inherent in Cyber Physical Systems (CPS)\ncreates a challenge when it comes to explainability methods. To overcome this\ninherent lack of interpretability, we propose ECATS, a concept-based\nneuro-symbolic architecture where concepts are represented as Signal Temporal\nLogic (STL) formulae. Leveraging kernel-based methods for STL, concept\nembeddings are learnt in an unsupervised manner through a cross-attention\nmechanism. The network makes class predictions through these concept\nembeddings, allowing for a meaningful explanation to be naturally extracted for\neach input. Our preliminary experiments with a simple CPS-based dataset show\nthat our model is able to achieve great classification performance while\nensuring local interpretability.\n","authors":["Irene Ferfoglia","Gaia Saveri","Laura Nenzi","Luca Bortolussi"],"pdf_url":"https://arxiv.org/pdf/2405.10608v2.pdf","comment":"14 pages, 8 figures, accepted to 18th International Conference on\n Neural-Symbolic Learning and Reasoning (NeSy 2024)"},{"id":"http://arxiv.org/abs/2407.20722v1","updated":"2024-07-30T10:34:40Z","published":"2024-07-30T10:34:40Z","title":"Persistent Sampling: Unleashing the Potential of Sequential Monte Carlo","summary":" Sequential Monte Carlo (SMC) methods are powerful tools for Bayesian\ninference but suffer from requiring many particles for accurate estimates,\nleading to high computational costs. We introduce persistent sampling (PS), an\nextension of SMC that mitigates this issue by allowing particles from previous\niterations to persist. This generates a growing, weighted ensemble of particles\ndistributed across iterations. In each iteration, PS utilizes multiple\nimportance sampling and resampling from the mixture of all previous\ndistributions to produce the next generation of particles. This addresses\nparticle impoverishment and mode collapse, resulting in more accurate posterior\napproximations. Furthermore, this approach provides lower-variance marginal\nlikelihood estimates for model comparison. Additionally, the persistent\nparticles improve transition kernel adaptation for efficient exploration.\nExperiments on complex distributions show that PS consistently outperforms\nstandard methods, achieving lower squared bias in posterior moment estimation\nand significantly reduced marginal likelihood errors, all at a lower\ncomputational cost. PS offers a robust, efficient, and scalable framework for\nBayesian inference.\n","authors":["Minas Karamanis","Uroš Seljak"],"pdf_url":"https://arxiv.org/pdf/2407.20722v1.pdf","comment":"30 pages, 9 figures, 4 tables. Submitted to Statistics & Computing"},{"id":"http://arxiv.org/abs/2302.13268v5","updated":"2024-07-30T10:29:24Z","published":"2023-02-26T08:43:08Z","title":"A survey of machine learning techniques in medical applications","summary":" In recent years, machine learning (ML) has emerged as a powerful tool for\nsolving a wide range of problems, including medical decision-making. The\nexponential growth of medical data over the past two decades has surpassed the\ncapacity for manual analysis, prompting increased interest in automated data\nanalysis and processing. ML algorithms, capable of learning from data with\nminimal human intervention, are particularly well-suited for medical data\nanalysis and interpretation. One significant advantage of ML is the reduced\ncost of collecting labeled training data necessary for supervised learning.\nWhile numerous studies have explored the applications of ML in medicine, this\nsurvey specifically focuses on the use of ML across various medical research\nfields. We provide a comprehensive technical overview of existing studies on ML\napplications in medicine, highlighting the strengths and limitations of these\napproaches. Additionally, we discuss potential research directions for future\nexploration. These include the development of more sophisticated reward\nfunctions, as the accuracy of the reward function is crucial for ML\nperformance, the integration of ML with other techniques, and the application\nof ML to new and emerging areas in genomics research. Finally, we summarize our\nfindings and present the current state of the field and the future outlook for\nML in medical application.\n","authors":["M. Keramy","K. Jahanian","R. Sani","A. Agha","I. Dehzangy","M. Yan","H. Rokni"],"pdf_url":"https://arxiv.org/pdf/2302.13268v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20705v1","updated":"2024-07-30T10:00:16Z","published":"2024-07-30T10:00:16Z","title":"PIP: Prototypes-Injected Prompt for Federated Class Incremental Learning","summary":" Federated Class Incremental Learning (FCIL) is a new direction in continual\nlearning (CL) for addressing catastrophic forgetting and non-IID data\ndistribution simultaneously. Existing FCIL methods call for high communication\ncosts and exemplars from previous classes. We propose a novel rehearsal-free\nmethod for FCIL named prototypes-injected prompt (PIP) that involves 3 main\nideas: a) prototype injection on prompt learning, b) prototype augmentation,\nand c) weighted Gaussian aggregation on the server side. Our experiment result\nshows that the proposed method outperforms the current state of the arts\n(SOTAs) with a significant improvement (up to 33%) in CIFAR100, MiniImageNet\nand TinyImageNet datasets. Our extensive analysis demonstrates the robustness\nof PIP in different task sizes, and the advantage of requiring smaller\nparticipating local clients, and smaller global rounds. For further study,\nsource codes of PIP, baseline, and experimental logs are shared publicly in\nhttps://github.com/anwarmaxsum/PIP.\n","authors":["Muhammad Anwar Ma'sum","Mahardhika Pratama","Savitha Ramasamy","Lin Liu","Habibullah Habibullah","Ryszard Kowalczyk"],"pdf_url":"https://arxiv.org/pdf/2407.20705v1.pdf","comment":"Conference on Information and Knowledge Management (CIKM) 2024\n (Accepted)"},{"id":"http://arxiv.org/abs/2302.03322v3","updated":"2024-07-30T09:59:47Z","published":"2023-02-07T08:54:37Z","title":"Attacking Cooperative Multi-Agent Reinforcement Learning by Adversarial\n Minority Influence","summary":" This study probes the vulnerabilities of cooperative multi-agent\nreinforcement learning (c-MARL) under adversarial attacks, a critical\ndeterminant of c-MARL's worst-case performance prior to real-world\nimplementation. Current observation-based attacks, constrained by white-box\nassumptions, overlook c-MARL's complex multi-agent interactions and cooperative\nobjectives, resulting in impractical and limited attack capabilities. To\naddress these shortcomes, we propose Adversarial Minority Influence (AMI), a\npractical and strong for c-MARL. AMI is a practical black-box attack and can be\nlaunched without knowing victim parameters. AMI is also strong by considering\nthe complex multi-agent interaction and the cooperative goal of agents,\nenabling a single adversarial agent to unilaterally misleads majority victims\nto form targeted worst-case cooperation. This mirrors minority influence\nphenomena in social psychology. To achieve maximum deviation in victim policies\nunder complex agent-wise interactions, our unilateral attack aims to\ncharacterize and maximize the impact of the adversary on the victims. This is\nachieved by adapting a unilateral agent-wise relation metric derived from\nmutual information, thereby mitigating the adverse effects of victim influence\non the adversary. To lead the victims into a jointly detrimental scenario, our\ntargeted attack deceives victims into a long-term, cooperatively harmful\nsituation by guiding each victim towards a specific target, determined through\na trial-and-error process executed by a reinforcement learning agent. Through\nAMI, we achieve the first successful attack against real-world robot swarms and\neffectively fool agents in simulated environments into collectively worst-case\nscenarios, including Starcraft II and Multi-agent Mujoco. The source code and\ndemonstrations can be found at: https://github.com/DIG-Beihang/AMI.\n","authors":["Simin Li","Jun Guo","Jingqiao Xiu","Yuwei Zheng","Pu Feng","Xin Yu","Aishan Liu","Yaodong Yang","Bo An","Wenjun Wu","Xianglong Liu"],"pdf_url":"https://arxiv.org/pdf/2302.03322v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20700v1","updated":"2024-07-30T09:53:55Z","published":"2024-07-30T09:53:55Z","title":"Industrial-Grade Smart Troubleshooting through Causal Technical Language\n Processing: a Proof of Concept","summary":" This paper describes the development of a causal diagnosis approach for\ntroubleshooting an industrial environment on the basis of the technical\nlanguage expressed in Return on Experience records. The proposed method\nleverages the vectorized linguistic knowledge contained in the distributed\nrepresentation of a Large Language Model, and the causal associations entailed\nby the embedded failure modes and mechanisms of the industrial assets. The\npaper presents the elementary but essential concepts of the solution, which is\nconceived as a causality-aware retrieval augmented generation system, and\nillustrates them experimentally on a real-world Predictive Maintenance setting.\nFinally, it discusses avenues of improvement for the maturity of the utilized\ncausal technology to meet the robustness challenges of increasingly complex\nscenarios in the industry.\n","authors":["Alexandre Trilla","Ossee Yiboe","Nenad Mijatovic","Jordi Vitrià"],"pdf_url":"https://arxiv.org/pdf/2407.20700v1.pdf","comment":"2nd Workshop on Causal Inference and Machine Learning in Practice at\n the KDD 2024 Conference. arXiv admin note: text overlap with arXiv:2407.11056"},{"id":"http://arxiv.org/abs/2402.00912v2","updated":"2024-07-30T09:49:51Z","published":"2024-02-01T10:18:43Z","title":"Can we Constrain Concept Bottleneck Models to Learn Semantically\n Meaningful Input Features?","summary":" Concept Bottleneck Models (CBMs) are regarded as inherently interpretable\nbecause they first predict a set of human-defined concepts which are used to\npredict a task label. For inherent interpretability to be fully realised, and\nensure trust in a model's output, it's desirable for concept predictions to use\nsemantically meaningful input features. For instance, in an image, pixels\nrepresenting a broken bone should contribute to predicting a fracture. However,\ncurrent literature suggests that concept predictions often rely on irrelevant\ninput features. We hypothesise that this occurs when dataset labels include\ninaccurate concept annotations, or the relationship between input features and\nconcepts is unclear. In general, the effect of dataset labelling on concept\nrepresentations remains an understudied area. In this paper, we demonstrate\nthat CBMs can learn to map concepts to semantically meaningful input features,\nby utilising datasets with a clear link between the input features and the\ndesired concept predictions. This is achieved, for instance, by ensuring\nmultiple concepts do not always co-occur and, therefore provide a clear\ntraining signal for the CBM to distinguish the relevant input features for each\nconcept. We validate our hypothesis on both synthetic and real-world image\ndatasets, and demonstrate under the correct conditions, CBMs can learn to\nattribute semantically meaningful input features to the correct concept\npredictions.\n","authors":["Jack Furby","Daniel Cunnington","Dave Braines","Alun Preece"],"pdf_url":"https://arxiv.org/pdf/2402.00912v2.pdf","comment":"Main paper: 8 pages, 9 figures, Appendix: 14 pages, 21 figures. This\n paper is a preprint"},{"id":"http://arxiv.org/abs/2407.20697v1","updated":"2024-07-30T09:46:03Z","published":"2024-07-30T09:46:03Z","title":"Weak neural variational inference for solving Bayesian inverse problems\n without forward models: applications in elastography","summary":" In this paper, we introduce a novel, data-driven approach for solving\nhigh-dimensional Bayesian inverse problems based on partial differential\nequations (PDEs), called Weak Neural Variational Inference (WNVI). The method\ncomplements real measurements with virtual observations derived from the\nphysical model. In particular, weighted residuals are employed as probes to the\ngoverning PDE in order to formulate and solve a Bayesian inverse problem\nwithout ever formulating nor solving a forward model. The formulation treats\nthe state variables of the physical model as latent variables, inferred using\nStochastic Variational Inference (SVI), along with the usual unknowns. The\napproximate posterior employed uses neural networks to approximate the inverse\nmapping from state variables to the unknowns. We illustrate the proposed method\nin a biomedical setting where we infer spatially varying material properties\nfrom noisy tissue deformation data. We demonstrate that WNVI is not only as\naccurate and more efficient than traditional methods that rely on repeatedly\nsolving the (non)linear forward problem as a black-box, but it can also handle\nill-posed forward problems (e.g., with insufficient boundary conditions).\n","authors":["Vincent C. Scholz","Yaohua Zang","Phaedon-Stelios Koutsourelakis"],"pdf_url":"https://arxiv.org/pdf/2407.20697v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20695v1","updated":"2024-07-30T09:43:42Z","published":"2024-07-30T09:43:42Z","title":"Time Series Anomaly Detection with CNN for Environmental Sensors in\n Healthcare-IoT","summary":" This research develops a new method to detect anomalies in time series data\nusing Convolutional Neural Networks (CNNs) in healthcare-IoT. The proposed\nmethod creates a Distributed Denial of Service (DDoS) attack using an IoT\nnetwork simulator, Cooja, which emulates environmental sensors such as\ntemperature and humidity. CNNs detect anomalies in time series data, resulting\nin a 92\\% accuracy in identifying possible attacks.\n","authors":["Mirza Akhi Khatun","Mangolika Bhattacharya","Ciarán Eising","Lubna Luxmi Dhirani"],"pdf_url":"https://arxiv.org/pdf/2407.20695v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20694v1","updated":"2024-07-30T09:43:35Z","published":"2024-07-30T09:43:35Z","title":"Detecting Causality in the Frequency Domain with Cross-Mapping Coherence","summary":" Understanding causal relationships within a system is crucial for uncovering\nits underlying mechanisms. Causal discovery methods, which facilitate the\nconstruction of such models from time-series data, hold the potential to\nsignificantly advance scientific and engineering fields.\n This study introduces the Cross-Mapping Coherence (CMC) method, designed to\nreveal causal connections in the frequency domain between time series. CMC\nbuilds upon nonlinear state-space reconstruction and extends the Convergent\nCross-Mapping algorithm to the frequency domain by utilizing coherence metrics\nfor evaluation. We tested the Cross-Mapping Coherence method using simulations\nof logistic maps, Lorenz systems, Kuramoto oscillators, and the Wilson-Cowan\nmodel of the visual cortex. CMC accurately identified the direction of causal\nconnections in all simulated scenarios. When applied to the Wilson-Cowan model,\nCMC yielded consistent results similar to spectral Granger causality.\n Furthermore, CMC exhibits high sensitivity in detecting weak connections,\ndemonstrates sample efficiency, and maintains robustness in the presence of\nnoise.\n In conclusion, the capability to determine directed causal influences across\ndifferent frequency bands allows CMC to provide valuable insights into the\ndynamics of complex, nonlinear systems.\n","authors":["Zsigmond Benkő","Bálint Varga","Marcell Stippinger","Zoltán Somogyvári"],"pdf_url":"https://arxiv.org/pdf/2407.20694v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.04474v2","updated":"2024-07-30T09:41:12Z","published":"2023-04-10T09:23:34Z","title":"Data Imputation from the Perspective of Graph Dirichlet Energy","summary":" Data imputation is a crucial task due to the widespread occurrence of missing\ndata. Many methods adopt a two-step approach: initially crafting a preliminary\nimputation (the \"draft\") and then refining it to produce the final missing data\nimputation result, commonly referred to as \"draft-then-refine\". In our study,\nwe examine this prevalent strategy through the lens of graph Dirichlet energy.\nWe observe that a basic \"draft\" imputation tends to decrease the Dirichlet\nenergy. Therefore, a subsequent \"refine\" step is necessary to restore the\noverall energy balance. Existing refinement techniques, such as the Graph\nConvolutional Network (GCN), often result in further energy reduction. To\naddress this, we introduce a new framework, the Graph Laplacian Pyramid Network\n(GLPN). GLPN incorporates a U-shaped autoencoder and residual networks to\ncapture both global and local details effectively. Through extensive\nexperiments on multiple real-world datasets, GLPN consistently outperforms\nstate-of-the-art methods across three different missing data mechanisms. The\ncode is available at https://github.com/liguanlue/GLPN.\n","authors":["Weiqi Zhang","Guanlue Li","Jianheng Tang","Jia Li","Fugee Tsung"],"pdf_url":"https://arxiv.org/pdf/2304.04474v2.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.20678v1","updated":"2024-07-30T09:20:15Z","published":"2024-07-30T09:20:15Z","title":"The Susceptibility of Example-Based Explainability Methods to Class\n Outliers","summary":" This study explores the impact of class outliers on the effectiveness of\nexample-based explainability methods for black-box machine learning models. We\nreformulate existing explainability evaluation metrics, such as correctness and\nrelevance, specifically for example-based methods, and introduce a new metric,\ndistinguishability. Using these metrics, we highlight the shortcomings of\ncurrent example-based explainability methods, including those who attempt to\nsuppress class outliers. We conduct experiments on two datasets, a text\nclassification dataset and an image classification dataset, and evaluate the\nperformance of four state-of-the-art explainability methods. Our findings\nunderscore the need for robust techniques to tackle the challenges posed by\nclass outliers.\n","authors":["Ikhtiyor Nematov","Dimitris Sacharidis","Tomer Sagi","Katja Hose"],"pdf_url":"https://arxiv.org/pdf/2407.20678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.05373v3","updated":"2024-07-30T09:05:05Z","published":"2023-12-15T12:45:47Z","title":"Dynamic Spiking Framework for Graph Neural Networks","summary":" The integration of Spiking Neural Networks (SNNs) and Graph Neural Networks\n(GNNs) is gradually attracting attention due to the low power consumption and\nhigh efficiency in processing the non-Euclidean data represented by graphs.\nHowever, as a common problem, dynamic graph representation learning faces\nchallenges such as high complexity and large memory overheads. Current work\noften uses SNNs instead of Recurrent Neural Networks (RNNs) by using binary\nfeatures instead of continuous ones for efficient training, which would\noverlooks graph structure information and leads to the loss of details during\npropagation. Additionally, optimizing dynamic spiking models typically requires\npropagation of information across time steps, which increases memory\nrequirements. To address these challenges, we present a framework named\n\\underline{Dy}namic \\underline{S}p\\underline{i}king \\underline{G}raph\n\\underline{N}eural Networks (\\method{}). To mitigate the information loss\nproblem, \\method{} propagates early-layer information directly to the last\nlayer for information compensation. To accommodate the memory requirements, we\napply the implicit differentiation on the equilibrium state, which does not\nrely on the exact reverse of the forward computation. While traditional\nimplicit differentiation methods are usually used for static situations,\n\\method{} extends it to the dynamic graph setting. Extensive experiments on\nthree large-scale real-world dynamic graph datasets validate the effectiveness\nof \\method{} on dynamic node classification tasks with lower computational\ncosts.\n","authors":["Nan Yin","Mengzhu Wang","Zhenghan Chen","Giulia De Masi","Bin Gu","Huan Xiong"],"pdf_url":"https://arxiv.org/pdf/2401.05373v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20667v1","updated":"2024-07-30T09:04:23Z","published":"2024-07-30T09:04:23Z","title":"Rethinking the Function of Neurons in KANs","summary":" The neurons of Kolmogorov-Arnold Networks (KANs) perform a simple summation\nmotivated by the Kolmogorov-Arnold representation theorem, which asserts that\nsum is the only fundamental multivariate function. In this work, we investigate\nthe potential for identifying an alternative multivariate function for KAN\nneurons that may offer increased practical utility. Our empirical research\ninvolves testing various multivariate functions in KAN neurons across a range\nof benchmark Machine Learning tasks.\n Our findings indicate that substituting the sum with the average function in\nKAN neurons results in significant performance enhancements compared to\ntraditional KANs. Our study demonstrates that this minor modification\ncontributes to the stability of training by confining the input to the spline\nwithin the effective range of the activation function. Our implementation and\nexperiments are available at: \\url{https://github.com/Ghaith81/dropkan}\n","authors":["Mohammed Ghaith Altarabichi"],"pdf_url":"https://arxiv.org/pdf/2407.20667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19456v2","updated":"2024-07-30T08:58:51Z","published":"2024-04-30T11:13:23Z","title":"A Survey of Imitation Learning Methods, Environments and Metrics","summary":" Imitation learning is an approach in which an agent learns how to execute a\ntask by trying to mimic how one or more teachers perform it. This learning\napproach offers a compromise between the time it takes to learn a new task and\nthe effort needed to collect teacher samples for the agent. It achieves this by\nbalancing learning from the teacher, who has some information on how to perform\nthe task, and deviating from their examples when necessary, such as states not\npresent in the teacher samples. Consequently, the field of imitation learning\nhas received much attention from researchers in recent years, resulting in many\nnew methods and applications. However, with this increase in published work and\npast surveys focusing mainly on methodology, a lack of standardisation became\nmore prominent in the field. This non-standardisation is evident in the use of\nenvironments, which appear in no more than two works, and evaluation processes,\nsuch as qualitative analysis, that have become rare in current literature. In\nthis survey, we systematically review current imitation learning literature and\npresent our findings by (i) classifying imitation learning techniques,\nenvironments and metrics by introducing novel taxonomies; (ii) reflecting on\nmain problems from the literature; and (iii) presenting challenges and future\ndirections for researchers.\n","authors":["Nathan Gavenski","Felipe Meneguzzi","Michael Luck","Odinaldo Rodrigues"],"pdf_url":"https://arxiv.org/pdf/2404.19456v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20013v2","updated":"2024-07-30T08:56:25Z","published":"2024-07-29T13:45:23Z","title":"Classification of freshwater snails of the genus Radomaniola with\n multimodal triplet networks","summary":" In this paper, we present our first proposal of a machine learning system for\nthe classification of freshwater snails of the genus Radomaniola. We elaborate\non the specific challenges encountered during system design, and how we tackled\nthem; namely a small, very imbalanced dataset with a high number of classes and\nhigh visual similarity between classes. We then show how we employed triplet\nnetworks and the multiple input modalities of images, measurements, and genetic\ninformation to overcome these challenges and reach a performance comparable to\nthat of a trained domain expert.\n","authors":["Dennis Vetter","Muhammad Ahsan","Diana Delicado","Thomas A. Neubauer","Thomas Wilke","Gemma Roig"],"pdf_url":"https://arxiv.org/pdf/2407.20013v2.pdf","comment":"Spotlight at ICML 2024 AI for Science workshop"},{"id":"http://arxiv.org/abs/2407.20662v1","updated":"2024-07-30T08:55:27Z","published":"2024-07-30T08:55:27Z","title":"DocXPand-25k: a large and diverse benchmark dataset for identity\n documents analysis","summary":" Identity document (ID) image analysis has become essential for many online\nservices, like bank account opening or insurance subscription. In recent years,\nmuch research has been conducted on subjects like document localization, text\nrecognition and fraud detection, to achieve a level of accuracy reliable enough\nto automatize identity verification. However, there are only a few available\ndatasets to benchmark ID analysis methods, mainly because of privacy\nrestrictions, security requirements and legal reasons.\n In this paper, we present the DocXPand-25k dataset, which consists of 24,994\nrichly labeled IDs images, generated using custom-made vectorial templates\nrepresenting nine fictitious ID designs, including four identity cards, two\nresidence permits and three passports designs. These synthetic IDs feature\nartificially generated personal information (names, dates, identifiers, faces,\nbarcodes, ...), and present a rich diversity in the visual layouts and textual\ncontents.\n We collected about 5.8k diverse backgrounds coming from real-world photos,\nscans and screenshots of IDs to guarantee the variety of the backgrounds. The\nsoftware we wrote to generate these images has been published\n(https://github.com/QuickSign/docxpand/) under the terms of the MIT license,\nand our dataset has been published\n(https://github.com/QuickSign/docxpand/releases/tag/v1.0.0) under the terms of\nthe CC-BY-NC-SA 4.0 License.\n","authors":["Julien Lerouge","Guillaume Betmont","Thomas Bres","Evgeny Stepankevich","Alexis Bergès"],"pdf_url":"https://arxiv.org/pdf/2407.20662v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20657v1","updated":"2024-07-30T08:52:16Z","published":"2024-07-30T08:52:16Z","title":"Prompt-Driven Contrastive Learning for Transferable Adversarial Attacks","summary":" Recent vision-language foundation models, such as CLIP, have demonstrated\nsuperior capabilities in learning representations that can be transferable\nacross diverse range of downstream tasks and domains. With the emergence of\nsuch powerful models, it has become crucial to effectively leverage their\ncapabilities in tackling challenging vision tasks. On the other hand, only a\nfew works have focused on devising adversarial examples that transfer well to\nboth unknown domains and model architectures. In this paper, we propose a novel\ntransfer attack method called PDCL-Attack, which leverages the CLIP model to\nenhance the transferability of adversarial perturbations generated by a\ngenerative model-based attack framework. Specifically, we formulate an\neffective prompt-driven feature guidance by harnessing the semantic\nrepresentation power of text, particularly from the ground-truth class labels\nof input images. To the best of our knowledge, we are the first to introduce\nprompt learning to enhance the transferable generative attacks. Extensive\nexperiments conducted across various cross-domain and cross-model settings\nempirically validate our approach, demonstrating its superiority over\nstate-of-the-art methods.\n","authors":["Hunmin Yang","Jongoh Jeong","Kuk-Jin Yoon"],"pdf_url":"https://arxiv.org/pdf/2407.20657v1.pdf","comment":"Accepted to ECCV 2024, Project Page: https://PDCL-Attack.github.io"},{"id":"http://arxiv.org/abs/2407.20656v1","updated":"2024-07-30T08:52:10Z","published":"2024-07-30T08:52:10Z","title":"Efficient Multi-Objective Neural Architecture Search via Pareto\n Dominance-based Novelty Search","summary":" Neural Architecture Search (NAS) aims to automate the discovery of\nhigh-performing deep neural network architectures. Traditional objective-based\nNAS approaches typically optimize a certain performance metric (e.g.,\nprediction accuracy), overlooking large parts of the architecture search space\nthat potentially contain interesting network configurations. Furthermore,\nobjective-driven population-based metaheuristics in complex search spaces often\nquickly exhaust population diversity and succumb to premature convergence to\nlocal optima. This issue becomes more complicated in NAS when performance\nobjectives do not fully align with the actual performance of the candidate\narchitectures, as is often the case with training-free metrics. While\ntraining-free metrics have gained popularity for their rapid performance\nestimation of candidate architectures without incurring computation-heavy\nnetwork training, their effective incorporation into NAS remains a challenge.\nThis paper presents the Pareto Dominance-based Novelty Search for\nmulti-objective NAS with Multiple Training-Free metrics (MTF-PDNS). Unlike\nconventional NAS methods that optimize explicit objectives, MTF-PDNS promotes\npopulation diversity by utilizing a novelty score calculated based on multiple\ntraining-free performance and complexity metrics, thereby yielding a broader\nexploration of the search space. Experimental results on standard NAS benchmark\nsuites demonstrate that MTF-PDNS outperforms conventional methods driven by\nexplicit objectives in terms of convergence speed, diversity maintenance,\narchitecture transferability, and computational costs.\n","authors":["An Vo","Ngoc Hoang Luong"],"pdf_url":"https://arxiv.org/pdf/2407.20656v1.pdf","comment":"10 pages, 4 figures. Accepted as full paper at GECCO 2024"},{"id":"http://arxiv.org/abs/2407.20653v1","updated":"2024-07-30T08:50:06Z","published":"2024-07-30T08:50:06Z","title":"FACL-Attack: Frequency-Aware Contrastive Learning for Transferable\n Adversarial Attacks","summary":" Deep neural networks are known to be vulnerable to security risks due to the\ninherent transferable nature of adversarial examples. Despite the success of\nrecent generative model-based attacks demonstrating strong transferability, it\nstill remains a challenge to design an efficient attack strategy in a\nreal-world strict black-box setting, where both the target domain and model\narchitectures are unknown. In this paper, we seek to explore a feature\ncontrastive approach in the frequency domain to generate adversarial examples\nthat are robust in both cross-domain and cross-model settings. With that goal\nin mind, we propose two modules that are only employed during the training\nphase: a Frequency-Aware Domain Randomization (FADR) module to randomize\ndomain-variant low- and high-range frequency components and a\nFrequency-Augmented Contrastive Learning (FACL) module to effectively separate\ndomain-invariant mid-frequency features of clean and perturbed image. We\ndemonstrate strong transferability of our generated adversarial perturbations\nthrough extensive cross-domain and cross-model experiments, while keeping the\ninference time complexity.\n","authors":["Hunmin Yang","Jongoh Jeong","Kuk-Jin Yoon"],"pdf_url":"https://arxiv.org/pdf/2407.20653v1.pdf","comment":"Accepted to AAAI 2024, Project Page: https://FACL-Attack.github.io"},{"id":"http://arxiv.org/abs/2407.20651v1","updated":"2024-07-30T08:48:49Z","published":"2024-07-30T08:48:49Z","title":"Towards Generalizable Reinforcement Learning via Causality-Guided\n Self-Adaptive Representations","summary":" General intelligence requires quick adaption across tasks. While existing\nreinforcement learning (RL) methods have made progress in generalization, they\ntypically assume only distribution changes between source and target domains.\nIn this paper, we explore a wider range of scenarios where both the\ndistribution and environment spaces may change. For example, in Atari games, we\ntrain agents to generalize to tasks with different levels of mode and\ndifficulty, where there could be new state or action variables that never\noccurred in previous environments. To address this challenging setting, we\nintroduce a causality-guided self-adaptive representation-based approach,\ncalled CSR, that equips the agent to generalize effectively and efficiently\nacross a sequence of tasks with evolving dynamics. Specifically, we employ\ncausal representation learning to characterize the latent causal variables and\nworld models within the RL system. Such compact causal representations uncover\nthe structural relationships among variables, enabling the agent to\nautonomously determine whether changes in the environment stem from\ndistribution shifts or variations in space, and to precisely locate these\nchanges. We then devise a three-step strategy to fine-tune the model under\ndifferent scenarios accordingly. Empirical experiments show that CSR\nefficiently adapts to the target domains with only a few samples and\noutperforms state-of-the-art baselines on a wide range of scenarios, including\nour simulated environments, Cartpole, and Atari games.\n","authors":["Yupei Yang","Biwei Huang","Fan Feng","Xinyue Wang","Shikui Tu","Lei Xu"],"pdf_url":"https://arxiv.org/pdf/2407.20651v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.10592v2","updated":"2024-07-30T08:48:04Z","published":"2024-02-16T11:27:48Z","title":"Optimizing Adaptive Experiments: A Unified Approach to Regret\n Minimization and Best-Arm Identification","summary":" Practitioners conducting adaptive experiments often encounter two competing\npriorities: maximizing total welfare (or `reward') through effective treatment\nassignment and swiftly concluding experiments to implement population-wide\ntreatments. Current literature addresses these priorities separately, with\nregret minimization studies focusing on the former and best-arm identification\nresearch on the latter. This paper bridges this divide by proposing a unified\nmodel that simultaneously accounts for within-experiment performance and\npost-experiment outcomes. We provide a sharp theory of optimal performance in\nlarge populations that not only unifies canonical results in the literature but\nalso uncovers novel insights. Our theory reveals that familiar algorithms, such\nas the recently proposed top-two Thompson sampling algorithm, can optimize a\nbroad class of objectives if a single scalar parameter is appropriately\nadjusted. In addition, we demonstrate that substantial reductions in experiment\nduration can often be achieved with minimal impact on both within-experiment\nand post-experiment regret.\n","authors":["Chao Qin","Daniel Russo"],"pdf_url":"https://arxiv.org/pdf/2402.10592v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20650v1","updated":"2024-07-30T08:47:02Z","published":"2024-07-30T08:47:02Z","title":"No learning rates needed: Introducing SALSA -- Stable Armijo Line Search\n Adaptation","summary":" In recent studies, line search methods have been demonstrated to\nsignificantly enhance the performance of conventional stochastic gradient\ndescent techniques across various datasets and architectures, while making an\notherwise critical choice of learning rate schedule superfluous. In this paper,\nwe identify problems of current state-of-the-art of line search methods,\npropose enhancements, and rigorously assess their effectiveness. Furthermore,\nwe evaluate these methods on orders of magnitude larger datasets and more\ncomplex data domains than previously done. More specifically, we enhance the\nArmijo line search method by speeding up its computation and incorporating a\nmomentum term into the Armijo criterion, making it better suited for stochastic\nmini-batching. Our optimization approach outperforms both the previous Armijo\nimplementation and a tuned learning rate schedule for the Adam and SGD\noptimizers. Our evaluation covers a diverse range of architectures, such as\nTransformers, CNNs, and MLPs, as well as data domains, including NLP and image\ndata.\n Our work is publicly available as a Python package, which provides a simple\nPytorch optimizer.\n","authors":["Philip Kenneweg","Tristan Kenneweg","Fabian Fumagalli","Barbara Hammer"],"pdf_url":"https://arxiv.org/pdf/2407.20650v1.pdf","comment":"published in IJCNN 2024. arXiv admin note: text overlap with\n arXiv:2403.18519"},{"id":"http://arxiv.org/abs/2407.20648v1","updated":"2024-07-30T08:45:32Z","published":"2024-07-30T08:45:32Z","title":"Leveraging Multi-facet Paths for Heterogeneous Graph Representation\n Learning","summary":" Recent advancements in graph neural networks (GNNs) and heterogeneous GNNs\n(HGNNs) have advanced node embeddings and relationship learning for various\ntasks. However, existing methods often rely on domain-specific predefined\nmeta-paths, which are coarse-grained and focus solely on aspects like node\ntype, limiting their ability to capture complex interactions. We introduce\nMF2Vec, a model that uses multi-faceted (fine-grained) paths instead of\npredefined meta-paths. MF2Vec extracts paths via random walks and generates\nmulti-faceted vectors, ignoring predefined schemas. This method learns diverse\naspects of nodes and their relationships, constructs a homogeneous network, and\ncreates node embeddings for classification, link prediction, and clustering.\nExtensive experiments show that MF2Vec outperforms existing methods, offering a\nmore flexible and comprehensive framework for analyzing complex networks. The\ncode is available at https://anonymous.4open.science/r/MF2Vec-6ABC.\n","authors":["JongWoo Kim","SeongYeub Chu","HyeongMin Park","Bryan Wong","MunYong Yi"],"pdf_url":"https://arxiv.org/pdf/2407.20648v1.pdf","comment":"9pages"},{"id":"http://arxiv.org/abs/2309.14928v3","updated":"2024-07-30T08:39:52Z","published":"2023-09-26T13:35:31Z","title":"Noise-Tolerant Few-Shot Unsupervised Adapter for Vision-Language Models","summary":" Recent advances in large-scale vision-language models have achieved\nimpressive performance in various zero-shot image classification tasks. While\nprior studies have demonstrated significant improvements by introducing\nfew-shot labelled target samples, they still require labelling of target\nsamples, which greatly degrades their scalability and generalizability while\nhandling various visual recognition tasks. We design NtUA, a Noise-tolerant\nUnsupervised Adapter that allows the learning of effective target models with\nfew unlabelled target samples. NtUA works as a key-value cache that formulates\nvisual features and predicted pseudo-labels of the few unlabelled target\nsamples as key-value pairs. It consists of two complementary designs. The first\nis adaptive cache formation that combats pseudo-label noises by weighting the\nkey-value pairs according to their prediction confidence. The second is\nknowledge-guided cache refinement, which refines pair values (i.e.,\npseudo-labels) and cache weights by leveraging knowledge distillation from\nlarge-scale vision language models. Extensive experiments show that NtUA\nachieves superior performance consistently across multiple widely adopted\nbenchmarks.\n","authors":["Eman Ali","Muhammad Haris Khan"],"pdf_url":"https://arxiv.org/pdf/2309.14928v3.pdf","comment":"Accepted at BMVC 2024"},{"id":"http://arxiv.org/abs/2407.20640v1","updated":"2024-07-30T08:35:26Z","published":"2024-07-30T08:35:26Z","title":"Improved Bounds for Pure Private Agnostic Learning: Item-Level and\n User-Level Privacy","summary":" Machine Learning has made remarkable progress in a wide range of fields. In\nmany scenarios, learning is performed on datasets involving sensitive\ninformation, in which privacy protection is essential for learning algorithms.\nIn this work, we study pure private learning in the agnostic model -- a\nframework reflecting the learning process in practice. We examine the number of\nusers required under item-level (where each user contributes one example) and\nuser-level (where each user contributes multiple examples) privacy and derive\nseveral improved upper bounds. For item-level privacy, our algorithm achieves a\nnear optimal bound for general concept classes. We extend this to the\nuser-level setting, rendering a tighter upper bound than the one proved by\nGhazi et al. (2023). Lastly, we consider the problem of learning thresholds\nunder user-level privacy and present an algorithm with a nearly tight user\ncomplexity.\n","authors":["Bo Li","Wei Wang","Peng Ye"],"pdf_url":"https://arxiv.org/pdf/2407.20640v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.08925v3","updated":"2024-07-30T08:20:41Z","published":"2023-09-16T08:39:28Z","title":"DOMAIN: MilDly COnservative Model-BAsed OfflINe Reinforcement Learning","summary":" Model-based reinforcement learning (RL), which learns environment model from\noffline dataset and generates more out-of-distribution model data, has become\nan effective approach to the problem of distribution shift in offline RL. Due\nto the gap between the learned and actual environment, conservatism should be\nincorporated into the algorithm to balance accurate offline data and imprecise\nmodel data. The conservatism of current algorithms mostly relies on model\nuncertainty estimation. However, uncertainty estimation is unreliable and leads\nto poor performance in certain scenarios, and the previous methods ignore\ndifferences between the model data, which brings great conservatism. Therefore,\nthis paper proposes a milDly cOnservative Model-bAsed offlINe RL algorithm\n(DOMAIN) without estimating model uncertainty to address the above issues.\nDOMAIN introduces adaptive sampling distribution of model samples, which can\nadaptively adjust the model data penalty. In this paper, we theoretically\ndemonstrate that the Q value learned by the DOMAIN outside the region is a\nlower bound of the true Q value, the DOMAIN is less conservative than previous\nmodel-based offline RL algorithms and has the guarantee of safety policy\nimprovement. The results of extensive experiments show that DOMAIN outperforms\nprior RL algorithms on the D4RL dataset benchmark.\n","authors":["Xiao-Yin Liu","Xiao-Hu Zhou","Mei-Jiang Gui","Xiao-Liang Xie","Shi-Qi Liu","Shuang-Yi Wang","Hao Li","Tian-Yu Xiang","De-Xing Huang","Zeng-Guang Hou"],"pdf_url":"https://arxiv.org/pdf/2309.08925v3.pdf","comment":"14 pages, 6 figures"},{"id":"http://arxiv.org/abs/2404.03088v2","updated":"2024-07-30T08:19:53Z","published":"2024-04-03T22:03:28Z","title":"Robust Federated Learning for Wireless Networks: A Demonstration with\n Channel Estimation","summary":" Federated learning (FL) offers a privacy-preserving collaborative approach\nfor training models in wireless networks, with channel estimation emerging as a\npromising application. Despite extensive studies on FL-empowered channel\nestimation, the security concerns associated with FL require meticulous\nattention. In a scenario where small base stations (SBSs) serve as local models\ntrained on cached data, and a macro base station (MBS) functions as the global\nmodel setting, an attacker can exploit the vulnerability of FL, launching\nattacks with various adversarial attacks or deployment tactics. In this paper,\nwe analyze such vulnerabilities, corresponding solutions were brought forth,\nand validated through simulation.\n","authors":["Zexin Fang","Bin Han","Hans D. Schotten"],"pdf_url":"https://arxiv.org/pdf/2404.03088v2.pdf","comment":"Submitted to IEEE GLOBECOM 2024"},{"id":"http://arxiv.org/abs/2407.20623v1","updated":"2024-07-30T07:59:28Z","published":"2024-07-30T07:59:28Z","title":"SharkTrack: an accurate, generalisable software for streamlining shark\n and ray underwater video analysis","summary":" Elasmobranchs (sharks and rays) can be important components of marine\necosystems but are experiencing global population declines. Effective\nmonitoring of these populations is essential to their protection. Baited Remote\nUnderwater Video Stations (BRUVS) have been a key tool for monitoring, but\nrequire time-consuming manual analysis. To address these challenges, we\ndeveloped SharkTrack, an AI-enhanced BRUVS analysis software. SharkTrack uses\nConvolutional Neural Networks and Multi-Object Tracking to detect and track\nelasmobranchs and provides an annotation pipeline to manually classify\nelasmobranch species and compute MaxN, the standard metric of relative\nabundance. We tested SharkTrack on BRUVS footage from locations unseen by the\nmodel during training. SharkTrack computed MaxN with 89% accuracy over 207\nhours of footage. The semi-automatic SharkTrack pipeline required two minutes\nof manual classification per hour of video, a 97% reduction of manual BRUVS\nanalysis time compared to traditional methods, estimated conservatively at one\nhour per hour of video. Furthermore, we demonstrate SharkTrack application\nacross diverse marine ecosystems and elasmobranch species, an advancement\ncompared to previous models, which were limited to specific species or\nlocations. SharkTrack applications extend beyond BRUVS analysis, facilitating\nrapid annotation of unlabeled videos, aiding the development of further models\nto classify elasmobranch species. We provide public access to the software and\nan unprecedentedly diverse dataset, facilitating future research in an\nimportant area of marine conservation.\n","authors":["Filippo Varini","Francesco Ferretti","Jeremy Jenrette","Joel H. Gayford","Mark E. Bond","Matthew J. Witt","Michael R. Heithaus","Sophie Wilday","Ben Glocker"],"pdf_url":"https://arxiv.org/pdf/2407.20623v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.13505v4","updated":"2024-07-30T07:52:34Z","published":"2024-02-21T03:39:04Z","title":"SimPro: A Simple Probabilistic Framework Towards Realistic Long-Tailed\n Semi-Supervised Learning","summary":" Recent advancements in semi-supervised learning have focused on a more\nrealistic yet challenging task: addressing imbalances in labeled data while the\nclass distribution of unlabeled data remains both unknown and potentially\nmismatched. Current approaches in this sphere often presuppose rigid\nassumptions regarding the class distribution of unlabeled data, thereby\nlimiting the adaptability of models to only certain distribution ranges. In\nthis study, we propose a novel approach, introducing a highly adaptable\nframework, designated as SimPro, which does not rely on any predefined\nassumptions about the distribution of unlabeled data. Our framework, grounded\nin a probabilistic model, innovatively refines the expectation-maximization\n(EM) algorithm by explicitly decoupling the modeling of conditional and\nmarginal class distributions. This separation facilitates a closed-form\nsolution for class distribution estimation during the maximization phase,\nleading to the formulation of a Bayes classifier. The Bayes classifier, in\nturn, enhances the quality of pseudo-labels in the expectation phase.\nRemarkably, the SimPro framework not only comes with theoretical guarantees but\nalso is straightforward to implement. Moreover, we introduce two novel class\ndistributions broadening the scope of the evaluation. Our method showcases\nconsistent state-of-the-art performance across diverse benchmarks and data\ndistribution scenarios. Our code is available at\nhttps://github.com/LeapLabTHU/SimPro.\n","authors":["Chaoqun Du","Yizeng Han","Gao Huang"],"pdf_url":"https://arxiv.org/pdf/2402.13505v4.pdf","comment":"ICML2024 camera-ready version"},{"id":"http://arxiv.org/abs/2407.20620v1","updated":"2024-07-30T07:52:22Z","published":"2024-07-30T07:52:22Z","title":"Accelerated forward-backward and Douglas-Rachford splitting dynamics","summary":" We examine convergence properties of continuous-time variants of accelerated\nForward-Backward (FB) and Douglas-Rachford (DR) splitting algorithms for\nnonsmooth composite optimization problems. When the objective function is given\nby the sum of a quadratic and a nonsmooth term, we establish accelerated\nsublinear and exponential convergence rates for convex and strongly convex\nproblems, respectively. Moreover, for FB splitting dynamics, we demonstrate\nthat accelerated exponential convergence rate carries over to general strongly\nconvex problems. In our Lyapunov-based analysis we exploit the variable-metric\ngradient interpretations of FB and DR splittings to obtain smooth Lyapunov\nfunctions that allow us to establish accelerated convergence rates. We provide\ncomputational experiments to demonstrate the merits and the effectiveness of\nour analysis.\n","authors":["Ibrahim K. Ozaslan","Mihailo R. Jovanović"],"pdf_url":"https://arxiv.org/pdf/2407.20620v1.pdf","comment":"10 pages; 2 figures"},{"id":"http://arxiv.org/abs/2407.20611v1","updated":"2024-07-30T07:36:13Z","published":"2024-07-30T07:36:13Z","title":"The Entrapment Problem in Random Walk Decentralized Learning","summary":" This paper explores decentralized learning in a graph-based setting, where\ndata is distributed across nodes. We investigate a decentralized SGD algorithm\nthat utilizes a random walk to update a global model based on local data. Our\nfocus is on designing the transition probability matrix to speed up\nconvergence. While importance sampling can enhance centralized learning, its\ndecentralized counterpart, using the Metropolis-Hastings (MH) algorithm, can\nlead to the entrapment problem, where the random walk becomes stuck at certain\nnodes, slowing convergence. To address this, we propose the Metropolis-Hastings\nwith L\\'evy Jumps (MHLJ) algorithm, which incorporates random perturbations\n(jumps) to overcome entrapment. We theoretically establish the convergence rate\nand error gap of MHLJ and validate our findings through numerical experiments.\n","authors":["Zonghong Liu","Salim El Rouayheb","Matthew Dwyer"],"pdf_url":"https://arxiv.org/pdf/2407.20611v1.pdf","comment":"10 pages, accepted by 2024 IEEE International Symposium on\n Information Theory. The associated presentation of this paper can be found in\n https://www.youtube.com/watch?v=et0sR4lJK_s&ab_channel=LiuZonghong"},{"id":"http://arxiv.org/abs/2407.20601v1","updated":"2024-07-30T07:24:58Z","published":"2024-07-30T07:24:58Z","title":"Investigating Sparsity in Recurrent Neural Networks","summary":" In the past few years, neural networks have evolved from simple Feedforward\nNeural Networks to more complex neural networks, such as Convolutional Neural\nNetworks and Recurrent Neural Networks. Where CNNs are a perfect fit for tasks\nwhere the sequence is not important such as image recognition, RNNs are useful\nwhen order is important such as machine translation. An increasing number of\nlayers in a neural network is one way to improve its performance, but it also\nincreases its complexity making it much more time and power-consuming to train.\nOne way to tackle this problem is to introduce sparsity in the architecture of\nthe neural network. Pruning is one of the many methods to make a neural network\narchitecture sparse by clipping out weights below a certain threshold while\nkeeping the performance near to the original. Another way is to generate\narbitrary structures using random graphs and embed them between an input and\noutput layer of an Artificial Neural Network. Many researchers in past years\nhave focused on pruning mainly CNNs, while hardly any research is done for the\nsame in RNNs. The same also holds in creating sparse architectures for RNNs by\ngenerating and embedding arbitrary structures. Therefore, this thesis focuses\non investigating the effects of the before-mentioned two techniques on the\nperformance of RNNs. We first describe the pruning of RNNs, its impact on the\nperformance of RNNs, and the number of training epochs required to regain\naccuracy after the pruning is performed. Next, we continue with the creation\nand training of Sparse Recurrent Neural Networks and identify the relation\nbetween the performance and the graph properties of its underlying arbitrary\nstructure. We perform these experiments on RNN with Tanh nonlinearity\n(RNN-Tanh), RNN with ReLU nonlinearity (RNN-ReLU), GRU, and LSTM. Finally, we\nanalyze and discuss the results achieved from both the experiments.\n","authors":["Harshil Darji"],"pdf_url":"https://arxiv.org/pdf/2407.20601v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20597v1","updated":"2024-07-30T07:17:46Z","published":"2024-07-30T07:17:46Z","title":"Joint Diffusion Processes as an Inductive Bias in Sheaf Neural Networks","summary":" Sheaf Neural Networks (SNNs) naturally extend Graph Neural Networks (GNNs) by\nendowing a cellular sheaf over the graph, equipping nodes and edges with vector\nspaces and defining linear mappings between them. While the attached geometric\nstructure has proven to be useful in analyzing heterophily and oversmoothing,\nso far the methods by which the sheaf is computed do not always guarantee a\ngood performance in such settings. In this work, drawing inspiration from\nopinion dynamics concepts, we propose two novel sheaf learning approaches that\n(i) provide a more intuitive understanding of the involved structure maps, (ii)\nintroduce a useful inductive bias for heterophily and oversmoothing, and (iii)\ninfer the sheaf in a way that does not scale with the number of features, thus\nusing fewer learnable parameters than existing methods. In our evaluation, we\nshow the limitations of the real-world benchmarks used so far on SNNs, and\ndesign a new synthetic task -- leveraging the symmetries of n-dimensional\nellipsoids -- that enables us to better assess the strengths and weaknesses of\nsheaf-based models. Our extensive experimentation on these novel datasets\nreveals valuable insights into the scenarios and contexts where SNNs in general\n-- and our proposed approaches in particular -- can be beneficial.\n","authors":["Ferran Hernandez Caralt","Guillermo Bernárdez Gil","Iulia Duta","Pietro Liò","Eduard Alarcón Cot"],"pdf_url":"https://arxiv.org/pdf/2407.20597v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20596v1","updated":"2024-07-30T07:15:39Z","published":"2024-07-30T07:15:39Z","title":"Benchmarking Histopathology Foundation Models for Ovarian Cancer\n Bevacizumab Treatment Response Prediction from Whole Slide Images","summary":" Bevacizumab is a widely studied targeted therapeutic drug used in conjunction\nwith standard chemotherapy for the treatment of recurrent ovarian cancer. While\nits administration has shown to increase the progression-free survival (PFS) in\npatients with advanced stage ovarian cancer, the lack of identifiable\nbiomarkers for predicting patient response has been a major roadblock in its\neffective adoption towards personalized medicine. In this work, we leverage the\nlatest histopathology foundation models trained on large-scale whole slide\nimage (WSI) datasets to extract ovarian tumor tissue features for predicting\nbevacizumab response from WSIs. Our extensive experiments across a combination\nof different histopathology foundation models and multiple instance learning\n(MIL) strategies demonstrate capability of these large models in predicting\nbevacizumab response in ovarian cancer patients with the best models achieving\nan AUC score of 0.86 and an accuracy score of 72.5%. Furthermore, our survival\nmodels are able to stratify high- and low-risk cases with statistical\nsignificance (p < 0.05) even among the patients with the aggressive subtype of\nhigh-grade serous ovarian carcinoma. This work highlights the utility of\nhistopathology foundation models for the task of ovarian bevacizumab response\nprediction from WSIs. The high-attention regions of the WSIs highlighted by\nthese models not only aid the model explainability but also serve as promising\nimaging biomarkers for treatment prognosis.\n","authors":["Mayur Mallya","Ali Khajegili Mirabadi","Hossein Farahani","Ali Bashashati"],"pdf_url":"https://arxiv.org/pdf/2407.20596v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.10768v4","updated":"2024-07-30T07:06:05Z","published":"2024-07-15T14:50:15Z","title":"ISMRNN: An Implicitly Segmented RNN Method with Mamba for Long-Term Time\n Series Forecasting","summary":" Long time series forecasting aims to utilize historical information to\nforecast future states over extended horizons. Traditional RNN-based series\nforecasting methods struggle to effectively address long-term dependencies and\ngradient issues in long time series problems. Recently, SegRNN has emerged as a\nleading RNN-based model tailored for long-term series forecasting,\ndemonstrating state-of-the-art performance while maintaining a streamlined\narchitecture through innovative segmentation and parallel decoding techniques.\nNevertheless, SegRNN has several limitations: its fixed segmentation disrupts\ndata continuity and fails to effectively leverage information across different\nsegments, the segmentation strategy employed by SegRNN does not fundamentally\naddress the issue of information loss within the recurrent structure. To\naddress these issues, we propose the ISMRNN method with three key enhancements:\nwe introduce an implicit segmentation structure to decompose the time series\nand map it to segmented hidden states, resulting in denser information exchange\nduring the segmentation phase. Additionally, we incorporate residual structures\nin the encoding layer to mitigate information loss within the recurrent\nstructure. To extract information more effectively, we further integrate the\nMamba architecture to enhance time series information extraction. Experiments\non several real-world long time series forecasting datasets demonstrate that\nour model surpasses the performance of current state-of-the-art models.\n","authors":["GaoXiang Zhao","Li Zhou","XiaoQiang Wang"],"pdf_url":"https://arxiv.org/pdf/2407.10768v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.03374v2","updated":"2024-07-30T06:49:04Z","published":"2022-11-07T09:00:33Z","title":"Deep Causal Learning: Representation, Discovery and Inference","summary":" Causal learning has garnered significant attention in recent years because it\nreveals the essential relationships that underpin phenomena and delineates the\nmechanisms by which the world evolves. Nevertheless, traditional causal\nlearning methods face numerous challenges and limitations, including\nhigh-dimensional, unstructured variables, combinatorial optimization problems,\nunobserved confounders, selection biases, and estimation inaccuracies. Deep\ncausal learning, which leverages deep neural networks, offers innovative\ninsights and solutions for addressing these challenges. Although numerous deep\nlearning-based methods for causal discovery and inference have been proposed,\nthere remains a dearth of reviews examining the underlying mechanisms by which\ndeep learning can enhance causal learning. In this article, we comprehensively\nreview how deep learning can contribute to causal learning by tackling\ntraditional challenges across three key dimensions: representation, discovery,\nand inference. We emphasize that deep causal learning is pivotal for advancing\nthe theoretical frontiers and broadening the practical applications of causal\nscience. We conclude by summarizing open issues and outlining potential\ndirections for future research.\n","authors":["Zizhen Deng","Xiaolong Zheng","Hu Tian","Daniel Dajun Zeng"],"pdf_url":"https://arxiv.org/pdf/2211.03374v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16663v2","updated":"2024-07-30T06:44:05Z","published":"2024-07-23T17:26:38Z","title":"Computable learning of natural hypothesis classes","summary":" This paper is about the recent notion of computably probably approximately\ncorrect learning, which lies between the statistical learning theory where\nthere is no computational requirement on the learner and efficient PAC where\nthe learner must be polynomially bounded. Examples have recently been given of\nhypothesis classes which are PAC learnable but not computably PAC learnable,\nbut these hypothesis classes are unnatural or non-canonical in the sense that\nthey depend on a numbering of proofs, formulas, or programs. We use the\non-a-cone machinery from computability theory to prove that, under mild\nassumptions such as that the hypothesis class can be computably listable, any\nnatural hypothesis class which is learnable must be computably learnable. Thus\nthe counterexamples given previously are necessarily unnatural.\n","authors":["Matthew Harrison-Trainor","Syed Akbari"],"pdf_url":"https://arxiv.org/pdf/2407.16663v2.pdf","comment":"This is a replacement of the earlier submission to just update the\n funding information"},{"id":"http://arxiv.org/abs/2407.20119v2","updated":"2024-07-30T06:33:48Z","published":"2024-07-29T15:51:09Z","title":"Adaptive Self-supervised Robust Clustering for Unstructured Data with\n Unknown Cluster Number","summary":" We introduce a novel self-supervised deep clustering approach tailored for\nunstructured data without requiring prior knowledge of the number of clusters,\ntermed Adaptive Self-supervised Robust Clustering (ASRC). In particular, ASRC\nadaptively learns the graph structure and edge weights to capture both local\nand global structural information. The obtained graph enables us to learn\nclustering-friendly feature representations by an enhanced graph auto-encoder\nwith contrastive learning technique. It further leverages the clustering\nresults adaptively obtained by robust continuous clustering (RCC) to generate\nprototypes for negative sampling, which can further contribute to promoting\nconsistency among positive pairs and enlarging the gap between positive and\nnegative samples. ASRC obtains the final clustering results by applying RCC to\nthe learned feature representations with their consistent graph structure and\nedge weights. Extensive experiments conducted on seven benchmark datasets\ndemonstrate the efficacy of ASRC, demonstrating its superior performance over\nother popular clustering models. Notably, ASRC even outperforms methods that\nrely on prior knowledge of the number of clusters, highlighting its\neffectiveness in addressing the challenges of clustering unstructured data.\n","authors":["Chen-Lu Ding","Jiancan Wu","Wei Lin","Shiyang Shen","Xiang Wang","Yancheng Yuan"],"pdf_url":"https://arxiv.org/pdf/2407.20119v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20560v1","updated":"2024-07-30T05:28:10Z","published":"2024-07-30T05:28:10Z","title":"Invariant deep neural networks under the finite group for solving\n partial differential equations","summary":" Utilizing physics-informed neural networks (PINN) to solve partial\ndifferential equations (PDEs) becomes a hot issue and also shows its great\npowers, but still suffers from the dilemmas of limited predicted accuracy in\nthe sampling domain and poor prediction ability beyond the sampling domain\nwhich are usually mitigated by adding the physical properties of PDEs into the\nloss function or by employing smart techniques to change the form of loss\nfunction for special PDEs. In this paper, we design a symmetry-enhanced deep\nneural network (sDNN) which makes the architecture of neural networks invariant\nunder the finite group through expanding the dimensions of weight matrixes and\nbias vectors in each hidden layers by the order of finite group if the group\nhas matrix representations, otherwise extending the set of input data and the\nhidden layers except for the first hidden layer by the order of finite group.\nHowever, the total number of training parameters is only about one over the\norder of finite group of the original PINN size due to the symmetric\narchitecture of sDNN. Furthermore, we give special forms of weight matrixes and\nbias vectors of sDNN, and rigorously prove that the architecture itself is\ninvariant under the finite group and the sDNN has the universal approximation\nability to learn the function keeping the finite group. Numerical results show\nthat the sDNN has strong predicted abilities in and beyond the sampling domain\nand performs far better than the vanilla PINN with fewer training points and\nsimpler architecture.\n","authors":["Zhi-Yong Zhang","Jie-Ying Li","Lei-Lei Guo"],"pdf_url":"https://arxiv.org/pdf/2407.20560v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20557v1","updated":"2024-07-30T05:24:08Z","published":"2024-07-30T05:24:08Z","title":"CELLM: An Efficient Communication in Large Language Models Training for\n Federated Learning","summary":" Federated Learning (FL) is a recent model training paradigm in which client\ndevices collaboratively train a model without ever aggregating their data.\nCrucially, this scheme offers users potential privacy and security benefits by\nonly ever communicating updates to the model weights to a central server as\nopposed to traditional machine learning (ML) training which directly\ncommunicates and aggregates data. However, FL training suffers from statistical\nheterogeneity as clients may have differing local data distributions. Large\nlanguage models (LLMs) offer a potential solution to this issue of\nheterogeneity given that they have consistently been shown to be able to learn\non vast amounts of noisy data. While LLMs are a promising development for\nresolving the consistent issue of non-I.I.D. Clients in federated settings\nexacerbate two other bottlenecks in FL: limited local computing and expensive\ncommunication. This thesis aims to develop efficient training methods for LLMs\nin FL. To this end, we employ two critical techniques in enabling efficient\ntraining. First, we use low-rank adaptation (LoRA) to reduce the computational\nload of local model training. Second, we communicate sparse updates throughout\ntraining to significantly cut down on communication costs. Taken together, our\nmethod reduces communication costs by up to 10x over vanilla LoRA and up to 5x\nover more complex sparse LoRA baselines while achieving greater utility. We\nemphasize the importance of carefully applying sparsity and picking effective\nrank and sparsity configurations for federated LLM training.\n","authors":["Raja Vavekanand","Kira Sam"],"pdf_url":"https://arxiv.org/pdf/2407.20557v1.pdf","comment":"22 pages, 10 figures"},{"id":"http://arxiv.org/abs/2407.20553v1","updated":"2024-07-30T05:15:19Z","published":"2024-07-30T05:15:19Z","title":"DiffusionCounterfactuals: Inferring High-dimensional Counterfactuals\n with Guidance of Causal Representations","summary":" Accurate estimation of counterfactual outcomes in high-dimensional data is\ncrucial for decision-making and understanding causal relationships and\nintervention outcomes in various domains, including healthcare, economics, and\nsocial sciences. However, existing methods often struggle to generate accurate\nand consistent counterfactuals, particularly when the causal relationships are\ncomplex. We propose a novel framework that incorporates causal mechanisms and\ndiffusion models to generate high-quality counterfactual samples guided by\ncausal representation. Our approach introduces a novel, theoretically grounded\ntraining and sampling process that enables the model to consistently generate\naccurate counterfactual high-dimensional data under multiple intervention\nsteps. Experimental results on various synthetic and real benchmarks\ndemonstrate the proposed approach outperforms state-of-the-art methods in\ngenerating accurate and high-quality counterfactuals, using different\nevaluation metrics.\n","authors":["Jiageng Zhu","Hanchen Xie","Jiazhi Li","Wael Abd-Almageed"],"pdf_url":"https://arxiv.org/pdf/2407.20553v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07171v5","updated":"2024-07-30T05:07:01Z","published":"2023-10-11T03:39:56Z","title":"Advocating for the Silent: Enhancing Federated Generalization for\n Non-Participating Clients","summary":" Federated Learning (FL) has surged in prominence due to its capability of\ncollaborative model training without direct data sharing. However, the vast\ndisparity in local data distributions among clients, often termed the\nNon-Independent Identically Distributed (Non-IID) challenge, poses a\nsignificant hurdle to FL's generalization efficacy. The scenario becomes even\nmore complex when not all clients participate in the training process, a common\noccurrence due to unstable network connections or limited computational\ncapacities. This can greatly complicate the assessment of the trained models'\ngeneralization abilities. While a plethora of recent studies has centered on\nthe generalization gap pertaining to unseen data from participating clients\nwith diverse distributions, the distinction between the training distributions\nof participating clients and the testing distributions of non-participating\nones has been largely overlooked. In response, our paper unveils an\ninformation-theoretic generalization framework for FL. Specifically, it\nquantifies generalization errors by evaluating the information entropy of local\ndistributions and discerning discrepancies across these distributions. Inspired\nby our deduced generalization bounds, we introduce a weighted aggregation\napproach and a duo of client selection strategies. These innovations are\ndesigned to strengthen FL's ability to generalize and thus ensure that trained\nmodels perform better on non-participating clients by incorporating a more\ndiverse range of client data distributions. Our extensive empirical evaluations\nreaffirm the potency of our proposed methods, aligning seamlessly with our\ntheoretical construct.\n","authors":["Zheshun Wu","Zenglin Xu","Dun Zeng","Qifan Wang","Jie Liu"],"pdf_url":"https://arxiv.org/pdf/2310.07171v5.pdf","comment":"Submitted to IEEE TNNLS, under minor revision"},{"id":"http://arxiv.org/abs/2407.20547v1","updated":"2024-07-30T05:05:09Z","published":"2024-07-30T05:05:09Z","title":"Neuromorphic on-chip reservoir computing with spiking neural network\n architectures","summary":" Reservoir computing is a promising approach for harnessing the computational\npower of recurrent neural networks while dramatically simplifying training.\nThis paper investigates the application of integrate-and-fire neurons within\nreservoir computing frameworks for two distinct tasks: capturing chaotic\ndynamics of the H\\'enon map and forecasting the Mackey-Glass time series.\nIntegrate-and-fire neurons can be implemented in low-power neuromorphic\narchitectures such as Intel Loihi. We explore the impact of network topologies\ncreated through random interactions on the reservoir's performance. Our study\nreveals task-specific variations in network effectiveness, highlighting the\nimportance of tailored architectures for distinct computational tasks. To\nidentify optimal network configurations, we employ a meta-learning approach\ncombined with simulated annealing. This method efficiently explores the space\nof possible network structures, identifying architectures that excel in\ndifferent scenarios. The resulting networks demonstrate a range of behaviors,\nshowcasing how inherent architectural features influence task-specific\ncapabilities. We study the reservoir computing performance using a custom\nintegrate-and-fire code, Intel's Lava neuromorphic computing software\nframework, and via an on-chip implementation in Loihi. We conclude with an\nanalysis of the energy performance of the Loihi architecture.\n","authors":["Samip Karki","Diego Chavez Arana","Andrew Sornborger","Francesco Caravelli"],"pdf_url":"https://arxiv.org/pdf/2407.20547v1.pdf","comment":"19 pages, 9 figures; single column"},{"id":"http://arxiv.org/abs/2407.20529v1","updated":"2024-07-30T04:08:00Z","published":"2024-07-30T04:08:00Z","title":"Can LLMs be Fooled? Investigating Vulnerabilities in LLMs","summary":" The advent of Large Language Models (LLMs) has garnered significant\npopularity and wielded immense power across various domains within Natural\nLanguage Processing (NLP). While their capabilities are undeniably impressive,\nit is crucial to identify and scrutinize their vulnerabilities especially when\nthose vulnerabilities can have costly consequences. One such LLM, trained to\nprovide a concise summarization from medical documents could unequivocally leak\npersonal patient data when prompted surreptitiously. This is just one of many\nunfortunate examples that have been unveiled and further research is necessary\nto comprehend the underlying reasons behind such vulnerabilities. In this\nstudy, we delve into multiple sections of vulnerabilities which are\nmodel-based, training-time, inference-time vulnerabilities, and discuss\nmitigation strategies including \"Model Editing\" which aims at modifying LLMs\nbehavior, and \"Chroma Teaming\" which incorporates synergy of multiple teaming\nstrategies to enhance LLMs' resilience. This paper will synthesize the findings\nfrom each vulnerability section and propose new directions of research and\ndevelopment. By understanding the focal points of current vulnerabilities, we\ncan better anticipate and mitigate future risks, paving the road for more\nrobust and secure LLMs.\n","authors":["Sara Abdali","Jia He","CJ Barberan","Richard Anarfi"],"pdf_url":"https://arxiv.org/pdf/2407.20529v1.pdf","comment":"14 pages, 1 figure. arXiv admin note: text overlap with\n arXiv:2403.12503"},{"id":"http://arxiv.org/abs/2406.10787v3","updated":"2024-07-30T04:00:44Z","published":"2024-06-16T03:00:16Z","title":"Evidential Uncertainty Sets in Deep Classifiers Using Conformal\n Prediction","summary":" In this paper, we propose Evidential Conformal Prediction (ECP) method for\nimage classifiers to generate the conformal prediction sets. Our method is\ndesigned based on a non-conformity score function that has its roots in\nEvidential Deep Learning (EDL) as a method of quantifying model (epistemic)\nuncertainty in DNN classifiers. We use evidence that are derived from the logit\nvalues of target labels to compute the components of our non-conformity score\nfunction: the heuristic notion of uncertainty in CP, uncertainty surprisal, and\nexpected utility. Our extensive experimental evaluation demonstrates that ECP\noutperforms three state-of-the-art methods for generating CP sets, in terms of\ntheir set sizes and adaptivity while maintaining the coverage of true labels.\n","authors":["Hamed Karimi","Reza Samavi"],"pdf_url":"https://arxiv.org/pdf/2406.10787v3.pdf","comment":"Accepted in 13th Symposium on Conformal and Probabilistic Prediction\n with Applications (COPA2024). To be published in the Proceedings of Machine\n Learning Research (PMLR), vol. 230, 2024 (25 Pages)"},{"id":"http://arxiv.org/abs/2407.08742v2","updated":"2024-07-30T03:53:32Z","published":"2024-05-29T01:23:19Z","title":"Improved Robustness and Hyperparameter Selection in Modern Hopfield\n Networks","summary":" The modern Hopfield network generalizes the classical Hopfield network by\nallowing for sharper interaction functions. This increases the capacity of the\nnetwork as an autoassociative memory as nearby learned attractors will not\ninterfere with one another. However, the implementation of the network relies\non applying large exponents to the dot product of memory vectors and probe\nvectors. If the dimension of the data is large the calculation can be very\nlarge and result in problems when using floating point numbers in a practical\nimplementation. We describe this problem in detail, modify the original network\ndescription to mitigate the problem, and show the modification will not alter\nthe networks' dynamics during update or training. We also show our modification\ngreatly improves hyperparameter selection for the modern Hopfield network,\nremoving hyperparameter dependence on the interaction vertex and resulting in\nan optimal region of hyperparameters that does not significantly change with\nthe interaction vertex as it does in the original network.\n","authors":["Hayden McAlister","Anthony Robins","Lech Szymanski"],"pdf_url":"https://arxiv.org/pdf/2407.08742v2.pdf","comment":"Add subsection on exponential interaction function"},{"id":"http://arxiv.org/abs/2407.20516v1","updated":"2024-07-30T03:26:09Z","published":"2024-07-30T03:26:09Z","title":"Machine Unlearning in Generative AI: A Survey","summary":" Generative AI technologies have been deployed in many places, such as\n(multimodal) large language models and vision generative models. Their\nremarkable performance should be attributed to massive training data and\nemergent reasoning abilities. However, the models would memorize and generate\nsensitive, biased, or dangerous information originated from the training data\nespecially those from web crawl. New machine unlearning (MU) techniques are\nbeing developed to reduce or eliminate undesirable knowledge and its effects\nfrom the models, because those that were designed for traditional\nclassification tasks could not be applied for Generative AI. We offer a\ncomprehensive survey on many things about MU in Generative AI, such as a new\nproblem formulation, evaluation methods, and a structured discussion on the\nadvantages and limitations of different kinds of MU techniques. It also\npresents several critical challenges and promising directions in MU research. A\ncurated list of readings can be found:\nhttps://github.com/franciscoliu/GenAI-MU-Reading.\n","authors":["Zheyuan Liu","Guangyao Dou","Zhaoxuan Tan","Yijun Tian","Meng Jiang"],"pdf_url":"https://arxiv.org/pdf/2407.20516v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.07472v3","updated":"2024-07-30T03:19:41Z","published":"2024-02-12T08:17:23Z","title":"Cartesian atomic cluster expansion for machine learning interatomic\n potentials","summary":" Machine learning interatomic potentials are revolutionizing large-scale,\naccurate atomistic modelling in material science and chemistry. Many potentials\nuse atomic cluster expansion or equivariant message passing frameworks. Such\nframeworks typically use spherical harmonics as angular basis functions, and\nthen use Clebsch-Gordan contraction to maintain rotational symmetry, which may\nintroduce redundancies in representations and computational overhead. We\npropose an alternative: a Cartesian-coordinates-based atomic density expansion.\nThis approach provides a complete set of polynormially indepedent features of\natomic environments while maintaining interaction body orders. Additionally, we\nintegrate low-dimensional embeddings of various chemical elements and\ninter-atomic message passing. The resulting potential, named Cartesian Atomic\nCluster Expansion (CACE), exhibits good accuracy, stability, and\ngeneralizability. We validate its performance in diverse systems, including\nbulk water, small molecules, and 25-element high-entropy alloys.\n","authors":["Bingqing Cheng"],"pdf_url":"https://arxiv.org/pdf/2402.07472v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14359v2","updated":"2024-07-30T03:15:26Z","published":"2023-11-24T09:02:24Z","title":"Thompson sampling for zero-inflated count outcomes with an application\n to the Drink Less mobile health study","summary":" Mobile health (mHealth) interventions often aim to improve distal outcomes,\nsuch as clinical conditions, by optimizing proximal outcomes through\njust-in-time adaptive interventions. Contextual bandits provide a suitable\nframework for customizing such interventions according to individual\ntime-varying contexts. However, unique challenges, such as modeling count\noutcomes within bandit frameworks, have hindered the widespread application of\ncontextual bandits to mHealth studies. The current work addresses this\nchallenge by leveraging count data models into online decision-making\napproaches. Specifically, we combine four common offline count data models\n(Poisson, negative binomial, zero-inflated Poisson, and zero-inflated negative\nbinomial regressions) with Thompson sampling, a popular contextual bandit\nalgorithm. The proposed algorithms are motivated by and evaluated on a real\ndataset from the Drink Less trial, where they are shown to improve user\nengagement with the mHealth platform. The proposed methods are further\nevaluated on simulated data, achieving improvement in maximizing cumulative\nproximal outcomes over existing algorithms. Theoretical results on regret\nbounds are also derived. The countts R package provides an implementation of\nour approach.\n","authors":["Xueqing Liu","Nina Deliu","Tanujit Chakraborty","Lauren Bell","Bibhas Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2311.14359v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20508v1","updated":"2024-07-30T02:53:26Z","published":"2024-07-30T02:53:26Z","title":"Unveiling the Potential of Spiking Dynamics in Graph Representation\n Learning through Spatial-Temporal Normalization and Coding Strategies","summary":" In recent years, spiking neural networks (SNNs) have attracted substantial\ninterest due to their potential to replicate the energy-efficient and\nevent-driven processing of biological neurons. Despite this, the application of\nSNNs in graph representation learning, particularly for non-Euclidean data,\nremains underexplored, and the influence of spiking dynamics on graph learning\nis not yet fully understood. This work seeks to address these gaps by examining\nthe unique properties and benefits of spiking dynamics in enhancing graph\nrepresentation learning. We propose a spike-based graph neural network model\nthat incorporates spiking dynamics, enhanced by a novel spatial-temporal\nfeature normalization (STFN) technique, to improve training efficiency and\nmodel stability. Our detailed analysis explores the impact of rate coding and\ntemporal coding on SNN performance, offering new insights into their advantages\nfor deep graph networks and addressing challenges such as the oversmoothing\nproblem. Experimental results demonstrate that our SNN models can achieve\ncompetitive performance with state-of-the-art graph neural networks (GNNs)\nwhile considerably reducing computational costs, highlighting the potential of\nSNNs for efficient neuromorphic computing applications in complex graph-based\nscenarios.\n","authors":["Mingkun Xu","Huifeng Yin","Yujie Wu","Guoqi Li","Faqiang Liu","Jing Pei","Shuai Zhong","Lei Deng"],"pdf_url":"https://arxiv.org/pdf/2407.20508v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20506v1","updated":"2024-07-30T02:51:21Z","published":"2024-07-30T02:51:21Z","title":"Boosting Efficiency in Task-Agnostic Exploration through Causal\n Knowledge","summary":" The effectiveness of model training heavily relies on the quality of\navailable training resources. However, budget constraints often impose\nlimitations on data collection efforts. To tackle this challenge, we introduce\ncausal exploration in this paper, a strategy that leverages the underlying\ncausal knowledge for both data collection and model training. We, in\nparticular, focus on enhancing the sample efficiency and reliability of the\nworld model learning within the domain of task-agnostic reinforcement learning.\nDuring the exploration phase, the agent actively selects actions expected to\nyield causal insights most beneficial for world model training. Concurrently,\nthe causal knowledge is acquired and incrementally refined with the ongoing\ncollection of data. We demonstrate that causal exploration aids in learning\naccurate world models using fewer data and provide theoretical guarantees for\nits convergence. Empirical experiments, on both synthetic data and real-world\napplications, further validate the benefits of causal exploration.\n","authors":["Yupei Yang","Biwei Huang","Shikui Tu","Lei Xu"],"pdf_url":"https://arxiv.org/pdf/2407.20506v1.pdf","comment":"This paper was accepted by IJCAI'24"},{"id":"http://arxiv.org/abs/2407.20503v1","updated":"2024-07-30T02:38:27Z","published":"2024-07-30T02:38:27Z","title":"A federated large language model for long-term time series forecasting","summary":" Long-term time series forecasting in centralized environments poses unique\nchallenges regarding data privacy, communication overhead, and scalability. To\naddress these challenges, we propose FedTime, a federated large language model\n(LLM) tailored for long-range time series prediction. Specifically, we\nintroduce a federated pre-trained LLM with fine-tuning and alignment\nstrategies. Prior to the learning process, we employ K-means clustering to\npartition edge devices or clients into distinct clusters, thereby facilitating\nmore focused model training. We also incorporate channel independence and\npatching to better preserve local semantic information, ensuring that important\ncontextual details are retained while minimizing the risk of information loss.\nWe demonstrate the effectiveness of our FedTime model through extensive\nexperiments on various real-world forecasting benchmarks, showcasing\nsubstantial improvements over recent approaches. In addition, we demonstrate\nthe efficiency of FedTime in streamlining resource usage, resulting in reduced\ncommunication overhead.\n","authors":["Raed Abdel-Sater","A. Ben Hamza"],"pdf_url":"https://arxiv.org/pdf/2407.20503v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.05694v2","updated":"2024-07-30T02:37:56Z","published":"2024-07-08T07:53:06Z","title":"On the Limitations of Compute Thresholds as a Governance Strategy","summary":" At face value, this essay is about understanding a fairly esoteric governance\ntool called compute thresholds. However, in order to grapple with whether these\nthresholds will achieve anything, we must first understand how they came to be.\nTo do so, we need to engage with a decades-old debate at the heart of computer\nscience progress, namely, is bigger always better? Does a certain inflection\npoint of compute result in changes to the risk profile of a model? Hence, this\nessay may be of interest not only to policymakers and the wider public but also\nto computer scientists interested in understanding the role of compute in\nunlocking breakthroughs. This discussion is timely given the wide adoption of\ncompute thresholds in both the White House Executive Orders on AI Safety (EO)\nand the EU AI Act to identify more risky systems. A key conclusion of this\nessay is that compute thresholds, as currently implemented, are shortsighted\nand likely to fail to mitigate risk. The relationship between compute and risk\nis highly uncertain and rapidly changing. Relying upon compute thresholds\noverestimates our ability to predict what abilities emerge at different scales.\nThis essay ends with recommendations for a better way forward.\n","authors":["Sara Hooker"],"pdf_url":"https://arxiv.org/pdf/2407.05694v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20499v1","updated":"2024-07-30T02:20:38Z","published":"2024-07-30T02:20:38Z","title":"Optimizing Long-tailed Link Prediction in Graph Neural Networks through\n Structure Representation Enhancement","summary":" Link prediction, as a fundamental task for graph neural networks (GNNs), has\nboasted significant progress in varied domains. Its success is typically\ninfluenced by the expressive power of node representation, but recent\ndevelopments reveal the inferior performance of low-degree nodes owing to their\nsparse neighbor connections, known as the degree-based long-tailed problem.\nWill the degree-based long-tailed distribution similarly constrain the efficacy\nof GNNs on link prediction? Unexpectedly, our study reveals that only a mild\ncorrelation exists between node degree and predictive accuracy, and more\nimportantly, the number of common neighbors between node pairs exhibits a\nstrong correlation with accuracy. Considering node pairs with less common\nneighbors, i.e., tail node pairs, make up a substantial fraction of the dataset\nbut achieve worse performance, we propose that link prediction also faces the\nlong-tailed problem. Therefore, link prediction of GNNs is greatly hindered by\nthe tail node pairs. After knowing the weakness of link prediction, a natural\nquestion is how can we eliminate the negative effects of the skewed long-tailed\ndistribution on common neighbors so as to improve the performance of link\nprediction? Towards this end, we introduce our long-tailed framework (LTLP),\nwhich is designed to enhance the performance of tail node pairs on link\nprediction by increasing common neighbors. Two key modules in LTLP respectively\nsupplement high-quality edges for tail node pairs and enforce representational\nalignment between head and tail node pairs within the same category, thereby\nimproving the performance of tail node pairs.\n","authors":["Yakun Wang","Daixin Wang","Hongrui Liu","Binbin Hu","Yingcui Yan","Qiyang Zhang","Zhiqiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.20499v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.13821v2","updated":"2024-07-30T02:20:15Z","published":"2023-10-20T21:18:04Z","title":"Geometric Learning with Positively Decomposable Kernels","summary":" Kernel methods are powerful tools in machine learning. Classical kernel\nmethods are based on positive-definite kernels, which map data spaces into\nreproducing kernel Hilbert spaces (RKHS). For non-Euclidean data spaces,\npositive-definite kernels are difficult to come by. In this case, we propose\nthe use of reproducing kernel Krein space (RKKS) based methods, which require\nonly kernels that admit a positive decomposition. We show that one does not\nneed to access this decomposition in order to learn in RKKS. We then\ninvestigate the conditions under which a kernel is positively decomposable. We\nshow that invariant kernels admit a positive decomposition on homogeneous\nspaces under tractable regularity assumptions. This makes them much easier to\nconstruct than positive-definite kernels, providing a route for learning with\nkernels for non-Euclidean data. By the same token, this provides theoretical\nfoundations for RKKS-based methods in general.\n","authors":["Nathael Da Costa","Cyrus Mostajeran","Juan-Pablo Ortega","Salem Said"],"pdf_url":"https://arxiv.org/pdf/2310.13821v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.09105v4","updated":"2024-07-30T02:06:17Z","published":"2024-07-12T09:10:37Z","title":"Enhancing Training Efficiency Using Packing with Flash Attention","summary":" Padding is often used in tuning LLM models by adding special tokens to\nshorter training examples to match the length of the longest sequence in each\nbatch. While this ensures uniformity for batch processing, it introduces\ninefficiencies by including irrelevant padding tokens in the computation and\nwastes GPU resources. Hugging Face SFT trainer has always offered the option to\nuse packing to combine multiple training examples, allowing for maximal\nutilization of GPU resources. However, up till now, it did not offer proper\nmasking of each packed training example. This capability has now been added to\nHugging Face Transformers 4.43. We analyse this new feature and show the\nbenefits across different variations of packing.\n","authors":["Achintya Kundu","Rhui Dih Lee","Laura Wynter","Raghu Kiran Ganti","Mayank Mishra"],"pdf_url":"https://arxiv.org/pdf/2407.09105v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20021v2","updated":"2024-07-30T02:03:06Z","published":"2024-07-29T13:57:40Z","title":"MimiQ: Low-Bit Data-Free Quantization of Vision Transformers with\n Encouraging Inter-Head Attention Similarity","summary":" Data-free quantization (DFQ) is a technique that creates a lightweight\nnetwork from its full-precision counterpart without the original training data,\noften through a synthetic dataset. Although several DFQ methods have been\nproposed for vision transformer (ViT) architectures, they fail to achieve\nefficacy in low-bit settings. Examining the existing methods, we identify that\ntheir synthetic data produce misaligned attention maps, while those of the real\nsamples are highly aligned. From the observation of aligned attention, we find\nthat aligning attention maps of synthetic data helps to improve the overall\nperformance of quantized ViTs. Motivated by this finding, we devise \\aname, a\nnovel DFQ method designed for ViTs that focuses on inter-head attention\nsimilarity. First, we generate synthetic data by aligning head-wise attention\nresponses in relation to spatial query patches. Then, we apply head-wise\nstructural attention distillation to align the attention maps of the quantized\nnetwork to those of the full-precision teacher. The experimental results show\nthat the proposed method significantly outperforms baselines, setting a new\nstate-of-the-art performance for data-free ViT quantization.\n","authors":["Kanghyun Choi","Hye Yoon Lee","Dain Kwon","SunJong Park","Kyuyeun Kim","Noseong Park","Jinho Lee"],"pdf_url":"https://arxiv.org/pdf/2407.20021v2.pdf","comment":"Author Preprint"},{"id":"http://arxiv.org/abs/2407.20496v1","updated":"2024-07-30T01:40:50Z","published":"2024-07-30T01:40:50Z","title":"Toward Efficient Permutation for Hierarchical N:M Sparsity on GPUs","summary":" N:M sparsity pruning is a powerful technique for compressing deep neural\nnetworks, utilizing NVIDIA's Sparse Tensor Core technology. This method\nbenefits from hardware support for sparse indexing, enabling the adoption of\nfine-grained sparsity to maintain model accuracy while minimizing the overhead\ntypically associated with irregular data access. Although restricted to a fixed\nlevel of sparsity due to its reliance on hardware, N:M sparsity can be combined\nwith coarser sparsity techniques to achieve diverse compression ratios.\nInitially, column-wise vector sparsity is applied to a dense model, followed by\nrow-wise N:M sparsity on the preserved column vectors. We call this multi-level\napproach as hierarchical N:M (HiNM) sparsity. Similar to earlier single-level\nsparsity techniques, HiNM sparsity necessitates an effective channel\npermutation strategy to maximize the accuracy of the compressed networks.\nHowever, it introduces further complexities by requiring the rearrangement of\nboth input and output channels, addressing challenges such as permutation\nsequence, HiNM-sparsity-aware permutation, and maintaining consistency in\nchannel ordering across layers. In this paper, we introduce a channel\npermutation method designed specifically for HiNM sparsity, named\ngyro-permutation. This method is crafted to exploit the unique characteristics\nof HiNM pruning, incorporating a strategic policy in each permutation phase,\nincluding channel sampling, clustering, and assignment, to circumvent local\nminima. Additionally, we have developed a GPU kernel that facilitates\nindependent layer permutation during the execution of HiNM sparse networks. Our\nextensive experimental evaluations on various DNN models demonstrate that our\ngyro-permutation significantly enhances the accuracy of HiNM sparse networks,\nallowing them to reach performance levels comparable to those of unstructured\nsparse networks.\n","authors":["Seungmin Yu","Xiaodie Yi","Hayun Lee","Dongkun Shin"],"pdf_url":"https://arxiv.org/pdf/2407.20496v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.15200v2","updated":"2024-07-30T01:26:46Z","published":"2024-07-21T15:43:52Z","title":"HyperbolicLR: Epoch insensitive learning rate scheduler","summary":" This study proposes two novel learning rate schedulers: the Hyperbolic\nLearning Rate Scheduler (HyperbolicLR) and the Exponential Hyperbolic Learning\nRate Scheduler (ExpHyperbolicLR). These schedulers attempt to address the\ninconsistent learning curves often observed in conventional schedulers when\nadjusting the number of epochs. By leveraging the asymptotic behavior of\nhyperbolic curves, the proposed schedulers maintain more consistent learning\ncurves across varying epoch settings. The HyperbolicLR algorithm directly\napplies this property to the epoch-learning rate space, while the\nExpHyperbolicLR maps this concept onto the exponential space of epochs and\nlearning rates. To evaluate the performance of these schedulers, first we found\nthe optimal hyperparameters for each scheduler on a small number of epochs,\nfixed these values, and compared their performance as the number of epochs\nincreased. Our experimental results on various deep learning tasks and\narchitectures demonstrate that both HyperbolicLR and ExpHyperbolicLR maintain\nmore consistent performance improvements compared to conventional schedulers as\nthe number of epochs increases. These findings suggest that our\nhyperbolic-based learning rate schedulers offer a more robust and efficient\napproach to training deep neural networks, especially in scenarios where\ncomputational resources or time constraints limit extensive hyperparameter\nsearches.\n","authors":["Tae-Geun Kim"],"pdf_url":"https://arxiv.org/pdf/2407.15200v2.pdf","comment":"30 pages, 12 figures"},{"id":"http://arxiv.org/abs/2407.20485v1","updated":"2024-07-30T01:13:42Z","published":"2024-07-30T01:13:42Z","title":"A2SF: Accumulative Attention Scoring with Forgetting Factor for Token\n Pruning in Transformer Decoder","summary":" Recently, large language models (LLM) based on transformers are facing memory\nbottleneck issues due to KV cache, especially in long sequence handling.\nPrevious researches proposed KV cache compression techniques that identify\ninsignificant tokens based on Accumulative Attention Scores and removes their\nitems from KV cache, noting that only few tokens play an important role in\nattention operations. However, we have observed that the existing Accumulative\nAttention Score is not suitable for the transformer decoder structure. In the\ndecoder model, the number of times the Attention Score accumulates varies\ndepending on the order of token appearance due to the effect of masking,\ncausing an uneven comparison between tokens. To solve this, we propose\nAccumulative Attention Score with Forgetting Factor (A2SF) technique, which\nintroduces a Forgetting Factor in the Attention Score accumulation process.\nA2SF applies a penalty to the past Attention Score generated from old tokens by\nrepeatedly multiplying the Forgetting Factor to the Attention Score over time.\nTherefore, older tokens receive a larger penalty, providing fairness among\ndifferent ages of tokens. Through the fair comparison among tokens, we can more\neffectively select important tokens. We have verified the accuracy improvement\nthrough A2SF in the OPT and LLaMA models and A2SF improves the accuracy of\nLLaMA 2 by up to 7.8% and 5.1% on 1-shot and 0-shot.\n","authors":["Hyun Rae Jo","Dong Kun Shin"],"pdf_url":"https://arxiv.org/pdf/2407.20485v1.pdf","comment":"11 pages(9 pages + reference 2 pages), 6 figures"},{"id":"http://arxiv.org/abs/2312.02230v3","updated":"2024-07-30T00:56:40Z","published":"2023-12-04T03:43:26Z","title":"A Simple and Scalable Representation for Graph Generation","summary":" Recently, there has been a surge of interest in employing neural networks for\ngraph generation, a fundamental statistical learning problem with critical\napplications like molecule design and community analysis. However, most\napproaches encounter significant limitations when generating large-scale\ngraphs. This is due to their requirement to output the full adjacency matrices\nwhose size grows quadratically with the number of nodes. In response to this\nchallenge, we introduce a new, simple, and scalable graph representation named\ngap encoded edge list (GEEL) that has a small representation size that aligns\nwith the number of edges. In addition, GEEL significantly reduces the\nvocabulary size by incorporating the gap encoding and bandwidth restriction\nschemes. GEEL can be autoregressively generated with the incorporation of node\npositional encoding, and we further extend GEEL to deal with attributed graphs\nby designing a new grammar. Our findings reveal that the adoption of this\ncompact representation not only enhances scalability but also bolsters\nperformance by simplifying the graph generation process. We conduct a\ncomprehensive evaluation across ten non-attributed and two molecular graph\ngeneration tasks, demonstrating the effectiveness of GEEL.\n","authors":["Yunhui Jang","Seul Lee","Sungsoo Ahn"],"pdf_url":"https://arxiv.org/pdf/2312.02230v3.pdf","comment":"International Conference on Learning Representations (ICLR) 2024"},{"id":"http://arxiv.org/abs/2401.09622v2","updated":"2024-07-30T00:26:42Z","published":"2024-01-17T22:23:29Z","title":"Is Hyper-Parameter Optimization Different for Software Analytics?","summary":" Yes. SE data can have \"smoother\" boundaries between classes (compared to\ntraditional AI data sets). To be more precise, the magnitude of the second\nderivative of the loss function found in SE data is typically much smaller. A\nnew hyper-parameter optimizer, called SMOOTHIE, can exploit this idiosyncrasy\nof SE data. We compare SMOOTHIE and a state-of-the-art AI hyper-parameter\noptimizer on three tasks: (a) GitHub issue lifetime prediction (b) detecting\nstatic code warnings false alarm; (c) defect prediction. For completeness, we\nalso show experiments on some standard AI datasets. SMOOTHIE runs faster and\npredicts better on the SE data--but ties on non-SE data with the AI tool. Hence\nwe conclude that SE data can be different to other kinds of data; and those\ndifferences mean that we should use different kinds of algorithms for our data.\nTo support open science and other researchers working in this area, all our\nscripts and datasets are available on-line at\nhttps://github.com/yrahul3910/smoothness-hpo/.\n","authors":["Rahul Yedida","Tim Menzies"],"pdf_url":"https://arxiv.org/pdf/2401.09622v2.pdf","comment":"v2"},{"id":"http://arxiv.org/abs/2407.20475v1","updated":"2024-07-30T00:21:51Z","published":"2024-07-30T00:21:51Z","title":"Distribution Learning for Molecular Regression","summary":" Using \"soft\" targets to improve model performance has been shown to be\neffective in classification settings, but the usage of soft targets for\nregression is a much less studied topic in machine learning. The existing\nliterature on the usage of soft targets for regression fails to properly assess\nthe method's limitations, and empirical evaluation is quite limited. In this\nwork, we assess the strengths and drawbacks of existing methods when applied to\nmolecular property regression tasks. Our assessment outlines key biases present\nin existing methods and proposes methods to address them, evaluated through\ncareful ablation studies. We leverage these insights to propose Distributional\nMixture of Experts (DMoE): A model-independent, and data-independent method for\nregression which trains a model to predict probability distributions of its\ntargets. Our proposed loss function combines the cross entropy between\npredicted and target distributions and the L1 distance between their expected\nvalues to produce a loss function that is robust to the outlined biases. We\nevaluate the performance of DMoE on different molecular property prediction\ndatasets -- Open Catalyst (OC20), MD17, and QM9 -- across different backbone\nmodel architectures -- SchNet, GemNet, and Graphormer. Our results demonstrate\nthat the proposed method is a promising alternative to classical regression for\nmolecular property prediction tasks, showing improvements over baselines on all\ndatasets and architectures.\n","authors":["Nima Shoghi","Pooya Shoghi","Anuroop Sriram","Abhishek Das"],"pdf_url":"https://arxiv.org/pdf/2407.20475v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20471v1","updated":"2024-07-30T00:16:50Z","published":"2024-07-30T00:16:50Z","title":"Relaxed Equivariant Graph Neural Networks","summary":" 3D Euclidean symmetry equivariant neural networks have demonstrated notable\nsuccess in modeling complex physical systems. We introduce a framework for\nrelaxed $E(3)$ graph equivariant neural networks that can learn and represent\nsymmetry breaking within continuous groups. Building on the existing e3nn\nframework, we propose the use of relaxed weights to allow for controlled\nsymmetry breaking. We show empirically that these relaxed weights learn the\ncorrect amount of symmetry breaking.\n","authors":["Elyssa Hofgard","Rui Wang","Robin Walters","Tess Smidt"],"pdf_url":"https://arxiv.org/pdf/2407.20471v1.pdf","comment":"Extended abstract presented at the Geometry-grounded Representation\n Learning and Generative Modeling Workshop (GRaM) at the 41st International\n Conference on Machine Learning, July 2024, Vienna, Austria"},{"id":"http://arxiv.org/abs/2312.16638v2","updated":"2024-07-30T00:07:01Z","published":"2023-12-27T17:00:09Z","title":"Fault Tolerant Serverless VFL Over Dynamic Device Environment","summary":" Vertical Federated learning (VFL) is a class of FL where each client shares\nthe same set of samples but only owns a subset of the features. Usually, VFL\nassumes perfect hardware and communication capabilities. However, this\nassumption hinders the broad deployment of VFL, particularly on a network of\nedge devices, which are heterogeneous in their in-situ capabilities while any\ndevice may connect/disconnect from the network over time. To address this gap,\nwe study the test time performance of VFL under dynamic network conditions,\nwhich we call DN-VFL. We first formalize DN-VFL, including a message passing\ndistributed inference algorithm, the corresponding risk, and a serverless\nsetup. We develop a novel DN-VFL approach called Multiple Aggregation with\nGossip Rounds and Simulated Faults (MAGS) that synthesizes replication,\ngossiping, and selective feature omission to improve performance significantly\nover baselines. Furthermore, we propose metrics and extensively analyze MAGS\nusing a simulated sensor network. The results show that naively using VFL for\nDN-VFL is not the best approach. Rather, MAGS present a better alternative to\nhandle changes in the network during inference.\n","authors":["Surojit Ganguli","Zeyu Zhou","Christopher G. Brinton","David I. Inouye"],"pdf_url":"https://arxiv.org/pdf/2312.16638v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21248v1","updated":"2024-07-30T23:43:59Z","published":"2024-07-30T23:43:59Z","title":"Adaptive Pre-training Data Detection for Large Language Models via\n Surprising Tokens","summary":" While large language models (LLMs) are extensively used, there are raising\nconcerns regarding privacy, security, and copyright due to their opaque\ntraining data, which brings the problem of detecting pre-training data on the\ntable. Current solutions to this problem leverage techniques explored in\nmachine learning privacy such as Membership Inference Attacks (MIAs), which\nheavily depend on LLMs' capability of verbatim memorization. However, this\nreliance presents challenges, especially given the vast amount of training data\nand the restricted number of effective training epochs. In this paper, we\npropose an adaptive pre-training data detection method which alleviates this\nreliance and effectively amplify the identification. Our method adaptively\nlocates \\textit{surprising tokens} of the input. A token is surprising to a LLM\nif the prediction on the token is \"certain but wrong\", which refers to low\nShannon entropy of the probability distribution and low probability of the\nground truth token at the same time. By using the prediction probability of\nsurprising tokens to measure \\textit{surprising}, the detection method is\nachieved based on the simple hypothesis that seeing seen data is less\nsurprising for the model compared with seeing unseen data. The method can be\napplied without any access to the the pre-training data corpus or additional\ntraining like reference models. Our approach exhibits a consistent enhancement\ncompared to existing methods in diverse experiments conducted on various\nbenchmarks and models, achieving a maximum improvement of 29.5\\%. We also\nintroduce a new benchmark Dolma-Book developed upon a novel framework, which\nemploys book data collected both before and after model training to provide\nfurther evaluation.\n","authors":["Anqi Zhang","Chaofeng Wu"],"pdf_url":"https://arxiv.org/pdf/2407.21248v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2012.02134v4","updated":"2024-07-30T23:39:47Z","published":"2020-12-03T18:13:26Z","title":"K-Deep Simplex: Deep Manifold Learning via Local Dictionaries","summary":" We propose K-Deep Simplex(KDS) which, given a set of data points, learns a\ndictionary comprising synthetic landmarks, along with representation\ncoefficients supported on a simplex. KDS employs a local weighted $\\ell_1$\npenalty that encourages each data point to represent itself as a convex\ncombination of nearby landmarks. We solve the proposed optimization program\nusing alternating minimization and design an efficient, interpretable\nautoencoder using algorithm unrolling. We theoretically analyze the proposed\nprogram by relating the weighted $\\ell_1$ penalty in KDS to a weighted $\\ell_0$\nprogram. Assuming that the data are generated from a Delaunay triangulation, we\nprove the equivalence of the weighted $\\ell_1$ and weighted $\\ell_0$ programs.\nWe further show the stability of the representation coefficients under mild\ngeometrical assumptions. If the representation coefficients are fixed, we prove\nthat the sub-problem of minimizing over the dictionary yields a unique\nsolution. Further, we show that low-dimensional representations can be\nefficiently obtained from the covariance of the coefficient matrix. Experiments\nshow that the algorithm is highly efficient and performs competitively on\nsynthetic and real data sets.\n","authors":["Pranay Tankala","Abiy Tasissa","James M. Murphy","Demba Ba"],"pdf_url":"https://arxiv.org/pdf/2012.02134v4.pdf","comment":"33 pages, 17 figures. This expanded version includes detailed\n numerical experiments in the supplementary material. Theorem 3 is a new\n stability result. The sections have been reorganized, and additional details\n have been provided for clarity"},{"id":"http://arxiv.org/abs/2312.11514v3","updated":"2024-07-30T23:37:20Z","published":"2023-12-12T18:57:08Z","title":"LLM in a flash: Efficient Large Language Model Inference with Limited\n Memory","summary":" Large language models (LLMs) are central to modern natural language\nprocessing, delivering exceptional performance in various tasks. However, their\nsubstantial computational and memory requirements present challenges,\nespecially for devices with limited DRAM capacity. This paper tackles the\nchallenge of efficiently running LLMs that exceed the available DRAM capacity\nby storing the model parameters in flash memory, but bringing them on demand to\nDRAM. Our method involves constructing an inference cost model that takes into\naccount the characteristics of flash memory, guiding us to optimize in two\ncritical areas: reducing the volume of data transferred from flash and reading\ndata in larger, more contiguous chunks. Within this hardware-informed\nframework, we introduce two principal techniques. First, \"windowing\"\nstrategically reduces data transfer by reusing previously activated neurons,\nand second, \"row-column bundling\", tailored to the sequential data access\nstrengths of flash memory, increases the size of data chunks read from flash\nmemory. These methods collectively enable running models up to twice the size\nof the available DRAM, with a 4-5x and 20-25x increase in inference speed\ncompared to naive loading approaches in CPU and GPU, respectively. Our\nintegration of sparsity awareness, context-adaptive loading, and a\nhardware-oriented design paves the way for effective inference of LLMs on\ndevices with limited memory.\n","authors":["Keivan Alizadeh","Iman Mirzadeh","Dmitry Belenko","Karen Khatamifard","Minsik Cho","Carlo C Del Mundo","Mohammad Rastegari","Mehrdad Farajtabar"],"pdf_url":"https://arxiv.org/pdf/2312.11514v3.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2407.21243v1","updated":"2024-07-30T23:29:29Z","published":"2024-07-30T23:29:29Z","title":"Informed Correctors for Discrete Diffusion Models","summary":" Discrete diffusion modeling is a promising framework for modeling and\ngenerating data in discrete spaces. To sample from these models, different\nstrategies present trade-offs between computation and sample quality. A\npredominant sampling strategy is predictor-corrector $\\tau$-leaping, which\nsimulates the continuous time generative process with discretized predictor\nsteps and counteracts the accumulation of discretization error via corrector\nsteps. However, for absorbing state diffusion, an important class of discrete\ndiffusion models, the standard forward-backward corrector can be ineffective in\nfixing such errors, resulting in subpar sample quality. To remedy this problem,\nwe propose a family of informed correctors that more reliably counteracts\ndiscretization error by leveraging information learned by the model. For\nfurther efficiency gains, we also propose $k$-Gillespie's, a sampling algorithm\nthat better utilizes each model evaluation, while still enjoying the speed and\nflexibility of $\\tau$-leaping. Across several real and synthetic datasets, we\nshow that $k$-Gillespie's with informed correctors reliably produces higher\nquality samples at lower computational cost.\n","authors":["Yixiu Zhao","Jiaxin Shi","Lester Mackey","Scott Linderman"],"pdf_url":"https://arxiv.org/pdf/2407.21243v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21236v1","updated":"2024-07-30T22:58:23Z","published":"2024-07-30T22:58:23Z","title":"GNUMAP: A Parameter-Free Approach to Unsupervised Dimensionality\n Reduction via Graph Neural Networks","summary":" With the proliferation of Graph Neural Network (GNN) methods stemming from\ncontrastive learning, unsupervised node representation learning for graph data\nis rapidly gaining traction across various fields, from biology to molecular\ndynamics, where it is often used as a dimensionality reduction tool. However,\nthere remains a significant gap in understanding the quality of the\nlow-dimensional node representations these methods produce, particularly beyond\nwell-curated academic datasets. To address this gap, we propose here the first\ncomprehensive benchmarking of various unsupervised node embedding techniques\ntailored for dimensionality reduction, encompassing a range of manifold\nlearning tasks, along with various performance metrics. We emphasize the\nsensitivity of current methods to hyperparameter choices -- highlighting a\nfundamental issue as to their applicability in real-world settings where there\nis no established methodology for rigorous hyperparameter selection. Addressing\nthis issue, we introduce GNUMAP, a robust and parameter-free method for\nunsupervised node representation learning that merges the traditional UMAP\napproach with the expressivity of the GNN framework. We show that GNUMAP\nconsistently outperforms existing state-of-the-art GNN embedding methods in a\nvariety of contexts, including synthetic geometric datasets, citation networks,\nand real-world biomedical data -- making it a simple but reliable\ndimensionality reduction tool.\n","authors":["Jihee You","So Won Jeong","Claire Donnat"],"pdf_url":"https://arxiv.org/pdf/2407.21236v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18076v2","updated":"2024-07-30T22:54:45Z","published":"2023-11-29T20:43:49Z","title":"Localization from structured distance matrices via low-rank matrix\n recovery","summary":" We study the problem of determining the configuration of $n$ points by using\ntheir distances to $m$ nodes, referred to as anchor nodes. One sampling scheme\nis Nystrom sampling, which assumes known distances between the anchors and\nbetween the anchors and the $n$ points, while the distances among the $n$\npoints are unknown. For this scheme, a simple adaptation of the Nystrom method,\nwhich is often used for kernel approximation, is a viable technique to estimate\nthe configuration of the anchors and the $n$ points. In this manuscript, we\npropose a modified version of Nystrom sampling, where the distances from every\nnode to one central node are known, but all other distances are incomplete. In\nthis setting, the standard Nystrom approach is not applicable, necessitating an\nalternative technique to estimate the configuration of the anchors and the $n$\npoints. We show that this problem can be framed as the recovery of a low-rank\nsubmatrix of a Gram matrix. Using synthetic and real data, we demonstrate that\nthe proposed approach can exactly recover configurations of points given\nsufficient distance samples. This underscores that, in contrast to methods that\nrely on global sampling of distance matrices, the task of estimating the\nconfiguration of points can be done efficiently via structured sampling with\nwell-chosen reliable anchors. Finally, our main analysis is grounded in a\nspecific centering of the points. With this in mind, we extend previous work in\nEuclidean distance geometry by providing a general dual basis approach for\npoints centered anywhere.\n","authors":["Samuel Lichtenberg","Abiy Tasissa"],"pdf_url":"https://arxiv.org/pdf/2311.18076v2.pdf","comment":"20 pages. Introduced a new sampling model. Experimental results on\n both synthetic and real data. A new optimization program for structured\n distance geometry based on low-rank recovery. The analysis of the previous\n sampling model is also discussed. Made changes to improve the clarity and\n presentation of the paper"},{"id":"http://arxiv.org/abs/2407.21231v1","updated":"2024-07-30T22:37:25Z","published":"2024-07-30T22:37:25Z","title":"Towards an Integrated Performance Framework for Fire Science and\n Management Workflows","summary":" Reliable performance metrics are necessary prerequisites to building\nlarge-scale end-to-end integrated workflows for collaborative scientific\nresearch, particularly within context of use-inspired decision making platforms\nwith many concurrent users and when computing real-time and urgent results\nusing large data. This work is a building block for the National Data Platform,\nwhich leverages multiple use-cases including the WIFIRE Data and Model Commons\nfor wildfire behavior modeling and the EarthScope Consortium for collaborative\ngeophysical research. This paper presents an artificial intelligence and\nmachine learning (AI/ML) approach to performance assessment and optimization of\nscientific workflows. An associated early AI/ML framework spanning performance\ndata collection, prediction and optimization is applied to wildfire science\napplications within the WIFIRE BurnPro3D (BP3D) platform for proactive fire\nmanagement and mitigation.\n","authors":["H. Ahmed","R. Shende","I. Perez","D. Crawl","S. Purawat","I. Altintas"],"pdf_url":"https://arxiv.org/pdf/2407.21231v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03861v3","updated":"2024-07-30T22:19:20Z","published":"2024-03-06T17:11:38Z","title":"Designing Informative Metrics for Few-Shot Example Selection","summary":" Pretrained language models (PLMs) have shown remarkable few-shot learning\ncapabilities when provided with properly formatted examples. However, selecting\nthe \"best\" examples remains an open challenge. We propose a complexity-based\nprompt selection approach for sequence tagging tasks. This approach avoids the\ntraining of a dedicated model for selection of examples, and instead uses\ncertain metrics to align the syntactico-semantic complexity of test sentences\nand examples. We use both sentence- and word-level metrics to match the\ncomplexity of examples to the (test) sentence being considered. Our results\ndemonstrate that our approach extracts greater performance from PLMs: it\nachieves state-of-the-art performance on few-shot NER, achieving a 5% absolute\nimprovement in F1 score on the CoNLL2003 dataset for GPT-4. We also see large\ngains of upto 28.85 points (F1/Acc.) in smaller models like GPT-j-6B.\n","authors":["Rishabh Adiga","Lakshminarayanan Subramanian","Varun Chandrasekaran"],"pdf_url":"https://arxiv.org/pdf/2403.03861v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21220v1","updated":"2024-07-30T22:14:47Z","published":"2024-07-30T22:14:47Z","title":"DeepBaR: Fault Backdoor Attack on Deep Neural Network Layers","summary":" Machine Learning using neural networks has received prominent attention\nrecently because of its success in solving a wide variety of computational\ntasks, in particular in the field of computer vision. However, several works\nhave drawn attention to potential security risks involved with the training and\nimplementation of such networks. In this work, we introduce DeepBaR, a novel\napproach that implants backdoors on neural networks by faulting their behavior\nat training, especially during fine-tuning. Our technique aims to generate\nadversarial samples by optimizing a custom loss function that mimics the\nimplanted backdoors while adding an almost non-visible trigger in the image. We\nattack three popular convolutional neural network architectures and show that\nDeepBaR attacks have a success rate of up to 98.30\\%. Furthermore, DeepBaR does\nnot significantly affect the accuracy of the attacked networks after deployment\nwhen non-malicious inputs are given. Remarkably, DeepBaR allows attackers to\nchoose an input that looks similar to a given class, from a human perspective,\nbut that will be classified as belonging to an arbitrary target class.\n","authors":["C. A. Martínez-Mejía","J. Solano","J. Breier","D. Bucko","X. Hou"],"pdf_url":"https://arxiv.org/pdf/2407.21220v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16491v2","updated":"2024-07-30T22:09:02Z","published":"2023-08-31T06:53:22Z","title":"In-class Data Analysis Replications: Teaching Students while Testing\n Science","summary":" Science is facing a reproducibility crisis. Previous work has proposed\nincorporating data analysis replications into classrooms as a potential\nsolution. However, despite the potential benefits, it is unclear whether this\napproach is feasible, and if so, what the involved stakeholders-students,\neducators, and scientists-should expect from it. Can students perform a data\nanalysis replication over the course of a class? What are the costs and\nbenefits for educators? And how can this solution help benchmark and improve\nthe state of science?\n In the present study, we incorporated data analysis replications in the\nproject component of the Applied Data Analysis course (CS-401) taught at EPFL\n(N=354 students). Here we report pre-registered findings based on surveys\nadministered throughout the course. First, we demonstrate that students can\nreplicate previously published scientific papers, most of them qualitatively\nand some exactly. We find discrepancies between what students expect of data\nanalysis replications and what they experience by doing them along with changes\nin expectations about reproducibility, which together serve as evidence of\nattitude shifts to foster students' critical thinking. Second, we provide\ninformation for educators about how much overhead is needed to incorporate\nreplications into the classroom and identify concerns that replications bring\nas compared to more traditional assignments. Third, we identify tangible\nbenefits of the in-class data analysis replications for scientific communities,\nsuch as a collection of replication reports and insights about replication\nbarriers in scientific work that should be avoided going forward.\n Overall, we demonstrate that incorporating replication tasks into a large\ndata science class can increase the reproducibility of scientific work as a\nby-product of data science instruction, thus benefiting both science and\nstudents.\n","authors":["Kristina Gligoric","Tiziano Piccardi","Jake Hofman","Robert West"],"pdf_url":"https://arxiv.org/pdf/2308.16491v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21217v1","updated":"2024-07-30T22:01:14Z","published":"2024-07-30T22:01:14Z","title":"NeuroSEM: A hybrid framework for simulating multiphysics problems by\n coupling PINNs and spectral elements","summary":" Multiphysics problems that are characterized by complex interactions among\nfluid dynamics, heat transfer, structural mechanics, and electromagnetics, are\ninherently challenging due to their coupled nature. While experimental data on\ncertain state variables may be available, integrating these data with numerical\nsolvers remains a significant challenge. Physics-informed neural networks\n(PINNs) have shown promising results in various engineering disciplines,\nparticularly in handling noisy data and solving inverse problems. However,\ntheir effectiveness in forecasting nonlinear phenomena in multiphysics regimes\nis yet to be fully established. This study introduces NeuroSEM, a hybrid\nframework integrating PINNs with the high-fidelity Spectral Element Method\n(SEM) solver, Nektar++. NeuroSEM leverages strengths of both PINNs and SEM,\nproviding robust solutions for multiphysics problems. PINNs are trained to\nassimilate data and model physical phenomena in specific subdomains, which are\nthen integrated into Nektar++. We demonstrate the efficiency and accuracy of\nNeuroSEM for thermal convection in cavity flow and flow past a cylinder. The\nframework effectively handles data assimilation by addressing those subdomains\nand state variables where data are available. We applied NeuroSEM to the\nRayleigh-B\\'enard convection system, including cases with missing thermal\nboundary conditions. Our results indicate that NeuroSEM accurately models the\nphysical phenomena and assimilates the data within the specified subdomains.\nThe framework's plug-and-play nature facilitates its extension to other\nmultiphysics or multiscale problems. Furthermore, NeuroSEM is optimized for an\nefficient execution on emerging integrated GPU-CPU architectures. This hybrid\napproach enhances the accuracy and efficiency of simulations, making it a\npowerful tool for tackling complex engineering challenges in various scientific\ndomains.\n","authors":["Khemraj Shukla","Zongren Zou","Chi Hin Chan","Additi Pandey","Zhicheng Wang","George Em Karniadakis"],"pdf_url":"https://arxiv.org/pdf/2407.21217v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11694v5","updated":"2024-07-30T21:43:28Z","published":"2024-01-22T05:26:18Z","title":"Parametric Matrix Models","summary":" We present a general class of machine learning algorithms called parametric\nmatrix models. In contrast with most existing machine learning models that\nimitate the biology of neurons, parametric matrix models use matrix equations\nthat emulate the physics of quantum systems. Similar to how physics problems\nare usually solved, parametric matrix models learn the governing equations that\nlead to the desired outputs. Parametric matrix models can be efficiently\ntrained from empirical data, and the equations may use algebraic, differential,\nor integral relations. While originally designed for scientific computing, we\nprove that parametric matrix models are universal function approximators that\ncan be applied to general machine learning problems. After introducing the\nunderlying theory, we apply parametric matrix models to a series of different\nchallenges that show their performance for a wide range of problems. For all\nthe challenges tested here, parametric matrix models produce accurate results\nwithin an efficient and interpretable computational framework that allows for\ninput feature extrapolation.\n","authors":["Patrick Cook","Danny Jammooa","Morten Hjorth-Jensen","Daniel D. Lee","Dean Lee"],"pdf_url":"https://arxiv.org/pdf/2401.11694v5.pdf","comment":"Exact same content as previous version (v4); corrected author email"},{"id":"http://arxiv.org/abs/2407.04822v2","updated":"2024-07-30T21:21:09Z","published":"2024-07-05T19:18:33Z","title":"YourMT3+: Multi-instrument Music Transcription with Enhanced Transformer\n Architectures and Cross-dataset Stem Augmentation","summary":" Multi-instrument music transcription aims to convert polyphonic music\nrecordings into musical scores assigned to each instrument. This task is\nchallenging for modeling as it requires simultaneously identifying multiple\ninstruments and transcribing their pitch and precise timing, and the lack of\nfully annotated data adds to the training difficulties. This paper introduces\nYourMT3+, a suite of models for enhanced multi-instrument music transcription\nbased on the recent language token decoding approach of MT3. We enhance its\nencoder by adopting a hierarchical attention transformer in the time-frequency\ndomain and integrating a mixture of experts. To address data limitations, we\nintroduce a new multi-channel decoding method for training with incomplete\nannotations and propose intra- and cross-stem augmentation for dataset mixing.\nOur experiments demonstrate direct vocal transcription capabilities,\neliminating the need for voice separation pre-processors. Benchmarks across ten\npublic datasets show our models' competitiveness with, or superiority to,\nexisting transcription models. Further testing on pop music recordings\nhighlights the limitations of current models. Fully reproducible code and\ndatasets are available with demos at \\url{https://github.com/mimbres/YourMT3}.\n","authors":["Sungkyun Chang","Emmanouil Benetos","Holger Kirchhoff","Simon Dixon"],"pdf_url":"https://arxiv.org/pdf/2407.04822v2.pdf","comment":"2024 IEEE International Workshop on Machine Learning for Signal\n Processing (MLSP), Sept.\\ 22--25, 2024, London, UK"},{"id":"http://arxiv.org/abs/2407.21195v1","updated":"2024-07-30T21:07:09Z","published":"2024-07-30T21:07:09Z","title":"Diffusion-Based Generation of Neural Activity from Disentangled Latent\n Codes","summary":" Recent advances in recording technology have allowed neuroscientists to\nmonitor activity from thousands of neurons simultaneously. Latent variable\nmodels are increasingly valuable for distilling these recordings into compact\nand interpretable representations. Here we propose a new approach to neural\ndata analysis that leverages advances in conditional generative modeling to\nenable the unsupervised inference of disentangled behavioral variables from\nrecorded neural activity. Our approach builds on InfoDiffusion, which augments\ndiffusion models with a set of latent variables that capture important factors\nof variation in the data. We apply our model, called Generating Neural\nObservations Conditioned on Codes with High Information (GNOCCHI), to time\nseries neural data and test its application to synthetic and biological\nrecordings of neural activity during reaching. In comparison to a VAE-based\nsequential autoencoder, GNOCCHI learns higher-quality latent spaces that are\nmore clearly structured and more disentangled with respect to key behavioral\nvariables. These properties enable accurate generation of novel samples (unseen\nbehavioral conditions) through simple linear traversal of the latent spaces\nproduced by GNOCCHI. Our work demonstrates the potential of unsupervised,\ninformation-based models for the discovery of interpretable latent spaces from\nneural data, enabling researchers to generate high-quality samples from unseen\nconditions.\n","authors":["Jonathan D. McCart","Andrew R. Sedler","Christopher Versteeg","Domenick Mifsud","Mattia Rigotti-Thompson","Chethan Pandarinath"],"pdf_url":"https://arxiv.org/pdf/2407.21195v1.pdf","comment":"16 pages, 5 figures"},{"id":"http://arxiv.org/abs/2303.05682v2","updated":"2024-07-30T21:03:15Z","published":"2023-03-10T03:20:03Z","title":"A dual basis approach to multidimensional scaling","summary":" Classical multidimensional scaling (CMDS) is a technique that embeds a set of\nobjects in a Euclidean space given their pairwise Euclidean distances. The main\npart of CMDS involves double centering a squared distance matrix and using a\ntruncated eigendecomposition to recover the point coordinates. In this paper,\nmotivated by a study in Euclidean distance geometry, we explore a dual basis\napproach to CMDS. We give an explicit formula for the dual basis vectors and\nfully characterize the spectrum of an essential matrix in the dual basis\nframework. We make connections to a related problem in metric nearness.\n","authors":["Samuel Lichtenberg","Abiy Tasissa"],"pdf_url":"https://arxiv.org/pdf/2303.05682v2.pdf","comment":"7 pages. The proof of dual basis representation is now compact. It is\n not constructive compared to the previous version, but it uses\n bi-orthogonality relation to establish the result more directly. A minor\n error in the proof of the spectrum of the dual basis has been fixed. We also\n made few changes for better clarity and presentation"},{"id":"http://arxiv.org/abs/2407.21193v1","updated":"2024-07-30T21:02:15Z","published":"2024-07-30T21:02:15Z","title":"Analyzing Customer-Facing Vendor Experiences with Time Series\n Forecasting and Monte Carlo Techniques","summary":" eBay partners with external vendors, which allows customers to freely select\na vendor to complete their eBay experiences. However, vendor outages can hinder\ncustomer experiences. Consequently, eBay can disable a problematic vendor to\nprevent customer loss. Disabling the vendor too late risks losing customers\nwilling to switch to other vendors, while disabling it too early risks losing\nthose unwilling to switch. In this paper, we propose a data-driven solution to\nanswer whether eBay should disable a problematic vendor and when to disable it.\nOur solution involves forecasting customer behavior. First, we use a\nmultiplicative seasonality model to represent behavior if all vendors are fully\nfunctioning. Next, we use a Monte Carlo simulation to represent behavior if the\nproblematic vendor remains enabled. Finally, we use a linear model to represent\nbehavior if the vendor is disabled. By comparing these forecasts, we determine\nthe optimal time for eBay to disable the problematic vendor.\n","authors":["Vivek Kaushik","Jason Tang"],"pdf_url":"https://arxiv.org/pdf/2407.21193v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21191v1","updated":"2024-07-30T20:58:36Z","published":"2024-07-30T20:58:36Z","title":"GenRec: Generative Personalized Sequential Recommendation","summary":" Sequential recommendation is a task to capture hidden user preferences from\nhistorical user item interaction data. Significant progress has been made in\nthis domain by leveraging classification based learning methods. Inspired by\nthe recent paradigm of 'pretrain, prompt and predict' in NLP, we consider\nsequential recommendation as a sequence to sequence generation task and propose\na novel model named Generative Recommendation (GenRec). Unlike classification\nbased models that learn explicit user and item representations, GenRec utilizes\nthe sequence modeling capability of Transformer and adopts the masked item\nprediction objective to effectively learn the hidden bidirectional sequential\npatterns. Different from existing generative sequential recommendation models,\nGenRec does not rely on manually designed hard prompts. The input to GenRec is\ntextual user item sequence and the output is top ranked next items. Moreover,\nGenRec is lightweight and requires only a few hours to train effectively in\nlow-resource settings, making it highly applicable to real-world scenarios and\nhelping to democratize large language models in the sequential recommendation\ndomain. Our extensive experiments have demonstrated that GenRec generalizes on\nvarious public real-world datasets and achieves state-of-the-art results. Our\nexperiments also validate the effectiveness of the the proposed masked item\nprediction objective that improves the model performance by a large margin.\n","authors":["Panfeng Cao","Pietro Lio"],"pdf_url":"https://arxiv.org/pdf/2407.21191v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21189v1","updated":"2024-07-30T20:54:07Z","published":"2024-07-30T20:54:07Z","title":"Multi-task Photonic Reservoir Computing: Wavelength Division\n Multiplexing for Parallel Computing with a Silicon Microring Resonator","summary":" Nowadays, as the ever-increasing demand for more powerful computing resources\ncontinues, alternative advanced computing paradigms are under extensive\ninvestigation. Significant effort has been made to deviate from conventional\nVon Neumann architectures. In-memory computing has emerged in the field of\nelectronics as a possible solution to the infamous bottleneck between memory\nand computing processors, which reduces the effective throughput of data. In\nphotonics, novel schemes attempt to collocate the computing processor and\nmemory in a single device. Photonics offers the flexibility of multiplexing\nstreams of data not only spatially and in time, but also in frequency or,\nequivalently, in wavelength, which makes it highly suitable for parallel\ncomputing. Here, we numerically show the use of time and wavelength division\nmultiplexing (WDM) to solve four independent tasks at the same time in a single\nphotonic chip, serving as a proof of concept for our proposal. The system is a\ntime-delay reservoir computing (TDRC) based on a microring resonator (MRR). The\naddressed tasks cover different applications: Time-series prediction, waveform\nsignal classification, wireless channel equalization, and radar signal\nprediction. The system is also tested for simultaneous computing of up to 10\ninstances of the same task, exhibiting excellent performance. The footprint of\nthe system is reduced by using time-division multiplexing of the nodes that act\nas the neurons of the studied neural network scheme. WDM is used for the\nparallelization of wavelength channels, each addressing a single task. By\nadjusting the input power and frequency of each optical channel, we can achieve\nlevels of performance for each of the tasks that are comparable to those quoted\nin state-of-the-art reports focusing on single-task operation...\n","authors":["Bernard J. Giron Castro","Christophe Peucheret","Darko Zibar","Francesco Da Ros"],"pdf_url":"https://arxiv.org/pdf/2407.21189v1.pdf","comment":"Main text: 11 figures, 3 tables. Supplementary material: 2 figures, 4\n tables. The pre-print is under review in Frontiers: Advanced Optical\n Technologies. The abstract is shorter than in the PDF file to comply with\n arXiv requirements"},{"id":"http://arxiv.org/abs/2407.21185v1","updated":"2024-07-30T20:50:48Z","published":"2024-07-30T20:50:48Z","title":"Amelia: A Large Model and Dataset for Airport Surface Movement\n Forecasting","summary":" The growing demand for air travel requires technological advancements in air\ntraffic management as well as mechanisms for monitoring and ensuring safe and\nefficient operations. In terminal airspaces, predictive models of future\nmovements and traffic flows can help with proactive planning and efficient\ncoordination; however, varying airport topologies, and interactions with other\nagents, among other factors, make accurate predictions challenging. Data-driven\npredictive models have shown promise for handling numerous variables to enable\nvarious downstream tasks, including collision risk assessment, taxi-out time\nprediction, departure metering, and emission estimations. While data-driven\nmethods have shown improvements in these tasks, prior works lack large-scale\ncurated surface movement datasets within the public domain and the development\nof generalizable trajectory forecasting models. In response to this, we propose\ntwo contributions: (1) Amelia-48, a large surface movement dataset collected\nusing the System Wide Information Management (SWIM) Surface Movement Event\nService (SMES). With data collection beginning in Dec 2022, the dataset\nprovides more than a year's worth of SMES data (~30TB) and covers 48 airports\nwithin the US National Airspace System. In addition to releasing this data in\nthe public domain, we also provide post-processing scripts and associated\nairport maps to enable research in the forecasting domain and beyond. (2)\nAmelia-TF model, a transformer-based next-token-prediction large multi-agent\nmulti-airport trajectory forecasting model trained on 292 days or 9.4 billion\ntokens of position data encompassing 10 different airports with varying\ntopology. The open-sourced model is validated on unseen airports with\nexperiments showcasing the different prediction horizon lengths, ego-agent\nselection strategies, and training recipes to demonstrate the generalization\ncapabilities.\n","authors":["Ingrid Navarro","Pablo Ortega-Kral","Jay Patrikar","Haichuan Wang","Zelin Ye","Jong Hoon Park","Jean Oh","Sebastian Scherer"],"pdf_url":"https://arxiv.org/pdf/2407.21185v1.pdf","comment":"24 pages, 9 figures, 8 tables"},{"id":"http://arxiv.org/abs/2407.21184v1","updated":"2024-07-30T20:50:30Z","published":"2024-07-30T20:50:30Z","title":"Optical Computing for Deep Neural Network Acceleration: Foundations,\n Recent Developments, and Emerging Directions","summary":" Emerging artificial intelligence applications across the domains of computer\nvision, natural language processing, graph processing, and sequence prediction\nincreasingly rely on deep neural networks (DNNs). These DNNs require\nsignificant compute and memory resources for training and inference.\nTraditional computing platforms such as CPUs, GPUs, and TPUs are struggling to\nkeep up with the demands of the increasingly complex and diverse DNNs. Optical\ncomputing represents an exciting new paradigm for light-speed acceleration of\nDNN workloads. In this article, we discuss the fundamentals and\nstate-of-the-art developments in optical computing, with an emphasis on DNN\nacceleration. Various promising approaches are described for engineering\noptical devices, enhancing optical circuits, and designing architectures that\ncan adapt optical computing to a variety of DNN workloads. Novel techniques for\nhardware/software co-design that can intelligently tune and map DNN models to\nimprove performance and energy-efficiency on optical computing platforms across\nhigh performance and resource constrained embedded, edge, and IoT platforms are\nalso discussed. Lastly, several open problems and future directions for\nresearch in this domain are highlighted.\n","authors":["Sudeep Pasricha"],"pdf_url":"https://arxiv.org/pdf/2407.21184v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.09955v2","updated":"2024-07-30T20:35:05Z","published":"2024-07-13T17:33:49Z","title":"LFFR: Logistic Function For (single-output) Regression","summary":" Privacy-preserving regression in machine learning is a crucial area of\nresearch, aimed at enabling the use of powerful machine learning techniques\nwhile protecting individuals' privacy. In this paper, we implement\nprivacy-preserving regression training using data encrypted under a fully\nhomomorphic encryption scheme. We first examine the common linear regression\nalgorithm and propose a (simplified) fixed Hessian for linear regression\ntraining, which can be applied for any datasets even not normalized into the\nrange $[0, 1]$. We also generalize this constant Hessian matrix to the ridge\nregression version, namely linear regression which includes a regularization\nterm to penalize large coefficients. However, our main contribution is to\ndevelop a novel and efficient algorithm called LFFR for homomorphic regression\nusing the logistic function, which could model more complex relations between\ninput values and output prediction in comparison with linear regression. We\nalso find a constant simplified Hessian to train our LFFR algorithm using the\nNewton-like method and compare it against to with our new fixed Hessian linear\nregression training over two real-world datasets. We suggest normalizing not\nonly the data but also the target predictions even for the original linear\nregression used in a privacy-preserving manner, which is helpful to remain\nweights in a small range, say $[-5, +5]$ good for refreshing ciphertext setting\nparameters, and avoid tuning the regularization parameter $\\lambda$ via cross\nvalidation. The linear regression with normalized predictions could be a viable\nalternative to ridge regression.\n","authors":["John Chiang"],"pdf_url":"https://arxiv.org/pdf/2407.09955v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21176v1","updated":"2024-07-30T20:30:44Z","published":"2024-07-30T20:30:44Z","title":"DKL-KAN: Scalable Deep Kernel Learning using Kolmogorov-Arnold Networks","summary":" The need for scalable and expressive models in machine learning is paramount,\nparticularly in applications requiring both structural depth and flexibility.\nTraditional deep learning methods, such as multilayer perceptrons (MLP), offer\ndepth but lack ability to integrate structural characteristics of deep learning\narchitectures with non-parametric flexibility of kernel methods. To address\nthis, deep kernel learning (DKL) was introduced, where inputs to a base kernel\nare transformed using a deep learning architecture. These kernels can replace\nstandard kernels, allowing both expressive power and scalability. The advent of\nKolmogorov-Arnold Networks (KAN) has generated considerable attention and\ndiscussion among researchers in scientific domain. In this paper, we introduce\na scalable deep kernel using KAN (DKL-KAN) as an effective alternative to DKL\nusing MLP (DKL-MLP). Our approach involves simultaneously optimizing these\nkernel attributes using marginal likelihood within a Gaussian process\nframework. We analyze two variants of DKL-KAN for a fair comparison with\nDKL-MLP: one with same number of neurons and layers as DKL-MLP, and another\nwith approximately same number of trainable parameters. To handle large\ndatasets, we use kernel interpolation for scalable structured Gaussian\nprocesses (KISS-GP) for low-dimensional inputs and KISS-GP with product kernels\nfor high-dimensional inputs. The efficacy of DKL-KAN is evaluated in terms of\ncomputational training time and test prediction accuracy across a wide range of\napplications. Additionally, the effectiveness of DKL-KAN is also examined in\nmodeling discontinuities and accurately estimating prediction uncertainty. The\nresults indicate that DKL-KAN outperforms DKL-MLP on datasets with a low number\nof observations. Conversely, DKL-MLP exhibits better scalability and higher\ntest prediction accuracy on datasets with large number of observations.\n","authors":["Shrenik Zinage","Sudeepta Mondal","Soumalya Sarkar"],"pdf_url":"https://arxiv.org/pdf/2407.21176v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.15946v2","updated":"2024-07-30T20:15:10Z","published":"2024-06-22T21:49:12Z","title":"Optimizing LaneSegNet for Real-Time Lane Topology Prediction in\n Autonomous Vehicles","summary":" With the increasing prevalence of autonomous vehicles, it is essential for\ncomputer vision algorithms to accurately assess road features in real-time.\nThis study explores the LaneSegNet architecture, a new approach to lane\ntopology prediction which integrates topological information with lane-line\ndata to provide a more contextual understanding of road environments. The\nLaneSegNet architecture includes a feature extractor, lane encoder, lane\ndecoder, and prediction head, leveraging components from ResNet-50, BEVFormer,\nand various attention mechanisms. We experimented with optimizations to the\nLaneSegNet architecture through feature extractor modification and transformer\nencoder-decoder stack modification. We found that modifying the encoder and\ndecoder stacks offered an interesting tradeoff between training time and\nprediction accuracy, with certain combinations showing promising results. Our\nimplementation, trained on a single NVIDIA Tesla A100 GPU, found that a 2:4\nratio reduced training time by 22.3% with only a 7.1% drop in mean average\nprecision, while a 4:8 ratio increased training time by only 11.1% but improved\nmean average precision by a significant 23.7%. These results indicate that\nstrategic hyperparameter tuning can yield substantial improvements depending on\nthe resources of the user. This study provides valuable insights for optimizing\nLaneSegNet according to available computation power, making it more accessible\nfor users with limited resources and increasing the capabilities for users with\nmore powerful resources.\n","authors":["William Stevens","Vishal Urs","Karthik Selvaraj","Gabriel Torres","Gaurish Lakhanpal"],"pdf_url":"https://arxiv.org/pdf/2406.15946v2.pdf","comment":"18 pages, 16 figures"},{"id":"http://arxiv.org/abs/2407.21159v1","updated":"2024-07-30T19:52:49Z","published":"2024-07-30T19:52:49Z","title":"Embedding Space Selection for Detecting Memorization and Fingerprinting\n in Generative Models","summary":" In the rapidly evolving landscape of artificial intelligence, generative\nmodels such as Generative Adversarial Networks (GANs) and Diffusion Models have\nbecome cornerstone technologies, driving innovation in diverse fields from art\ncreation to healthcare. Despite their potential, these models face the\nsignificant challenge of data memorization, which poses risks to privacy and\nthe integrity of generated content. Among various metrics of memorization\ndetection, our study delves into the memorization scores calculated from\nencoder layer embeddings, which involves measuring distances between samples in\nthe embedding spaces. Particularly, we find that the memorization scores\ncalculated from layer embeddings of Vision Transformers (ViTs) show an notable\ntrend - the latter (deeper) the layer, the less the memorization measured. It\nhas been found that the memorization scores from the early layers' embeddings\nare more sensitive to low-level memorization (e.g. colors and simple patterns\nfor an image), while those from the latter layers are more sensitive to\nhigh-level memorization (e.g. semantic meaning of an image). We also observe\nthat, for a specific model architecture, its degree of memorization on\ndifferent levels of information is unique. It can be viewed as an inherent\nproperty of the architecture. Building upon this insight, we introduce a unique\nfingerprinting methodology. This method capitalizes on the unique distributions\nof the memorization score across different layers of ViTs, providing a novel\napproach to identifying models involved in generating deepfakes and malicious\ncontent. Our approach demonstrates a marked 30% enhancement in identification\naccuracy over existing baseline methods, offering a more effective tool for\ncombating digital misinformation.\n","authors":["Jack He","Jianxing Zhao","Andrew Bai","Cho-Jui Hsieh"],"pdf_url":"https://arxiv.org/pdf/2407.21159v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21151v1","updated":"2024-07-30T19:28:28Z","published":"2024-07-30T19:28:28Z","title":"Private Collaborative Edge Inference via Over-the-Air Computation","summary":" We consider collaborative inference at the wireless edge, where each client's\nmodel is trained independently on their local datasets. Clients are queried in\nparallel to make an accurate decision collaboratively. In addition to\nmaximizing the inference accuracy, we also want to ensure the privacy of local\nmodels. To this end, we leverage the superposition property of the multiple\naccess channel to implement bandwidth-efficient multi-user inference methods.\nSpecifically, we propose different methods for ensemble and multi-view\nclassification that exploit over-the-air computation. We show that these\nschemes perform better than their orthogonal counterparts with statistically\nsignificant differences while using fewer resources and providing privacy\nguarantees. We also provide experimental results verifying the benefits of the\nproposed over-the-air multi-user inference approach and perform an ablation\nstudy to demonstrate the effectiveness of our design choices. We share the\nsource code of the framework publicly on Github to facilitate further research\nand reproducibility.\n","authors":["Selim F. Yilmaz","Burak Hasircioglu","Li Qiao","Deniz Gunduz"],"pdf_url":"https://arxiv.org/pdf/2407.21151v1.pdf","comment":"15 pages, 8 figures. This work extends from our preliminary study\n presented at the 2022 IEEE International Symposium on Information Theory [1].\n arXiv admin note: text overlap with arXiv:2202.03129"},{"id":"http://arxiv.org/abs/2407.21138v1","updated":"2024-07-30T18:59:19Z","published":"2024-07-30T18:59:19Z","title":"Enhancing Deep Hedging of Options with Implied Volatility Surface\n Feedback Information","summary":" We present a dynamic hedging scheme for S&P 500 options, where rebalancing\ndecisions are enhanced by integrating information about the implied volatility\nsurface dynamics. The optimal hedging strategy is obtained through a deep\npolicy gradient-type reinforcement learning algorithm, with a novel hybrid\nneural network architecture improving the training performance. The favorable\ninclusion of forward-looking information embedded in the volatility surface\nallows our procedure to outperform several conventional benchmarks such as\npractitioner and smiled-implied delta hedging procedures, both in simulation\nand backtesting experiments.\n","authors":["Pascal François","Geneviève Gauthier","Frédéric Godin","Carlos Octavio Pérez Mendoza"],"pdf_url":"https://arxiv.org/pdf/2407.21138v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17477v2","updated":"2024-07-30T18:47:31Z","published":"2024-07-01T17:20:37Z","title":"Toward Automated Detection of Biased Social Signals from the Content of\n Clinical Conversations","summary":" Implicit bias can impede patient-provider interactions and lead to inequities\nin care. Raising awareness is key to reducing such bias, but its manifestations\nin the social dynamics of patient-provider communication are difficult to\ndetect. In this study, we used automated speech recognition (ASR) and natural\nlanguage processing (NLP) to identify social signals in patient-provider\ninteractions. We built an automated pipeline to predict social signals from\naudio recordings of 782 primary care visits that achieved 90.1% average\naccuracy across codes, and exhibited fairness in its predictions for white and\nnon-white patients. Applying this pipeline, we identified statistically\nsignificant differences in provider communication behavior toward white versus\nnon-white patients. In particular, providers expressed more patient-centered\nbehaviors towards white patients including more warmth, engagement, and\nattentiveness. Our study underscores the potential of automated tools in\nidentifying subtle communication signals that may be linked with bias and\nimpact healthcare quality and equity.\n","authors":["Feng Chen","Manas Satish Bedmutha","Ray-Yuan Chung","Janice Sabin","Wanda Pratt","Brian R. Wood","Nadir Weibel","Andrea L. Hartzler","Trevor Cohen"],"pdf_url":"https://arxiv.org/pdf/2407.17477v2.pdf","comment":"Accepted by AMIA 2024 Annual Symposium"},{"id":"http://arxiv.org/abs/2407.21130v1","updated":"2024-07-30T18:44:40Z","published":"2024-07-30T18:44:40Z","title":"Computational music analysis from first principles","summary":" We use coupled hidden Markov models to automatically annotate the 371 Bach\nchorales in the Riemenschneider edition, a corpus containing approximately\n100,000 notes and 20,000 chords. We give three separate analyses that achieve\nprogressively greater accuracy at the cost of making increasingly strong\nassumptions about musical syntax. Although our method makes almost no use of\nhuman input, we are able to identify both chords and keys with an accuracy of\n85% or greater when compared to an expert human analysis, resulting in\nannotations accurate enough to be used for a range of music-theoretical\npurposes, while also being free of subjective human judgments. Our work bears\non longstanding debates about the objective reality of the structures\npostulated by standard Western harmonic theory, as well as on specific\nquestions about the nature of Western harmonic syntax.\n","authors":["Dmitri Tymoczko","Mark Newman"],"pdf_url":"https://arxiv.org/pdf/2407.21130v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21124v1","updated":"2024-07-30T18:33:05Z","published":"2024-07-30T18:33:05Z","title":"Zero Shot Health Trajectory Prediction Using Transformer","summary":" Integrating modern machine learning and clinical decision-making has great\npromise for mitigating healthcare's increasing cost and complexity. We\nintroduce the Enhanced Transformer for Health Outcome Simulation (ETHOS), a\nnovel application of the transformer deep-learning architecture for analyzing\nhigh-dimensional, heterogeneous, and episodic health data. ETHOS is trained\nusing Patient Health Timelines (PHTs)-detailed, tokenized records of health\nevents-to predict future health trajectories, leveraging a zero-shot learning\napproach. ETHOS represents a significant advancement in foundation model\ndevelopment for healthcare analytics, eliminating the need for labeled data and\nmodel fine-tuning. Its ability to simulate various treatment pathways and\nconsider patient-specific factors positions ETHOS as a tool for care\noptimization and addressing biases in healthcare delivery. Future developments\nwill expand ETHOS' capabilities to incorporate a wider range of data types and\ndata sources. Our work demonstrates a pathway toward accelerated AI development\nand deployment in healthcare.\n","authors":["Pawel Renc","Yugang Jia","Anthony E. Samir","Jaroslaw Was","Quanzheng Li","David W. Bates","Arkadiusz Sitek"],"pdf_url":"https://arxiv.org/pdf/2407.21124v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21121v1","updated":"2024-07-30T18:24:46Z","published":"2024-07-30T18:24:46Z","title":"Taming the Frequency Factory of Sinusoidal Networks","summary":" This work investigates the structure and representation capacity of\n$sinusoidal$ MLPs, which have recently shown promising results in encoding\nlow-dimensional signals. This success can be attributed to its smoothness and\nhigh representation capacity. The first allows the use of the network's\nderivatives during training, enabling regularization. However, defining the\narchitecture and initializing its parameters to achieve a desired capacity\nremains an empirical task. This work provides theoretical and experimental\nresults justifying the capacity property of sinusoidal MLPs and offers control\nmechanisms for their initialization and training.\n We approach this from a Fourier series perspective and link the training with\nthe model's spectrum. Our analysis is based on a $harmonic$ expansion of the\nsinusoidal MLP, which says that the composition of sinusoidal layers produces a\nlarge number of new frequencies expressed as integer linear combinations of the\ninput frequencies (weights of the input layer). We use this novel $identity$ to\ninitialize the input neurons which work as a sampling in the signal spectrum.\nWe also note that each hidden neuron produces the same frequencies with\namplitudes completely determined by the hidden weights. Finally, we give an\nupper bound for these amplitudes, which results in a $bounding$ scheme for the\nnetwork's spectrum during training.\n","authors":["Tiago Novello","Diana Aldana","Luiz Velho"],"pdf_url":"https://arxiv.org/pdf/2407.21121v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21118v1","updated":"2024-07-30T18:19:38Z","published":"2024-07-30T18:19:38Z","title":"Palu: Compressing KV-Cache with Low-Rank Projection","summary":" KV-Cache compression methods generally sample a KV-Cache of effectual tokens\nor quantize it into lower bits. However, these methods cannot exploit the\nredundancy of the hidden dimension of KV tensors. This paper investigates a\nunique hidden dimension approach called Palu, a novel KV-Cache compression\nframework that utilizes low-rank projection. Palu decomposes the linear layers\ninto low-rank matrices, caches the smaller intermediate states, and\nreconstructs the full keys and values on the fly. To improve accuracy,\ncompression rate, and efficiency, Palu further encompasses (1) a medium-grained\nlow-rank decomposition scheme, (2) an efficient rank search algorithm, (3) a\nlow-rank-aware quantization algorithm, and (4) matrix fusion with optimized GPU\nkernels. Our extensive experiments with popular LLMs show that Palu can\ncompress KV-Cache by more than 91.25% while maintaining a significantly better\naccuracy (up to 1.19 lower perplexity) than state-of-the-art KV-Cache\nquantization methods at a similar or even higher memory usage. When compressing\nKV-Cache for 50%, Palu delivers up to 1.61x end-to-end speedup for the\nattention module. Our code is publicly available at\nhttps://github.com/shadowpa0327/Palu.\n","authors":["Chi-Chih Chang","Wei-Cheng Lin","Chien-Yu Lin","Chong-Yan Chen","Yu-Fang Hu","Pei-Shuo Wang","Ning-Chi Huang","Luis Ceze","Kai-Chiang Wu"],"pdf_url":"https://arxiv.org/pdf/2407.21118v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03194v2","updated":"2024-07-30T18:18:12Z","published":"2024-07-03T15:26:02Z","title":"Prediction Instability in Machine Learning Ensembles","summary":" In machine learning ensembles predictions from multiple models are\naggregated. Despite widespread use and strong performance of ensembles in\napplied problems little is known about the mathematical properties of\naggregating models and associated consequences for safe, explainable use of\nsuch models. In this paper we prove a theorem that shows that any ensemble will\nexhibit at least one of the following forms of prediction instability. It will\neither ignore agreement among all underlying models, change its mind when none\nof the underlying models have done so, or be manipulable through inclusion or\nexclusion of options it would never actually predict. As a consequence,\nensemble aggregation procedures will always need to balance the benefits of\ninformation use against the risk of these prediction instabilities. This\nanalysis also sheds light on what specific forms of prediction instability to\nexpect from particular ensemble algorithms; for example popular tree ensembles\nlike random forest, or xgboost will violate basic, intuitive fairness\nproperties. Finally, we show that this can be ameliorated by using consistent\nmodels in asymptotic conditions.\n","authors":["Jeremy Kedziora"],"pdf_url":"https://arxiv.org/pdf/2407.03194v2.pdf","comment":"15 pages, uses a modified version of ICML2024.sty"},{"id":"http://arxiv.org/abs/2305.16189v4","updated":"2024-07-30T18:14:56Z","published":"2023-05-25T15:49:38Z","title":"Martian time-series unraveled: A multi-scale nested approach with\n factorial variational autoencoders","summary":" Unsupervised source separation involves unraveling an unknown set of source\nsignals recorded through a mixing operator, with limited prior knowledge about\nthe sources, and only access to a dataset of signal mixtures. This problem is\ninherently ill-posed and is further challenged by the variety of timescales\nexhibited by sources in time series data from planetary space missions. As\nsuch, a systematic multi-scale unsupervised approach is needed to identify and\nseparate sources at different timescales. Existing methods typically rely on a\npreselected window size that determines their operating timescale, limiting\ntheir capacity to handle multi-scale sources. To address this issue, we propose\nan unsupervised multi-scale clustering and source separation framework by\nleveraging wavelet scattering spectra that provide a low-dimensional\nrepresentation of stochastic processes, capable of distinguishing between\ndifferent non-Gaussian stochastic processes. Nested within this representation\nspace, we develop a factorial variational autoencoder that is trained to\nprobabilistically cluster sources at different timescales. To perform source\nseparation, we use samples from clusters at multiple timescales obtained via\nthe factorial variational autoencoder as prior information and formulate an\noptimization problem in the wavelet scattering spectra representation space.\nWhen applied to the entire seismic dataset recorded during the NASA InSight\nmission on Mars, containing sources varying greatly in timescale, our approach\ndisentangles such different sources, e.g., minute-long transient one-sided\npulses (known as \"glitches\") and structured ambient noises resulting from\natmospheric activities that typically last for tens of minutes, and provides an\nopportunity to conduct further investigations into the isolated sources.\n","authors":["Ali Siahkoohi","Rudy Morel","Randall Balestriero","Erwan Allys","Grégory Sainton","Taichi Kawamura","Maarten V. de Hoop"],"pdf_url":"https://arxiv.org/pdf/2305.16189v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05385v4","updated":"2024-07-30T18:04:41Z","published":"2024-03-08T15:30:58Z","title":"Switching the Loss Reduces the Cost in Batch Reinforcement Learning","summary":" We propose training fitted Q-iteration with log-loss (FQI-log) for batch\nreinforcement learning (RL). We show that the number of samples needed to learn\na near-optimal policy with FQI-log scales with the accumulated cost of the\noptimal policy, which is zero in problems where acting optimally achieves the\ngoal and incurs no cost. In doing so, we provide a general framework for\nproving small-cost bounds, i.e. bounds that scale with the optimal achievable\ncost, in batch RL. Moreover, we empirically verify that FQI-log uses fewer\nsamples than FQI trained with squared loss on problems where the optimal policy\nreliably achieves the goal.\n","authors":["Alex Ayoub","Kaiwen Wang","Vincent Liu","Samuel Robertson","James McInerney","Dawen Liang","Nathan Kallus","Csaba Szepesvári"],"pdf_url":"https://arxiv.org/pdf/2403.05385v4.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2407.20962v1","updated":"2024-07-30T16:43:24Z","published":"2024-07-30T16:43:24Z","title":"MMTrail: A Multimodal Trailer Video Dataset with Language and Music\n Descriptions","summary":" Massive multi-modality datasets play a significant role in facilitating the\nsuccess of large video-language models. However, current video-language\ndatasets primarily provide text descriptions for visual frames, considering\naudio to be weakly related information. They usually overlook exploring the\npotential of inherent audio-visual correlation, leading to monotonous\nannotation within each modality instead of comprehensive and precise\ndescriptions. Such ignorance results in the difficulty of multiple\ncross-modality studies. To fulfill this gap, we present MMTrail, a large-scale\nmulti-modality video-language dataset incorporating more than 20M trailer clips\nwith visual captions, and 2M high-quality clips with multimodal captions.\nTrailers preview full-length video works and integrate context, visual frames,\nand background music. In particular, the trailer has two main advantages: (1)\nthe topics are diverse, and the content characters are of various types, e.g.,\nfilm, news, and gaming. (2) the corresponding background music is\ncustom-designed, making it more coherent with the visual context. Upon these\ninsights, we propose a systemic captioning framework, achieving various\nmodality annotations with more than 27.1k hours of trailer videos. Here, to\nensure the caption retains music perspective while preserving the authority of\nvisual context, we leverage the advanced LLM to merge all annotations\nadaptively. In this fashion, our MMtrail dataset potentially paves the path for\nfine-grained large multimodal-language model training. In experiments, we\nprovide evaluation metrics and benchmark results on our dataset, demonstrating\nthe high quality of our annotation and its effectiveness for model training.\n","authors":["Xiaowei Chi","Yatian Wang","Aosong Cheng","Pengjun Fang","Zeyue Tian","Yingqing He","Zhaoyang Liu","Xingqun Qi","Jiahao Pan","Rongyu Zhang","Mengfei Li","Ruibin Yuan","Yanbing Jiang","Wei Xue","Wenhan Luo","Qifeng Chen","Shanghang Zhang","Qifeng Liu","Yike Guo"],"pdf_url":"https://arxiv.org/pdf/2407.20962v1.pdf","comment":"15 Pages. Dataset report"},{"id":"http://arxiv.org/abs/2311.13307v3","updated":"2024-07-30T16:11:48Z","published":"2023-11-22T10:55:36Z","title":"Rethinking Radiology Report Generation via Causal Inspired\n Counterfactual Augmentation","summary":" Radiology Report Generation (RRG) draws attention as a vision-and-language\ninteraction of biomedical fields. Previous works inherited the ideology of\ntraditional language generation tasks, aiming to generate paragraphs with high\nreadability as reports. Despite significant progress, the independence between\ndiseases-a specific property of RRG-was neglected, yielding the models being\nconfused by the co-occurrence of diseases brought on by the biased data\ndistribution, thus generating inaccurate reports. In this paper, to rethink\nthis issue, we first model the causal effects between the variables from a\ncausal perspective, through which we prove that the co-occurrence relationships\nbetween diseases on the biased distribution function as confounders, confusing\nthe accuracy through two backdoor paths, i.e. the Joint Vision Coupling and the\nConditional Sequential Coupling. Then, we proposed a novel model-agnostic\ncounterfactual augmentation method that contains two strategies, i.e. the\nPrototype-based Counterfactual Sample Synthesis (P-CSS) and the Magic-Cube-like\nCounterfactual Report Reconstruction (Cube), to intervene the backdoor paths,\nthus enhancing the accuracy and generalization of RRG models. Experimental\nresults on the widely used MIMIC-CXR dataset demonstrate the effectiveness of\nour proposed method. Additionally, a generalization performance is evaluated on\nIU X-Ray dataset, which verifies our work can effectively reduce the impact of\nco-occurrences caused by different distributions on the results.\n","authors":["Xiao Song","Jiafan Liu","Yun Li","Yan Liu","Wenbin Lei","Ruxin Wang"],"pdf_url":"https://arxiv.org/pdf/2311.13307v3.pdf","comment":"10 pages,5 figures"},{"id":"http://arxiv.org/abs/2305.07216v2","updated":"2024-07-30T14:36:26Z","published":"2023-05-12T03:13:37Z","title":"Versatile audio-visual learning for emotion recognition","summary":" Most current audio-visual emotion recognition models lack the flexibility\nneeded for deployment in practical applications. We envision a multimodal\nsystem that works even when only one modality is available and can be\nimplemented interchangeably for either predicting emotional attributes or\nrecognizing categorical emotions. Achieving such flexibility in a multimodal\nemotion recognition system is difficult due to the inherent challenges in\naccurately interpreting and integrating varied data sources. It is also a\nchallenge to robustly handle missing or partial information while allowing\ndirect switch between regression or classification tasks. This study proposes a\nversatile audio-visual learning (VAVL) framework for handling unimodal and\nmultimodal systems for emotion regression or emotion classification tasks. We\nimplement an audio-visual framework that can be trained even when audio and\nvisual paired data is not available for part of the training set (i.e., audio\nonly or only video is present). We achieve this effective representation\nlearning with audio-visual shared layers, residual connections over shared\nlayers, and a unimodal reconstruction task. Our experimental results reveal\nthat our architecture significantly outperforms strong baselines on the\nCREMA-D, MSP-IMPROV, and CMU-MOSEI corpora. Notably, VAVL attains a new\nstate-of-the-art performance in the emotional attribute prediction task on the\nMSP-IMPROV corpus.\n","authors":["Lucas Goncalves","Seong-Gyun Leem","Wei-Cheng Lin","Berrak Sisman","Carlos Busso"],"pdf_url":"https://arxiv.org/pdf/2305.07216v2.pdf","comment":"18 pages, 4 Figures, 3 tables (published at IEEE Transactions on\n Affective Computing)"},{"id":"http://arxiv.org/abs/2407.20852v1","updated":"2024-07-30T14:27:32Z","published":"2024-07-30T14:27:32Z","title":"Optimizing 5G-Advanced Networks for Time-critical Applications: The Role\n of L4S","summary":" As 5G networks strive to support advanced time-critical applications, such as\nimmersive Extended Reality (XR), cloud gaming, and autonomous driving, the\ndemand for Real-time Broadband Communication (RTBC) grows. In this article, we\npresent the main mechanisms of Low Latency, Low Loss, and Scalable Throughput\n(L4S). Subsequently, we investigate the support and challenges of L4S\ntechnology in the latest 3GPP 5G-Advanced Release 18 (R18) standard. Our case\nstudy, using a prototype system for a real-time communication (RTC)\napplication, demonstrates the superiority of L4S technology. The experimental\nresults show that, compared with the GCC algorithm, the proposed L4S-GCC\nalgorithm can reduce the stalling rate by 1.51%-2.80% and increase the\nbandwidth utilization by 11.4%-31.4%. The results emphasize the immense\npotential of L4S technology in enhancing transmission performance in\ntime-critical applications.\n","authors":["Guangjin Pan","Shugong Xu","Pin Jiang"],"pdf_url":"https://arxiv.org/pdf/2407.20852v1.pdf","comment":"7 pages, 3 figures. This work has been submitted to the IEEE for\n possible publication. Copyright may be transferred without notice, after\n which this version may no longer be accessible"},{"id":"http://arxiv.org/abs/2407.19456v2","updated":"2024-07-30T11:57:37Z","published":"2024-07-28T10:15:17Z","title":"An Inverse Partial Optimal Transport Framework for Music-guided Movie\n Trailer Generation","summary":" Trailer generation is a challenging video clipping task that aims to select\nhighlighting shots from long videos like movies and re-organize them in an\nattractive way. In this study, we propose an inverse partial optimal transport\n(IPOT) framework to achieve music-guided movie trailer generation. In\nparticular, we formulate the trailer generation task as selecting and sorting\nkey movie shots based on audio shots, which involves matching the latent\nrepresentations across visual and acoustic modalities. We learn a multi-modal\nlatent representation model in the proposed IPOT framework to achieve this aim.\nIn this framework, a two-tower encoder derives the latent representations of\nmovie and music shots, respectively, and an attention-assisted Sinkhorn\nmatching network parameterizes the grounding distance between the shots' latent\nrepresentations and the distribution of the movie shots. Taking the\ncorrespondence between the movie shots and its trailer music shots as the\nobserved optimal transport plan defined on the grounding distances, we learn\nthe model by solving an inverse partial optimal transport problem, leading to a\nbi-level optimization strategy. We collect real-world movies and their trailers\nto construct a dataset with abundant label information called CMTD and,\naccordingly, train and evaluate various automatic trailer generators. Compared\nwith state-of-the-art methods, our IPOT method consistently shows superiority\nin subjective visual effects and objective quantitative measurements.\n","authors":["Yutong Wang","Sidan Zhu","Hongteng Xu","Dixin Luo"],"pdf_url":"https://arxiv.org/pdf/2407.19456v2.pdf","comment":"acmmm2024"},{"id":"http://arxiv.org/abs/2407.20693v1","updated":"2024-07-30T09:41:37Z","published":"2024-07-30T09:41:37Z","title":"Boosting Audio Visual Question Answering via Key Semantic-Aware Cues","summary":" The Audio Visual Question Answering (AVQA) task aims to answer questions\nrelated to various visual objects, sounds, and their interactions in videos.\nSuch naturally multimodal videos contain rich and complex dynamic audio-visual\ncomponents, with only a portion of them closely related to the given questions.\nHence, effectively perceiving audio-visual cues relevant to the given questions\nis crucial for correctly answering them. In this paper, we propose a\nTemporal-Spatial Perception Model (TSPM), which aims to empower the model to\nperceive key visual and auditory cues related to the questions. Specifically,\nconsidering the challenge of aligning non-declarative questions and visual\nrepresentations into the same semantic space using visual-language pretrained\nmodels, we construct declarative sentence prompts derived from the question\ntemplate, to assist the temporal perception module in better identifying\ncritical segments relevant to the questions. Subsequently, a spatial perception\nmodule is designed to merge visual tokens from selected segments to highlight\nkey latent targets, followed by cross-modal interaction with audio to perceive\npotential sound-aware areas. Finally, the significant temporal-spatial cues\nfrom these modules are integrated to answer the question. Extensive experiments\non multiple AVQA benchmarks demonstrate that our framework excels not only in\nunderstanding audio-visual scenes but also in answering complex questions\neffectively. Code is available at https://github.com/GeWu-Lab/TSPM.\n","authors":["Guangyao Li","Henghui Du","Di Hu"],"pdf_url":"https://arxiv.org/pdf/2407.20693v1.pdf","comment":"Accepted by ACM MM 2024"},{"id":"http://arxiv.org/abs/2407.20124v2","updated":"2024-07-30T07:40:17Z","published":"2024-07-29T15:54:43Z","title":"AxiomVision: Accuracy-Guaranteed Adaptive Visual Model Selection for\n Perspective-Aware Video Analytics","summary":" The rapid evolution of multimedia and computer vision technologies requires\nadaptive visual model deployment strategies to effectively handle diverse tasks\nand varying environments. This work introduces AxiomVision, a novel framework\nthat can guarantee accuracy by leveraging edge computing to dynamically select\nthe most efficient visual models for video analytics under diverse scenarios.\nUtilizing a tiered edge-cloud architecture, AxiomVision enables the deployment\nof a broad spectrum of visual models, from lightweight to complex DNNs, that\ncan be tailored to specific scenarios while considering camera source impacts.\nIn addition, AxiomVision provides three core innovations: (1) a dynamic visual\nmodel selection mechanism utilizing continual online learning, (2) an efficient\nonline method that efficiently takes into account the influence of the camera's\nperspective, and (3) a topology-driven grouping approach that accelerates the\nmodel selection process. With rigorous theoretical guarantees, these\nadvancements provide a scalable and effective solution for visual tasks\ninherent to multimedia systems, such as object detection, classification, and\ncounting. Empirically, AxiomVision achieves a 25.7\\% improvement in accuracy.\n","authors":["Xiangxiang Dai","Zeyu Zhang","Peng Yang","Yuedong Xu","Xutong Liu","John C. S. Lui"],"pdf_url":"https://arxiv.org/pdf/2407.20124v2.pdf","comment":"Accepted by ACM MM 2024"},{"id":"http://arxiv.org/abs/2407.20592v1","updated":"2024-07-30T06:57:00Z","published":"2024-07-30T06:57:00Z","title":"EgoSonics: Generating Synchronized Audio for Silent Egocentric Videos","summary":" We introduce EgoSonics, a method to generate semantically meaningful and\nsynchronized audio tracks conditioned on silent egocentric videos. Generating\naudio for silent egocentric videos could open new applications in virtual\nreality, assistive technologies, or for augmenting existing datasets. Existing\nwork has been limited to domains like speech, music, or impact sounds and\ncannot easily capture the broad range of audio frequencies found in egocentric\nvideos. EgoSonics addresses these limitations by building on the strength of\nlatent diffusion models for conditioned audio synthesis. We first encode and\nprocess audio and video data into a form that is suitable for generation. The\nencoded data is used to train our model to generate audio tracks that capture\nthe semantics of the input video. Our proposed SyncroNet builds on top of\nControlNet to provide control signals that enables temporal synchronization to\nthe synthesized audio. Extensive evaluations show that our model outperforms\nexisting work in audio quality, and in our newly proposed synchronization\nevaluation method. Furthermore, we demonstrate downstream applications of our\nmodel in improving video summarization.\n","authors":["Aashish Rai","Srinath Sridhar"],"pdf_url":"https://arxiv.org/pdf/2407.20592v1.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2407.20523v1","updated":"2024-07-30T03:45:04Z","published":"2024-07-30T03:45:04Z","title":"Wireless Multi-User Interactive Virtual Reality in Metaverse with\n Edge-Device Collaborative Computing","summary":" The immersive nature of the metaverse presents significant challenges for\nwireless multi-user interactive virtual reality (VR), such as ultra-low\nlatency, high throughput and intensive computing, which place substantial\ndemands on the wireless bandwidth and rendering resources of mobile edge\ncomputing (MEC). In this paper, we propose a wireless multi-user interactive VR\nwith edge-device collaborative computing framework to overcome the\nmotion-to-photon (MTP) threshold bottleneck. Specifically, we model the\nserial-parallel task execution in queues within a foreground and background\nseparation architecture. The rendering indices of background tiles within the\nprediction window are determined, and both the foreground and selected\nbackground tiles are loaded into respective processing queues based on the\nrendering locations. To minimize the age of sensor information and the power\nconsumption of mobile devices, we optimize rendering decisions and MEC resource\nallocation subject to the MTP constraint. To address this optimization problem,\nwe design a safe reinforcement learning (RL) algorithm, active queue\nmanagement-constrained updated projection (AQM-CUP). AQM-CUP constructs an\nenvironment suitable for queues, incorporating expired tiles actively discarded\nin processing buffers into its state and reward system. Experimental results\ndemonstrate that the proposed framework significantly enhances user immersion\nwhile reducing device power consumption, and the superiority of the proposed\nAQM-CUP algorithm over conventional methods in terms of the training\nconvergence and performance metrics.\n","authors":["Caolu Xu","Zhiyong Chen","Meixia Tao","Wenjun Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.20523v1.pdf","comment":"submitted to IEEE journal"},{"id":"http://arxiv.org/abs/2407.19224v2","updated":"2024-07-30T02:27:56Z","published":"2024-07-27T09:56:23Z","title":"RAVSS: Robust Audio-Visual Speech Separation in Multi-Speaker Scenarios\n with Missing Visual Cues","summary":" While existing Audio-Visual Speech Separation (AVSS) methods primarily\nconcentrate on the audio-visual fusion strategy for two-speaker separation,\nthey demonstrate a severe performance drop in the multi-speaker separation\nscenarios. Typically, AVSS methods employ guiding videos to sequentially\nisolate individual speakers from the given audio mixture, resulting in notable\nmissing and noisy parts across various segments of the separated speech. In\nthis study, we propose a simultaneous multi-speaker separation framework that\ncan facilitate the concurrent separation of multiple speakers within a singular\nprocess. We introduce speaker-wise interactions to establish distinctions and\ncorrelations among speakers. Experimental results on the VoxCeleb2 and LRS3\ndatasets demonstrate that our method achieves state-of-the-art performance in\nseparating mixtures with 2, 3, 4, and 5 speakers, respectively. Additionally,\nour model can utilize speakers with complete audio-visual information to\nmitigate other visual-deficient speakers, thereby enhancing its resilience to\nmissing visual cues. We also conduct experiments where visual information for\nspecific speakers is entirely absent or visual frames are partially missing.\nThe results demonstrate that our model consistently outperforms others,\nexhibiting the smallest performance drop across all settings involving 2, 3, 4,\nand 5 speakers.\n","authors":["Tianrui Pan","Jie Liu","Bohan Wang","Jie Tang","Gangshan Wu"],"pdf_url":"https://arxiv.org/pdf/2407.19224v2.pdf","comment":"Accepted by MM 2024"}]},"2024-07-31T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2407.21792v1","updated":"2024-07-31T17:59:24Z","published":"2024-07-31T17:59:24Z","title":"Safetywashing: Do AI Safety Benchmarks Actually Measure Safety Progress?","summary":" As artificial intelligence systems grow more powerful, there has been\nincreasing interest in \"AI safety\" research to address emerging and future\nrisks. However, the field of AI safety remains poorly defined and\ninconsistently measured, leading to confusion about how researchers can\ncontribute. This lack of clarity is compounded by the unclear relationship\nbetween AI safety benchmarks and upstream general capabilities (e.g., general\nknowledge and reasoning). To address these issues, we conduct a comprehensive\nmeta-analysis of AI safety benchmarks, empirically analyzing their correlation\nwith general capabilities across dozens of models and providing a survey of\nexisting directions in AI safety. Our findings reveal that many safety\nbenchmarks highly correlate with upstream model capabilities, potentially\nenabling \"safetywashing\" -- where capability improvements are misrepresented as\nsafety advancements. Based on these findings, we propose an empirical\nfoundation for developing more meaningful safety metrics and define AI safety\nin a machine learning research context as a set of clearly delineated research\ngoals that are empirically separable from generic capabilities advancements. In\ndoing so, we aim to provide a more rigorous framework for AI safety research,\nadvancing the science of safety evaluations and clarifying the path towards\nmeasurable progress.\n","authors":["Richard Ren","Steven Basart","Adam Khoja","Alice Gatti","Long Phan","Xuwang Yin","Mantas Mazeika","Alexander Pan","Gabriel Mukobi","Ryan H. Kim","Stephen Fitz","Dan Hendrycks"],"pdf_url":"https://arxiv.org/pdf/2407.21792v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21788v1","updated":"2024-07-31T17:57:32Z","published":"2024-07-31T17:57:32Z","title":"Vision-Language Model Based Handwriting Verification","summary":" Handwriting Verification is a critical in document forensics. Deep learning\nbased approaches often face skepticism from forensic document examiners due to\ntheir lack of explainability and reliance on extensive training data and\nhandcrafted features. This paper explores using Vision Language Models (VLMs),\nsuch as OpenAI's GPT-4o and Google's PaliGemma, to address these challenges. By\nleveraging their Visual Question Answering capabilities and 0-shot\nChain-of-Thought (CoT) reasoning, our goal is to provide clear,\nhuman-understandable explanations for model decisions. Our experiments on the\nCEDAR handwriting dataset demonstrate that VLMs offer enhanced\ninterpretability, reduce the need for large training datasets, and adapt better\nto diverse handwriting styles. However, results show that the CNN-based\nResNet-18 architecture outperforms the 0-shot CoT prompt engineering approach\nwith GPT-4o (Accuracy: 70%) and supervised fine-tuned PaliGemma (Accuracy:\n71%), achieving an accuracy of 84% on the CEDAR AND dataset. These findings\nhighlight the potential of VLMs in generating human-interpretable decisions\nwhile underscoring the need for further advancements to match the performance\nof specialized deep learning models.\n","authors":["Mihir Chauhan","Abhishek Satbhai","Mohammad Abuzar Hashemi","Mir Basheer Ali","Bina Ramamurthy","Mingchen Gao","Siwei Lyu","Sargur Srihari"],"pdf_url":"https://arxiv.org/pdf/2407.21788v1.pdf","comment":"4 Pages, 1 Figure, 1 Table, Accepted as Short paper at Irish Machine\n Vision and Image Processing (IMVIP) Conference"},{"id":"http://arxiv.org/abs/2407.20224v2","updated":"2024-07-31T17:57:20Z","published":"2024-07-29T17:58:06Z","title":"Can Editing LLMs Inject Harm?","summary":" Knowledge editing techniques have been increasingly adopted to efficiently\ncorrect the false or outdated knowledge in Large Language Models (LLMs), due to\nthe high cost of retraining from scratch. Meanwhile, one critical but\nunder-explored question is: can knowledge editing be used to inject harm into\nLLMs? In this paper, we propose to reformulate knowledge editing as a new type\nof safety threat for LLMs, namely Editing Attack, and conduct a systematic\ninvestigation with a newly constructed dataset EditAttack. Specifically, we\nfocus on two typical safety risks of Editing Attack including Misinformation\nInjection and Bias Injection. For the risk of misinformation injection, we\nfirst categorize it into commonsense misinformation injection and long-tail\nmisinformation injection. Then, we find that editing attacks can inject both\ntypes of misinformation into LLMs, and the effectiveness is particularly high\nfor commonsense misinformation injection. For the risk of bias injection, we\ndiscover that not only can biased sentences be injected into LLMs with high\neffectiveness, but also one single biased sentence injection can cause a bias\nincrease in general outputs of LLMs, which are even highly irrelevant to the\ninjected sentence, indicating a catastrophic impact on the overall fairness of\nLLMs. Then, we further illustrate the high stealthiness of editing attacks,\nmeasured by their impact on the general knowledge and reasoning capacities of\nLLMs, and show the hardness of defending editing attacks with empirical\nevidence. Our discoveries demonstrate the emerging misuse risks of knowledge\nediting techniques on compromising the safety alignment of LLMs.\n","authors":["Canyu Chen","Baixiang Huang","Zekun Li","Zhaorun Chen","Shiyang Lai","Xiongxiao Xu","Jia-Chen Gu","Jindong Gu","Huaxiu Yao","Chaowei Xiao","Xifeng Yan","William Yang Wang","Philip Torr","Dawn Song","Kai Shu"],"pdf_url":"https://arxiv.org/pdf/2407.20224v2.pdf","comment":"The first two authors contributed equally. 9 pages for main paper, 36\n pages including appendix. The code, results, dataset for this paper and more\n resources are on the project website: https://llm-editing.github.io"},{"id":"http://arxiv.org/abs/2407.21783v1","updated":"2024-07-31T17:54:27Z","published":"2024-07-31T17:54:27Z","title":"The Llama 3 Herd of Models","summary":" Modern artificial intelligence (AI) systems are powered by foundation models.\nThis paper presents a new set of foundation models, called Llama 3. It is a\nherd of language models that natively support multilinguality, coding,\nreasoning, and tool usage. Our largest model is a dense Transformer with 405B\nparameters and a context window of up to 128K tokens. This paper presents an\nextensive empirical evaluation of Llama 3. We find that Llama 3 delivers\ncomparable quality to leading language models such as GPT-4 on a plethora of\ntasks. We publicly release Llama 3, including pre-trained and post-trained\nversions of the 405B parameter language model and our Llama Guard 3 model for\ninput and output safety. The paper also presents the results of experiments in\nwhich we integrate image, video, and speech capabilities into Llama 3 via a\ncompositional approach. We observe this approach performs competitively with\nthe state-of-the-art on image, video, and speech recognition tasks. The\nresulting models are not yet being broadly released as they are still under\ndevelopment.\n","authors":["Abhimanyu Dubey","Abhinav Jauhri","Abhinav Pandey","Abhishek Kadian","Ahmad Al-Dahle","Aiesha Letman","Akhil Mathur","Alan Schelten","Amy Yang","Angela Fan","Anirudh Goyal","Anthony Hartshorn","Aobo Yang","Archi Mitra","Archie Sravankumar","Artem Korenev","Arthur Hinsvark","Arun Rao","Aston Zhang","Aurelien Rodriguez","Austen Gregerson","Ava Spataru","Baptiste Roziere","Bethany Biron","Binh Tang","Bobbie Chern","Charlotte Caucheteux","Chaya Nayak","Chloe Bi","Chris Marra","Chris McConnell","Christian Keller","Christophe Touret","Chunyang Wu","Corinne Wong","Cristian Canton Ferrer","Cyrus Nikolaidis","Damien Allonsius","Daniel Song","Danielle Pintz","Danny Livshits","David Esiobu","Dhruv Choudhary","Dhruv Mahajan","Diego Garcia-Olano","Diego Perino","Dieuwke Hupkes","Egor Lakomkin","Ehab AlBadawy","Elina Lobanova","Emily Dinan","Eric Michael Smith","Filip Radenovic","Frank Zhang","Gabriel Synnaeve","Gabrielle Lee","Georgia Lewis Anderson","Graeme Nail","Gregoire Mialon","Guan Pang","Guillem Cucurell","Hailey Nguyen","Hannah Korevaar","Hu Xu","Hugo Touvron","Iliyan Zarov","Imanol Arrieta Ibarra","Isabel Kloumann","Ishan Misra","Ivan Evtimov","Jade Copet","Jaewon Lee","Jan Geffert","Jana Vranes","Jason Park","Jay Mahadeokar","Jeet Shah","Jelmer van der Linde","Jennifer Billock","Jenny Hong","Jenya Lee","Jeremy Fu","Jianfeng Chi","Jianyu Huang","Jiawen Liu","Jie Wang","Jiecao Yu","Joanna Bitton","Joe Spisak","Jongsoo Park","Joseph Rocca","Joshua Johnstun","Joshua Saxe","Junteng Jia","Kalyan Vasuden Alwala","Kartikeya Upasani","Kate Plawiak","Ke Li","Kenneth Heafield","Kevin Stone","Khalid El-Arini","Krithika Iyer","Kshitiz Malik","Kuenley Chiu","Kunal Bhalla","Lauren Rantala-Yeary","Laurens van der Maaten","Lawrence Chen","Liang Tan","Liz Jenkins","Louis Martin","Lovish Madaan","Lubo Malo","Lukas Blecher","Lukas Landzaat","Luke de Oliveira","Madeline Muzzi","Mahesh Pasupuleti","Mannat Singh","Manohar Paluri","Marcin Kardas","Mathew Oldham","Mathieu Rita","Maya Pavlova","Melanie Kambadur","Mike Lewis","Min Si","Mitesh Kumar Singh","Mona Hassan","Naman Goyal","Narjes Torabi","Nikolay Bashlykov","Nikolay Bogoychev","Niladri Chatterji","Olivier Duchenne","Onur Çelebi","Patrick Alrassy","Pengchuan Zhang","Pengwei Li","Petar Vasic","Peter Weng","Prajjwal Bhargava","Pratik Dubal","Praveen Krishnan","Punit Singh Koura","Puxin Xu","Qing He","Qingxiao Dong","Ragavan Srinivasan","Raj Ganapathy","Ramon Calderer","Ricardo Silveira Cabral","Robert Stojnic","Roberta Raileanu","Rohit Girdhar","Rohit Patel","Romain Sauvestre","Ronnie Polidoro","Roshan Sumbaly","Ross Taylor","Ruan Silva","Rui Hou","Rui Wang","Saghar Hosseini","Sahana Chennabasappa","Sanjay Singh","Sean Bell","Seohyun Sonia Kim","Sergey Edunov","Shaoliang Nie","Sharan Narang","Sharath Raparthy","Sheng Shen","Shengye Wan","Shruti Bhosale","Shun Zhang","Simon Vandenhende","Soumya Batra","Spencer Whitman","Sten Sootla","Stephane Collot","Suchin Gururangan","Sydney Borodinsky","Tamar Herman","Tara Fowler","Tarek Sheasha","Thomas Georgiou","Thomas Scialom","Tobias Speckbacher","Todor Mihaylov","Tong Xiao","Ujjwal Karn","Vedanuj Goswami","Vibhor Gupta","Vignesh Ramanathan","Viktor Kerkez","Vincent Gonguet","Virginie Do","Vish Vogeti","Vladan Petrovic","Weiwei Chu","Wenhan Xiong","Wenyin Fu","Whitney Meers","Xavier Martinet","Xiaodong Wang","Xiaoqing Ellen Tan","Xinfeng Xie","Xuchao Jia","Xuewei Wang","Yaelle Goldschlag","Yashesh Gaur","Yasmine Babaei","Yi Wen","Yiwen Song","Yuchen Zhang","Yue Li","Yuning Mao","Zacharie Delpierre Coudert","Zheng Yan","Zhengxing Chen","Zoe Papakipos","Aaditya Singh","Aaron Grattafiori","Abha Jain","Adam Kelsey","Adam Shajnfeld","Adithya Gangidi","Adolfo Victoria","Ahuva Goldstand","Ajay Menon","Ajay Sharma","Alex Boesenberg","Alex Vaughan","Alexei Baevski","Allie Feinstein","Amanda Kallet","Amit Sangani","Anam Yunus","Andrei Lupu","Andres Alvarado","Andrew Caples","Andrew Gu","Andrew Ho","Andrew Poulton","Andrew Ryan","Ankit Ramchandani","Annie Franco","Aparajita Saraf","Arkabandhu Chowdhury","Ashley Gabriel","Ashwin Bharambe","Assaf Eisenman","Azadeh Yazdan","Beau James","Ben Maurer","Benjamin Leonhardi","Bernie Huang","Beth Loyd","Beto De Paola","Bhargavi Paranjape","Bing Liu","Bo Wu","Boyu Ni","Braden Hancock","Bram Wasti","Brandon Spence","Brani Stojkovic","Brian Gamido","Britt Montalvo","Carl Parker","Carly Burton","Catalina Mejia","Changhan Wang","Changkyu Kim","Chao Zhou","Chester Hu","Ching-Hsiang Chu","Chris Cai","Chris Tindal","Christoph Feichtenhofer","Damon Civin","Dana Beaty","Daniel Kreymer","Daniel Li","Danny Wyatt","David Adkins","David Xu","Davide Testuggine","Delia David","Devi Parikh","Diana Liskovich","Didem Foss","Dingkang Wang","Duc Le","Dustin Holland","Edward Dowling","Eissa Jamil","Elaine Montgomery","Eleonora Presani","Emily Hahn","Emily Wood","Erik Brinkman","Esteban Arcaute","Evan Dunbar","Evan Smothers","Fei Sun","Felix Kreuk","Feng Tian","Firat Ozgenel","Francesco Caggioni","Francisco Guzmán","Frank Kanayet","Frank Seide","Gabriela Medina Florez","Gabriella Schwarz","Gada Badeer","Georgia Swee","Gil Halpern","Govind Thattai","Grant Herman","Grigory Sizov"," Guangyi"," Zhang","Guna Lakshminarayanan","Hamid Shojanazeri","Han Zou","Hannah Wang","Hanwen Zha","Haroun Habeeb","Harrison Rudolph","Helen Suk","Henry Aspegren","Hunter Goldman","Igor Molybog","Igor Tufanov","Irina-Elena Veliche","Itai Gat","Jake Weissman","James Geboski","James Kohli","Japhet Asher","Jean-Baptiste Gaya","Jeff Marcus","Jeff Tang","Jennifer Chan","Jenny Zhen","Jeremy Reizenstein","Jeremy Teboul","Jessica Zhong","Jian Jin","Jingyi Yang","Joe Cummings","Jon Carvill","Jon Shepard","Jonathan McPhie","Jonathan Torres","Josh Ginsburg","Junjie Wang","Kai Wu","Kam Hou U","Karan Saxena","Karthik Prasad","Kartikay Khandelwal","Katayoun Zand","Kathy Matosich","Kaushik Veeraraghavan","Kelly Michelena","Keqian Li","Kun Huang","Kunal Chawla","Kushal Lakhotia","Kyle Huang","Lailin Chen","Lakshya Garg","Lavender A","Leandro Silva","Lee Bell","Lei Zhang","Liangpeng Guo","Licheng Yu","Liron Moshkovich","Luca Wehrstedt","Madian Khabsa","Manav Avalani","Manish Bhatt","Maria Tsimpoukelli","Martynas Mankus","Matan Hasson","Matthew Lennie","Matthias Reso","Maxim Groshev","Maxim Naumov","Maya Lathi","Meghan Keneally","Michael L. Seltzer","Michal Valko","Michelle Restrepo","Mihir Patel","Mik Vyatskov","Mikayel Samvelyan","Mike Clark","Mike Macey","Mike Wang","Miquel Jubert Hermoso","Mo Metanat","Mohammad Rastegari","Munish Bansal","Nandhini Santhanam","Natascha Parks","Natasha White","Navyata Bawa","Nayan Singhal","Nick Egebo","Nicolas Usunier","Nikolay Pavlovich Laptev","Ning Dong","Ning Zhang","Norman Cheng","Oleg Chernoguz","Olivia Hart","Omkar Salpekar","Ozlem Kalinli","Parkin Kent","Parth Parekh","Paul Saab","Pavan Balaji","Pedro Rittner","Philip Bontrager","Pierre Roux","Piotr Dollar","Polina Zvyagina","Prashant Ratanchandani","Pritish Yuvraj","Qian Liang","Rachad Alao","Rachel Rodriguez","Rafi Ayub","Raghotham Murthy","Raghu Nayani","Rahul Mitra","Raymond Li","Rebekkah Hogan","Robin Battey","Rocky Wang","Rohan Maheswari","Russ Howes","Ruty Rinott","Sai Jayesh Bondu","Samyak Datta","Sara Chugh","Sara Hunt","Sargun Dhillon","Sasha Sidorov","Satadru Pan","Saurabh Verma","Seiji Yamamoto","Sharadh Ramaswamy","Shaun Lindsay","Shaun Lindsay","Sheng Feng","Shenghao Lin","Shengxin Cindy Zha","Shiva Shankar","Shuqiang Zhang","Shuqiang Zhang","Sinong Wang","Sneha Agarwal","Soji Sajuyigbe","Soumith Chintala","Stephanie Max","Stephen Chen","Steve Kehoe","Steve Satterfield","Sudarshan Govindaprasad","Sumit Gupta","Sungmin Cho","Sunny Virk","Suraj Subramanian","Sy Choudhury","Sydney Goldman","Tal Remez","Tamar Glaser","Tamara Best","Thilo Kohler","Thomas Robinson","Tianhe Li","Tianjun Zhang","Tim Matthews","Timothy Chou","Tzook Shaked","Varun Vontimitta","Victoria Ajayi","Victoria Montanez","Vijai Mohan","Vinay Satish Kumar","Vishal Mangla","Vlad Ionescu","Vlad Poenaru","Vlad Tiberiu Mihailescu","Vladimir Ivanov","Wei Li","Wenchen Wang","Wenwen Jiang","Wes Bouaziz","Will Constable","Xiaocheng Tang","Xiaofang Wang","Xiaojian Wu","Xiaolan Wang","Xide Xia","Xilun Wu","Xinbo Gao","Yanjun Chen","Ye Hu","Ye Jia","Ye Qi","Yenda Li","Yilin Zhang","Ying Zhang","Yossi Adi","Youngjin Nam"," Yu"," Wang","Yuchen Hao","Yundi Qian","Yuzi He","Zach Rait","Zachary DeVito","Zef Rosnbrick","Zhaoduo Wen","Zhenyu Yang","Zhiwei Zhao"],"pdf_url":"https://arxiv.org/pdf/2407.21783v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21772v1","updated":"2024-07-31T17:48:14Z","published":"2024-07-31T17:48:14Z","title":"ShieldGemma: Generative AI Content Moderation Based on Gemma","summary":" We present ShieldGemma, a comprehensive suite of LLM-based safety content\nmoderation models built upon Gemma2. These models provide robust,\nstate-of-the-art predictions of safety risks across key harm types (sexually\nexplicit, dangerous content, harassment, hate speech) in both user input and\nLLM-generated output. By evaluating on both public and internal benchmarks, we\ndemonstrate superior performance compared to existing models, such as Llama\nGuard (+10.8\\% AU-PRC on public benchmarks) and WildCard (+4.3\\%).\nAdditionally, we present a novel LLM-based data curation pipeline, adaptable to\na variety of safety-related tasks and beyond. We have shown strong\ngeneralization performance for model trained mainly on synthetic data. By\nreleasing ShieldGemma, we provide a valuable resource to the research\ncommunity, advancing LLM safety and enabling the creation of more effective\ncontent moderation solutions for developers.\n","authors":["Wenjun Zeng","Yuchi Liu","Ryan Mullins","Ludovic Peran","Joe Fernandez","Hamza Harkous","Karthik Narasimhan","Drew Proud","Piyush Kumar","Bhaktipriya Radharapu","Olivia Sturman","Oscar Wahltinez"],"pdf_url":"https://arxiv.org/pdf/2407.21772v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19509v2","updated":"2024-07-31T17:08:48Z","published":"2024-04-30T12:43:53Z","title":"Do Large Language Models Understand Conversational Implicature -- A case\n study with a chinese sitcom","summary":" Understanding the non-literal meaning of an utterance is critical for large\nlanguage models (LLMs) to become human-like social communicators. In this work,\nwe introduce SwordsmanImp, the first Chinese multi-turn-dialogue-based dataset\naimed at conversational implicature, sourced from dialogues in the Chinese\nsitcom $\\textit{My Own Swordsman}$. It includes 200 carefully handcrafted\nquestions, all annotated on which Gricean maxims have been violated. We test\neight close-source and open-source LLMs under two tasks: a multiple-choice\nquestion task and an implicature explanation task. Our results show that GPT-4\nattains human-level accuracy (94%) on multiple-choice questions. CausalLM\ndemonstrates a 78.5% accuracy following GPT-4. Other models, including GPT-3.5\nand several open-source models, demonstrate a lower accuracy ranging from 20%\nto 60% on multiple-choice questions. Human raters were asked to rate the\nexplanation of the implicatures generated by LLMs on their reasonability, logic\nand fluency. While all models generate largely fluent and self-consistent text,\ntheir explanations score low on reasonability except for GPT-4, suggesting that\nmost LLMs cannot produce satisfactory explanations of the implicatures in the\nconversation. Moreover, we find LLMs' performance does not vary significantly\nby Gricean maxims, suggesting that LLMs do not seem to process implicatures\nderived from different maxims differently. Our data and code are available at\nhttps://github.com/sjtu-compling/llm-pragmatics.\n","authors":["Shisen Yue","Siyuan Song","Xinyuan Cheng","Hai Hu"],"pdf_url":"https://arxiv.org/pdf/2404.19509v2.pdf","comment":"14 pages, 8 tables and 5 figures"},{"id":"http://arxiv.org/abs/2406.14167v2","updated":"2024-07-31T16:20:45Z","published":"2024-06-20T10:13:08Z","title":"Definition generation for lexical semantic change detection","summary":" We use contextualized word definitions generated by large language models as\nsemantic representations in the task of diachronic lexical semantic change\ndetection (LSCD). In short, generated definitions are used as `senses', and the\nchange score of a target word is retrieved by comparing their distributions in\ntwo time periods under comparison. On the material of five datasets and three\nlanguages, we show that generated definitions are indeed specific and general\nenough to convey a signal sufficient to rank sets of words by the degree of\ntheir semantic change over time. Our approach is on par with or outperforms\nprior non-supervised sense-based LSCD methods. At the same time, it preserves\ninterpretability and allows to inspect the reasons behind a specific shift in\nterms of discrete definitions-as-senses. This is another step in the direction\nof explainable semantic change modeling.\n","authors":["Mariia Fedorova","Andrey Kutuzov","Yves Scherrer"],"pdf_url":"https://arxiv.org/pdf/2406.14167v2.pdf","comment":"Findings of ACL 2024"},{"id":"http://arxiv.org/abs/2407.21712v1","updated":"2024-07-31T16:04:03Z","published":"2024-07-31T16:04:03Z","title":"Adaptive Retrieval-Augmented Generation for Conversational Systems","summary":" Despite the success of integrating large language models into the development\nof conversational systems, many studies have shown the effectiveness of\nretrieving and augmenting external knowledge for informative responses. Hence,\nmany existing studies commonly assume the always need for Retrieval Augmented\nGeneration (RAG) in a conversational system without explicit control. This\nraises a research question about such a necessity. In this study, we propose to\ninvestigate the need for each turn of system response to be augmented with\nexternal knowledge. In particular, by leveraging human judgements on the binary\nchoice of adaptive augmentation, we develop RAGate, a gating model, which\nmodels conversation context and relevant inputs to predict if a conversational\nsystem requires RAG for improved responses. We conduct extensive experiments on\ndevising and applying RAGate to conversational models and well-rounded analyses\nof different conversational scenarios. Our experimental results and analysis\nindicate the effective application of RAGate in RAG-based conversational\nsystems in identifying system responses for appropriate RAG with high-quality\nresponses and a high generation confidence. This study also identifies the\ncorrelation between the generation's confidence level and the relevance of the\naugmented knowledge.\n","authors":["Xi Wang","Procheta Sen","Ruizhe Li","Emine Yilmaz"],"pdf_url":"https://arxiv.org/pdf/2407.21712v1.pdf","comment":"12 pages, under review"},{"id":"http://arxiv.org/abs/2407.21669v1","updated":"2024-07-31T15:12:24Z","published":"2024-07-31T15:12:24Z","title":"Synth-Empathy: Towards High-Quality Synthetic Empathy Data","summary":" In recent years, with the rapid advancements in large language models (LLMs),\nachieving excellent empathetic response capabilities has become a crucial\nprerequisite. Consequently, managing and understanding empathetic datasets have\ngained increasing significance. However, empathetic data are typically\nhuman-labeled, leading to insufficient datasets and wasted human labor. In this\nwork, we present Synth-Empathy, an LLM-based data generation and quality and\ndiversity selection pipeline that automatically generates high-quality\nempathetic data while discarding low-quality data. With the data generated from\na low empathetic model, we are able to further improve empathetic response\nperformance and achieve state-of-the-art (SoTA) results across multiple\nbenchmarks. Moreover, our model achieves SoTA performance on various human\nevaluation benchmarks, demonstrating its effectiveness and robustness in\nreal-world applications. Furthermore, we show the trade-off between data\nquantity and quality, providing insights into empathetic data generation and\nselection.\n","authors":["Hao Liang","Linzhuang Sun","Jingxuan Wei","Xijie Huang","Linkun Sun","Bihui Yu","Conghui He","Wentao Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.21669v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2407.01937"},{"id":"http://arxiv.org/abs/2407.21659v1","updated":"2024-07-31T15:02:46Z","published":"2024-07-31T15:02:46Z","title":"Defending Jailbreak Attack in VLMs via Cross-modality Information\n Detector","summary":" Vision Language Models (VLMs) extend the capacity of LLMs to comprehensively\nunderstand vision information, achieving remarkable performance in many\nvision-centric tasks. Despite that, recent studies have shown that these models\nare susceptible to jailbreak attacks, which refer to an exploitative technique\nwhere malicious users can break the safety alignment of the target model and\ngenerate misleading and harmful answers. This potential threat is caused by\nboth the inherent vulnerabilities of LLM and the larger attack scope introduced\nby vision input. To enhance the security of VLMs against jailbreak attacks,\nresearchers have developed various defense techniques. However, these methods\neither require modifications to the model's internal structure or demand\nsignificant computational resources during the inference phase. Multimodal\ninformation is a double-edged sword. While it increases the risk of attacks, it\nalso provides additional data that can enhance safeguards. Inspired by this, we\npropose $\\underline{\\textbf{C}}$ross-modality\n$\\underline{\\textbf{I}}$nformation\n$\\underline{\\textbf{DE}}$tecto$\\underline{\\textbf{R}}$ ($\\textit{CIDER})$, a\nplug-and-play jailbreaking detector designed to identify maliciously perturbed\nimage inputs, utilizing the cross-modal similarity between harmful queries and\nadversarial images. This simple yet effective cross-modality information\ndetector, $\\textit{CIDER}$, is independent of the target VLMs and requires less\ncomputation cost. Extensive experimental results demonstrate the effectiveness\nand efficiency of $\\textit{CIDER}$, as well as its transferability to both\nwhite-box and black-box VLMs.\n","authors":["Yue Xu","Xiuyuan Qi","Zhan Qin","Wenjie Wang"],"pdf_url":"https://arxiv.org/pdf/2407.21659v1.pdf","comment":"12 pages, 9 figures, ACL ARR 2024 June Submission"},{"id":"http://arxiv.org/abs/2407.21646v1","updated":"2024-07-31T14:48:27Z","published":"2024-07-31T14:48:27Z","title":"Towards Achieving Human Parity on End-to-end Simultaneous Speech\n Translation via LLM Agent","summary":" In this paper, we present Cross Language Agent -- Simultaneous\nInterpretation, CLASI, a high-quality and human-like Simultaneous Speech\nTranslation (SiST) System. Inspired by professional human interpreters, we\nutilize a novel data-driven read-write strategy to balance the translation\nquality and latency. To address the challenge of translating in-domain\nterminologies, CLASI employs a multi-modal retrieving module to obtain relevant\ninformation to augment the translation. Supported by LLMs, our approach can\ngenerate error-tolerated translation by considering the input audio, historical\ncontext, and retrieved information. Experimental results show that our system\noutperforms other systems by significant margins. Aligned with professional\nhuman interpreters, we evaluate CLASI with a better human evaluation metric,\nvalid information proportion (VIP), which measures the amount of information\nthat can be successfully conveyed to the listeners. In the real-world\nscenarios, where the speeches are often disfluent, informal, and unclear, CLASI\nachieves VIP of 81.3% and 78.0% for Chinese-to-English and English-to-Chinese\ntranslation directions, respectively. In contrast, state-of-the-art commercial\nor open-source systems only achieve 35.4% and 41.6%. On the extremely hard\ndataset, where other systems achieve under 13% VIP, CLASI can still achieve 70%\nVIP.\n","authors":["Shanbo Cheng","Zhichao Huang","Tom Ko","Hang Li","Ningxin Peng","Lu Xu","Qini Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.21646v1.pdf","comment":"Authors are listed in alphabetical order by last name. Demonstrations\n and human-annotated test sets are available at\n https://byteresearchcla.github.io/clasi"},{"id":"http://arxiv.org/abs/2407.16205v2","updated":"2024-07-31T14:37:05Z","published":"2024-07-23T06:14:41Z","title":"Figure it Out: Analyzing-based Jailbreak Attack on Large Language Models","summary":" The rapid development of Large Language Models (LLMs) has brought remarkable\ngenerative capabilities across diverse tasks. However, despite the impressive\nachievements, these models still have numerous security vulnerabilities,\nparticularly when faced with jailbreak attacks. Therefore, by investigating\njailbreak attacks, we can uncover hidden weaknesses in LLMs and guide us in\ndeveloping more robust defense mechanisms to fortify their security. In this\npaper, we further explore the boundary of jailbreak attacks on LLMs and propose\nAnalyzing-based Jailbreak (ABJ). This effective jailbreak attack method takes\nadvantage of LLMs' growing analyzing and reasoning capability and reveals their\nunderlying vulnerabilities when facing analysis-based tasks. We conduct a\ndetailed evaluation of ABJ across various open-source and closed-source LLMs,\nwhich achieves 94.8% Attack Success Rate (ASR) and 1.06 Attack Efficiency (AE)\non GPT-4-turbo-0409, demonstrating state-of-the-art attack effectiveness and\nefficiency. Our research highlights the importance of prioritizing and\nenhancing the safety of LLMs to mitigate the risks of misuse.The code is\npublicly available at https://github.com/theshi-1128/ABJ-Attack.\n","authors":["Shi Lin","Rongchang Li","Xun Wang","Changting Lin","Wenpeng Xing","Meng Han"],"pdf_url":"https://arxiv.org/pdf/2407.16205v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21633v1","updated":"2024-07-31T14:26:41Z","published":"2024-07-31T14:26:41Z","title":"Zero-Shot Cross-Domain Dialogue State Tracking via Dual Low-Rank\n Adaptation","summary":" Zero-shot dialogue state tracking (DST) seeks to enable dialogue systems to\ntransition to unfamiliar domains without manual annotation or extensive\nretraining. Prior research has approached this objective by embedding prompts\ninto language models (LMs). Common methodologies include integrating prompts at\nthe input layer or introducing learnable variables at each transformer layer.\nNonetheless, each strategy exhibits inherent limitations. Prompts integrated at\nthe input layer risk underutilization, with their impact potentially\ndiminishing across successive transformer layers. Conversely, the addition of\nlearnable variables to each layer can complicate the training process and\nincrease inference latency. To tackle the issues mentioned above, this paper\nproposes Dual Low-Rank Adaptation (DualLoRA), a plug-and-play architecture\ndesigned for zero-shot DST. DualLoRA incorporates two distinct Low-Rank\nAdaptation (LoRA) components, targeting both dialogue context processing and\nprompt optimization, to ensure the comprehensive influence of prompts\nthroughout the transformer model layers. This is achieved without incurring\nadditional inference latency, showcasing an efficient integration into existing\narchitectures. Through rigorous evaluation on the MultiWOZ and SGD datasets,\nDualLoRA demonstrates notable improvements across multiple domains,\noutperforming traditional baseline methods in zero-shot settings. Our code is\naccessible at: \\url{https://github.com/suntea233/DualLoRA}.\n","authors":["Xiang Luo","Zhiwen Tang","Jin Wang","Xuejie Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.21633v1.pdf","comment":"Accepted by ACL 2024"},{"id":"http://arxiv.org/abs/2407.21630v1","updated":"2024-07-31T14:24:01Z","published":"2024-07-31T14:24:01Z","title":"TAROT: Task-Oriented Authorship Obfuscation Using Policy Optimization\n Methods","summary":" Authorship obfuscation aims to disguise the identity of an author within a\ntext by altering the writing style, vocabulary, syntax, and other linguistic\nfeatures associated with the text author. This alteration needs to balance\nprivacy and utility. While strong obfuscation techniques can effectively hide\nthe author's identity, they often degrade the quality and usefulness of the\ntext for its intended purpose. Conversely, maintaining high utility tends to\nprovide insufficient privacy, making it easier for an adversary to de-anonymize\nthe author. Thus, achieving an optimal trade-off between these two conflicting\nobjectives is crucial. In this paper, we propose TAROT: Task-Oriented\nAuthorship Obfuscation Using Policy Optimization, a new unsupervised authorship\nobfuscation method whose goal is to optimize the privacy-utility trade-off by\nregenerating the entire text considering its downstream utility. Our approach\nleverages policy optimization as a fine-tuning paradigm over small language\nmodels in order to rewrite texts by preserving author identity and downstream\ntask utility. We show that our approach largely reduce the accuracy of\nattackers while preserving utility. We make our code and models publicly\navailable.\n","authors":["Gabriel Loiseau","Damien Sileo","Damien Riquet","Maxime Meyer","Marc Tommasi"],"pdf_url":"https://arxiv.org/pdf/2407.21630v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00569v3","updated":"2024-07-31T13:08:22Z","published":"2024-06-30T03:04:11Z","title":"Investigating and Mitigating the Multimodal Hallucination Snowballing in\n Large Vision-Language Models","summary":" Though advanced in understanding visual information with human languages,\nLarge Vision-Language Models (LVLMs) still suffer from multimodal\nhallucinations. A natural concern is that during multimodal interaction, the\ngenerated hallucinations could influence the LVLMs' subsequent generation.\nThus, we raise a question: When presented with a query relevant to the\npreviously generated hallucination, will LVLMs be misled and respond\nincorrectly, even though the ground visual information exists? To answer this,\nwe propose a framework called MMHalSnowball to evaluate LVLMs' behaviors when\nencountering generated hallucinations, where LVLMs are required to answer\nspecific visual questions within a curated hallucinatory conversation.\nCrucially, our experiment shows that the performance of open-source LVLMs drops\nby at least $31\\%$, indicating that LVLMs are prone to accept the generated\nhallucinations and make false claims that they would not have supported without\ndistractions. We term this phenomenon Multimodal Hallucination Snowballing. To\nmitigate this, we further propose a training-free method called Residual Visual\nDecoding, where we revise the output distribution of LVLMs with the one derived\nfrom the residual visual input, providing models with direct access to the\nvisual information. Experiments show that our method can mitigate more than\n$24\\%$ of the snowballed multimodal hallucination while maintaining\ncapabilities.\n","authors":["Weihong Zhong","Xiaocheng Feng","Liang Zhao","Qiming Li","Lei Huang","Yuxuan Gu","Weitao Ma","Yuan Xu","Bing Qin"],"pdf_url":"https://arxiv.org/pdf/2407.00569v3.pdf","comment":"Accepted to ACL 2024 Main Conference. 21 pages, 20 figures"},{"id":"http://arxiv.org/abs/2310.20501v3","updated":"2024-07-31T13:08:08Z","published":"2023-10-31T14:42:23Z","title":"Neural Retrievers are Biased Towards LLM-Generated Content","summary":" Recently, the emergence of large language models (LLMs) has revolutionized\nthe paradigm of information retrieval (IR) applications, especially in web\nsearch, by generating vast amounts of human-like texts on the Internet. As a\nresult, IR systems in the LLM era are facing a new challenge: the indexed\ndocuments are now not only written by human beings but also automatically\ngenerated by the LLMs. How these LLM-generated documents influence the IR\nsystems is a pressing and still unexplored question. In this work, we conduct a\nquantitative evaluation of IR models in scenarios where both human-written and\nLLM-generated texts are involved. Surprisingly, our findings indicate that\nneural retrieval models tend to rank LLM-generated documents higher. We refer\nto this category of biases in neural retrievers towards the LLM-generated\ncontent as the \\textbf{source bias}. Moreover, we discover that this bias is\nnot confined to the first-stage neural retrievers, but extends to the\nsecond-stage neural re-rankers. Then, in-depth analyses from the perspective of\ntext compression indicate that LLM-generated texts exhibit more focused\nsemantics with less noise, making it easier for neural retrieval models to\nsemantic match. To mitigate the source bias, we also propose a plug-and-play\ndebiased constraint for the optimization objective, and experimental results\nshow its effectiveness. Finally, we discuss the potential severe concerns\nstemming from the observed source bias and hope our findings can serve as a\ncritical wake-up call to the IR community and beyond. To facilitate future\nexplorations of IR in the LLM era, the constructed two new benchmarks are\navailable at https://github.com/KID-22/Source-Bias.\n","authors":["Sunhao Dai","Yuqi Zhou","Liang Pang","Weihao Liu","Xiaolin Hu","Yong Liu","Xiao Zhang","Gang Wang","Jun Xu"],"pdf_url":"https://arxiv.org/pdf/2310.20501v3.pdf","comment":"KDD 2024"},{"id":"http://arxiv.org/abs/2407.21571v1","updated":"2024-07-31T12:56:14Z","published":"2024-07-31T12:56:14Z","title":"PMoE: Progressive Mixture of Experts with Asymmetric Transformer for\n Continual Learning","summary":" Large Language Models (LLMs) encounter significant challenges in continual\nlearning due to catastrophic forgetting, where new information overwrites\npreviously acquired knowledge. This limitation leads to substantial\nenvironmental and economic waste. In this study, we introduce the PMoE,\nProgressive Mixture of Experts with Asymmetric Transformer, which aims to\nminimize forgetting by utilizing an asymmetric design with shallow layers\ndedicated to general knowledge and deep layers for new knowledge. PMoE\nincorporates progressively added experts in deep layers and a router that\nallocates new knowledge to the appropriate experts efficiently. The router,\npositioned adjacent to the deep layers, utilizes deep features aggregating\nconsolidated information. This enables the router to perform efficiently,\nallocating new knowledge to the appropriate experts, which progressively\nincrease in the deep layers. Extensive experiments on TRACE datasets and\ngeneral language understanding datasets demonstrate that the proposed PMoE\noutperforms previous state-of-the-art approaches.\n","authors":["Min Jae Jung","JooHee Kim"],"pdf_url":"https://arxiv.org/pdf/2407.21571v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21560v1","updated":"2024-07-31T12:29:17Z","published":"2024-07-31T12:29:17Z","title":"Generative Sentiment Analysis via Latent Category Distribution and\n Constrained Decoding","summary":" Fine-grained sentiment analysis involves extracting and organizing sentiment\nelements from textual data. However, existing approaches often overlook issues\nof category semantic inclusion and overlap, as well as inherent structural\npatterns within the target sequence. This study introduces a generative\nsentiment analysis model. To address the challenges related to category\nsemantic inclusion and overlap, a latent category distribution variable is\nintroduced. By reconstructing the input of a variational autoencoder, the model\nlearns the intensity of the relationship between categories and text, thereby\nimproving sequence generation. Additionally, a trie data structure and\nconstrained decoding strategy are utilized to exploit structural patterns,\nwhich in turn reduces the search space and regularizes the generation process.\nExperimental results on the Restaurant-ACOS and Laptop-ACOS datasets\ndemonstrate a significant performance improvement compared to baseline models.\nAblation experiments further confirm the effectiveness of latent category\ndistribution and constrained decoding strategy.\n","authors":["Jun Zhou","Dongyang Yu","Kamran Aziz","Fangfang Su","Qing Zhang","Fei Li","Donghong Ji"],"pdf_url":"https://arxiv.org/pdf/2407.21560v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00614v2","updated":"2024-07-31T12:25:14Z","published":"2024-03-31T09:04:01Z","title":"Learning to Plan for Language Modeling from Unlabeled Data","summary":" By training to predict the next token in an unlabeled corpus, large language\nmodels learn to perform many tasks without any labeled data. However, their\nnext-token-prediction objective arguably limits their performance in scenarios\nthat require planning, such as writing a coherent article. In this paper, we\ntrain a module for planning the future writing process via a self-supervised\nlearning objective. Given the textual context, this planning module learns to\npredict future abstract writing actions, which correspond to centroids in a\nclustered text embedding space. By conditioning on these actions, our model\nextends the successful language model formula to more abstract planning in an\nunsupervised way. Empirically, we demonstrate that our method improves language\nmodeling performance in general, particularly with respect to the text\nstructure. Because our framework uses a planner module that is unsupervised and\nexternal to the language model, new planner modules can be trained at large\nscale and easily be shared with the community.\n","authors":["Nathan Cornille","Marie-Francine Moens","Florian Mai"],"pdf_url":"https://arxiv.org/pdf/2404.00614v2.pdf","comment":"Published at COLM 2024"},{"id":"http://arxiv.org/abs/2406.19884v2","updated":"2024-07-31T11:50:54Z","published":"2024-06-28T12:49:27Z","title":"Investigating the Timescales of Language Processing with EEG and\n Language Models","summary":" This study explores the temporal dynamics of language processing by examining\nthe alignment between word representations from a pre-trained transformer-based\nlanguage model, and EEG data. Using a Temporal Response Function (TRF) model,\nwe investigate how neural activity corresponds to model representations across\ndifferent layers, revealing insights into the interaction between artificial\nlanguage models and brain responses during language comprehension. Our analysis\nreveals patterns in TRFs from distinct layers, highlighting varying\ncontributions to lexical and compositional processing. Additionally, we used\nlinear discriminant analysis (LDA) to isolate part-of-speech (POS)\nrepresentations, offering insights into their influence on neural responses and\nthe underlying mechanisms of syntactic processing. These findings underscore\nEEG's utility for probing language processing dynamics with high temporal\nresolution. By bridging artificial language models and neural activity, this\nstudy advances our understanding of their interaction at fine timescales.\n","authors":["Davide Turco","Conor Houghton"],"pdf_url":"https://arxiv.org/pdf/2406.19884v2.pdf","comment":"Accepted at the 2024 Conference on Cognitive Computational\n Neuroscience (CCN 2024)"},{"id":"http://arxiv.org/abs/2407.21536v1","updated":"2024-07-31T11:47:36Z","published":"2024-07-31T11:47:36Z","title":"Tracing Intricate Cues in Dialogue: Joint Graph Structure and Sentiment\n Dynamics for Multimodal Emotion Recognition","summary":" Multimodal emotion recognition in conversation (MERC) has garnered\nsubstantial research attention recently. Existing MERC methods face several\nchallenges: (1) they fail to fully harness direct inter-modal cues, possibly\nleading to less-than-thorough cross-modal modeling; (2) they concurrently\nextract information from the same and different modalities at each network\nlayer, potentially triggering conflicts from the fusion of multi-source data;\n(3) they lack the agility required to detect dynamic sentimental changes,\nperhaps resulting in inaccurate classification of utterances with abrupt\nsentiment shifts. To address these issues, a novel approach named GraphSmile is\nproposed for tracking intricate emotional cues in multimodal dialogues.\nGraphSmile comprises two key components, i.e., GSF and SDP modules. GSF\ningeniously leverages graph structures to alternately assimilate inter-modal\nand intra-modal emotional dependencies layer by layer, adequately capturing\ncross-modal cues while effectively circumventing fusion conflicts. SDP is an\nauxiliary task to explicitly delineate the sentiment dynamics between\nutterances, promoting the model's ability to distinguish sentimental\ndiscrepancies. Furthermore, GraphSmile is effortlessly applied to multimodal\nsentiment analysis in conversation (MSAC), forging a unified multimodal\naffective model capable of executing MERC and MSAC tasks. Empirical results on\nmultiple benchmarks demonstrate that GraphSmile can handle complex emotional\nand sentimental patterns, significantly outperforming baseline models.\n","authors":["Jiang Li","Xiaoping Wang","Zhigang Zeng"],"pdf_url":"https://arxiv.org/pdf/2407.21536v1.pdf","comment":"Submitted"},{"id":"http://arxiv.org/abs/2407.21531v1","updated":"2024-07-31T11:29:46Z","published":"2024-07-31T11:29:46Z","title":"Can LLMs \"Reason\" in Music? An Evaluation of LLMs' Capability of Music\n Understanding and Generation","summary":" Symbolic Music, akin to language, can be encoded in discrete symbols. Recent\nresearch has extended the application of large language models (LLMs) such as\nGPT-4 and Llama2 to the symbolic music domain including understanding and\ngeneration. Yet scant research explores the details of how these LLMs perform\non advanced music understanding and conditioned generation, especially from the\nmulti-step reasoning perspective, which is a critical aspect in the\nconditioned, editable, and interactive human-computer co-creation process. This\nstudy conducts a thorough investigation of LLMs' capability and limitations in\nsymbolic music processing. We identify that current LLMs exhibit poor\nperformance in song-level multi-step music reasoning, and typically fail to\nleverage learned music knowledge when addressing complex musical tasks. An\nanalysis of LLMs' responses highlights distinctly their pros and cons. Our\nfindings suggest achieving advanced musical capability is not intrinsically\nobtained by LLMs, and future research should focus more on bridging the gap\nbetween music knowledge and reasoning, to improve the co-creation experience\nfor musicians.\n","authors":["Ziya Zhou","Yuhang Wu","Zhiyue Wu","Xinyue Zhang","Ruibin Yuan","Yinghao Ma","Lu Wang","Emmanouil Benetos","Wei Xue","Yike Guo"],"pdf_url":"https://arxiv.org/pdf/2407.21531v1.pdf","comment":"Accepted by ISMIR2024"},{"id":"http://arxiv.org/abs/2407.21530v1","updated":"2024-07-31T11:26:57Z","published":"2024-07-31T11:26:57Z","title":"Data Contamination Report from the 2024 CONDA Shared Task","summary":" The 1st Workshop on Data Contamination (CONDA 2024) focuses on all relevant\naspects of data contamination in natural language processing, where data\ncontamination is understood as situations where evaluation data is included in\npre-training corpora used to train large scale models, compromising evaluation\nresults. The workshop fostered a shared task to collect evidence on data\ncontamination in current available datasets and models. The goal of the shared\ntask and associated database is to assist the community in understanding the\nextent of the problem and to assist researchers in avoiding reporting\nevaluation results on known contaminated resources. The shared task provides a\nstructured, centralized public database for the collection of contamination\nevidence, open to contributions from the community via GitHub pool requests.\nThis first compilation paper is based on 566 reported entries over 91\ncontaminated sources from a total of 23 contributors. The details of the\nindividual contamination events are available in the platform. The platform\ncontinues to be online, open to contributions from the community.\n","authors":["Oscar Sainz","Iker García-Ferrero","Alon Jacovi","Jon Ander Campos","Yanai Elazar","Eneko Agirre","Yoav Goldberg","Wei-Lin Chen","Jenny Chim","Leshem Choshen","Luca D'Amico-Wong","Melissa Dell","Run-Ze Fan","Shahriar Golchin","Yucheng Li","Pengfei Liu","Bhavish Pahwa","Ameya Prabhu","Suryansh Sharma","Emily Silcock","Kateryna Solonko","David Stap","Mihai Surdeanu","Yu-Min Tseng","Vishaal Udandarao","Zengzhi Wang","Ruijie Xu","Jinglin Yang"],"pdf_url":"https://arxiv.org/pdf/2407.21530v1.pdf","comment":"https://huggingface.co/spaces/CONDA-Workshop/Data-Contamination-Database"},{"id":"http://arxiv.org/abs/2407.21512v1","updated":"2024-07-31T10:30:31Z","published":"2024-07-31T10:30:31Z","title":"Interpreting and learning voice commands with a Large Language Model for\n a robot system","summary":" Robots are increasingly common in industry and daily life, such as in nursing\nhomes where they can assist staff. A key challenge is developing intuitive\ninterfaces for easy communication. The use of Large Language Models (LLMs) like\nGPT-4 has enhanced robot capabilities, allowing for real-time interaction and\ndecision-making. This integration improves robots' adaptability and\nfunctionality. This project focuses on merging LLMs with databases to improve\ndecision-making and enable knowledge acquisition for request interpretation\nproblems.\n","authors":["Stanislau Stankevich","Wojciech Dudek"],"pdf_url":"https://arxiv.org/pdf/2407.21512v1.pdf","comment":"PP-RAI 2024, 5th Polish Conference on Artificial Intelligence,\n 18-20.04.2024 Warsaw, Poland"},{"id":"http://arxiv.org/abs/2407.15425v2","updated":"2024-07-31T10:27:37Z","published":"2024-07-22T07:02:15Z","title":"Empirical Capacity Model for Self-Attention Neural Networks","summary":" Large pretrained self-attention neural networks, or transformers, have been\nvery successful in various tasks recently. The performance of a model on a\ngiven task depends on its ability to memorize and generalize the training data.\nLarge transformer models, which may have billions of parameters, in theory have\na huge capacity to memorize content. However, the current algorithms for the\noptimization fall short of the theoretical capacity, and the capacity is also\nhighly dependent on the content. In this paper, we focus on the memory capacity\nof these models obtained using common training algorithms and synthetic\ntraining data. Based on the results, we derive an empirical capacity model\n(ECM) for a generic transformer. The ECM can be used to design task-specific\ntransformer models with an optimal number of parameters in cases where the\ntarget memorization capability of the task can be defined.\n","authors":["Aki Härmä","Marcin Pietrasik","Anna Wilbik"],"pdf_url":"https://arxiv.org/pdf/2407.15425v2.pdf","comment":"Submitted to BNAIC'24, 14 pages + refs"},{"id":"http://arxiv.org/abs/2406.11301v2","updated":"2024-07-31T10:18:50Z","published":"2024-06-17T08:08:11Z","title":"Enhancing and Assessing Instruction-Following with Fine-Grained\n Instruction Variants","summary":" The effective alignment of Large Language Models (LLMs) with precise\ninstructions is essential for their application in diverse real-world\nscenarios. Current methods focus on enhancing the diversity and complexity of\ntraining and evaluation samples, yet they fall short in accurately assessing\nLLMs' ability to follow similar instruction variants. We introduce an effective\ndata augmentation technique that decomposes complex instructions into simpler\nsub-components, modifies these, and reconstructs them into new variants,\nthereby preserves the original instruction's context and complexity while\nintroducing variability, which is critical for training and evaluating LLMs'\ninstruction-following precision. We developed the DeMoRecon dataset using this\nmethod to both fine-tune and evaluate LLMs. Our findings show that LLMs\nfine-tuned with DeMoRecon will gain significant performance boost on both ours\nand commonly used instructions-following benchmarks.\n","authors":["Jiuding Yang","Weidong Guo","Kaitong Yang","Xiangyang Li","Zhuwei Rao","Yu Xu","Di Niu"],"pdf_url":"https://arxiv.org/pdf/2406.11301v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21491v1","updated":"2024-07-31T10:02:21Z","published":"2024-07-31T10:02:21Z","title":"Generative Expressive Conversational Speech Synthesis","summary":" Conversational Speech Synthesis (CSS) aims to express a target utterance with\nthe proper speaking style in a user-agent conversation setting. Existing CSS\nmethods employ effective multi-modal context modeling techniques to achieve\nempathy understanding and expression. However, they often need to design\ncomplex network architectures and meticulously optimize the modules within\nthem. In addition, due to the limitations of small-scale datasets containing\nscripted recording styles, they often fail to simulate real natural\nconversational styles. To address the above issues, we propose a novel\ngenerative expressive CSS system, termed GPT-Talker.We transform the multimodal\ninformation of the multi-turn dialogue history into discrete token sequences\nand seamlessly integrate them to form a comprehensive user-agent dialogue\ncontext. Leveraging the power of GPT, we predict the token sequence, that\nincludes both semantic and style knowledge, of response for the agent. After\nthat, the expressive conversational speech is synthesized by the\nconversation-enriched VITS to deliver feedback to the user.Furthermore, we\npropose a large-scale Natural CSS Dataset called NCSSD, that includes both\nnaturally recorded conversational speech in improvised styles and dialogues\nextracted from TV shows. It encompasses both Chinese and English languages,\nwith a total duration of 236 hours.We conducted comprehensive experiments on\nthe reliability of the NCSSD and the effectiveness of our GPT-Talker. Both\nsubjective and objective evaluations demonstrate that our model outperforms\nother state-of-the-art CSS systems significantly in terms of naturalness and\nexpressiveness. The Code, Dataset, and Pre-trained Model are available at:\nhttps://github.com/AI-S2-Lab/GPT-Talker.\n","authors":["Rui Liu","Yifan Hu","Ren Yi","Yin Xiang","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2407.21491v1.pdf","comment":"14 pages, 6 figures, 8 tables. Accepted by ACM MM 2024"},{"id":"http://arxiv.org/abs/2407.21489v1","updated":"2024-07-31T09:58:48Z","published":"2024-07-31T09:58:48Z","title":"Maverick: Efficient and Accurate Coreference Resolution Defying Recent\n Trends","summary":" Large autoregressive generative models have emerged as the cornerstone for\nachieving the highest performance across several Natural Language Processing\ntasks. However, the urge to attain superior results has, at times, led to the\npremature replacement of carefully designed task-specific approaches without\nexhaustive experimentation. The Coreference Resolution task is no exception;\nall recent state-of-the-art solutions adopt large generative autoregressive\nmodels that outperform encoder-based discriminative systems. In this work,we\nchallenge this recent trend by introducing Maverick, a carefully designed - yet\nsimple - pipeline, which enables running a state-of-the-art Coreference\nResolution system within the constraints of an academic budget, outperforming\nmodels with up to 13 billion parameters with as few as 500 million parameters.\nMaverick achieves state-of-the-art performance on the CoNLL-2012 benchmark,\ntraining with up to 0.006x the memory resources and obtaining a 170x faster\ninference compared to previous state-of-the-art systems. We extensively\nvalidate the robustness of the Maverick framework with an array of diverse\nexperiments, reporting improvements over prior systems in data-scarce,\nlong-document, and out-of-domain settings. We release our code and models for\nresearch purposes at https://github.com/SapienzaNLP/maverick-coref.\n","authors":["Giuliano Martinelli","Edoardo Barba","Roberto Navigli"],"pdf_url":"https://arxiv.org/pdf/2407.21489v1.pdf","comment":"Accepted at main conference of ACL 2024. 15 pages"},{"id":"http://arxiv.org/abs/2407.21476v1","updated":"2024-07-31T09:37:27Z","published":"2024-07-31T09:37:27Z","title":"On the Problem of Text-To-Speech Model Selection for Synthetic Data\n Generation in Automatic Speech Recognition","summary":" The rapid development of neural text-to-speech (TTS) systems enabled its\nusage in other areas of natural language processing such as automatic speech\nrecognition (ASR) or spoken language translation (SLT). Due to the large number\nof different TTS architectures and their extensions, selecting which TTS\nsystems to use for synthetic data creation is not an easy task. We use the\ncomparison of five different TTS decoder architectures in the scope of\nsynthetic data generation to show the impact on CTC-based speech recognition\ntraining. We compare the recognition results to computable metrics like NISQA\nMOS and intelligibility, finding that there are no clear relations to the ASR\nperformance. We also observe that for data generation auto-regressive decoding\nperforms better than non-autoregressive decoding, and propose an approach to\nquantify TTS generalization capabilities.\n","authors":["Nick Rossenbach","Ralf Schlüter","Sakriani Sakti"],"pdf_url":"https://arxiv.org/pdf/2407.21476v1.pdf","comment":"Accepted at the SynData4GenAI 2024 workshop"},{"id":"http://arxiv.org/abs/2407.15017v2","updated":"2024-07-31T09:14:29Z","published":"2024-07-22T06:15:59Z","title":"Knowledge Mechanisms in Large Language Models: A Survey and Perspective","summary":" Understanding knowledge mechanisms in Large Language Models (LLMs) is crucial\nfor advancing towards trustworthy AGI. This paper reviews knowledge mechanism\nanalysis from a novel taxonomy including knowledge utilization and evolution.\nKnowledge utilization delves into the mechanism of memorization, comprehension\nand application, and creation. Knowledge evolution focuses on the dynamic\nprogression of knowledge within individual and group LLMs. Moreover, we discuss\nwhat knowledge LLMs have learned, the reasons for the fragility of parametric\nknowledge, and the potential dark knowledge (hypothesis) that will be\nchallenging to address. We hope this work can help understand knowledge in LLMs\nand provide insights for future research.\n","authors":["Mengru Wang","Yunzhi Yao","Ziwen Xu","Shuofei Qiao","Shumin Deng","Peng Wang","Xiang Chen","Jia-Chen Gu","Yong Jiang","Pengjun Xie","Fei Huang","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.15017v2.pdf","comment":"Ongoing work (v2); add Section 5: Application of Knowledge Mechanism;\n revise Section 6 and 7; fix typos"},{"id":"http://arxiv.org/abs/2407.20524v2","updated":"2024-07-31T09:11:46Z","published":"2024-07-30T03:50:10Z","title":"Contrastive Feedback Mechanism for Simultaneous Speech Translation","summary":" Recent advances in simultaneous speech translation (SST) focus on the\ndecision policies that enable the use of offline-trained ST models for\nsimultaneous inference. These decision policies not only control the\nquality-latency trade-off in SST but also mitigate the impact of unstable\npredictions on translation quality by delaying translation for more context or\ndiscarding these predictions through stable hypothesis detection. However,\nthese policies often overlook the potential benefits of utilizing unstable\npredictions. We introduce the contrastive feedback mechanism (CFM) for SST, a\nnovel method that leverages these unstable predictions as feedback to improve\ntranslation quality. CFM guides the system to eliminate undesired model\nbehaviors from these predictions through a contrastive objective. The\nexperiments on 3 state-of-the-art decision policies across 8 languages in the\nMuST-C v1.0 dataset show that CFM effectively improves the performance of SST.\n","authors":["Haotian Tan","Sakriani Sakti"],"pdf_url":"https://arxiv.org/pdf/2407.20524v2.pdf","comment":"Accepted to Interspeech 2024 main conference"},{"id":"http://arxiv.org/abs/2407.21452v1","updated":"2024-07-31T08:55:57Z","published":"2024-07-31T08:55:57Z","title":"Navigating Beyond Instructions: Vision-and-Language Navigation in\n Obstructed Environments","summary":" Real-world navigation often involves dealing with unexpected obstructions\nsuch as closed doors, moved objects, and unpredictable entities. However,\nmainstream Vision-and-Language Navigation (VLN) tasks typically assume\ninstructions perfectly align with the fixed and predefined navigation graphs\nwithout any obstructions. This assumption overlooks potential discrepancies in\nactual navigation graphs and given instructions, which can cause major failures\nfor both indoor and outdoor agents. To address this issue, we integrate diverse\nobstructions into the R2R dataset by modifying both the navigation graphs and\nvisual observations, introducing an innovative dataset and task, R2R with\nUNexpected Obstructions (R2R-UNO). R2R-UNO contains various types and numbers\nof path obstructions to generate instruction-reality mismatches for VLN\nresearch. Experiments on R2R-UNO reveal that state-of-the-art VLN methods\ninevitably encounter significant challenges when facing such mismatches,\nindicating that they rigidly follow instructions rather than navigate\nadaptively. Therefore, we propose a novel method called ObVLN (Obstructed VLN),\nwhich includes a curriculum training strategy and virtual graph construction to\nhelp agents effectively adapt to obstructed environments. Empirical results\nshow that ObVLN not only maintains robust performance in unobstructed scenarios\nbut also achieves a substantial performance advantage with unexpected\nobstructions.\n","authors":["Haodong Hong","Sen Wang","Zi Huang","Qi Wu","Jiajun Liu"],"pdf_url":"https://arxiv.org/pdf/2407.21452v1.pdf","comment":"Accepted to MM 2024"},{"id":"http://arxiv.org/abs/2407.21443v1","updated":"2024-07-31T08:48:48Z","published":"2024-07-31T08:48:48Z","title":"Improving Faithfulness of Large Language Models in Summarization via\n Sliding Generation and Self-Consistency","summary":" Despite large language models (LLMs) have demonstrated impressive performance\nin various tasks, they are still suffering from the factual inconsistency\nproblem called hallucinations. For instance, LLMs occasionally generate content\nthat diverges from source article, and prefer to extract information that\nappears at the beginning and end of the context, especially in long document\nsummarization. Inspired by these findings, we propose to improve the\nfaithfulness of LLMs in summarization by impelling them to process the entire\narticle more fairly and faithfully. We present a novel summary generation\nstrategy, namely SliSum, which exploits the ideas of sliding windows and\nself-consistency. Specifically, SliSum divides the source article into\noverlapping windows, and utilizes LLM to generate local summaries for the\ncontent in the windows. Finally, SliSum aggregates all local summaries using\nclustering and majority voting algorithm to produce more faithful summary of\nentire article. Extensive experiments demonstrate that SliSum significantly\nimproves the faithfulness of diverse LLMs including LLaMA-2, Claude-2 and\nGPT-3.5 in both short and long text summarization, while maintaining their\nfluency and informativeness and without additional fine-tuning and resources.\nWe further conduct qualitative and quantitative studies to investigate why\nSliSum works and impacts of hyperparameters in SliSum on performance.\n","authors":["Taiji Li","Zhi Li","Yin Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.21443v1.pdf","comment":"Long paper accepted at LREC-COLING 2024 (oral)"},{"id":"http://arxiv.org/abs/2407.21441v1","updated":"2024-07-31T08:44:29Z","published":"2024-07-31T08:44:29Z","title":"QuestGen: Effectiveness of Question Generation Methods for Fact-Checking\n Applications","summary":" Verifying fact-checking claims poses a significant challenge, even for\nhumans. Recent approaches have demonstrated that decomposing claims into\nrelevant questions to gather evidence enhances the efficiency of the\nfact-checking process. In this paper, we provide empirical evidence showing\nthat this question decomposition can be effectively automated. We demonstrate\nthat smaller generative models, fine-tuned for the question generation task\nusing data augmentation from various datasets, outperform large language models\nby up to 8%. Surprisingly, in some cases, the evidence retrieved using\nmachine-generated questions proves to be significantly more effective for\nfact-checking than that obtained from human-written questions. We also perform\nmanual evaluation of the decomposed questions to assess the quality of the\nquestions generated.\n","authors":["Rivik Setty","Vinay Setty"],"pdf_url":"https://arxiv.org/pdf/2407.21441v1.pdf","comment":"Accepted in CIKM 2024 as a short paper 4 pages and 1 page references"},{"id":"http://arxiv.org/abs/2407.21439v1","updated":"2024-07-31T08:43:17Z","published":"2024-07-31T08:43:17Z","title":"MLLM Is a Strong Reranker: Advancing Multimodal Retrieval-augmented\n Generation via Knowledge-enhanced Reranking and Noise-injected Training","summary":" Multimodal Large Language Models (MLLMs) have demonstrated remarkable\ncapabilities in processing and generating content across multiple data\nmodalities, including text, images, audio, and video. However, a significant\ndrawback of MLLMs is their reliance on static training data, leading to\noutdated information and limited contextual awareness. This static nature\nhampers their ability to provide accurate, up-to-date responses, particularly\nin dynamic or rapidly evolving contexts. Integrating Multimodal\nRetrieval-augmented Generation (Multimodal RAG) offers a promising solution,\nbut the system would inevitably encounter the multi-granularity noisy\ncorrespondence (MNC) problem, which involves two types of noise: coarse-grained\n(query-caption) and fine-grained (query-image). This noise hinders accurate\nretrieval and generation. In this work, we propose \\textbf{RagLLaVA}, a novel\nframework with knowledge-enhanced reranking and noise-injected training, to\naddress these limitations. We instruction-tune the MLLM with a simple yet\neffective instruction template to induce its ranking ability and serve it as a\nreranker to precisely filter the top-k retrieved images. For generation, we\ninject visual noise during training at the data and token levels to enhance the\ngenerator's robustness. Extensive experiments are conducted on the subsets of\ntwo datasets that require retrieving and reasoning over images to answer a\ngiven query. Our results demonstrate the superiority of RagLLaVA in retrieving\naccurately and generating robustly. Code and models are available at\nhttps://github.com/IDEA-FinAI/RagLLaVA.\n","authors":["Zhanpeng Chen","Chengjin Xu","Yiyan Qi","Jian Guo"],"pdf_url":"https://arxiv.org/pdf/2407.21439v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21424v1","updated":"2024-07-31T08:19:06Z","published":"2024-07-31T08:19:06Z","title":"Cost-Effective Hallucination Detection for LLMs","summary":" Large language models (LLMs) can be prone to hallucinations - generating\nunreliable outputs that are unfaithful to their inputs, external facts or\ninternally inconsistent. In this work, we address several challenges for\npost-hoc hallucination detection in production settings. Our pipeline for\nhallucination detection entails: first, producing a confidence score\nrepresenting the likelihood that a generated answer is a hallucination; second,\ncalibrating the score conditional on attributes of the inputs and candidate\nresponse; finally, performing detection by thresholding the calibrated score.\nWe benchmark a variety of state-of-the-art scoring methods on different\ndatasets, encompassing question answering, fact checking, and summarization\ntasks. We employ diverse LLMs to ensure a comprehensive assessment of\nperformance. We show that calibrating individual scoring methods is critical\nfor ensuring risk-aware downstream decision making. Based on findings that no\nindividual score performs best in all situations, we propose a multi-scoring\nframework, which combines different scores and achieves top performance across\nall datasets. We further introduce cost-effective multi-scoring, which can\nmatch or even outperform more expensive detection methods, while significantly\nreducing computational overhead.\n","authors":["Simon Valentin","Jinmiao Fu","Gianluca Detommaso","Shaoyuan Xu","Giovanni Zappella","Bryan Wang"],"pdf_url":"https://arxiv.org/pdf/2407.21424v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21417v1","updated":"2024-07-31T08:05:04Z","published":"2024-07-31T08:05:04Z","title":"Dancing in Chains: Reconciling Instruction Following and Faithfulness in\n Language Models","summary":" Modern language models (LMs) need to follow human instructions while being\nfaithful; yet, they often fail to achieve both. Here, we provide concrete\nevidence of a trade-off between instruction following (i.e., follow open-ended\ninstructions) and faithfulness (i.e., ground responses in given context) when\ntraining LMs with these objectives. For instance, fine-tuning LLaMA-7B on\ninstruction following datasets renders it less faithful. Conversely,\ninstruction-tuned Vicuna-7B shows degraded performance at following\ninstructions when further optimized on tasks that require contextual grounding.\nOne common remedy is multi-task learning (MTL) with data mixing, yet it remains\nfar from achieving a synergic outcome. We propose a simple yet effective method\nthat relies on Rejection Sampling for Continued Self-instruction Tuning\n(ReSet), which significantly outperforms vanilla MTL. Surprisingly, we find\nthat less is more, as training ReSet with high-quality, yet substantially\nsmaller data (three-fold less) yields superior results. Our findings offer a\nbetter understanding of objective discrepancies in alignment training of LMs.\n","authors":["Zhengxuan Wu","Yuhao Zhang","Peng Qi","Yumo Xu","Rujun Han","Yian Zhang","Jifan Chen","Bonan Min","Zhiheng Huang"],"pdf_url":"https://arxiv.org/pdf/2407.21417v1.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2407.21414v1","updated":"2024-07-31T08:00:41Z","published":"2024-07-31T08:00:41Z","title":"Towards interfacing large language models with ASR systems using\n confidence measures and prompting","summary":" As large language models (LLMs) grow in parameter size and capabilities, such\nas interaction through prompting, they open up new ways of interfacing with\nautomatic speech recognition (ASR) systems beyond rescoring n-best lists. This\nwork investigates post-hoc correction of ASR transcripts with LLMs. To avoid\nintroducing errors into likely accurate transcripts, we propose a range of\nconfidence-based filtering methods. Our results indicate that this can improve\nthe performance of less competitive ASR systems.\n","authors":["Maryam Naderi","Enno Hermann","Alexandre Nanchen","Sevada Hovsepyan","Mathew Magimai. -Doss"],"pdf_url":"https://arxiv.org/pdf/2407.21414v1.pdf","comment":"5 pages, 3 figures, 5 tables. Accepted to Interspeech 2024"},{"id":"http://arxiv.org/abs/2407.18483v4","updated":"2024-07-31T07:24:30Z","published":"2024-07-26T03:23:31Z","title":"A Role-specific Guided Large Language Model for Ophthalmic Consultation\n Based on Stylistic Differentiation","summary":" Ophthalmology consultations are crucial for diagnosing, treating, and\npreventing eye diseases. However, the growing demand for consultations exceeds\nthe availability of ophthalmologists. By leveraging large pre-trained language\nmodels, we can design effective dialogues for specific scenarios, aiding in\nconsultations. Traditional fine-tuning strategies for question-answering tasks\nare impractical due to increasing model size and often ignoring patient-doctor\nrole function during consultations. In this paper, we propose EyeDoctor, an\nophthalmic medical questioning large language model that enhances accuracy\nthrough doctor-patient role perception guided and an augmented knowledge base\nwith external disease information. Experimental results show EyeDoctor achieves\nhigher question-answering precision in ophthalmology consultations. Notably,\nEyeDoctor demonstrated a 7.25% improvement in Rouge-1 scores and a 10.16%\nimprovement in F1 scores on multi-round datasets compared to second best model\nChatGPT, highlighting the importance of doctor-patient role differentiation and\ndynamic knowledge base expansion for intelligent medical consultations. EyeDoc\nalso serves as a free available web based service and souce code is available\nat https://github.com/sperfu/EyeDoc.\n","authors":["Laiyi Fu","Binbin Fan","Hongkai Du","Yanxiang Feng","Chunhua Li","Huping Song"],"pdf_url":"https://arxiv.org/pdf/2407.18483v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21384v1","updated":"2024-07-31T07:15:33Z","published":"2024-07-31T07:15:33Z","title":"GEGA: Graph Convolutional Networks and Evidence Retrieval Guided\n Attention for Enhanced Document-level Relation Extraction","summary":" Document-level relation extraction (DocRE) aims to extract relations between\nentities from unstructured document text. Compared to sentence-level relation\nextraction, it requires more complex semantic understanding from a broader text\ncontext. Currently, some studies are utilizing logical rules within evidence\nsentences to enhance the performance of DocRE. However, in the data without\nprovided evidence sentences, researchers often obtain a list of evidence\nsentences for the entire document through evidence retrieval (ER). Therefore,\nDocRE suffers from two challenges: firstly, the relevance between evidence and\nentity pairs is weak; secondly, there is insufficient extraction of complex\ncross-relations between long-distance multi-entities. To overcome these\nchallenges, we propose GEGA, a novel model for DocRE. The model leverages graph\nneural networks to construct multiple weight matrices, guiding attention\nallocation to evidence sentences. It also employs multi-scale representation\naggregation to enhance ER. Subsequently, we integrate the most efficient\nevidence information to implement both fully supervised and weakly supervised\ntraining processes for the model. We evaluate the GEGA model on three widely\nused benchmark datasets: DocRED, Re-DocRED, and Revisit-DocRED. The\nexperimental results indicate that our model has achieved comprehensive\nimprovements compared to the existing SOTA model.\n","authors":["Yanxu Mao","Peipei Liu","Tiehan Cui"],"pdf_url":"https://arxiv.org/pdf/2407.21384v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09002v4","updated":"2024-07-31T06:46:44Z","published":"2024-01-17T06:42:44Z","title":"AttackEval: How to Evaluate the Effectiveness of Jailbreak Attacking on\n Large Language Models","summary":" Ensuring the security of large language models (LLMs) against attacks has\nbecome increasingly urgent, with jailbreak attacks representing one of the most\nsophisticated threats. To deal with such risks, we introduce an innovative\nframework that can help evaluate the effectiveness of jailbreak attacks on\nLLMs. Unlike traditional binary evaluations focusing solely on the robustness\nof LLMs, our method assesses the effectiveness of the attacking prompts\nthemselves. We present two distinct evaluation frameworks: a coarse-grained\nevaluation and a fine-grained evaluation. Each framework uses a scoring range\nfrom 0 to 1, offering unique perspectives and allowing for the assessment of\nattack effectiveness in different scenarios. Additionally, we develop a\ncomprehensive ground truth dataset specifically tailored for jailbreak prompts.\nThis dataset serves as a crucial benchmark for our current study and provides a\nfoundational resource for future research. By comparing with traditional\nevaluation methods, our study shows that the current results align with\nbaseline metrics while offering a more nuanced and fine-grained assessment. It\nalso helps identify potentially harmful attack prompts that might appear\nharmless in traditional evaluations. Overall, our work establishes a solid\nfoundation for assessing a broader range of attack prompts in the area of\nprompt injection.\n","authors":["Dong shu","Mingyu Jin","Chong Zhang","Liangyao Li","Zihao Zhou","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.09002v4.pdf","comment":"34 pages, 6 figures"},{"id":"http://arxiv.org/abs/2407.21368v1","updated":"2024-07-31T06:34:38Z","published":"2024-07-31T06:34:38Z","title":"Prompting Medical Large Vision-Language Models to Diagnose Pathologies\n by Visual Question Answering","summary":" Large Vision-Language Models (LVLMs) have achieved significant success in\nrecent years, and they have been extended to the medical domain. Although\ndemonstrating satisfactory performance on medical Visual Question Answering\n(VQA) tasks, Medical LVLMs (MLVLMs) suffer from the hallucination problem,\nwhich makes them fail to diagnose complex pathologies. Moreover, they readily\nfail to learn minority pathologies due to imbalanced training data. We propose\ntwo prompting strategies for MLVLMs that reduce hallucination and improve VQA\nperformance. In the first strategy, we provide a detailed explanation of the\nqueried pathology. In the second strategy, we fine-tune a cheap, weak learner\nto achieve high performance on a specific metric, and textually provide its\njudgment to the MLVLM. Tested on the MIMIC-CXR-JPG and Chexpert datasets, our\nmethods significantly improve the diagnostic F1 score, with the highest\nincrease being 0.27. We also demonstrate that our prompting strategies can be\nextended to general LVLM domains. Based on POPE metrics, it effectively\nsuppresses the false negative predictions of existing LVLMs and improves Recall\nby approximately 0.07.\n","authors":["Danfeng Guo","Demetri Terzopoulos"],"pdf_url":"https://arxiv.org/pdf/2407.21368v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06082v2","updated":"2024-07-31T05:59:31Z","published":"2024-03-10T04:01:49Z","title":"FrameQuant: Flexible Low-Bit Quantization for Transformers","summary":" Transformers are the backbone of powerful foundation models for many Vision\nand Natural Language Processing tasks. But their compute and memory/storage\nfootprint is large, and so, serving such models is expensive often requiring\nhigh-end hardware. To mitigate this difficulty, Post-Training Quantization\nseeks to modify a pre-trained model and quantize it to eight bits or lower,\nsignificantly boosting compute/memory/latency efficiency. Such models have been\nsuccessfully quantized to four bits with some performance loss. In this work,\nwe outline a simple scheme to quantize Transformer-based models to just two\nbits (plus some overhead) with only a small drop in accuracy. Key to our\nformulation is a concept borrowed from Harmonic analysis called Fusion Frames.\nOur main finding is that the quantization must take place not in the original\nweight space, but instead in the Fusion Frame representations. If quantization\nis interpreted as the addition of noise, our casting of the problem allows\ninvoking an extensive body of known consistent recovery and noise robustness\nguarantees. Further, if desired, de-noising filters are known in closed form.\nWe show empirically, via a variety of experiments, that (almost) two-bit\nquantization for Transformer models promises sizable efficiency gains. The code\nis available at https://github.com/vsingh-group/FrameQuant\n","authors":["Harshavardhan Adepu","Zhanpeng Zeng","Li Zhang","Vikas Singh"],"pdf_url":"https://arxiv.org/pdf/2403.06082v2.pdf","comment":"25 pages, 15 figures"},{"id":"http://arxiv.org/abs/2407.13765v2","updated":"2024-07-31T05:57:07Z","published":"2024-07-18T17:59:27Z","title":"Latent Causal Probing: A Formal Perspective on Probing with Causal\n Models of Data","summary":" As language models (LMs) deliver increasing performance on a range of NLP\ntasks, probing classifiers have become an indispensable technique in the effort\nto better understand their inner workings. A typical setup involves (1)\ndefining an auxiliary task consisting of a dataset of text annotated with\nlabels, then (2) supervising small classifiers to predict the labels from the\nrepresentations of a pretrained LM as it processed the dataset. A high probing\naccuracy is interpreted as evidence that the LM has learned to perform the\nauxiliary task as an unsupervised byproduct of its original pretraining\nobjective. Despite the widespread usage of probes, however, the robust design\nand analysis of probing experiments remains a challenge. We develop a formal\nperspective on probing using structural causal models (SCM). Specifically,\ngiven an SCM which explains the distribution of tokens observed during\ntraining, we frame the central hypothesis as whether the LM has learned to\nrepresent the latent variables of the SCM. Empirically, we extend a recent\nstudy of LMs in the context of a synthetic grid-world navigation task, where\nhaving an exact model of the underlying causal structure allows us to draw\nstrong inferences from the result of probing experiments. Our techniques\nprovide robust empirical evidence for the ability of LMs to induce the latent\nconcepts underlying text.\n","authors":["Charles Jin","Martin Rinard"],"pdf_url":"https://arxiv.org/pdf/2407.13765v2.pdf","comment":"COLM 2024"},{"id":"http://arxiv.org/abs/2407.20248v2","updated":"2024-07-31T05:16:30Z","published":"2024-07-19T09:24:29Z","title":"LAPIS: Language Model-Augmented Police Investigation System","summary":" Crime situations are race against time. An AI-assisted criminal investigation\nsystem, providing prompt but precise legal counsel is in need for police\nofficers. We introduce LAPIS (Language Model Augmented Police Investigation\nSystem), an automated system that assists police officers to perform rational\nand legal investigative actions. We constructed a finetuning dataset and\nretrieval knowledgebase specialized in crime investigation legal reasoning\ntask. We extended the dataset's quality by incorporating manual curation\nefforts done by a group of domain experts. We then finetuned the pretrained\nweights of a smaller Korean language model to the newly constructed dataset and\nintegrated it with the crime investigation knowledgebase retrieval approach.\nExperimental results show LAPIS' potential in providing reliable legal guidance\nfor police officers, even better than the proprietary GPT-4 model. Qualitative\nanalysis on the rationales generated by LAPIS demonstrate the model's reasoning\nability to leverage the premises and derive legally correct conclusions.\n","authors":["Heedou Kim","Dain Kim","Jiwoo Lee","Chanwoong Yoon","Donghee Choi","Mogan Gim","Jaewoo Kang"],"pdf_url":"https://arxiv.org/pdf/2407.20248v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21330v1","updated":"2024-07-31T04:38:07Z","published":"2024-07-31T04:38:07Z","title":"Performance of Recent Large Language Models for a Low-Resourced Language","summary":" Large Language Models (LLMs) have shown significant advances in the past\nyear. In addition to new versions of GPT and Llama, several other LLMs have\nbeen introduced recently. Some of these are open models available for download\nand modification.\n Although multilingual large language models have been available for some\ntime, their performance on low-resourced languages such as Sinhala has been\npoor. We evaluated four recent LLMs on their performance directly in the\nSinhala language, and by translation to and from English. We also evaluated\ntheir fine-tunability with a small amount of fine-tuning data. Claude and GPT\n4o perform well out-of-the-box and do significantly better than previous\nversions. Llama and Mistral perform poorly but show some promise of improvement\nwith fine tuning.\n","authors":["Ravindu Jayakody","Gihan Dias"],"pdf_url":"https://arxiv.org/pdf/2407.21330v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21315v1","updated":"2024-07-31T03:53:14Z","published":"2024-07-31T03:53:14Z","title":"Beyond Silent Letters: Amplifying LLMs in Emotion Recognition with Vocal\n Nuances","summary":" This paper introduces a novel approach to emotion detection in speech using\nLarge Language Models (LLMs). We address the limitation of LLMs in processing\naudio inputs by translating speech characteristics into natural language\ndescriptions. Our method integrates these descriptions into text prompts,\nenabling LLMs to perform multimodal emotion analysis without architectural\nmodifications. We evaluate our approach on two datasets: IEMOCAP and MELD,\ndemonstrating significant improvements in emotion recognition accuracy,\nparticularly for high-quality audio data. Our experiments show that\nincorporating speech descriptions yields a 2 percentage point increase in\nweighted F1 score on IEMOCAP (from 70.111\\% to 72.596\\%). We also compare\nvarious LLM architectures and explore the effectiveness of different feature\nrepresentations. Our findings highlight the potential of this approach in\nenhancing emotion detection capabilities of LLMs and underscore the importance\nof audio quality in speech-based emotion recognition tasks. We'll release the\nsource code on Github.\n","authors":["Zehui Wu","Ziwei Gong","Lin Ai","Pengyuan Shi","Kaan Donbekci","Julia Hirschberg"],"pdf_url":"https://arxiv.org/pdf/2407.21315v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03407v3","updated":"2024-07-31T03:52:46Z","published":"2024-03-06T02:23:32Z","title":"Human vs. Machine: Behavioral Differences Between Expert Humans and\n Language Models in Wargame Simulations","summary":" To some, the advent of AI promises better decision-making and increased\nmilitary effectiveness while reducing the influence of human error and\nemotions. However, there is still debate about how AI systems, especially large\nlanguage models (LLMs) that can be applied to many tasks, behave compared to\nhumans in high-stakes military decision-making scenarios with the potential for\nincreased risks towards escalation and unnecessary conflicts. To test this\npotential and scrutinize the use of LLMs for such purposes, we use a new\nwargame experiment with 107 national security experts designed to examine\ncrisis escalation in a fictional US-China scenario and compare the behavior of\nhuman player teams to LLM-simulated team responses in separate simulations.\nHere, we find that the LLM-simulated responses can be more aggressive and\nsignificantly affected by changes in the scenario. We show a considerable\nhigh-level agreement in the LLM and human responses and significant\nquantitative and qualitative differences in individual actions and strategic\ntendencies. These differences depend on intrinsic biases in LLMs regarding the\nappropriate level of violence following strategic instructions, the choice of\nLLM, and whether the LLMs are tasked to decide for a team of players directly\nor first to simulate dialog between a team of players. When simulating the\ndialog, the discussions lack quality and maintain a farcical harmony. The LLM\nsimulations cannot account for human player characteristics, showing no\nsignificant difference even for extreme traits, such as \"pacifist\" or\n\"aggressive sociopath.\" When probing behavioral consistency across individual\nmoves of the simulation, the tested LLMs deviated from each other but generally\nshowed somewhat consistent behavior. Our results motivate policymakers to be\ncautious before granting autonomy or following AI-based strategy\nrecommendations.\n","authors":["Max Lamparth","Anthony Corso","Jacob Ganz","Oriana Skylar Mastro","Jacquelyn Schneider","Harold Trinkunas"],"pdf_url":"https://arxiv.org/pdf/2403.03407v3.pdf","comment":"Updated based on reviewer feedback to match AIES accepted\n camera-ready version"},{"id":"http://arxiv.org/abs/2407.17487v2","updated":"2024-07-31T02:35:37Z","published":"2024-07-03T08:27:51Z","title":"Explainable Natural Language Processing for Corporate Sustainability\n Analysis","summary":" Sustainability commonly refers to entities, such as individuals, companies,\nand institutions, having a non-detrimental (or even positive) impact on the\nenvironment, society, and the economy. With sustainability becoming a synonym\nof acceptable and legitimate behaviour, it is being increasingly demanded and\nregulated. Several frameworks and standards have been proposed to measure the\nsustainability impact of corporations, including United Nations' sustainable\ndevelopment goals and the recently introduced global sustainability reporting\nframework, amongst others. However, the concept of corporate sustainability is\ncomplex due to the diverse and intricate nature of firm operations (i.e.\ngeography, size, business activities, interlinks with other stakeholders). As a\nresult, corporate sustainability assessments are plagued by subjectivity both\nwithin data that reflect corporate sustainability efforts (i.e. corporate\nsustainability disclosures) and the analysts evaluating them. This subjectivity\ncan be distilled into distinct challenges, such as incompleteness, ambiguity,\nunreliability and sophistication on the data dimension, as well as limited\nresources and potential bias on the analyst dimension. Put together,\nsubjectivity hinders effective cost attribution to entities non-compliant with\nprevailing sustainability expectations, potentially rendering sustainability\nefforts and its associated regulations futile. To this end, we argue that\nExplainable Natural Language Processing (XNLP) can significantly enhance\ncorporate sustainability analysis. Specifically, linguistic understanding\nalgorithms (lexical, semantic, syntactic), integrated with XAI capabilities\n(interpretability, explainability, faithfulness), can bridge gaps in analyst\nresources and mitigate subjectivity problems within data.\n","authors":["Keane Ong","Rui Mao","Ranjan Satapathy","Ricardo Shirota Filho","Erik Cambria","Johan Sulaeman","Gianmarco Mengaldo"],"pdf_url":"https://arxiv.org/pdf/2407.17487v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04430v2","updated":"2024-07-31T02:15:31Z","published":"2023-08-08T17:58:15Z","title":"SILO Language Models: Isolating Legal Risk In a Nonparametric Datastore","summary":" The legality of training language models (LMs) on copyrighted or otherwise\nrestricted data is under intense debate. However, as we show, model performance\nsignificantly degrades if trained only on low-risk text (e.g., out-of-copyright\nbooks or government documents), due to its limited size and domain coverage. We\npresent SILO, a new language model that manages this risk-performance tradeoff\nduring inference. SILO is built by (1) training a parametric LM on Open License\nCorpus (OLC), a new corpus we curate with 228B tokens of public domain and\npermissively licensed text and (2) augmenting it with a more general and easily\nmodifiable nonparametric datastore (e.g., containing copyrighted books or news)\nthat is only queried during inference. The datastore allows use of high-risk\ndata without training on it, supports sentence-level data attribution, and\nenables data producers to opt out from the model by removing content from the\nstore. These capabilities can foster compliance with data-use regulations such\nas the fair use doctrine in the United States and the GDPR in the European\nUnion. Our experiments show that the parametric LM struggles on domains not\ncovered by OLC. However, access to the datastore greatly improves out of domain\nperformance, closing 90% of the performance gap with an LM trained on the Pile,\na more diverse corpus with mostly high-risk text. We also analyze which\nnonparametric approach works best, where the remaining errors lie, and how\nperformance scales with datastore size. Our results suggest that it is possible\nto build high quality language models while mitigating their legal risk.\n","authors":["Sewon Min","Suchin Gururangan","Eric Wallace","Weijia Shi","Hannaneh Hajishirzi","Noah A. Smith","Luke Zettlemoyer"],"pdf_url":"https://arxiv.org/pdf/2308.04430v2.pdf","comment":"29 pages; 7 figures. Published as a conference paper at ICLR 2024\n (spotlight). Code, models, and data available at\n https://github.com/kernelmachine/silo-lm"},{"id":"http://arxiv.org/abs/2407.20485v2","updated":"2024-07-31T02:02:40Z","published":"2024-07-30T01:13:42Z","title":"A2SF: Accumulative Attention Scoring with Forgetting Factor for Token\n Pruning in Transformer Decoder","summary":" Recently, large language models (LLM) based on transformers are facing memory\nbottleneck issues due to KV cache, especially in long sequence handling.\nPrevious researches proposed KV cache compression techniques that identify\ninsignificant tokens based on Accumulative Attention Scores and removes their\nitems from KV cache, noting that only few tokens play an important role in\nattention operations. However, we have observed that the existing Accumulative\nAttention Score is not suitable for the transformer decoder structure. In the\ndecoder model, the number of times the Attention Score accumulates varies\ndepending on the order of token appearance due to the effect of masking,\ncausing an uneven comparison between tokens. To solve this, we propose\nAccumulative Attention Score with Forgetting Factor (A2SF) technique, which\nintroduces a Forgetting Factor in the Attention Score accumulation process.\nA2SF applies a penalty to the past Attention Score generated from old tokens by\nrepeatedly multiplying the Forgetting Factor to the Attention Score over time.\nTherefore, older tokens receive a larger penalty, providing fairness among\ndifferent ages of tokens. Through the fair comparison among tokens, we can more\neffectively select important tokens. We have verified the accuracy improvement\nthrough A2SF in the OPT and LLaMA models and A2SF improves the accuracy of\nLLaMA 2 by up to 7.8% and 5.1% on 1-shot and 0-shot.\n","authors":["Hyun-rae Jo","Dongkun Shin"],"pdf_url":"https://arxiv.org/pdf/2407.20485v2.pdf","comment":"11 pages(9 pages + reference 2 pages), 6 figures"},{"id":"http://arxiv.org/abs/2407.21276v1","updated":"2024-07-31T01:51:24Z","published":"2024-07-31T01:51:24Z","title":"Multi-Level Querying using A Knowledge Pyramid","summary":" This paper addresses the need for improved precision in existing\nRetrieval-Augmented Generation (RAG) methods that primarily focus on enhancing\nrecall. We propose a multi-layer knowledge pyramid approach within the RAG\nframework to achieve a better balance between precision and recall. The\nknowledge pyramid consists of three layers: Ontologies, Knowledge Graphs (KGs),\nand chunk-based raw text. We employ cross-layer augmentation techniques for\ncomprehensive knowledge coverage and dynamic updates of the Ontology schema and\ninstances. To ensure compactness, we utilize cross-layer filtering methods for\nknowledge condensation in KGs. Our approach, named PolyRAG, follows a waterfall\nmodel for retrieval, starting from the top of the pyramid and progressing down\nuntil a confident answer is obtained. We introduce two benchmarks for\ndomain-specific knowledge retrieval, one in the academic domain and the other\nin the financial domain. The effectiveness of the methods has been validated\nthrough comprehensive experiments by outperforming 19 SOTA methods. An\nencouraging observation is that the proposed method has augmented the GPT-4,\nproviding 395\\% F1 gain by improving its performance from 0.1636 to 0.8109.\n","authors":["Rubing Chen","Xulu Zhang","Jiaxin Wu","Wenqi Fan","Xiao-Yong Wei","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2407.21276v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21264v1","updated":"2024-07-31T00:56:09Z","published":"2024-07-31T00:56:09Z","title":"Model Attribution in Machine-Generated Disinformation: A Domain\n Generalization Approach with Supervised Contrastive Learning","summary":" Model attribution for machine-generated disinformation poses a significant\nchallenge in understanding its origins and mitigating its spread. This task is\nespecially challenging because modern large language models (LLMs) produce\ndisinformation with human-like quality. Additionally, the diversity in\nprompting methods used to generate disinformation complicates accurate source\nattribution. These methods introduce domain-specific features that can mask the\nfundamental characteristics of the models. In this paper, we introduce the\nconcept of model attribution as a domain generalization problem, where each\nprompting method represents a unique domain. We argue that an effective\nattribution model must be invariant to these domain-specific features. It\nshould also be proficient in identifying the originating models across all\nscenarios, reflecting real-world detection challenges. To address this, we\nintroduce a novel approach based on Supervised Contrastive Learning. This\nmethod is designed to enhance the model's robustness to variations in prompts\nand focuses on distinguishing between different source LLMs. We evaluate our\nmodel through rigorous experiments involving three common prompting methods:\n``open-ended'', ``rewriting'', and ``paraphrasing'', and three advanced LLMs:\n``llama 2'', ``chatgpt'', and ``vicuna''. Our results demonstrate the\neffectiveness of our approach in model attribution tasks, achieving\nstate-of-the-art performance across diverse and unseen datasets.\n","authors":["Alimohammad Beigi","Zhen Tan","Nivedh Mudiam","Canyu Chen","Kai Shu","Huan Liu"],"pdf_url":"https://arxiv.org/pdf/2407.21264v1.pdf","comment":"10 pages, 2 figures, accepted at DSAA 2024"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2407.21794v1","updated":"2024-07-31T17:59:58Z","published":"2024-07-31T17:59:58Z","title":"Generalized Out-of-Distribution Detection and Beyond in Vision Language\n Model Era: A Survey","summary":" Detecting out-of-distribution (OOD) samples is crucial for ensuring the\nsafety of machine learning systems and has shaped the field of OOD detection.\nMeanwhile, several other problems are closely related to OOD detection,\nincluding anomaly detection (AD), novelty detection (ND), open set recognition\n(OSR), and outlier detection (OD). To unify these problems, a generalized OOD\ndetection framework was proposed, taxonomically categorizing these five\nproblems. However, Vision Language Models (VLMs) such as CLIP have\nsignificantly changed the paradigm and blurred the boundaries between these\nfields, again confusing researchers. In this survey, we first present a\ngeneralized OOD detection v2, encapsulating the evolution of AD, ND, OSR, OOD\ndetection, and OD in the VLM era. Our framework reveals that, with some field\ninactivity and integration, the demanding challenges have become OOD detection\nand AD. In addition, we also highlight the significant shift in the definition,\nproblem settings, and benchmarks; we thus feature a comprehensive review of the\nmethodology for OOD detection, including the discussion over other related\ntasks to clarify their relationship to OOD detection. Finally, we explore the\nadvancements in the emerging Large Vision Language Model (LVLM) era, such as\nGPT-4V. We conclude this survey with open challenges and future directions.\n","authors":["Atsuyuki Miyai","Jingkang Yang","Jingyang Zhang","Yifei Ming","Yueqian Lin","Qing Yu","Go Irie","Shafiq Joty","Yixuan Li","Hai Li","Ziwei Liu","Toshihiko Yamasaki","Kiyoharu Aizawa"],"pdf_url":"https://arxiv.org/pdf/2407.21794v1.pdf","comment":"survey paper. We welcome questions, issues, and paper requests via\n https://github.com/AtsuMiyai/Awesome-OOD-VLM"},{"id":"http://arxiv.org/abs/2407.00278v2","updated":"2024-07-31T17:57:37Z","published":"2024-06-29T02:06:01Z","title":"PerAct2: Benchmarking and Learning for Robotic Bimanual Manipulation\n Tasks","summary":" Bimanual manipulation is challenging due to precise spatial and temporal\ncoordination required between two arms. While there exist several real-world\nbimanual systems, there is a lack of simulated benchmarks with a large task\ndiversity for systematically studying bimanual capabilities across a wide range\nof tabletop tasks. This paper addresses the gap by extending RLBench to\nbimanual manipulation. We open-source our code and benchmark comprising 13 new\ntasks with 23 unique task variations, each requiring a high degree of\ncoordination and adaptability. To kickstart the benchmark, we extended several\nstate-of-the art methods to bimanual manipulation and also present a\nlanguage-conditioned behavioral cloning agent -- PerAct2, which enables the\nlearning and execution of bimanual 6-DoF manipulation tasks. Our novel network\narchitecture efficiently integrates language processing with action prediction,\nallowing robots to understand and perform complex bimanual tasks in response to\nuser-specified goals. Project website with code is available at:\nhttp://bimanual.github.io\n","authors":["Markus Grotz","Mohit Shridhar","Tamim Asfour","Dieter Fox"],"pdf_url":"https://arxiv.org/pdf/2407.00278v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21788v1","updated":"2024-07-31T17:57:32Z","published":"2024-07-31T17:57:32Z","title":"Vision-Language Model Based Handwriting Verification","summary":" Handwriting Verification is a critical in document forensics. Deep learning\nbased approaches often face skepticism from forensic document examiners due to\ntheir lack of explainability and reliance on extensive training data and\nhandcrafted features. This paper explores using Vision Language Models (VLMs),\nsuch as OpenAI's GPT-4o and Google's PaliGemma, to address these challenges. By\nleveraging their Visual Question Answering capabilities and 0-shot\nChain-of-Thought (CoT) reasoning, our goal is to provide clear,\nhuman-understandable explanations for model decisions. Our experiments on the\nCEDAR handwriting dataset demonstrate that VLMs offer enhanced\ninterpretability, reduce the need for large training datasets, and adapt better\nto diverse handwriting styles. However, results show that the CNN-based\nResNet-18 architecture outperforms the 0-shot CoT prompt engineering approach\nwith GPT-4o (Accuracy: 70%) and supervised fine-tuned PaliGemma (Accuracy:\n71%), achieving an accuracy of 84% on the CEDAR AND dataset. These findings\nhighlight the potential of VLMs in generating human-interpretable decisions\nwhile underscoring the need for further advancements to match the performance\nof specialized deep learning models.\n","authors":["Mihir Chauhan","Abhishek Satbhai","Mohammad Abuzar Hashemi","Mir Basheer Ali","Bina Ramamurthy","Mingchen Gao","Siwei Lyu","Sargur Srihari"],"pdf_url":"https://arxiv.org/pdf/2407.21788v1.pdf","comment":"4 Pages, 1 Figure, 1 Table, Accepted as Short paper at Irish Machine\n Vision and Image Processing (IMVIP) Conference"},{"id":"http://arxiv.org/abs/2407.21783v1","updated":"2024-07-31T17:54:27Z","published":"2024-07-31T17:54:27Z","title":"The Llama 3 Herd of Models","summary":" Modern artificial intelligence (AI) systems are powered by foundation models.\nThis paper presents a new set of foundation models, called Llama 3. It is a\nherd of language models that natively support multilinguality, coding,\nreasoning, and tool usage. Our largest model is a dense Transformer with 405B\nparameters and a context window of up to 128K tokens. This paper presents an\nextensive empirical evaluation of Llama 3. We find that Llama 3 delivers\ncomparable quality to leading language models such as GPT-4 on a plethora of\ntasks. We publicly release Llama 3, including pre-trained and post-trained\nversions of the 405B parameter language model and our Llama Guard 3 model for\ninput and output safety. The paper also presents the results of experiments in\nwhich we integrate image, video, and speech capabilities into Llama 3 via a\ncompositional approach. We observe this approach performs competitively with\nthe state-of-the-art on image, video, and speech recognition tasks. The\nresulting models are not yet being broadly released as they are still under\ndevelopment.\n","authors":["Abhimanyu Dubey","Abhinav Jauhri","Abhinav Pandey","Abhishek Kadian","Ahmad Al-Dahle","Aiesha Letman","Akhil Mathur","Alan Schelten","Amy Yang","Angela Fan","Anirudh Goyal","Anthony Hartshorn","Aobo Yang","Archi Mitra","Archie Sravankumar","Artem Korenev","Arthur Hinsvark","Arun Rao","Aston Zhang","Aurelien Rodriguez","Austen Gregerson","Ava Spataru","Baptiste Roziere","Bethany Biron","Binh Tang","Bobbie Chern","Charlotte Caucheteux","Chaya Nayak","Chloe Bi","Chris Marra","Chris McConnell","Christian Keller","Christophe Touret","Chunyang Wu","Corinne Wong","Cristian Canton Ferrer","Cyrus Nikolaidis","Damien Allonsius","Daniel Song","Danielle Pintz","Danny Livshits","David Esiobu","Dhruv Choudhary","Dhruv Mahajan","Diego Garcia-Olano","Diego Perino","Dieuwke Hupkes","Egor Lakomkin","Ehab AlBadawy","Elina Lobanova","Emily Dinan","Eric Michael Smith","Filip Radenovic","Frank Zhang","Gabriel Synnaeve","Gabrielle Lee","Georgia Lewis Anderson","Graeme Nail","Gregoire Mialon","Guan Pang","Guillem Cucurell","Hailey Nguyen","Hannah Korevaar","Hu Xu","Hugo Touvron","Iliyan Zarov","Imanol Arrieta Ibarra","Isabel Kloumann","Ishan Misra","Ivan Evtimov","Jade Copet","Jaewon Lee","Jan Geffert","Jana Vranes","Jason Park","Jay Mahadeokar","Jeet Shah","Jelmer van der Linde","Jennifer Billock","Jenny Hong","Jenya Lee","Jeremy Fu","Jianfeng Chi","Jianyu Huang","Jiawen Liu","Jie Wang","Jiecao Yu","Joanna Bitton","Joe Spisak","Jongsoo Park","Joseph Rocca","Joshua Johnstun","Joshua Saxe","Junteng Jia","Kalyan Vasuden Alwala","Kartikeya Upasani","Kate Plawiak","Ke Li","Kenneth Heafield","Kevin Stone","Khalid El-Arini","Krithika Iyer","Kshitiz Malik","Kuenley Chiu","Kunal Bhalla","Lauren Rantala-Yeary","Laurens van der Maaten","Lawrence Chen","Liang Tan","Liz Jenkins","Louis Martin","Lovish Madaan","Lubo Malo","Lukas Blecher","Lukas Landzaat","Luke de Oliveira","Madeline Muzzi","Mahesh Pasupuleti","Mannat Singh","Manohar Paluri","Marcin Kardas","Mathew Oldham","Mathieu Rita","Maya Pavlova","Melanie Kambadur","Mike Lewis","Min Si","Mitesh Kumar Singh","Mona Hassan","Naman Goyal","Narjes Torabi","Nikolay Bashlykov","Nikolay Bogoychev","Niladri Chatterji","Olivier Duchenne","Onur Çelebi","Patrick Alrassy","Pengchuan Zhang","Pengwei Li","Petar Vasic","Peter Weng","Prajjwal Bhargava","Pratik Dubal","Praveen Krishnan","Punit Singh Koura","Puxin Xu","Qing He","Qingxiao Dong","Ragavan Srinivasan","Raj Ganapathy","Ramon Calderer","Ricardo Silveira Cabral","Robert Stojnic","Roberta Raileanu","Rohit Girdhar","Rohit Patel","Romain Sauvestre","Ronnie Polidoro","Roshan Sumbaly","Ross Taylor","Ruan Silva","Rui Hou","Rui Wang","Saghar Hosseini","Sahana Chennabasappa","Sanjay Singh","Sean Bell","Seohyun Sonia Kim","Sergey Edunov","Shaoliang Nie","Sharan Narang","Sharath Raparthy","Sheng Shen","Shengye Wan","Shruti Bhosale","Shun Zhang","Simon Vandenhende","Soumya Batra","Spencer Whitman","Sten Sootla","Stephane Collot","Suchin Gururangan","Sydney Borodinsky","Tamar Herman","Tara Fowler","Tarek Sheasha","Thomas Georgiou","Thomas Scialom","Tobias Speckbacher","Todor Mihaylov","Tong Xiao","Ujjwal Karn","Vedanuj Goswami","Vibhor Gupta","Vignesh Ramanathan","Viktor Kerkez","Vincent Gonguet","Virginie Do","Vish Vogeti","Vladan Petrovic","Weiwei Chu","Wenhan Xiong","Wenyin Fu","Whitney Meers","Xavier Martinet","Xiaodong Wang","Xiaoqing Ellen Tan","Xinfeng Xie","Xuchao Jia","Xuewei Wang","Yaelle Goldschlag","Yashesh Gaur","Yasmine Babaei","Yi Wen","Yiwen Song","Yuchen Zhang","Yue Li","Yuning Mao","Zacharie Delpierre Coudert","Zheng Yan","Zhengxing Chen","Zoe Papakipos","Aaditya Singh","Aaron Grattafiori","Abha Jain","Adam Kelsey","Adam Shajnfeld","Adithya Gangidi","Adolfo Victoria","Ahuva Goldstand","Ajay Menon","Ajay Sharma","Alex Boesenberg","Alex Vaughan","Alexei Baevski","Allie Feinstein","Amanda Kallet","Amit Sangani","Anam Yunus","Andrei Lupu","Andres Alvarado","Andrew Caples","Andrew Gu","Andrew Ho","Andrew Poulton","Andrew Ryan","Ankit Ramchandani","Annie Franco","Aparajita Saraf","Arkabandhu Chowdhury","Ashley Gabriel","Ashwin Bharambe","Assaf Eisenman","Azadeh Yazdan","Beau James","Ben Maurer","Benjamin Leonhardi","Bernie Huang","Beth Loyd","Beto De Paola","Bhargavi Paranjape","Bing Liu","Bo Wu","Boyu Ni","Braden Hancock","Bram Wasti","Brandon Spence","Brani Stojkovic","Brian Gamido","Britt Montalvo","Carl Parker","Carly Burton","Catalina Mejia","Changhan Wang","Changkyu Kim","Chao Zhou","Chester Hu","Ching-Hsiang Chu","Chris Cai","Chris Tindal","Christoph Feichtenhofer","Damon Civin","Dana Beaty","Daniel Kreymer","Daniel Li","Danny Wyatt","David Adkins","David Xu","Davide Testuggine","Delia David","Devi Parikh","Diana Liskovich","Didem Foss","Dingkang Wang","Duc Le","Dustin Holland","Edward Dowling","Eissa Jamil","Elaine Montgomery","Eleonora Presani","Emily Hahn","Emily Wood","Erik Brinkman","Esteban Arcaute","Evan Dunbar","Evan Smothers","Fei Sun","Felix Kreuk","Feng Tian","Firat Ozgenel","Francesco Caggioni","Francisco Guzmán","Frank Kanayet","Frank Seide","Gabriela Medina Florez","Gabriella Schwarz","Gada Badeer","Georgia Swee","Gil Halpern","Govind Thattai","Grant Herman","Grigory Sizov"," Guangyi"," Zhang","Guna Lakshminarayanan","Hamid Shojanazeri","Han Zou","Hannah Wang","Hanwen Zha","Haroun Habeeb","Harrison Rudolph","Helen Suk","Henry Aspegren","Hunter Goldman","Igor Molybog","Igor Tufanov","Irina-Elena Veliche","Itai Gat","Jake Weissman","James Geboski","James Kohli","Japhet Asher","Jean-Baptiste Gaya","Jeff Marcus","Jeff Tang","Jennifer Chan","Jenny Zhen","Jeremy Reizenstein","Jeremy Teboul","Jessica Zhong","Jian Jin","Jingyi Yang","Joe Cummings","Jon Carvill","Jon Shepard","Jonathan McPhie","Jonathan Torres","Josh Ginsburg","Junjie Wang","Kai Wu","Kam Hou U","Karan Saxena","Karthik Prasad","Kartikay Khandelwal","Katayoun Zand","Kathy Matosich","Kaushik Veeraraghavan","Kelly Michelena","Keqian Li","Kun Huang","Kunal Chawla","Kushal Lakhotia","Kyle Huang","Lailin Chen","Lakshya Garg","Lavender A","Leandro Silva","Lee Bell","Lei Zhang","Liangpeng Guo","Licheng Yu","Liron Moshkovich","Luca Wehrstedt","Madian Khabsa","Manav Avalani","Manish Bhatt","Maria Tsimpoukelli","Martynas Mankus","Matan Hasson","Matthew Lennie","Matthias Reso","Maxim Groshev","Maxim Naumov","Maya Lathi","Meghan Keneally","Michael L. Seltzer","Michal Valko","Michelle Restrepo","Mihir Patel","Mik Vyatskov","Mikayel Samvelyan","Mike Clark","Mike Macey","Mike Wang","Miquel Jubert Hermoso","Mo Metanat","Mohammad Rastegari","Munish Bansal","Nandhini Santhanam","Natascha Parks","Natasha White","Navyata Bawa","Nayan Singhal","Nick Egebo","Nicolas Usunier","Nikolay Pavlovich Laptev","Ning Dong","Ning Zhang","Norman Cheng","Oleg Chernoguz","Olivia Hart","Omkar Salpekar","Ozlem Kalinli","Parkin Kent","Parth Parekh","Paul Saab","Pavan Balaji","Pedro Rittner","Philip Bontrager","Pierre Roux","Piotr Dollar","Polina Zvyagina","Prashant Ratanchandani","Pritish Yuvraj","Qian Liang","Rachad Alao","Rachel Rodriguez","Rafi Ayub","Raghotham Murthy","Raghu Nayani","Rahul Mitra","Raymond Li","Rebekkah Hogan","Robin Battey","Rocky Wang","Rohan Maheswari","Russ Howes","Ruty Rinott","Sai Jayesh Bondu","Samyak Datta","Sara Chugh","Sara Hunt","Sargun Dhillon","Sasha Sidorov","Satadru Pan","Saurabh Verma","Seiji Yamamoto","Sharadh Ramaswamy","Shaun Lindsay","Shaun Lindsay","Sheng Feng","Shenghao Lin","Shengxin Cindy Zha","Shiva Shankar","Shuqiang Zhang","Shuqiang Zhang","Sinong Wang","Sneha Agarwal","Soji Sajuyigbe","Soumith Chintala","Stephanie Max","Stephen Chen","Steve Kehoe","Steve Satterfield","Sudarshan Govindaprasad","Sumit Gupta","Sungmin Cho","Sunny Virk","Suraj Subramanian","Sy Choudhury","Sydney Goldman","Tal Remez","Tamar Glaser","Tamara Best","Thilo Kohler","Thomas Robinson","Tianhe Li","Tianjun Zhang","Tim Matthews","Timothy Chou","Tzook Shaked","Varun Vontimitta","Victoria Ajayi","Victoria Montanez","Vijai Mohan","Vinay Satish Kumar","Vishal Mangla","Vlad Ionescu","Vlad Poenaru","Vlad Tiberiu Mihailescu","Vladimir Ivanov","Wei Li","Wenchen Wang","Wenwen Jiang","Wes Bouaziz","Will Constable","Xiaocheng Tang","Xiaofang Wang","Xiaojian Wu","Xiaolan Wang","Xide Xia","Xilun Wu","Xinbo Gao","Yanjun Chen","Ye Hu","Ye Jia","Ye Qi","Yenda Li","Yilin Zhang","Ying Zhang","Yossi Adi","Youngjin Nam"," Yu"," Wang","Yuchen Hao","Yundi Qian","Yuzi He","Zach Rait","Zachary DeVito","Zef Rosnbrick","Zhaoduo Wen","Zhenyu Yang","Zhiwei Zhao"],"pdf_url":"https://arxiv.org/pdf/2407.21783v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.05735v2","updated":"2024-07-31T17:50:50Z","published":"2024-07-08T08:40:15Z","title":"An Earth Rover dataset recorded at the ICRA@40 party","summary":" The ICRA conference is celebrating its $40^{th}$ anniversary in Rotterdam in\nSeptember 2024, with as highlight the Happy Birthday ICRA Party at the iconic\nHolland America Line Cruise Terminal. One month later the IROS conference will\ntake place, which will include the Earth Rover Challenge. In this challenge\nopen-world autonomous navigation models are studied truly open-world settings.\n As part of the Earth Rover Challenge several real-world navigation sets in\nseveral cities world-wide, like Auckland, Australia and Wuhan, China. The only\ndataset recorded in the Netherlands is the small village Oudewater. The\nproposal is to record a dataset with the robot used in the Earth Rover\nChallenge in Rotterdam, in front of the Holland America Line Cruise Terminal,\nbefore the festivities of the Happy Birthday ICRA Party start.\n See: https://github.com/SlamMate/vSLAM-on-FrodoBots-2K\n","authors":["Qi Zhang","Zhihao Lin","Arnoud Visser"],"pdf_url":"https://arxiv.org/pdf/2407.05735v2.pdf","comment":"3 page, accepted as Late-Breaking extended abstract to IEEE\n Conference on Robotics and Automation"},{"id":"http://arxiv.org/abs/2407.21773v1","updated":"2024-07-31T17:48:22Z","published":"2024-07-31T17:48:22Z","title":"RainMamba: Enhanced Locality Learning with State Space Models for Video\n Deraining","summary":" The outdoor vision systems are frequently contaminated by rain streaks and\nraindrops, which significantly degenerate the performance of visual tasks and\nmultimedia applications. The nature of videos exhibits redundant temporal cues\nfor rain removal with higher stability. Traditional video deraining methods\nheavily rely on optical flow estimation and kernel-based manners, which have a\nlimited receptive field. Yet, transformer architectures, while enabling\nlong-term dependencies, bring about a significant increase in computational\ncomplexity. Recently, the linear-complexity operator of the state space models\n(SSMs) has contrarily facilitated efficient long-term temporal modeling, which\nis crucial for rain streaks and raindrops removal in videos. Unexpectedly, its\nuni-dimensional sequential process on videos destroys the local correlations\nacross the spatio-temporal dimension by distancing adjacent pixels. To address\nthis, we present an improved SSMs-based video deraining network (RainMamba)\nwith a novel Hilbert scanning mechanism to better capture sequence-level local\ninformation. We also introduce a difference-guided dynamic contrastive locality\nlearning strategy to enhance the patch-level self-similarity learning ability\nof the proposed network. Extensive experiments on four synthesized video\nderaining datasets and real-world rainy videos demonstrate the superiority of\nour network in the removal of rain streaks and raindrops.\n","authors":["Hongtao Wu","Yijun Yang","Huihui Xu","Weiming Wang","Jinni Zhou","Lei Zhu"],"pdf_url":"https://arxiv.org/pdf/2407.21773v1.pdf","comment":"ACM Multimedia 2024"},{"id":"http://arxiv.org/abs/2407.21771v1","updated":"2024-07-31T17:46:57Z","published":"2024-07-31T17:46:57Z","title":"Paying More Attention to Image: A Training-Free Method for Alleviating\n Hallucination in LVLMs","summary":" Existing Large Vision-Language Models (LVLMs) primarily align image features\nof vision encoder with Large Language Models (LLMs) to leverage their superior\ntext generation capabilities. However, the scale disparity between vision\nencoder and language model may led to LLMs assuming a predominant role in\nmulti-modal comprehension. This imbalance in LVLMs may result in the instances\nof hallucinatory. Concretely, LVLMs may generate consistent descriptions with\nor without visual input, indicating that certain outputs are influenced solely\nby context text. We refer to this phenomenon as \"text inertia.\" To counteract\nthis issue, we introduce a training-free algorithm to find an equilibrium point\nbetween image comprehension and language inference. Specifically, we adaptively\ninvolve adjusting and amplifying the attention weights assigned to image\ntokens, thereby granting greater prominence to visual elements. Meanwhile, we\nsubtract the logits of multi-modal inputs from ones of pure text input, which\ncan help LVLMs be not biased towards LLMs. By enhancing images tokens and\nreducing the stubborn output of LLM, we can let LVLM pay more attention to\nimages, towards alleviating text inertia and reducing the hallucination in\nLVLMs. Our extensive experiments shows that this method substantially reduces\nthe frequency of hallucinatory outputs in various LVLMs in terms of different\nmetrics. Project page is available at https://lalbj.github.io/projects/PAI/.\n","authors":["Shi Liu","Kecheng Zheng","Wei Chen"],"pdf_url":"https://arxiv.org/pdf/2407.21771v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02141v2","updated":"2024-07-31T17:41:14Z","published":"2023-12-04T18:58:20Z","title":"iMatching: Imperative Correspondence Learning","summary":" Learning feature correspondence is a foundational task in computer vision,\nholding immense importance for downstream applications such as visual odometry\nand 3D reconstruction. Despite recent progress in data-driven models, feature\ncorrespondence learning is still limited by the lack of accurate per-pixel\ncorrespondence labels. To overcome this difficulty, we introduce a new\nself-supervised scheme, imperative learning (IL), for training feature\ncorrespondence. It enables correspondence learning on arbitrary uninterrupted\nvideos without any camera pose or depth labels, heralding a new era for\nself-supervised correspondence learning. Specifically, we formulated the\nproblem of correspondence learning as a bilevel optimization, which takes the\nreprojection error from bundle adjustment as a supervisory signal for the\nmodel. To avoid large memory and computation overhead, we leverage the\nstationary point to effectively back-propagate the implicit gradients through\nbundle adjustment. Through extensive experiments, we demonstrate superior\nperformance on tasks including feature matching and pose estimation, in which\nwe obtained an average of 30% accuracy gain over the state-of-the-art matching\nmodels. This preprint corresponds to the Accepted Manuscript in European\nConference on Computer Vision (ECCV) 2024.\n","authors":["Zitong Zhan","Dasong Gao","Yun-Jou Lin","Youjie Xia","Chen Wang"],"pdf_url":"https://arxiv.org/pdf/2312.02141v2.pdf","comment":"This preprint corresponds to the Accepted Manuscript in European\n Conference on Computer Vision (ECCV) 2024"},{"id":"http://arxiv.org/abs/2407.21757v1","updated":"2024-07-31T17:23:57Z","published":"2024-07-31T17:23:57Z","title":"Learning Video Context as Interleaved Multimodal Sequences","summary":" Narrative videos, such as movies, pose significant challenges in video\nunderstanding due to their rich contexts (characters, dialogues, storylines)\nand diverse demands (identify who, relationship, and reason). In this paper, we\nintroduce MovieSeq, a multimodal language model developed to address the wide\nrange of challenges in understanding video contexts. Our core idea is to\nrepresent videos as interleaved multimodal sequences (including images, plots,\nvideos, and subtitles), either by linking external knowledge databases or using\noffline models (such as whisper for subtitles). Through instruction-tuning,\nthis approach empowers the language model to interact with videos using\ninterleaved multimodal instructions. For example, instead of solely relying on\nvideo as input, we jointly provide character photos alongside their names and\ndialogues, allowing the model to associate these elements and generate more\ncomprehensive responses. To demonstrate its effectiveness, we validate\nMovieSeq's performance on six datasets (LVU, MAD, Movienet, CMD, TVC, MovieQA)\nacross five settings (video classification, audio description, video-text\nretrieval, video captioning, and video question-answering). The code will be\npublic at https://github.com/showlab/MovieSeq.\n","authors":["Kevin Qinghong Lin","Pengchuan Zhang","Difei Gao","Xide Xia","Joya Chen","Ziteng Gao","Jinheng Xie","Xuhong Xiao","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2407.21757v1.pdf","comment":"Accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2407.21740v1","updated":"2024-07-31T16:52:00Z","published":"2024-07-31T16:52:00Z","title":"Contrastive Factor Analysis","summary":" Factor analysis, often regarded as a Bayesian variant of matrix\nfactorization, offers superior capabilities in capturing uncertainty, modeling\ncomplex dependencies, and ensuring robustness. As the deep learning era\narrives, factor analysis is receiving less and less attention due to their\nlimited expressive ability. On the contrary, contrastive learning has emerged\nas a potent technique with demonstrated efficacy in unsupervised\nrepresentational learning. While the two methods are different paradigms,\nrecent theoretical analysis has revealed the mathematical equivalence between\ncontrastive learning and matrix factorization, providing a potential\npossibility for factor analysis combined with contrastive learning. Motivated\nby the interconnectedness of contrastive learning, matrix factorization, and\nfactor analysis, this paper introduces a novel Contrastive Factor Analysis\nframework, aiming to leverage factor analysis's advantageous properties within\nthe realm of contrastive learning. To further leverage the interpretability\nproperties of non-negative factor analysis, which can learn disentangled\nrepresentations, contrastive factor analysis is extended to a non-negative\nversion. Finally, extensive experimental validation showcases the efficacy of\nthe proposed contrastive (non-negative) factor analysis methodology across\nmultiple key properties, including expressiveness, robustness,\ninterpretability, and accurate uncertainty estimation.\n","authors":["Zhibin Duan","Tiansheng Wen","Yifei Wang","Chen Zhu","Bo Chen","Mingyuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.21740v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21739v1","updated":"2024-07-31T16:48:06Z","published":"2024-07-31T16:48:06Z","title":"A Federated Learning-Friendly Approach for Parameter-Efficient\n Fine-Tuning of SAM in 3D Segmentation","summary":" Adapting foundation models for medical image analysis requires finetuning\nthem on a considerable amount of data because of extreme distribution shifts\nbetween natural (source) data used for pretraining and medical (target) data.\nHowever, collecting task-specific medical data for such finetuning at a central\nlocation raises many privacy concerns. Although Federated learning (FL)\nprovides an effective means for training on private decentralized data,\ncommunication costs in federating large foundation models can quickly become a\nsignificant bottleneck, impacting the solution's scalability. In this work, we\naddress this problem of efficient communication while ensuring effective\nlearning in FL by combining the strengths of Parameter-Efficient Fine-tuning\n(PEFT) with FL. Specifically, we study plug-and-play Low-Rank Adapters (LoRA)\nin a federated manner to adapt the Segment Anything Model (SAM) for 3D medical\nimage segmentation. Unlike prior works that utilize LoRA and finetune the\nentire decoder, we critically analyze the contribution of each granular\ncomponent of SAM on finetuning performance. Thus, we identify specific layers\nto be federated that are very efficient in terms of communication cost while\nproducing on-par accuracy. Our experiments show that retaining the parameters\nof the SAM model (including most of the decoder) in their original state during\nadaptation is beneficial because fine-tuning on small datasets tends to distort\nthe inherent capabilities of the underlying foundation model. On Fed-KiTS, our\napproach decreases communication cost (~48x) compared to full fine-tuning while\nincreasing performance (~6% Dice score) in 3D segmentation tasks. Our approach\nperforms similar to SAMed while achieving ~2.8x reduction in communication and\nparameters to be finetuned. We further validate our approach with experiments\non Fed-IXI and Prostate MRI datasets.\n","authors":["Mothilal Asokan","Joseph Geo Benjamin","Mohammad Yaqub","Karthik Nandakumar"],"pdf_url":"https://arxiv.org/pdf/2407.21739v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21738v1","updated":"2024-07-31T16:47:21Z","published":"2024-07-31T16:47:21Z","title":"Leveraging Self-Supervised Learning for Fetal Cardiac Planes\n Classification using Ultrasound Scan Videos","summary":" Self-supervised learning (SSL) methods are popular since they can address\nsituations with limited annotated data by directly utilising the underlying\ndata distribution. However, the adoption of such methods is not explored enough\nin ultrasound (US) imaging, especially for fetal assessment. We investigate the\npotential of dual-encoder SSL in utilizing unlabelled US video data to improve\nthe performance of challenging downstream Standard Fetal Cardiac Planes (SFCP)\nclassification using limited labelled 2D US images. We study 7 SSL approaches\nbased on reconstruction, contrastive loss, distillation, and information theory\nand evaluate them extensively on a large private US dataset. Our observations\nand findings are consolidated from more than 500 downstream training\nexperiments under different settings. Our primary observation shows that for\nSSL training, the variance of the dataset is more crucial than its size because\nit allows the model to learn generalisable representations, which improve the\nperformance of downstream tasks. Overall, the BarlowTwins method shows robust\nperformance, irrespective of the training settings and data variations, when\nused as an initialisation for downstream tasks. Notably, full fine-tuning with\n1% of labelled data outperforms ImageNet initialisation by 12% in F1-score and\noutperforms other SSL initialisations by at least 4% in F1-score, thus making\nit a promising candidate for transfer learning from US video to image data.\n","authors":["Joseph Geo Benjamin","Mothilal Asokan","Amna Alhosani","Hussain Alasmawi","Werner Gerhard Diehl","Leanne Bricker","Karthik Nandakumar","Mohammad Yaqub"],"pdf_url":"https://arxiv.org/pdf/2407.21738v1.pdf","comment":"Simplifying Medical Ultrasound: 4th International Workshop, ASMUS\n 2023, Held in Conjunction with MICCAI 2023, Vancouver, BC, Canada, October 8,\n 2023, Proceedings"},{"id":"http://arxiv.org/abs/2407.21735v1","updated":"2024-07-31T16:43:20Z","published":"2024-07-31T16:43:20Z","title":"Unifying Event-based Flow, Stereo and Depth Estimation via Feature\n Similarity Matching","summary":" As an emerging vision sensor, the event camera has gained popularity in\nvarious vision tasks such as optical flow estimation, stereo matching, and\ndepth estimation due to its high-speed, sparse, and asynchronous event streams.\nUnlike traditional approaches that use specialized architectures for each\nspecific task, we propose a unified framework, EventMatch, that reformulates\nthese tasks as an event-based dense correspondence matching problem, allowing\nthem to be solved with a single model by directly comparing feature\nsimilarities. By utilizing a shared feature similarities module, which\nintegrates knowledge from other event flows via temporal or spatial\ninteractions, and distinct task heads, our network can concurrently perform\noptical flow estimation from temporal inputs (e.g., two segments of event\nstreams in the temporal domain) and stereo matching from spatial inputs (e.g.,\ntwo segments of event streams from different viewpoints in the spatial domain).\nMoreover, we further demonstrate that our unified model inherently supports\ncross-task transfer since the architecture and parameters are shared across\ntasks. Without the need for retraining on each task, our model can effectively\nhandle both optical flow and disparity estimation simultaneously. The\nexperiment conducted on the DSEC benchmark demonstrates that our model exhibits\nsuperior performance in both optical flow and disparity estimation tasks,\noutperforming existing state-of-the-art methods. Our unified approach not only\nadvances event-based models but also opens new possibilities for cross-task\ntransfer and inter-task fusion in both spatial and temporal dimensions. Our\ncode will be available later.\n","authors":["Pengjie Zhang","Lin Zhu","Lizhi Wang","Hua Huang"],"pdf_url":"https://arxiv.org/pdf/2407.21735v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21720v1","updated":"2024-07-31T16:13:29Z","published":"2024-07-31T16:13:29Z","title":"Detecting, Explaining, and Mitigating Memorization in Diffusion Models","summary":" Recent breakthroughs in diffusion models have exhibited exceptional\nimage-generation capabilities. However, studies show that some outputs are\nmerely replications of training data. Such replications present potential legal\nchallenges for model owners, especially when the generated content contains\nproprietary information. In this work, we introduce a straightforward yet\neffective method for detecting memorized prompts by inspecting the magnitude of\ntext-conditional predictions. Our proposed method seamlessly integrates without\ndisrupting sampling algorithms, and delivers high accuracy even at the first\ngeneration step, with a single generation per prompt. Building on our detection\nstrategy, we unveil an explainable approach that shows the contribution of\nindividual words or tokens to memorization. This offers an interactive medium\nfor users to adjust their prompts. Moreover, we propose two strategies i.e., to\nmitigate memorization by leveraging the magnitude of text-conditional\npredictions, either through minimization during inference or filtering during\ntraining. These proposed strategies effectively counteract memorization while\nmaintaining high-generation quality. Code is available at\nhttps://github.com/YuxinWenRick/diffusion_memorization.\n","authors":["Yuxin Wen","Yuchen Liu","Chen Chen","Lingjuan Lyu"],"pdf_url":"https://arxiv.org/pdf/2407.21720v1.pdf","comment":"16 pages, 9 figures, accepted as oral presentation in ICLR 2024"},{"id":"http://arxiv.org/abs/2403.04969v2","updated":"2024-07-31T16:03:51Z","published":"2024-03-08T00:53:49Z","title":"PIPsUS: Self-Supervised Point Tracking in Ultrasound","summary":" Finding point-level correspondences is a fundamental problem in ultrasound\n(US), since it can enable US landmark tracking for intraoperative image\nguidance in different surgeries, including head and neck. Most existing US\ntracking methods, e.g., those based on optical flow or feature matching, were\ninitially designed for RGB images before being applied to US. Therefore domain\nshift can impact their performance. Training could be supervised by\nground-truth correspondences, but these are expensive to acquire in US. To\nsolve these problems, we propose a self-supervised pixel-level tracking model\ncalled PIPsUS. Our model can track an arbitrary number of points in one forward\npass and exploits temporal information by considering multiple, instead of just\nconsecutive, frames. We developed a new self-supervised training strategy that\nutilizes a long-term point-tracking model trained for RGB images as a teacher\nto guide the model to learn realistic motions and use data augmentation to\nenforce tracking from US appearance. We evaluate our method on neck and oral US\nand echocardiography, showing higher point tracking accuracy when compared with\nfast normalized cross-correlation and tuned optical flow. Code will be\navailable once the paper is accepted.\n","authors":["Wanwen Chen","Adam Schmidt","Eitan Prisman","Septimiu E Salcudean"],"pdf_url":"https://arxiv.org/pdf/2403.04969v2.pdf","comment":"10 pages, 3 figures, submitted to MICCAI 2024"},{"id":"http://arxiv.org/abs/2407.21705v1","updated":"2024-07-31T15:53:20Z","published":"2024-07-31T15:53:20Z","title":"Tora: Trajectory-oriented Diffusion Transformer for Video Generation","summary":" Recent advancements in Diffusion Transformer (DiT) have demonstrated\nremarkable proficiency in producing high-quality video content. Nonetheless,\nthe potential of transformer-based diffusion models for effectively generating\nvideos with controllable motion remains an area of limited exploration. This\npaper introduces Tora, the first trajectory-oriented DiT framework that\nintegrates textual, visual, and trajectory conditions concurrently for video\ngeneration. Specifically, Tora consists of a Trajectory Extractor~(TE), a\nSpatial-Temporal DiT, and a Motion-guidance Fuser~(MGF). The TE encodes\narbitrary trajectories into hierarchical spacetime motion patches with a 3D\nvideo compression network. The MGF integrates the motion patches into the DiT\nblocks to generate consistent videos following trajectories. Our design aligns\nseamlessly with DiT's scalability, allowing precise control of video content's\ndynamics with diverse durations, aspect ratios, and resolutions. Extensive\nexperiments demonstrate Tora's excellence in achieving high motion fidelity,\nwhile also meticulously simulating the movement of the physical world. Page can\nbe found at https://ali-videoai.github.io/tora_video.\n","authors":["Zhenghao Zhang","Junchao Liao","Menghao Li","Long Qin","Weizhi Wang"],"pdf_url":"https://arxiv.org/pdf/2407.21705v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21703v1","updated":"2024-07-31T15:50:11Z","published":"2024-07-31T15:50:11Z","title":"Hyper-parameter tuning for text guided image editing","summary":" The test-time finetuning text-guided image editing method, Forgedit, is\ncapable of tackling general and complex image editing problems given only the\ninput image itself and the target text prompt. During finetuning stage, using\nthe same set of finetuning hyper-paramters every time for every given image,\nForgedit remembers and understands the input image in 30 seconds. During\nediting stage, the workflow of Forgedit might seem complicated. However, in\nfact, the editing process of Forgedit is not more complex than previous SOTA\nImagic, yet completely solves the overfitting problem of Imagic. In this paper,\nwe will elaborate the workflow of Forgedit editing stage with examples. We will\nshow how to tune the hyper-parameters in an efficient way to obtain ideal\nediting results.\n","authors":["Shiwen Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.21703v1.pdf","comment":"Codes are available at https://github.com/witcherofresearch/Forgedit/"},{"id":"http://arxiv.org/abs/2407.21691v1","updated":"2024-07-31T15:37:52Z","published":"2024-07-31T15:37:52Z","title":"Explainable Artificial Intelligence for Quantifying Interfering and\n High-Risk Behaviors in Autism Spectrum Disorder in a Real-World Classroom\n Environment Using Privacy-Preserving Video Analysis","summary":" Rapid identification and accurate documentation of interfering and high-risk\nbehaviors in ASD, such as aggression, self-injury, disruption, and restricted\nrepetitive behaviors, are important in daily classroom environments for\ntracking intervention effectiveness and allocating appropriate resources to\nmanage care needs. However, having a staff dedicated solely to observing is\ncostly and uncommon in most educational settings. Recently, multiple research\nstudies have explored developing automated, continuous, and objective tools\nusing machine learning models to quantify behaviors in ASD. However, the\nmajority of the work was conducted under a controlled environment and has not\nbeen validated for real-world conditions. In this work, we demonstrate that the\nlatest advances in video-based group activity recognition techniques can\nquantify behaviors in ASD in real-world activities in classroom environments\nwhile preserving privacy. Our explainable model could detect the episode of\nproblem behaviors with a 77% F1-score and capture distinctive behavior features\nin different types of behaviors in ASD. To the best of our knowledge, this is\nthe first work that shows the promise of objectively quantifying behaviors in\nASD in a real-world environment, which is an important step toward the\ndevelopment of a practical tool that can ease the burden of data collection for\nclassroom staff.\n","authors":["Barun Das","Conor Anderson","Tania Villavicencio","Johanna Lantz","Jenny Foster","Theresa Hamlin","Ali Bahrami Rad","Gari D. Clifford","Hyeokhyen Kwon"],"pdf_url":"https://arxiv.org/pdf/2407.21691v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.12499v2","updated":"2024-07-31T15:36:55Z","published":"2022-12-23T17:45:38Z","title":"Posterior-Variance-Based Error Quantification for Inverse Problems in\n Imaging","summary":" In this work, a method for obtaining pixel-wise error bounds in Bayesian\nregularization of inverse imaging problems is introduced. The proposed method\nemploys estimates of the posterior variance together with techniques from\nconformal prediction in order to obtain coverage guarantees for the error\nbounds, without making any assumption on the underlying data distribution. It\nis generally applicable to Bayesian regularization approaches, independent,\ne.g., of the concrete choice of the prior. Furthermore, the coverage guarantees\ncan also be obtained in case only approximate sampling from the posterior is\npossible. With this in particular, the proposed framework is able to\nincorporate any learned prior in a black-box manner. Guaranteed coverage\nwithout assumptions on the underlying distributions is only achievable since\nthe magnitude of the error bounds is, in general, unknown in advance.\nNevertheless, experiments with multiple regularization approaches presented in\nthe paper confirm that in practice, the obtained error bounds are rather tight.\nFor realizing the numerical experiments, also a novel primal-dual Langevin\nalgorithm for sampling from non-smooth distributions is introduced in this\nwork.\n","authors":["Dominik Narnhofer","Andreas Habring","Martin Holler","Thomas Pock"],"pdf_url":"https://arxiv.org/pdf/2212.12499v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20198v3","updated":"2024-07-31T15:36:22Z","published":"2024-07-29T17:24:52Z","title":"SpaER: Learning Spatio-temporal Equivariant Representations for Fetal\n Brain Motion Tracking","summary":" In this paper, we introduce SpaER, a pioneering method for fetal motion\ntracking that leverages equivariant filters and self-attention mechanisms to\neffectively learn spatio-temporal representations. Different from conventional\napproaches that statically estimate fetal brain motions from pairs of images,\nour method dynamically tracks the rigid movement patterns of the fetal head\nacross temporal and spatial dimensions. Specifically, we first develop an\nequivariant neural network that efficiently learns rigid motion sequences\nthrough low-dimensional spatial representations of images. Subsequently, we\nlearn spatio-temporal representations by incorporating time encoding and\nself-attention neural network layers. This approach allows for the capture of\nlong-term dependencies of fetal brain motion and addresses alignment errors due\nto contrast changes and severe motion artifacts. Our model also provides a\ngeometric deformation estimation that properly addresses image distortions\namong all time frames. To the best of our knowledge, our approach is the first\nto learn spatial-temporal representations via deep neural networks for fetal\nmotion tracking without data augmentation. We validated our model using real\nfetal echo-planar images with simulated and real motions. Our method carries\nsignificant potential value in accurately measuring, tracking, and correcting\nfetal motion in fetal MRI sequences.\n","authors":["Jian Wang","Razieh Faghihpirayesh","Polina Golland","Ali Gholipour"],"pdf_url":"https://arxiv.org/pdf/2407.20198v3.pdf","comment":"11 pages, 3 figures, Medical Image Computing and Computer Assisted\n Interventions (MICCAI) Workshop on Perinatal Imaging, Placental and Preterm\n Image analysis (PIPPI) 2024"},{"id":"http://arxiv.org/abs/2407.21687v1","updated":"2024-07-31T15:29:34Z","published":"2024-07-31T15:29:34Z","title":"Dynamic Object Queries for Transformer-based Incremental Object\n Detection","summary":" Incremental object detection (IOD) aims to sequentially learn new classes,\nwhile maintaining the capability to locate and identify old ones. As the\ntraining data arrives with annotations only with new classes, IOD suffers from\ncatastrophic forgetting. Prior methodologies mainly tackle the forgetting issue\nthrough knowledge distillation and exemplar replay, ignoring the conflict\nbetween limited model capacity and increasing knowledge. In this paper, we\nexplore \\textit{dynamic object queries} for incremental object detection built\non Transformer architecture. We propose the \\textbf{Dy}namic object\n\\textbf{Q}uery-based \\textbf{DE}tection \\textbf{TR}ansformer (DyQ-DETR), which\nincrementally expands the model representation ability to achieve\nstability-plasticity tradeoff. First, a new set of learnable object queries are\nfed into the decoder to represent new classes. These new object queries are\naggregated with those from previous phases to adapt both old and new knowledge\nwell. Second, we propose the isolated bipartite matching for object queries in\ndifferent phases, based on disentangled self-attention. The interaction among\nthe object queries at different phases is eliminated to reduce inter-class\nconfusion. Thanks to the separate supervision and computation over object\nqueries, we further present the risk-balanced partial calibration for effective\nexemplar replay. Extensive experiments demonstrate that DyQ-DETR significantly\nsurpasses the state-of-the-art methods, with limited parameter overhead. Code\nwill be made publicly available.\n","authors":["Jichuan Zhang","Wei Li","Shuang Cheng","Ya-Li Li","Shengjin Wang"],"pdf_url":"https://arxiv.org/pdf/2407.21687v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21686v1","updated":"2024-07-31T15:29:13Z","published":"2024-07-31T15:29:13Z","title":"Expressive Whole-Body 3D Gaussian Avatar","summary":" Facial expression and hand motions are necessary to express our emotions and\ninteract with the world. Nevertheless, most of the 3D human avatars modeled\nfrom a casually captured video only support body motions without facial\nexpressions and hand motions.In this work, we present ExAvatar, an expressive\nwhole-body 3D human avatar learned from a short monocular video. We design\nExAvatar as a combination of the whole-body parametric mesh model (SMPL-X) and\n3D Gaussian Splatting (3DGS). The main challenges are 1) a limited diversity of\nfacial expressions and poses in the video and 2) the absence of 3D\nobservations, such as 3D scans and RGBD images. The limited diversity in the\nvideo makes animations with novel facial expressions and poses non-trivial. In\naddition, the absence of 3D observations could cause significant ambiguity in\nhuman parts that are not observed in the video, which can result in noticeable\nartifacts under novel motions. To address them, we introduce our hybrid\nrepresentation of the mesh and 3D Gaussians. Our hybrid representation treats\neach 3D Gaussian as a vertex on the surface with pre-defined connectivity\ninformation (i.e., triangle faces) between them following the mesh topology of\nSMPL-X. It makes our ExAvatar animatable with novel facial expressions by\ndriven by the facial expression space of SMPL-X. In addition, by using\nconnectivity-based regularizers, we significantly reduce artifacts in novel\nfacial expressions and poses.\n","authors":["Gyeongsik Moon","Takaaki Shiratori","Shunsuke Saito"],"pdf_url":"https://arxiv.org/pdf/2407.21686v1.pdf","comment":"Accepted to ECCV 2024. Project page:\n https://mks0601.github.io/ExAvatar/"},{"id":"http://arxiv.org/abs/2407.15689v2","updated":"2024-07-31T15:28:39Z","published":"2024-07-22T14:54:51Z","title":"Pediatric Wrist Fracture Detection in X-rays via YOLOv10 Algorithm and\n Dual Label Assignment System","summary":" Wrist fractures are highly prevalent among children and can significantly\nimpact their daily activities, such as attending school, participating in\nsports, and performing basic self-care tasks. If not treated properly, these\nfractures can result in chronic pain, reduced wrist functionality, and other\nlong-term complications. Recently, advancements in object detection have shown\npromise in enhancing fracture detection, with systems achieving accuracy\ncomparable to, or even surpassing, that of human radiologists. The YOLO series,\nin particular, has demonstrated notable success in this domain. This study is\nthe first to provide a thorough evaluation of various YOLOv10 variants to\nassess their performance in detecting pediatric wrist fractures using the\nGRAZPEDWRI-DX dataset. It investigates how changes in model complexity, scaling\nthe architecture, and implementing a dual-label assignment strategy can enhance\ndetection performance. Experimental results indicate that our trained model\nachieved mean average precision (mAP@50-95) of 51.9\\% surpassing the current\nYOLOv9 benchmark of 43.3\\% on this dataset. This represents an improvement of\n8.6\\%. The implementation code is publicly available at\nhttps://github.com/ammarlodhi255/YOLOv10-Fracture-Detection\n","authors":["Ammar Ahmed","Abdul Manaf"],"pdf_url":"https://arxiv.org/pdf/2407.15689v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10073v3","updated":"2024-07-31T15:22:39Z","published":"2024-04-15T18:26:03Z","title":"Explainable Light-Weight Deep Learning Pipeline for Improved Drought\n Stress Identification","summary":" Early identification of drought stress in crops is vital for implementing\neffective mitigation measures and reducing yield loss. Non-invasive imaging\ntechniques hold immense potential by capturing subtle physiological changes in\nplants under water deficit. Sensor based imaging data serves as a rich source\nof information for machine learning and deep learning algorithms, facilitating\nfurther analysis aimed at identifying drought stress. While these approaches\nyield favorable results, real-time field applications requires algorithms\nspecifically designed for the complexities of natural agricultural conditions.\nOur work proposes a novel deep learning framework for classifying drought\nstress in potato crops captured by UAVs in natural settings. The novelty lies\nin the synergistic combination of a pre-trained network with carefully designed\ncustom layers. This architecture leverages feature extraction capabilities of\nthe pre-trained network while the custom layers enable targeted dimensionality\nreduction and enhanced regularization, ultimately leading to improved\nperformance. A key innovation of our work involves the integration of\nGradient-Class Activation Mapping (Grad-CAM), an explainability technique.\nGrad-CAM sheds light on the internal workings of the deep learning model,\ntypically referred to as a black box. By visualizing the focus areas of the\nmodel within the images, Grad-CAM fosters interpretability and builds trust in\nthe decision-making process of the model. Our proposed framework achieves\nsuperior performance, particularly with the DenseNet121 pre-trained network,\nreaching a precision of 97% to identify the stressed class with an overall\naccuracy of 91%. Comparative analysis of existing state-of-the-art object\ndetection algorithms reveals the superiority of our approach in significantly\nhigher precision and accuracy.\n","authors":["Aswini Kumar Patra","Lingaraj Sahoo"],"pdf_url":"https://arxiv.org/pdf/2404.10073v3.pdf","comment":"16 pages, 10 figures, 2 tables"},{"id":"http://arxiv.org/abs/2404.10892v2","updated":"2024-07-31T15:18:40Z","published":"2024-04-16T20:30:16Z","title":"Automatic classification of prostate MR series type using image content\n and metadata","summary":" With the wealth of medical image data, efficient curation is essential.\nAssigning the sequence type to magnetic resonance images is necessary for\nscientific studies and artificial intelligence-based analysis. However,\nincomplete or missing metadata prevents effective automation. We therefore\npropose a deep-learning method for classification of prostate cancer scanning\nsequences based on a combination of image data and DICOM metadata. We\ndemonstrate superior results compared to metadata or image data alone, and make\nour code publicly available at\nhttps://github.com/deepakri201/DICOMScanClassification.\n","authors":["Deepa Krishnaswamy","Bálint Kovács","Stefan Denner","Steve Pieper","David Clunie","Christopher P. Bridge","Tina Kapur","Klaus H. Maier-Hein","Andrey Fedorov"],"pdf_url":"https://arxiv.org/pdf/2404.10892v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21674v1","updated":"2024-07-31T15:14:17Z","published":"2024-07-31T15:14:17Z","title":"Synthetic Simplicity: Unveiling Bias in Medical Data Augmentation","summary":" Synthetic data is becoming increasingly integral in data-scarce fields such\nas medical imaging, serving as a substitute for real data. However, its\ninherent statistical characteristics can significantly impact downstream tasks,\npotentially compromising deployment performance. In this study, we empirically\ninvestigate this issue and uncover a critical phenomenon: downstream neural\nnetworks often exploit spurious distinctions between real and synthetic data\nwhen there is a strong correlation between the data source and the task label.\nThis exploitation manifests as \\textit{simplicity bias}, where models overly\nrely on superficial features rather than genuine task-related complexities.\nThrough principled experiments, we demonstrate that the source of data (real\nvs.\\ synthetic) can introduce spurious correlating factors leading to poor\nperformance during deployment when the correlation is absent. We first\ndemonstrate this vulnerability on a digit classification task, where the model\nspuriously utilizes the source of data instead of the digit to provide an\ninference. We provide further evidence of this phenomenon in a medical imaging\nproblem related to cardiac view classification in echocardiograms, particularly\ndistinguishing between 2-chamber and 4-chamber views. Given the increasing role\nof utilizing synthetic datasets, we hope that our experiments serve as\neffective guidelines for the utilization of synthetic datasets in model\ntraining.\n","authors":["Krishan Agyakari Raja Babu","Rachana Sathish","Mrunal Pattanaik","Rahul Venkataramani"],"pdf_url":"https://arxiv.org/pdf/2407.21674v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21666v1","updated":"2024-07-31T15:08:26Z","published":"2024-07-31T15:08:26Z","title":"An Explainable Vision Transformer with Transfer Learning Combined with\n Support Vector Machine Based Efficient Drought Stress Identification","summary":" Early detection of drought stress is critical for taking timely measures for\nreducing crop loss before the drought impact becomes irreversible. The subtle\nphenotypical and physiological changes in response to drought stress are\ncaptured by non-invasive imaging techniques and these imaging data serve as\nvaluable resource for machine learning methods to identify drought stress.\nWhile convolutional neural networks (CNNs) are in wide use, vision transformers\n(ViTs) present a promising alternative in capturing long-range dependencies and\nintricate spatial relationships, thereby enhancing the detection of subtle\nindicators of drought stress. We propose an explainable deep learning pipeline\nthat leverages the power of ViTs for drought stress detection in potato crops\nusing aerial imagery. We applied two distinct approaches: a synergistic\ncombination of ViT and support vector machine (SVM), where ViT extracts\nintricate spatial features from aerial images, and SVM classifies the crops as\nstressed or healthy and an end-to-end approach using a dedicated classification\nlayer within ViT to directly detect drought stress. Our key findings explain\nthe ViT model's decision-making process by visualizing attention maps. These\nmaps highlight the specific spatial features within the aerial images that the\nViT model focuses as the drought stress signature. Our findings demonstrate\nthat the proposed methods not only achieve high accuracy in drought stress\nidentification but also shedding light on the diverse subtle plant features\nassociated with drought stress. This offers a robust and interpretable solution\nfor drought stress monitoring for farmers to undertake informed decisions for\nimproved crop management.\n","authors":["Aswini Kumar Patra","Ankit Varshney","Lingaraj Sahoo"],"pdf_url":"https://arxiv.org/pdf/2407.21666v1.pdf","comment":"30 pages, 6 figures, 4 tables"},{"id":"http://arxiv.org/abs/2407.11865v2","updated":"2024-07-31T15:05:03Z","published":"2024-07-16T15:50:45Z","title":"Novel Hybrid Integrated Pix2Pix and WGAN Model with Gradient Penalty for\n Binary Images Denoising","summary":" This paper introduces a novel approach to image denoising that leverages the\nadvantages of Generative Adversarial Networks (GANs). Specifically, we propose\na model that combines elements of the Pix2Pix model and the Wasserstein GAN\n(WGAN) with Gradient Penalty (WGAN-GP). This hybrid framework seeks to\ncapitalize on the denoising capabilities of conditional GANs, as demonstrated\nin the Pix2Pix model, while mitigating the need for an exhaustive search for\noptimal hyperparameters that could potentially ruin the stability of the\nlearning process. In the proposed method, the GAN's generator is employed to\nproduce denoised images, harnessing the power of a conditional GAN for noise\nreduction. Simultaneously, the implementation of the Lipschitz continuity\nconstraint during updates, as featured in WGAN-GP, aids in reducing\nsusceptibility to mode collapse. This innovative design allows the proposed\nmodel to benefit from the strong points of both Pix2Pix and WGAN-GP, generating\nsuperior denoising results while ensuring training stability. Drawing on\nprevious work on image-to-image translation and GAN stabilization techniques,\nthe proposed research highlights the potential of GANs as a general-purpose\nsolution for denoising. The paper details the development and testing of this\nmodel, showcasing its effectiveness through numerical experiments. The dataset\nwas created by adding synthetic noise to clean images. Numerical results based\non real-world dataset validation underscore the efficacy of this approach in\nimage-denoising tasks, exhibiting significant enhancements over traditional\ntechniques. Notably, the proposed model demonstrates strong generalization\ncapabilities, performing effectively even when trained with synthetic noise.\n","authors":["Luca Tirel","Ali Mohamed Ali","Hashim A. Hashim"],"pdf_url":"https://arxiv.org/pdf/2407.11865v2.pdf","comment":"Systems and Soft Computing"},{"id":"http://arxiv.org/abs/2407.21654v1","updated":"2024-07-31T14:56:42Z","published":"2024-07-31T14:56:42Z","title":"MTA-CLIP: Language-Guided Semantic Segmentation with Mask-Text Alignment","summary":" Recent approaches have shown that large-scale vision-language models such as\nCLIP can improve semantic segmentation performance. These methods typically aim\nfor pixel-level vision-language alignment, but often rely on low resolution\nimage features from CLIP, resulting in class ambiguities along boundaries.\nMoreover, the global scene representations in CLIP text embeddings do not\ndirectly correlate with the local and detailed pixel-level features, making\nmeaningful alignment more difficult. To address these limitations, we introduce\nMTA-CLIP, a novel framework employing mask-level vision-language alignment.\nSpecifically, we first propose Mask-Text Decoder that enhances the mask\nrepresentations using rich textual data with the CLIP language model.\nSubsequently, it aligns mask representations with text embeddings using\nMask-to-Text Contrastive Learning. Furthermore, we introduce MaskText Prompt\nLearning, utilizing multiple context-specific prompts for text embeddings to\ncapture diverse class representations across masks. Overall, MTA-CLIP achieves\nstate-of-the-art, surpassing prior works by an average of 2.8% and 1.3% on on\nstandard benchmark datasets, ADE20k and Cityscapes, respectively.\n","authors":["Anurag Das","Xinting Hu","Li Jiang","Bernt Schiele"],"pdf_url":"https://arxiv.org/pdf/2407.21654v1.pdf","comment":"accepted at ECCV 2024"},{"id":"http://arxiv.org/abs/2407.21652v1","updated":"2024-07-31T14:53:41Z","published":"2024-07-31T14:53:41Z","title":"Spatial Transformer Network YOLO Model for Agricultural Object Detection","summary":" Object detection plays a crucial role in the field of computer vision by\nautonomously identifying and locating objects of interest. The You Only Look\nOnce (YOLO) model is an effective single-shot detector. However, YOLO faces\nchallenges in cluttered or partially occluded scenes and can struggle with\nsmall, low-contrast objects. We propose a new method that integrates spatial\ntransformer networks (STNs) into YOLO to improve performance. The proposed\nSTN-YOLO aims to enhance the model's effectiveness by focusing on important\nareas of the image and improving the spatial invariance of the model before the\ndetection process. Our proposed method improved object detection performance\nboth qualitatively and quantitatively. We explore the impact of different\nlocalization networks within the STN module as well as the robustness of the\nmodel across different spatial transformations. We apply the STN-YOLO on\nbenchmark datasets for Agricultural object detection as well as a new dataset\nfrom a state-of-the-art plant phenotyping greenhouse facility. Our code and\ndataset are publicly available.\n","authors":["Yash Zambre","Ekdev Rajkitkul","Akshatha Mohan","Joshua Peeples"],"pdf_url":"https://arxiv.org/pdf/2407.21652v1.pdf","comment":"7 pages, 5 figures, submitted for review"},{"id":"http://arxiv.org/abs/2403.05245v2","updated":"2024-07-31T14:53:08Z","published":"2024-03-08T12:07:18Z","title":"Noise Level Adaptive Diffusion Model for Robust Reconstruction of\n Accelerated MRI","summary":" In general, diffusion model-based MRI reconstruction methods incrementally\nremove artificially added noise while imposing data consistency to reconstruct\nthe underlying images. However, real-world MRI acquisitions already contain\ninherent noise due to thermal fluctuations. This phenomenon is particularly\nnotable when using ultra-fast, high-resolution imaging sequences for advanced\nresearch, or using low-field systems favored by low- and middle-income\ncountries. These common scenarios can lead to sub-optimal performance or\ncomplete failure of existing diffusion model-based reconstruction techniques.\nSpecifically, as the artificially added noise is gradually removed, the\ninherent MRI noise becomes increasingly pronounced, making the actual noise\nlevel inconsistent with the predefined denoising schedule and consequently\ninaccurate image reconstruction. To tackle this problem, we propose a posterior\nsampling strategy with a novel NoIse Level Adaptive Data Consistency (Nila-DC)\noperation. Extensive experiments are conducted on two public datasets and an\nin-house clinical dataset with field strength ranging from 0.3T to 3T, showing\nthat our method surpasses the state-of-the-art MRI reconstruction methods, and\nis highly robust against various noise levels. The code for Nila is available\nat https://github.com/Solor-pikachu/Nila.\n","authors":["Shoujin Huang","Guanxiong Luo","Xi Wang","Ziran Chen","Yuwan Wang","Huaishui Yang","Pheng-Ann Heng","Lingyan Zhang","Mengye Lyu"],"pdf_url":"https://arxiv.org/pdf/2403.05245v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21640v1","updated":"2024-07-31T14:41:10Z","published":"2024-07-31T14:41:10Z","title":"MSA2Net: Multi-scale Adaptive Attention-guided Network for Medical Image\n Segmentation","summary":" Medical image segmentation involves identifying and separating object\ninstances in a medical image to delineate various tissues and structures, a\ntask complicated by the significant variations in size, shape, and density of\nthese features. Convolutional neural networks (CNNs) have traditionally been\nused for this task but have limitations in capturing long-range dependencies.\nTransformers, equipped with self-attention mechanisms, aim to address this\nproblem. However, in medical image segmentation it is beneficial to merge both\nlocal and global features to effectively integrate feature maps across various\nscales, capturing both detailed features and broader semantic elements for\ndealing with variations in structures. In this paper, we introduce MSA2Net, a\nnew deep segmentation framework featuring an expedient design of\nskip-connections. These connections facilitate feature fusion by dynamically\nweighting and combining coarse-grained encoder features with fine-grained\ndecoder feature maps. Specifically, we propose a Multi-Scale Adaptive Spatial\nAttention Gate (MASAG), which dynamically adjusts the receptive field (Local\nand Global contextual information) to ensure that spatially relevant features\nare selectively highlighted while minimizing background distractions. Extensive\nevaluations involving dermatology, and radiological datasets demonstrate that\nour MSA2Net outperforms state-of-the-art (SOTA) works or matches their\nperformance. The source code is publicly available at\nhttps://github.com/xmindflow/MSA-2Net.\n","authors":["Sina Ghorbani Kolahi","Seyed Kamal Chaharsooghi","Toktam Khatibi","Afshin Bozorgpour","Reza Azad","Moein Heidari","Ilker Hacihaliloglu","Dorit Merhof"],"pdf_url":"https://arxiv.org/pdf/2407.21640v1.pdf","comment":"Accepted at BMVC 2025. Supplementary materials included at the end of\n the main paper (3 pages, 2 figures, 1 table)"},{"id":"http://arxiv.org/abs/2407.21638v1","updated":"2024-07-31T14:37:00Z","published":"2024-07-31T14:37:00Z","title":"Quality Control for Radiology Report Generation Models via Auxiliary\n Auditing Components","summary":" Automation of medical image interpretation could alleviate bottlenecks in\ndiagnostic workflows, and has become of particular interest in recent years due\nto advancements in natural language processing. Great strides have been made\ntowards automated radiology report generation via AI, yet ensuring clinical\naccuracy in generated reports is a significant challenge, hindering deployment\nof such methods in clinical practice. In this work we propose a quality control\nframework for assessing the reliability of AI-generated radiology reports with\nrespect to semantics of diagnostic importance using modular auxiliary auditing\ncomponents (AC). Evaluating our pipeline on the MIMIC-CXR dataset, our findings\nshow that incorporating ACs in the form of disease-classifiers can enable\nauditing that identifies more reliable reports, resulting in higher F1 scores\ncompared to unfiltered generated reports. Additionally, leveraging the\nconfidence of the AC labels further improves the audit's effectiveness.\n","authors":["Hermione Warr","Yasin Ibrahim","Daniel R. McGowan","Konstantinos Kamnitsas"],"pdf_url":"https://arxiv.org/pdf/2407.21638v1.pdf","comment":"Accepted to MICCAI UNSURE Workshop"},{"id":"http://arxiv.org/abs/2407.09033v2","updated":"2024-07-31T14:27:06Z","published":"2024-07-12T06:49:16Z","title":"Textual Query-Driven Mask Transformer for Domain Generalized\n Segmentation","summary":" In this paper, we introduce a method to tackle Domain Generalized Semantic\nSegmentation (DGSS) by utilizing domain-invariant semantic knowledge from text\nembeddings of vision-language models. We employ the text embeddings as object\nqueries within a transformer-based segmentation framework (textual object\nqueries). These queries are regarded as a domain-invariant basis for pixel\ngrouping in DGSS. To leverage the power of textual object queries, we introduce\na novel framework named the textual query-driven mask transformer (tqdm). Our\ntqdm aims to (1) generate textual object queries that maximally encode\ndomain-invariant semantics and (2) enhance the semantic clarity of dense visual\nfeatures. Additionally, we suggest three regularization losses to improve the\nefficacy of tqdm by aligning between visual and textual features. By utilizing\nour method, the model can comprehend inherent semantic information for classes\nof interest, enabling it to generalize to extreme domains (e.g., sketch style).\nOur tqdm achieves 68.9 mIoU on GTA5$\\rightarrow$Cityscapes, outperforming the\nprior state-of-the-art method by 2.5 mIoU. The project page is available at\nhttps://byeonghyunpak.github.io/tqdm.\n","authors":["Byeonghyun Pak","Byeongju Woo","Sunghwan Kim","Dae-hwan Kim","Hoseong Kim"],"pdf_url":"https://arxiv.org/pdf/2407.09033v2.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2407.21631v1","updated":"2024-07-31T14:25:16Z","published":"2024-07-31T14:25:16Z","title":"RoadFormer+: Delivering RGB-X Scene Parsing through Scale-Aware\n Information Decoupling and Advanced Heterogeneous Feature Fusion","summary":" Task-specific data-fusion networks have marked considerable achievements in\nurban scene parsing. Among these networks, our recently proposed RoadFormer\nsuccessfully extracts heterogeneous features from RGB images and surface normal\nmaps and fuses these features through attention mechanisms, demonstrating\ncompelling efficacy in RGB-Normal road scene parsing. However, its performance\nsignificantly deteriorates when handling other types/sources of data or\nperforming more universal, all-category scene parsing tasks. To overcome these\nlimitations, this study introduces RoadFormer+, an efficient, robust, and\nadaptable model capable of effectively fusing RGB-X data, where ``X'',\nrepresents additional types/modalities of data such as depth, thermal, surface\nnormal, and polarization. Specifically, we propose a novel hybrid feature\ndecoupling encoder to extract heterogeneous features and decouple them into\nglobal and local components. These decoupled features are then fused through a\ndual-branch multi-scale heterogeneous feature fusion block, which employs\nparallel Transformer attentions and convolutional neural network modules to\nmerge multi-scale features across different scales and receptive fields. The\nfused features are subsequently fed into a decoder to generate the final\nsemantic predictions. Notably, our proposed RoadFormer+ ranks first on the\nKITTI Road benchmark and achieves state-of-the-art performance in mean\nintersection over union on the Cityscapes, MFNet, FMB, and ZJU datasets.\nMoreover, it reduces the number of learnable parameters by 65\\% compared to\nRoadFormer. Our source code will be publicly available at\nmias.group/RoadFormerPlus.\n","authors":["Jianxin Huang","Jiahang Li","Ning Jia","Yuxiang Sun","Chengju Liu","Qijun Chen","Rui Fan"],"pdf_url":"https://arxiv.org/pdf/2407.21631v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.21616v1","updated":"2024-07-31T14:06:06Z","published":"2024-07-31T14:06:06Z","title":"EZSR: Event-based Zero-Shot Recognition","summary":" This paper studies zero-shot object recognition using event camera data.\nGuided by CLIP, which is pre-trained on RGB images, existing approaches achieve\nzero-shot object recognition by maximizing embedding similarities between event\ndata encoded by an event encoder and RGB images encoded by the CLIP image\nencoder. Alternatively, several methods learn RGB frame reconstructions from\nevent data for the CLIP image encoder. However, these approaches often result\nin suboptimal zero-shot performance.\n This study develops an event encoder without relying on additional\nreconstruction networks. We theoretically analyze the performance bottlenecks\nof previous approaches: global similarity-based objective (i.e., maximizing the\nembedding similarities) cause semantic misalignments between the learned event\nembedding space and the CLIP text embedding space due to the degree of freedom.\nTo mitigate the issue, we explore a scalar-wise regularization strategy.\nFurthermore, to scale up the number of events and RGB data pairs for training,\nwe also propose a pipeline for synthesizing event data from static RGB images.\n Experimentally, our data synthesis strategy exhibits an attractive scaling\nproperty, and our method achieves superior zero-shot object recognition\nperformance on extensive standard benchmark datasets, even compared with past\nsupervised learning approaches. For example, we achieve 47.84% zero-shot\naccuracy on the N-ImageNet dataset.\n","authors":["Yan Yang","Liyuan Pan","Dongxu Li","Liu Liu"],"pdf_url":"https://arxiv.org/pdf/2407.21616v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15328v2","updated":"2024-07-31T13:58:55Z","published":"2024-07-22T02:19:30Z","title":"Iterative Ensemble Training with Anti-Gradient Control for Mitigating\n Memorization in Diffusion Models","summary":" Diffusion models, known for their tremendous ability to generate novel and\nhigh-quality samples, have recently raised concerns due to their data\nmemorization behavior, which poses privacy risks. Recent approaches for memory\nmitigation either only focused on the text modality problem in cross-modal\ngeneration tasks or utilized data augmentation strategies. In this paper, we\npropose a novel training framework for diffusion models from the perspective of\nvisual modality, which is more generic and fundamental for mitigating\nmemorization. To facilitate forgetting of stored information in diffusion model\nparameters, we propose an iterative ensemble training strategy by splitting the\ndata into multiple shards for training multiple models and intermittently\naggregating these model parameters. Moreover, practical analysis of losses\nillustrates that the training loss for easily memorable images tends to be\nobviously lower. Thus, we propose an anti-gradient control method to exclude\nthe sample with a lower loss value from the current mini-batch to avoid\nmemorizing. Extensive experiments and analysis on four datasets are conducted\nto illustrate the effectiveness of our method, and results show that our method\nsuccessfully reduces memory capacity while even improving the performance\nslightly. Moreover, to save the computing cost, we successfully apply our\nmethod to fine-tune the well-trained diffusion models by limited epochs,\ndemonstrating the applicability of our method. Code is available in\nhttps://github.com/liuxiao-guan/IET_AGC.\n","authors":["Xiao Liu","Xiaoliu Guan","Yu Wu","Jiaxu Miao"],"pdf_url":"https://arxiv.org/pdf/2407.15328v2.pdf","comment":"To appear in ECCV 2024, 20 pages with 7 figures"},{"id":"http://arxiv.org/abs/2110.12509v5","updated":"2024-07-31T13:41:24Z","published":"2021-10-24T19:09:28Z","title":"U-Net-based Lung Thickness Map for Pixel-level Lung Volume Estimation of\n Chest X-rays","summary":" Purpose: We aimed to estimate the total lung volume (TLV) from real and\nsynthetic frontal X-ray radiographs on a pixel level using lung thickness maps\ngenerated by a U-Net.\n Methods: 5,959 thorax X-ray computed tomography (CT) scans were retrieved\nfrom two publicly available datasets of the lung nodule analysis 2016 (n=656)\nand the RSNA pulmonary embolism detection challenge 2020 (n=5,303).\nAdditionally, thorax CT scans from 72 subjects (33 healthy: 20 men, mean age\n[range] = 62.4 [34, 80]; 39 suffering from chronic obstructive pulmonary\ndisease: 25 men, mean age [range] = 69.0 [47, 91]) were retrospectively\nselected (10.2018-12.2019) from our in-house dataset such that for each\nsubject, a frontal chest X-ray radiograph no older than seven days was\navailable. All CT scans and their corresponding lung segmentation were forward\nprojected using a simulated X-ray spectrum to generate synthetic radiographs\nand lung thickness maps, respectively. A U-Net model was trained and tested on\nsynthetic radiographs from the public datasets to predict lung thickness maps\nand consequently estimate TLV. Model performance was further assessed by\nevaluating the TLV estimations for the in-house synthetic and real radiograph\npairs using Pearson correlation coefficient (r) and significance testing.\n Results: Strong correlations were measured between the predicted and\nCT-derived ground truth TLV values for test data from synthetic\n($n_{Public}$=1,191, r=0.987, P < 0.001; $n_{In-house}$=72, r=0.973, P < 0.001)\nand real radiographs (n=72, r=0.908, P < 0.001).\n Conclusion: TLV from U-Net-generated pixel-level lung thickness maps were\nsuccessfully estimated for synthetic and real radiographs.\n","authors":["Tina Dorosti","Manuel Schultheiss","Philipp Schmette","Jule Heuchert","Johannes Thalhammer","Florian Schaff","Thorsten Sellerer","Rafael Schick","Kirsten Taphorn","Korbinian Mechlem","Lorenz Birnbacher","Franz Pfeiffer","Daniela Pfeiffer"],"pdf_url":"https://arxiv.org/pdf/2110.12509v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21604v1","updated":"2024-07-31T13:38:47Z","published":"2024-07-31T13:38:47Z","title":"MicroMIL: Graph-based Contextual Multiple Instance Learning for Patient\n Diagnosis Using Microscopy Images","summary":" Current histopathology research has primarily focused on using whole-slide\nimages (WSIs) produced by scanners with weakly-supervised multiple instance\nlearning (MIL). However, WSIs are costly, memory-intensive, and require\nextensive analysis time. As an alternative, microscopy-based analysis offers\ncost and memory efficiency, though microscopy images face issues with unknown\nabsolute positions and redundant images due to multiple captures from the\nsubjective perspectives of pathologists. To this end, we introduce MicroMIL, a\nweakly-supervised MIL framework specifically built to address these challenges\nby dynamically clustering images using deep cluster embedding (DCE) and Gumbel\nSoftmax for representative image extraction. Graph edges are then constructed\nfrom the upper triangular similarity matrix, with nodes connected to their most\nsimilar neighbors, and a graph neural network (GNN) is utilized to capture\nlocal and diverse areas of contextual information. Unlike existing graph-based\nMIL methods designed for WSIs that require absolute positions, MicroMIL\nefficiently handles the graph edges without this need. Extensive evaluations on\nreal-world colon cancer (Seegene) and public BreakHis datasets demonstrate that\nMicroMIL outperforms state-of-the-art (SOTA) methods, offering a robust and\nefficient solution for patient diagnosis using microscopy images. The code is\navailable at https://anonymous.4open.science/r/MicroMIL-6C7C\n","authors":["JongWoo Kim","Bryan Wong","YoungSin Ko","MunYong Yi"],"pdf_url":"https://arxiv.org/pdf/2407.21604v1.pdf","comment":"The first two authors contributed equally to this work"},{"id":"http://arxiv.org/abs/2407.21600v1","updated":"2024-07-31T13:34:14Z","published":"2024-07-31T13:34:14Z","title":"Robust Simultaneous Multislice MRI Reconstruction Using Deep Generative\n Priors","summary":" Simultaneous multislice (SMS) imaging is a powerful technique for\naccelerating magnetic resonance imaging (MRI) acquisitions. However, SMS\nreconstruction remains challenging due to the complex signal interactions\nbetween and within the excited slices. This study presents a robust SMS MRI\nreconstruction method using deep generative priors. Starting from Gaussian\nnoise, we leverage denoising diffusion probabilistic models (DDPM) to gradually\nrecover the individual slices through reverse diffusion iterations while\nimposing data consistency from the measured k-space under readout concatenation\nframework. The posterior sampling procedure is designed such that the DDPM\ntraining can be performed on single-slice images without special adjustments\nfor SMS tasks. Additionally, our method integrates a low-frequency enhancement\n(LFE) module to address a practical issue that SMS-accelerated fast spin echo\n(FSE) and echo-planar imaging (EPI) sequences cannot easily embed\nautocalibration signals. Extensive experiments demonstrate that our approach\nconsistently outperforms existing methods and generalizes well to unseen\ndatasets. The code is available at https://github.com/Solor-pikachu/ROGER after\nthe review process.\n","authors":["Shoujin Huang","Guanxiong Luo","Yuwan Wang","Kexin Yang","Lingyan Zhang","Jingzhe Liu","Hua Guo","Min Wang","Mengye Lyu"],"pdf_url":"https://arxiv.org/pdf/2407.21600v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21596v1","updated":"2024-07-31T13:32:10Z","published":"2024-07-31T13:32:10Z","title":"Evaluating SAM2's Role in Camouflaged Object Detection: From SAM to SAM2","summary":" The Segment Anything Model (SAM), introduced by Meta AI Research as a generic\nobject segmentation model, quickly garnered widespread attention and\nsignificantly influenced the academic community. To extend its application to\nvideo, Meta further develops Segment Anything Model 2 (SAM2), a unified model\ncapable of both video and image segmentation. SAM2 shows notable improvements\nover its predecessor in terms of applicable domains, promptable segmentation\naccuracy, and running speed. However, this report reveals a decline in SAM2's\nability to perceive different objects in images without prompts in its auto\nmode, compared to SAM. Specifically, we employ the challenging task of\ncamouflaged object detection to assess this performance decrease, hoping to\ninspire further exploration of the SAM model family by researchers. The results\nof this paper are provided in \\url{https://github.com/luckybird1994/SAMCOD}.\n","authors":["Lv Tang","Bo Li"],"pdf_url":"https://arxiv.org/pdf/2407.21596v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21586v1","updated":"2024-07-31T13:19:39Z","published":"2024-07-31T13:19:39Z","title":"Adaptive Mix for Semi-Supervised Medical Image Segmentation","summary":" Mix-up is a key technique for consistency regularization-based\nsemi-supervised learning methods, generating strong-perturbed samples for\nstrong-weak pseudo-supervision. Existing mix-up operations are performed either\nrandomly or with predefined rules, such as replacing low-confidence patches\nwith high-confidence ones. The former lacks control over the perturbation\ndegree, leading to overfitting on randomly perturbed samples, while the latter\ntends to generate images with trivial perturbations, both of which limit the\neffectiveness of consistency learning. This paper aims to answer the following\nquestion: How can image mix-up perturbation be adaptively performed during\ntraining? To this end, we propose an Adaptive Mix algorithm (AdaMix) for image\nmix-up in a self-paced learning manner. Given that, in general, a model's\nperformance gradually improves during training, AdaMix is equipped with a\nself-paced curriculum that, in the initial training stage, provides relatively\nsimple perturbed samples and then gradually increases the difficulty of\nperturbed images by adaptively controlling the perturbation degree based on the\nmodel's learning state estimated by a self-paced regularize. We develop three\nframeworks with our AdaMix, i.e., AdaMix-ST, AdaMix-MT, and AdaMix-CT, for\nsemi-supervised medical image segmentation. Extensive experiments on three\npublic datasets, including both 2D and 3D modalities, show that the proposed\nframeworks are capable of achieving superior performance. For example, compared\nwith the state-of-the-art, AdaMix-CT achieves relative improvements of 2.62% in\nDice and 48.25% in average surface distance on the ACDC dataset with 10%\nlabeled data. The results demonstrate that mix-up operations with dynamically\nadjusted perturbation strength based on the segmentation model's state can\nsignificantly enhance the effectiveness of consistency regularization.\n","authors":["Zhiqiang Shen","Peng Cao","Junming Su","Jinzhu Yang","Osmar R. Zaiane"],"pdf_url":"https://arxiv.org/pdf/2407.21586v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21581v1","updated":"2024-07-31T13:11:14Z","published":"2024-07-31T13:11:14Z","title":"InScope: A New Real-world 3D Infrastructure-side Collaborative\n Perception Dataset for Open Traffic Scenarios","summary":" Perception systems of autonomous vehicles are susceptible to occlusion,\nespecially when examined from a vehicle-centric perspective. Such occlusion can\nlead to overlooked object detections, e.g., larger vehicles such as trucks or\nbuses may create blind spots where cyclists or pedestrians could be obscured,\naccentuating the safety concerns associated with such perception system\nlimitations. To mitigate these challenges, the vehicle-to-everything (V2X)\nparadigm suggests employing an infrastructure-side perception system (IPS) to\ncomplement autonomous vehicles with a broader perceptual scope. Nevertheless,\nthe scarcity of real-world 3D infrastructure-side datasets constrains the\nadvancement of V2X technologies. To bridge these gaps, this paper introduces a\nnew 3D infrastructure-side collaborative perception dataset, abbreviated as\ninscope. Notably, InScope is the first dataset dedicated to addressing\nocclusion challenges by strategically deploying multiple-position Light\nDetection and Ranging (LiDAR) systems on the infrastructure side. Specifically,\nInScope encapsulates a 20-day capture duration with 303 tracking trajectories\nand 187,787 3D bounding boxes annotated by experts. Through analysis of\nbenchmarks, four different benchmarks are presented for open traffic scenarios,\nincluding collaborative 3D object detection, multisource data fusion, data\ndomain transfer, and 3D multiobject tracking tasks. Additionally, a new metric\nis designed to quantify the impact of occlusion, facilitating the evaluation of\ndetection degradation ratios among various algorithms. The Experimental\nfindings showcase the enhanced performance of leveraging InScope to assist in\ndetecting and tracking 3D multiobjects in real-world scenarios, particularly in\ntracking obscured, small, and distant objects. The dataset and benchmarks are\navailable at https://github.com/xf-zh/InScope.\n","authors":["Xiaofei Zhang","Yining Li","Jinping Wang","Xiangyi Qin","Ying Shen","Zhengping Fan","Xiaojun Tan"],"pdf_url":"https://arxiv.org/pdf/2407.21581v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21580v1","updated":"2024-07-31T13:10:59Z","published":"2024-07-31T13:10:59Z","title":"Voxel Scene Graph for Intracranial Hemorrhage","summary":" Patients with Intracranial Hemorrhage (ICH) face a potentially\nlife-threatening condition, and patient-centered individualized treatment\nremains challenging due to possible clinical complications. Deep-Learning-based\nmethods can efficiently analyze the routinely acquired head CTs to support the\nclinical decision-making. The majority of early work focuses on the detection\nand segmentation of ICH, but do not model the complex relations between ICH and\nadjacent brain structures. In this work, we design a tailored object detection\nmethod for ICH, which we unite with segmentation-grounded Scene Graph\nGeneration (SGG) methods to learn a holistic representation of the clinical\ncerebral scene. To the best of our knowledge, this is the first application of\nSGG for 3D voxel images. We evaluate our method on two head-CT datasets and\ndemonstrate that our model can recall up to 74% of clinically relevant\nrelations. This work lays the foundation towards SGG for 3D voxel data. The\ngenerated Scene Graphs can already provide insights for the clinician, but are\nalso valuable for all downstream tasks as a compact and interpretable\nrepresentation.\n","authors":["Antoine P. Sanner","Nils F. Grauhan","Marc A. Brockmann","Ahmed E. Othman","Anirban Mukhopadhyay"],"pdf_url":"https://arxiv.org/pdf/2407.21580v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00569v3","updated":"2024-07-31T13:08:22Z","published":"2024-06-30T03:04:11Z","title":"Investigating and Mitigating the Multimodal Hallucination Snowballing in\n Large Vision-Language Models","summary":" Though advanced in understanding visual information with human languages,\nLarge Vision-Language Models (LVLMs) still suffer from multimodal\nhallucinations. A natural concern is that during multimodal interaction, the\ngenerated hallucinations could influence the LVLMs' subsequent generation.\nThus, we raise a question: When presented with a query relevant to the\npreviously generated hallucination, will LVLMs be misled and respond\nincorrectly, even though the ground visual information exists? To answer this,\nwe propose a framework called MMHalSnowball to evaluate LVLMs' behaviors when\nencountering generated hallucinations, where LVLMs are required to answer\nspecific visual questions within a curated hallucinatory conversation.\nCrucially, our experiment shows that the performance of open-source LVLMs drops\nby at least $31\\%$, indicating that LVLMs are prone to accept the generated\nhallucinations and make false claims that they would not have supported without\ndistractions. We term this phenomenon Multimodal Hallucination Snowballing. To\nmitigate this, we further propose a training-free method called Residual Visual\nDecoding, where we revise the output distribution of LVLMs with the one derived\nfrom the residual visual input, providing models with direct access to the\nvisual information. Experiments show that our method can mitigate more than\n$24\\%$ of the snowballed multimodal hallucination while maintaining\ncapabilities.\n","authors":["Weihong Zhong","Xiaocheng Feng","Liang Zhao","Qiming Li","Lei Huang","Yuxuan Gu","Weitao Ma","Yuan Xu","Bing Qin"],"pdf_url":"https://arxiv.org/pdf/2407.00569v3.pdf","comment":"Accepted to ACL 2024 Main Conference. 21 pages, 20 figures"},{"id":"http://arxiv.org/abs/2211.06088v2","updated":"2024-07-31T13:05:56Z","published":"2022-11-11T09:44:23Z","title":"RepGhost: A Hardware-Efficient Ghost Module via Re-parameterization","summary":" Feature reuse has been a key technique in light-weight convolutional neural\nnetworks (CNNs) architecture design. Current methods usually utilize a\nconcatenation operator to keep large channel numbers cheaply (thus large\nnetwork capacity) by reusing feature maps from other layers. Although\nconcatenation is parameters- and FLOPs-free, its computational cost on hardware\ndevices is non-negligible. To address this, this paper provides a new\nperspective to realize feature reuse implicitly and more efficiently instead of\nconcatenation. A novel hardware-efficient RepGhost module is proposed for\nimplicit feature reuse via reparameterization, instead of using concatenation\noperator. Based on the RepGhost module, we develop our efficient RepGhost\nbottleneck and RepGhostNet. Experiments on ImageNet and COCO benchmarks\ndemonstrate that our RepGhostNet is much more effective and efficient than\nGhostNet and MobileNetV3 on mobile devices. Specially, our RepGhostNet\nsurpasses GhostNet 0.5x by 2.5% Top-1 accuracy on ImageNet dataset with less\nparameters and comparable latency on an ARM-based mobile device. Code and model\nweights are available at https://github.com/ChengpengChen/RepGhost.\n","authors":["Chengpeng Chen","Zichao Guo","Haien Zeng","Pengfei Xiong","Jian Dong"],"pdf_url":"https://arxiv.org/pdf/2211.06088v2.pdf","comment":"tech report"},{"id":"http://arxiv.org/abs/2407.21577v1","updated":"2024-07-31T13:05:32Z","published":"2024-07-31T13:05:32Z","title":"Multi-Site Class-Incremental Learning with Weighted Experts in\n Echocardiography","summary":" Building an echocardiography view classifier that maintains performance in\nreal-life cases requires diverse multi-site data, and frequent updates with\nnewly available data to mitigate model drift. Simply fine-tuning on new\ndatasets results in \"catastrophic forgetting\", and cannot adapt to variations\nof view labels between sites. Alternatively, collecting all data on a single\nserver and re-training may not be feasible as data sharing agreements may\nrestrict image transfer, or datasets may only become available at different\ntimes. Furthermore, time and cost associated with re-training grows with every\nnew dataset. We propose a class-incremental learning method which learns an\nexpert network for each dataset, and combines all expert networks with a score\nfusion model. The influence of ``unqualified experts'' is minimised by\nweighting each contribution with a learnt in-distribution score. These weights\npromote transparency as the contribution of each expert is known during\ninference. Instead of using the original images, we use learned features from\neach dataset, which are easier to share and raise fewer licensing and privacy\nconcerns. We validate our work on six datasets from multiple sites,\ndemonstrating significant reductions in training time while improving view\nclassification performance.\n","authors":["Kit M. Bransby","Woo-jin Cho Kim","Jorge Oliveira","Alex Thorley","Arian Beqiri","Alberto Gomez","Agisilaos Chartsias"],"pdf_url":"https://arxiv.org/pdf/2407.21577v1.pdf","comment":"Accepted for Oral at MICCAI workshop ASMUS-2024"},{"id":"http://arxiv.org/abs/2312.03048v3","updated":"2024-07-31T13:02:51Z","published":"2023-12-05T18:34:12Z","title":"DGInStyle: Domain-Generalizable Semantic Segmentation with Image\n Diffusion Models and Stylized Semantic Control","summary":" Large, pretrained latent diffusion models (LDMs) have demonstrated an\nextraordinary ability to generate creative content, specialize to user data\nthrough few-shot fine-tuning, and condition their output on other modalities,\nsuch as semantic maps. However, are they usable as large-scale data generators,\ne.g., to improve tasks in the perception stack, like semantic segmentation? We\ninvestigate this question in the context of autonomous driving, and answer it\nwith a resounding \"yes\". We propose an efficient data generation pipeline\ntermed DGInStyle. First, we examine the problem of specializing a pretrained\nLDM to semantically-controlled generation within a narrow domain. Second, we\npropose a Style Swap technique to endow the rich generative prior with the\nlearned semantic control. Third, we design a Multi-resolution Latent Fusion\ntechnique to overcome the bias of LDMs towards dominant objects. Using\nDGInStyle, we generate a diverse dataset of street scenes, train a\ndomain-agnostic semantic segmentation model on it, and evaluate the model on\nmultiple popular autonomous driving datasets. Our approach consistently\nincreases the performance of several domain generalization methods compared to\nthe previous state-of-the-art methods. The source code and the generated\ndataset are available at https://dginstyle.github.io.\n","authors":["Yuru Jia","Lukas Hoyer","Shengyu Huang","Tianfu Wang","Luc Van Gool","Konrad Schindler","Anton Obukhov"],"pdf_url":"https://arxiv.org/pdf/2312.03048v3.pdf","comment":"ECCV 2024, camera ready"},{"id":"http://arxiv.org/abs/2407.17152v2","updated":"2024-07-31T12:56:22Z","published":"2024-07-24T10:51:46Z","title":"XMeCap: Meme Caption Generation with Sub-Image Adaptability","summary":" Humor, deeply rooted in societal meanings and cultural details, poses a\nunique challenge for machines. While advances have been made in natural\nlanguage processing, real-world humor often thrives in a multi-modal context,\nencapsulated distinctively by memes. This paper poses a particular emphasis on\nthe impact of multi-images on meme captioning. After that, we introduce the\n\\textsc{XMeCap} framework, a novel approach that adopts supervised fine-tuning\nand reinforcement learning based on an innovative reward model, which factors\nin both global and local similarities between visuals and text. Our results,\nbenchmarked against contemporary models, manifest a marked improvement in\ncaption generation for both single-image and multi-image memes, as well as\ndifferent meme categories. \\textsc{XMeCap} achieves an average evaluation score\nof 75.85 for single-image memes and 66.32 for multi-image memes, outperforming\nthe best baseline by 3.71\\% and 4.82\\%, respectively. This research not only\nestablishes a new frontier in meme-related studies but also underscores the\npotential of machines in understanding and generating humor in a multi-modal\nsetting.\n","authors":["Yuyan Chen","Songzhou Yan","Zhihong Zhu","Zhixu Li","Yanghua Xiao"],"pdf_url":"https://arxiv.org/pdf/2407.17152v2.pdf","comment":"Accepted to MM 2024"},{"id":"http://arxiv.org/abs/2404.02830v2","updated":"2024-07-31T12:34:39Z","published":"2024-04-03T16:04:59Z","title":"Enhancing Interpretability of Vertebrae Fracture Grading using\n Human-interpretable Prototypes","summary":" Vertebral fracture grading classifies the severity of vertebral fractures,\nwhich is a challenging task in medical imaging and has recently attracted Deep\nLearning (DL) models. Only a few works attempted to make such models\nhuman-interpretable despite the need for transparency and trustworthiness in\ncritical use cases like DL-assisted medical diagnosis. Moreover, such models\neither rely on post-hoc methods or additional annotations. In this work, we\npropose a novel interpretable-by-design method, ProtoVerse, to find relevant\nsub-parts of vertebral fractures (prototypes) that reliably explain the model's\ndecision in a human-understandable way. Specifically, we introduce a novel\ndiversity-promoting loss to mitigate prototype repetitions in small datasets\nwith intricate semantics. We have experimented with the VerSe'19 dataset and\noutperformed the existing prototype-based method. Further, our model provides\nsuperior interpretability against the post-hoc method. Importantly, expert\nradiologists validated the visual interpretability of our results, showing\nclinical applicability.\n","authors":["Poulami Sinhamahapatra","Suprosanna Shit","Anjany Sekuboyina","Malek Husseini","David Schinz","Nicolas Lenhart","Joern Menze","Jan Kirschke","Karsten Roscher","Stephan Guennemann"],"pdf_url":"https://arxiv.org/pdf/2404.02830v2.pdf","comment":"Accepted for publication at the Journal of Machine Learning for\n Biomedical Imaging (MELBA) https://melba-journal.org/2024:015"},{"id":"http://arxiv.org/abs/2407.21554v1","updated":"2024-07-31T12:22:57Z","published":"2024-07-31T12:22:57Z","title":"Conditioned Prompt-Optimization for Continual Deepfake Detection","summary":" The rapid advancement of generative models has significantly enhanced the\nrealism and customization of digital content creation. The increasing power of\nthese tools, coupled with their ease of access, fuels the creation of\nphotorealistic fake content, termed deepfakes, that raises substantial concerns\nabout their potential misuse. In response, there has been notable progress in\ndeveloping detection mechanisms to identify content produced by these advanced\nsystems. However, existing methods often struggle to adapt to the continuously\nevolving landscape of deepfake generation. This paper introduces Prompt2Guard,\na novel solution for exemplar-free continual deepfake detection of images, that\nleverages Vision-Language Models (VLMs) and domain-specific multimodal prompts.\nCompared to previous VLM-based approaches that are either bounded by prompt\nselection accuracy or necessitate multiple forward passes, we leverage a\nprediction ensembling technique with read-only prompts. Read-only prompts do\nnot interact with VLMs internal representation, mitigating the need for\nmultiple forward passes. Thus, we enhance efficiency and accuracy in detecting\ngenerated content. Additionally, our method exploits a text-prompt conditioning\ntailored to deepfake detection, which we demonstrate is beneficial in our\nsetting. We evaluate Prompt2Guard on CDDB-Hard, a continual deepfake detection\nbenchmark composed of five deepfake detection datasets spanning multiple\ndomains and generators, achieving a new state-of-the-art. Additionally, our\nresults underscore the effectiveness of our approach in addressing the\nchallenges posed by continual deepfake detection, paving the way for more\nrobust and adaptable solutions in deepfake detection.\n","authors":["Francesco Laiti","Benedetta Liberatori","Thomas De Min","Elisa Ricci"],"pdf_url":"https://arxiv.org/pdf/2407.21554v1.pdf","comment":"Accepted at ICPR 2024"},{"id":"http://arxiv.org/abs/2404.05584v2","updated":"2024-07-31T11:54:48Z","published":"2024-04-08T14:59:53Z","title":"Neural Cellular Automata for Lightweight, Robust and Explainable\n Classification of White Blood Cell Images","summary":" Diagnosis of hematological malignancies depends on accurate identification of\nwhite blood cells in peripheral blood smears. Deep learning techniques are\nemerging as a viable solution to scale and optimize this process by automatic\ncell classification. However, these techniques face several challenges such as\nlimited generalizability, sensitivity to domain shifts, and lack of\nexplainability. Here, we introduce a novel approach for white blood cell\nclassification based on neural cellular automata (NCA). We test our approach on\nthree datasets of white blood cell images and show that we achieve competitive\nperformance compared to conventional methods. Our NCA-based method is\nsignificantly smaller in terms of parameters and exhibits robustness to domain\nshifts. Furthermore, the architecture is inherently explainable, providing\ninsights into the decision process for each classification, which helps to\nunderstand and validate model predictions. Our results demonstrate that NCA can\nbe used for image classification, and that they address key challenges of\nconventional methods, indicating a high potential for applicability in clinical\npractice.\n","authors":["Michael Deutges","Ario Sadafi","Nassir Navab","Carsten Marr"],"pdf_url":"https://arxiv.org/pdf/2404.05584v2.pdf","comment":"Accepted for publication at the 27th International Conference on\n Medical Image Computing and Computer Assisted Intervention - MICCAI 2024"},{"id":"http://arxiv.org/abs/2407.21534v1","updated":"2024-07-31T11:40:29Z","published":"2024-07-31T11:40:29Z","title":"ControlMLLM: Training-Free Visual Prompt Learning for Multimodal Large\n Language Models","summary":" In this work, we propose a training-free method to inject visual referring\ninto Multimodal Large Language Models (MLLMs) through learnable visual token\noptimization. We observe the relationship between text prompt tokens and visual\ntokens in MLLMs, where attention layers model the connection between them. Our\napproach involves adjusting visual tokens from the MLP output during inference,\ncontrolling which text prompt tokens attend to which visual tokens. We optimize\na learnable visual token based on an energy function, enhancing the strength of\nreferential regions in the attention map. This enables detailed region\ndescription and reasoning without the need for substantial training costs or\nmodel retraining. Our method offers a promising direction for integrating\nreferential abilities into MLLMs. Our method support referring with box, mask,\nscribble and point. The results demonstrate that our method exhibits\ncontrollability and interpretability.\n","authors":["Mingrui Wu","Xinyue Cai","Jiayi Ji","Jiale Li","Oucheng Huang","Gen Luo","Hao Fei","Xiaoshuai Sun","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2407.21534v1.pdf","comment":"Code:https://github.com/mrwu-mac/ControlMLLM"},{"id":"http://arxiv.org/abs/2407.20664v2","updated":"2024-07-31T11:11:20Z","published":"2024-07-30T08:59:05Z","title":"3D-GRES: Generalized 3D Referring Expression Segmentation","summary":" 3D Referring Expression Segmentation (3D-RES) is dedicated to segmenting a\nspecific instance within a 3D space based on a natural language description.\nHowever, current approaches are limited to segmenting a single target,\nrestricting the versatility of the task. To overcome this limitation, we\nintroduce Generalized 3D Referring Expression Segmentation (3D-GRES), which\nextends the capability to segment any number of instances based on natural\nlanguage instructions. In addressing this broader task, we propose the\nMulti-Query Decoupled Interaction Network (MDIN), designed to break down\nmulti-object segmentation tasks into simpler, individual segmentations. MDIN\ncomprises two fundamental components: Text-driven Sparse Queries (TSQ) and\nMulti-object Decoupling Optimization (MDO). TSQ generates sparse point cloud\nfeatures distributed over key targets as the initialization for queries.\nMeanwhile, MDO is tasked with assigning each target in multi-object scenarios\nto different queries while maintaining their semantic consistency. To adapt to\nthis new task, we build a new dataset, namely Multi3DRes. Our comprehensive\nevaluations on this dataset demonstrate substantial enhancements over existing\nmodels, thus charting a new path for intricate multi-object 3D scene\ncomprehension. The benchmark and code are available at\nhttps://github.com/sosppxo/MDIN.\n","authors":["Changli Wu","Yihang Liu","Jiayi Ji","Yiwei Ma","Haowei Wang","Gen Luo","Henghui Ding","Xiaoshuai Sun","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2407.20664v2.pdf","comment":"Accepted by ACM MM 2024 (Oral), Code: https://github.com/sosppxo/MDIN"},{"id":"http://arxiv.org/abs/2407.21525v1","updated":"2024-07-31T11:04:41Z","published":"2024-07-31T11:04:41Z","title":"Skeleton-Based Action Recognition with Spatial-Structural Graph\n Convolution","summary":" Human Activity Recognition (HAR) is a field of study that focuses on\nidentifying and classifying human activities. Skeleton-based Human Activity\nRecognition has received much attention in recent years, where Graph\nConvolutional Network (GCN) based method is widely used and has achieved\nremarkable results. However, the representation of skeleton data and the issue\nof over-smoothing in GCN still need to be studied. 1). Compared to central\nnodes, edge nodes can only aggregate limited neighbor information, and\ndifferent edge nodes of the human body are always structurally related.\nHowever, the information from edge nodes is crucial for fine-grained activity\nrecognition. 2). The Graph Convolutional Network suffers from a significant\nover-smoothing issue, causing nodes to become increasingly similar as the\nnumber of network layers increases. Based on these two ideas, we propose a\ntwo-stream graph convolution method called Spatial-Structural GCN (SpSt-GCN).\nSpatial GCN performs information aggregation based on the topological structure\nof the human body, and structural GCN performs differentiation based on the\nsimilarity of edge node sequences. The spatial connection is fixed, and the\nhuman skeleton naturally maintains this topology regardless of the actions\nperformed by humans. However, the structural connection is dynamic and depends\non the type of movement the human body is performing. Based on this idea, we\nalso propose an entirely data-driven structural connection, which greatly\nincreases flexibility. We evaluate our method on two large-scale datasets,\ni.e., NTU RGB+D and NTU RGB+D 120. The proposed method achieves good results\nwhile being efficient.\n","authors":["Jingyao Wang","Emmanuel Bergeret","Issam Falih"],"pdf_url":"https://arxiv.org/pdf/2407.21525v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21519v1","updated":"2024-07-31T10:44:31Z","published":"2024-07-31T10:44:31Z","title":"PhysFlow: Skin tone transfer for remote heart rate estimation through\n conditional normalizing flows","summary":" In recent years, deep learning methods have shown impressive results for\ncamera-based remote physiological signal estimation, clearly surpassing\ntraditional methods. However, the performance and generalization ability of\nDeep Neural Networks heavily depends on rich training data truly representing\ndifferent factors of variation encountered in real applications. Unfortunately,\nmany current remote photoplethysmography (rPPG) datasets lack diversity,\nparticularly in darker skin tones, leading to biased performance of existing\nrPPG approaches. To mitigate this bias, we introduce PhysFlow, a novel method\nfor augmenting skin diversity in remote heart rate estimation using conditional\nnormalizing flows. PhysFlow adopts end-to-end training optimization, enabling\nsimultaneous training of supervised rPPG approaches on both original and\ngenerated data. Additionally, we condition our model using CIELAB color space\nskin features directly extracted from the facial videos without the need for\nskin-tone labels. We validate PhysFlow on publicly available datasets,\nUCLA-rPPG and MMPD, demonstrating reduced heart rate error, particularly in\ndark skin tones. Furthermore, we demonstrate its versatility and adaptability\nacross different data-driven rPPG methods.\n","authors":["Joaquim Comas","Antonia Alomar","Adria Ruiz","Federico Sukno"],"pdf_url":"https://arxiv.org/pdf/2407.21519v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21517v1","updated":"2024-07-31T10:38:11Z","published":"2024-07-31T10:38:11Z","title":"A Simple Low-bit Quantization Framework for Video Snapshot Compressive\n Imaging","summary":" Video Snapshot Compressive Imaging (SCI) aims to use a low-speed 2D camera to\ncapture high-speed scene as snapshot compressed measurements, followed by a\nreconstruction algorithm to reconstruct the high-speed video frames.\nState-of-the-art (SOTA) deep learning-based algorithms have achieved impressive\nperformance, yet with heavy computational workload. Network quantization is a\npromising way to reduce computational cost. However, a direct low-bit\nquantization will bring large performance drop. To address this challenge, in\nthis paper, we propose a simple low-bit quantization framework (dubbed Q-SCI)\nfor the end-to-end deep learning-based video SCI reconstruction methods which\nusually consist of a feature extraction, feature enhancement, and video\nreconstruction module. Specifically, we first design a high-quality feature\nextraction module and a precise video reconstruction module to extract and\npropagate high-quality features in the low-bit quantized model. In addition, to\nalleviate the information distortion of the Transformer branch in the quantized\nfeature enhancement module, we introduce a shift operation on the query and key\ndistributions to further bridge the performance gap. Comprehensive experimental\nresults manifest that our Q-SCI framework can achieve superior performance,\ne.g., 4-bit quantized EfficientSCI-S derived by our Q-SCI framework can\ntheoretically accelerate the real-valued EfficientSCI-S by 7.8X with only 2.3%\nperformance gap on the simulation testing datasets. Code is available at\nhttps://github.com/mcao92/QuantizedSCI.\n","authors":["Miao Cao","Lishun Wang","Huan Wang","Xin Yuan"],"pdf_url":"https://arxiv.org/pdf/2407.21517v1.pdf","comment":"18 pages, Accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2311.12682v2","updated":"2024-07-31T10:37:00Z","published":"2023-11-21T15:39:21Z","title":"Transferring to Real-World Layouts: A Depth-aware Framework for Scene\n Adaptation","summary":" Scene segmentation via unsupervised domain adaptation (UDA) enables the\ntransfer of knowledge acquired from source synthetic data to real-world target\ndata, which largely reduces the need for manual pixel-level annotations in the\ntarget domain. To facilitate domain-invariant feature learning, existing\nmethods typically mix data from both the source domain and target domain by\nsimply copying and pasting the pixels. Such vanilla methods are usually\nsub-optimal since they do not take into account how well the mixed layouts\ncorrespond to real-world scenarios. Real-world scenarios are with an inherent\nlayout. We observe that semantic categories, such as sidewalks, buildings, and\nsky, display relatively consistent depth distributions, and could be clearly\ndistinguished in a depth map. Based on such observation, we propose a\ndepth-aware framework to explicitly leverage depth estimation to mix the\ncategories and facilitate the two complementary tasks, i.e., segmentation and\ndepth learning in an end-to-end manner. In particular, the framework contains a\nDepth-guided Contextual Filter (DCF) forndata augmentation and a cross-task\nencoder for contextual learning. DCF simulates the real-world layouts, while\nthe cross-task encoder further adaptively fuses the complementing features\nbetween two tasks. Besides, it is worth noting that several public datasets do\nnot provide depth annotation. Therefore, we leverage the off-the-shelf depth\nestimation network to generate the pseudo depth. Extensive experiments show\nthat our proposed methods, even with pseudo depth, achieve competitive\nperformance on two widely-used bench-marks, i.e. 77.7 mIoU on GTA to Cityscapes\nand 69.3 mIoU on Synthia to Cityscapes.\n","authors":["Mu Chen","Zhedong Zheng","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2311.12682v2.pdf","comment":"ACM MM 2024 (Oral)"},{"id":"http://arxiv.org/abs/2407.21516v1","updated":"2024-07-31T10:36:41Z","published":"2024-07-31T10:36:41Z","title":"Expanding the Medical Decathlon dataset: segmentation of colon and\n colorectal cancer from computed tomography images","summary":" Colorectal cancer is the third-most common cancer in the Western Hemisphere.\nThe segmentation of colorectal and colorectal cancer by computed tomography is\nan urgent problem in medicine. Indeed, a system capable of solving this problem\nwill enable the detection of colorectal cancer at early stages of the disease,\nfacilitate the search for pathology by the radiologist, and significantly\naccelerate the process of diagnosing the disease. However, scientific\npublications on medical image processing mostly use closed, non-public data.\nThis paper presents an extension of the Medical Decathlon dataset with\ncolorectal markups in order to improve the quality of segmentation algorithms.\nAn experienced radiologist validated the data, categorized it into subsets by\nquality, and published it in the public domain. Based on the obtained results,\nwe trained neural network models of the UNet architecture with 5-part\ncross-validation and achieved a Dice metric quality of $0.6988 \\pm 0.3$. The\npublished markups will improve the quality of colorectal cancer detection and\nsimplify the radiologist's job for study description.\n","authors":["I. M. Chernenkiy","Y. A. Drach","S. R. Mustakimova","V. V. Kazantseva","N. A. Ushakov","S. K. Efetov","M. V. Feldsherov"],"pdf_url":"https://arxiv.org/pdf/2407.21516v1.pdf","comment":"8 pages, 2 figures, 2 tables"},{"id":"http://arxiv.org/abs/2407.21510v1","updated":"2024-07-31T10:28:49Z","published":"2024-07-31T10:28:49Z","title":"PEAR: Phrase-Based Hand-Object Interaction Anticipation","summary":" First-person hand-object interaction anticipation aims to predict the\ninteraction process over a forthcoming period based on current scenes and\nprompts. This capability is crucial for embodied intelligence and human-robot\ncollaboration. The complete interaction process involves both pre-contact\ninteraction intention (i.e., hand motion trends and interaction hotspots) and\npost-contact interaction manipulation (i.e., manipulation trajectories and hand\nposes with contact). Existing research typically anticipates only interaction\nintention while neglecting manipulation, resulting in incomplete predictions\nand an increased likelihood of intention errors due to the lack of manipulation\nconstraints. To address this, we propose a novel model, PEAR (Phrase-Based\nHand-Object Interaction Anticipation), which jointly anticipates interaction\nintention and manipulation. To handle uncertainties in the interaction process,\nwe employ a twofold approach. Firstly, we perform cross-alignment of verbs,\nnouns, and images to reduce the diversity of hand movement patterns and object\nfunctional attributes, thereby mitigating intention uncertainty. Secondly, we\nestablish bidirectional constraints between intention and manipulation using\ndynamic integration and residual connections, ensuring consistency among\nelements and thus overcoming manipulation uncertainty. To rigorously evaluate\nthe performance of the proposed model, we collect a new task-relevant dataset,\nEGO-HOIP, with comprehensive annotations. Extensive experimental results\ndemonstrate the superiority of our method.\n","authors":["Zichen Zhang","Hongchen Luo","Wei Zhai","Yang Cao","Yu Kang"],"pdf_url":"https://arxiv.org/pdf/2407.21510v1.pdf","comment":"22 pages, 10 figures, 4 tables"},{"id":"http://arxiv.org/abs/2407.21498v1","updated":"2024-07-31T10:12:14Z","published":"2024-07-31T10:12:14Z","title":"MaskUno: Switch-Split Block For Enhancing Instance Segmentation","summary":" Instance segmentation is an advanced form of image segmentation which, beyond\ntraditional segmentation, requires identifying individual instances of\nrepeating objects in a scene. Mask R-CNN is the most common architecture for\ninstance segmentation, and improvements to this architecture include steps such\nas benefiting from bounding box refinements, adding semantics, or backbone\nenhancements. In all the proposed variations to date, the problem of competing\nkernels (each class aims to maximize its own accuracy) persists when models try\nto synchronously learn numerous classes. In this paper, we propose mitigating\nthis problem by replacing mask prediction with a Switch-Split block that\nprocesses refined ROIs, classifies them, and assigns them to specialized mask\npredictors. We name the method MaskUno and test it on various models from the\nliterature, which are then trained on multiple classes using the benchmark COCO\ndataset. An increase in the mean Average Precision (mAP) of 2.03% was observed\nfor the high-performing DetectoRS when trained on 80 classes. MaskUno proved to\nenhance the mAP of instance segmentation models regardless of the number and\ntyp\n","authors":["Jawad Haidar","Marc Mouawad","Imad Elhajj","Daniel Asmar"],"pdf_url":"https://arxiv.org/pdf/2407.21498v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21497v1","updated":"2024-07-31T10:11:57Z","published":"2024-07-31T10:11:57Z","title":"Mitral Regurgitation Recogniton based on Unsupervised\n Out-of-Distribution Detection with Residual Diffusion Amplification","summary":" Mitral regurgitation (MR) is a serious heart valve disease. Early and\naccurate diagnosis of MR via ultrasound video is critical for timely clinical\ndecision-making and surgical intervention. However, manual MR diagnosis heavily\nrelies on the operator's experience, which may cause misdiagnosis and\ninter-observer variability. Since MR data is limited and has large intra-class\nvariability, we propose an unsupervised out-of-distribution (OOD) detection\nmethod to identify MR rather than building a deep classifier. To our knowledge,\nwe are the first to explore OOD in MR ultrasound videos. Our method consists of\na feature extractor, a feature reconstruction model, and a residual\naccumulation amplification algorithm. The feature extractor obtains features\nfrom the video clips and feeds them into the feature reconstruction model to\nrestore the original features. The residual accumulation amplification\nalgorithm then iteratively performs noise feature reconstruction, amplifying\nthe reconstructed error of OOD features. This algorithm is straightforward yet\nefficient and can seamlessly integrate as a plug-and-play component in\nreconstruction-based OOD detection methods. We validated the proposed method on\na large ultrasound dataset containing 893 non-MR and 267 MR videos.\nExperimental results show that our OOD detection method can effectively\nidentify MR samples.\n","authors":["Zhe Liu","Xiliang Zhu","Tong Han","Yuhao Huang","Jian Wang","Lian Liu","Fang Wang","Dong Ni","Zhongshan Gou","Xin Yang"],"pdf_url":"https://arxiv.org/pdf/2407.21497v1.pdf","comment":"Accepted by MICCAI MLMI 2024, 11 pages, 3 figures"},{"id":"http://arxiv.org/abs/2407.21490v1","updated":"2024-07-31T09:59:20Z","published":"2024-07-31T09:59:20Z","title":"Explainable and Controllable Motion Curve Guided Cardiac Ultrasound\n Video Generation","summary":" Echocardiography video is a primary modality for diagnosing heart diseases,\nbut the limited data poses challenges for both clinical teaching and machine\nlearning training. Recently, video generative models have emerged as a\npromising strategy to alleviate this issue. However, previous methods often\nrelied on holistic conditions during generation, hindering the flexible\nmovement control over specific cardiac structures. In this context, we propose\nan explainable and controllable method for echocardiography video generation,\ntaking an initial frame and a motion curve as guidance. Our contributions are\nthree-fold. First, we extract motion information from each heart substructure\nto construct motion curves, enabling the diffusion model to synthesize\ncustomized echocardiography videos by modifying these curves. Second, we\npropose the structure-to-motion alignment module, which can map semantic\nfeatures onto motion curves across cardiac structures. Third, The\nposition-aware attention mechanism is designed to enhance video consistency\nutilizing Gaussian masks with structural position information. Extensive\nexperiments on three echocardiography datasets show that our method outperforms\nothers regarding fidelity and consistency. The full code will be released at\nhttps://github.com/mlmi-2024-72/ECM.\n","authors":["Junxuan Yu","Rusi Chen","Yongsong Zhou","Yanlin Chen","Yaofei Duan","Yuhao Huang","Han Zhou","Tan Tao","Xin Yang","Dong Ni"],"pdf_url":"https://arxiv.org/pdf/2407.21490v1.pdf","comment":"Accepted by MICCAI MLMI 2024"},{"id":"http://arxiv.org/abs/2407.21475v1","updated":"2024-07-31T09:36:58Z","published":"2024-07-31T09:36:58Z","title":"Fine-gained Zero-shot Video Sampling","summary":" Incorporating a temporal dimension into pretrained image diffusion models for\nvideo generation is a prevalent approach. However, this method is\ncomputationally demanding and necessitates large-scale video datasets. More\ncritically, the heterogeneity between image and video datasets often results in\ncatastrophic forgetting of the image expertise. Recent attempts to directly\nextract video snippets from image diffusion models have somewhat mitigated\nthese problems. Nevertheless, these methods can only generate brief video clips\nwith simple movements and fail to capture fine-grained motion or non-grid\ndeformation. In this paper, we propose a novel Zero-Shot video Sampling\nalgorithm, denoted as $\\mathcal{ZS}^2$, capable of directly sampling\nhigh-quality video clips from existing image synthesis methods, such as Stable\nDiffusion, without any training or optimization. Specifically, $\\mathcal{ZS}^2$\nutilizes the dependency noise model and temporal momentum attention to ensure\ncontent consistency and animation coherence, respectively. This ability enables\nit to excel in related tasks, such as conditional and context-specialized video\ngeneration and instruction-guided video editing. Experimental results\ndemonstrate that $\\mathcal{ZS}^2$ achieves state-of-the-art performance in\nzero-shot video generation, occasionally outperforming recent supervised\nmethods.\n Homepage: \\url{https://densechen.github.io/zss/}.\n","authors":["Dengsheng Chen","Jie Hu","Xiaoming Wei","Enhua Wu"],"pdf_url":"https://arxiv.org/pdf/2407.21475v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21467v1","updated":"2024-07-31T09:26:20Z","published":"2024-07-31T09:26:20Z","title":"Deep Learning-Based Longitudinal Prediction of Childhood Myopia\n Progression Using Fundus Image Sequences and Baseline Refraction Data","summary":" Childhood myopia constitutes a significant global health concern. It exhibits\nan escalating prevalence and has the potential to evolve into severe,\nirreversible conditions that detrimentally impact familial well-being and\ncreate substantial economic costs. Contemporary research underscores the\nimportance of precisely predicting myopia progression to enable timely and\neffective interventions, thereby averting severe visual impairment in children.\nSuch predictions predominantly rely on subjective clinical assessments, which\nare inherently biased and resource-intensive, thus hindering their widespread\napplication. In this study, we introduce a novel, high-accuracy method for\nquantitatively predicting the myopic trajectory and myopia risk in children\nusing only fundus images and baseline refraction data. This approach was\nvalidated through a six-year longitudinal study of 3,408 children in Henan,\nutilizing 16,211 fundus images and corresponding refractive data. Our method\nbased on deep learning demonstrated predictive accuracy with an error margin of\n0.311D per year and AUC scores of 0.944 and 0.995 for forecasting the risks of\ndeveloping myopia and high myopia, respectively. These findings confirm the\nutility of our model in supporting early intervention strategies and in\nsignificantly reducing healthcare costs, particularly by obviating the need for\nadditional metadata and repeated consultations. Furthermore, our method was\ndesigned to rely only on fundus images and refractive error data, without the\nneed for meta data or multiple inquiries from doctors, strongly reducing the\nassociated medical costs and facilitating large-scale screening. Our model can\neven provide good predictions based on only a single time measurement.\nConsequently, the proposed method is an important means to reduce medical\ninequities caused by economic disparities.\n","authors":["Mengtian Kang","Yansong Hu","Shuo Gao","Yuanyuan Liu","Hongbei Meng","Xuemeng Li","Xuhang Chen","Hubin Zhao","Jing Fu","Guohua Hu","Wei Wang","Yanning Dai","Arokia Nathan","Peter Smielewski","Ningli Wang","Shiming Li"],"pdf_url":"https://arxiv.org/pdf/2407.21467v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21465v1","updated":"2024-07-31T09:23:57Z","published":"2024-07-31T09:23:57Z","title":"MarvelOVD: Marrying Object Recognition and Vision-Language Models for\n Robust Open-Vocabulary Object Detection","summary":" Learning from pseudo-labels that generated with VLMs~(Vision Language Models)\nhas been shown as a promising solution to assist open vocabulary detection\n(OVD) in recent studies. However, due to the domain gap between VLM and\nvision-detection tasks, pseudo-labels produced by the VLMs are prone to be\nnoisy, while the training design of the detector further amplifies the bias. In\nthis work, we investigate the root cause of VLMs' biased prediction under the\nOVD context. Our observations lead to a simple yet effective paradigm, coded\nMarvelOVD, that generates significantly better training targets and optimizes\nthe learning procedure in an online manner by marrying the capability of the\ndetector with the vision-language model. Our key insight is that the detector\nitself can act as a strong auxiliary guidance to accommodate VLM's inability of\nunderstanding both the ``background'' and the context of a proposal within the\nimage. Based on it, we greatly purify the noisy pseudo-labels via Online Mining\nand propose Adaptive Reweighting to effectively suppress the biased training\nboxes that are not well aligned with the target object. In addition, we also\nidentify a neglected ``base-novel-conflict'' problem and introduce stratified\nlabel assignments to prevent it. Extensive experiments on COCO and LVIS\ndatasets demonstrate that our method outperforms the other state-of-the-arts by\nsignificant margins. Codes are available at https://github.com/wkfdb/MarvelOVD\n","authors":["Kuo Wang","Lechao Cheng","Weikai Chen","Pingping Zhang","Liang Lin","Fan Zhou","Guanbin Li"],"pdf_url":"https://arxiv.org/pdf/2407.21465v1.pdf","comment":"Codes are available at https://github.com/wkfdb/MarvelOVD"},{"id":"http://arxiv.org/abs/2407.15017v2","updated":"2024-07-31T09:14:29Z","published":"2024-07-22T06:15:59Z","title":"Knowledge Mechanisms in Large Language Models: A Survey and Perspective","summary":" Understanding knowledge mechanisms in Large Language Models (LLMs) is crucial\nfor advancing towards trustworthy AGI. This paper reviews knowledge mechanism\nanalysis from a novel taxonomy including knowledge utilization and evolution.\nKnowledge utilization delves into the mechanism of memorization, comprehension\nand application, and creation. Knowledge evolution focuses on the dynamic\nprogression of knowledge within individual and group LLMs. Moreover, we discuss\nwhat knowledge LLMs have learned, the reasons for the fragility of parametric\nknowledge, and the potential dark knowledge (hypothesis) that will be\nchallenging to address. We hope this work can help understand knowledge in LLMs\nand provide insights for future research.\n","authors":["Mengru Wang","Yunzhi Yao","Ziwen Xu","Shuofei Qiao","Shumin Deng","Peng Wang","Xiang Chen","Jia-Chen Gu","Yong Jiang","Pengjun Xie","Fei Huang","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.15017v2.pdf","comment":"Ongoing work (v2); add Section 5: Application of Knowledge Mechanism;\n revise Section 6 and 7; fix typos"},{"id":"http://arxiv.org/abs/2407.21454v1","updated":"2024-07-31T08:59:33Z","published":"2024-07-31T08:59:33Z","title":"StreetSurfaceVis: a dataset of crowdsourced street-level imagery with\n semi-automated annotations of road surface type and quality","summary":" Road unevenness significantly impacts the safety and comfort of various\ntraffic participants, especially vulnerable road users such as cyclists and\nwheelchair users. This paper introduces StreetSurfaceVis, a novel dataset\ncomprising 9,122 street-level images collected from a crowdsourcing platform\nand manually annotated by road surface type and quality. The dataset is\nintended to train models for comprehensive surface assessments of road\nnetworks. Existing open datasets are constrained by limited geospatial coverage\nand camera setups, typically excluding cycleways and footways. By crafting a\nheterogeneous dataset, we aim to fill this gap and enable robust models that\nmaintain high accuracy across diverse image sources. However, the frequency\ndistribution of road surface types and qualities is highly imbalanced. We\naddress the challenge of ensuring sufficient images per class while reducing\nmanual annotation by proposing a sampling strategy that incorporates various\nexternal label prediction resources. More precisely, we estimate the impact of\n(1) enriching the image data with OpenStreetMap tags, (2) iterative training\nand application of a custom surface type classification model, (3) amplifying\nunderrepresented classes through prompt-based classification with GPT-4o or\nsimilarity search using image embeddings. We show that utilizing a combination\nof these strategies effectively reduces manual annotation workload while\nensuring sufficient class representation.\n","authors":["Alexandra Kapp","Edith Hoffmann","Esther Weigmann","Helena Mihaljević"],"pdf_url":"https://arxiv.org/pdf/2407.21454v1.pdf","comment":"11 pages, 2 figures"},{"id":"http://arxiv.org/abs/2403.08383v3","updated":"2024-07-31T08:57:57Z","published":"2024-03-13T09:48:04Z","title":"AFGI: Towards Accurate and Fast-convergent Gradient Inversion Attack in\n Federated Learning","summary":" Federated learning (FL) empowers privacypreservation in model training by\nonly exposing users' model gradients. Yet, FL users are susceptible to gradient\ninversion attacks (GIAs) which can reconstruct ground-truth training data such\nas images based on model gradients. However, reconstructing high-resolution\nimages by existing GIAs faces two challenges: inferior accuracy and\nslow-convergence, especially when duplicating labels exist in the training\nbatch. To address these challenges, we present an Accurate and Fast-convergent\nGradient Inversion attack algorithm, called AFGI, with two components: Label\nRecovery Block (LRB) which can accurately restore duplicating labels of private\nimages based on exposed gradients; VME Regularization Term, which includes the\ntotal variance of reconstructed images, the discrepancy between three-channel\nmeans and edges, between values from exposed gradients and reconstructed\nimages, respectively. The AFGI can be regarded as a white-box attack strategy\nto reconstruct images by leveraging labels recovered by LRB. In particular,\nAFGI is efficient that accurately reconstruct ground-truth images when users'\ntraining batch size is up to 48. Our experimental results manifest that AFGI\ncan diminish 85% time costs while achieving superb inversion quality in the\nImageNet dataset. At last, our study unveils the shortcomings of FL in\nprivacy-preservation, prompting the development of more advanced countermeasure\nstrategies.\n","authors":["Can Liu","Jin Wang","and Yipeng Zhou","Yachao Yuan","Quanzheng Sheng","Kejie Lu"],"pdf_url":"https://arxiv.org/pdf/2403.08383v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21452v1","updated":"2024-07-31T08:55:57Z","published":"2024-07-31T08:55:57Z","title":"Navigating Beyond Instructions: Vision-and-Language Navigation in\n Obstructed Environments","summary":" Real-world navigation often involves dealing with unexpected obstructions\nsuch as closed doors, moved objects, and unpredictable entities. However,\nmainstream Vision-and-Language Navigation (VLN) tasks typically assume\ninstructions perfectly align with the fixed and predefined navigation graphs\nwithout any obstructions. This assumption overlooks potential discrepancies in\nactual navigation graphs and given instructions, which can cause major failures\nfor both indoor and outdoor agents. To address this issue, we integrate diverse\nobstructions into the R2R dataset by modifying both the navigation graphs and\nvisual observations, introducing an innovative dataset and task, R2R with\nUNexpected Obstructions (R2R-UNO). R2R-UNO contains various types and numbers\nof path obstructions to generate instruction-reality mismatches for VLN\nresearch. Experiments on R2R-UNO reveal that state-of-the-art VLN methods\ninevitably encounter significant challenges when facing such mismatches,\nindicating that they rigidly follow instructions rather than navigate\nadaptively. Therefore, we propose a novel method called ObVLN (Obstructed VLN),\nwhich includes a curriculum training strategy and virtual graph construction to\nhelp agents effectively adapt to obstructed environments. Empirical results\nshow that ObVLN not only maintains robust performance in unobstructed scenarios\nbut also achieves a substantial performance advantage with unexpected\nobstructions.\n","authors":["Haodong Hong","Sen Wang","Zi Huang","Qi Wu","Jiajun Liu"],"pdf_url":"https://arxiv.org/pdf/2407.21452v1.pdf","comment":"Accepted to MM 2024"},{"id":"http://arxiv.org/abs/2407.21450v1","updated":"2024-07-31T08:54:50Z","published":"2024-07-31T08:54:50Z","title":"Forecasting Future Videos from Novel Views via Disentangled 3D Scene\n Representation","summary":" Video extrapolation in space and time (VEST) enables viewers to forecast a 3D\nscene into the future and view it from novel viewpoints. Recent methods propose\nto learn an entangled representation, aiming to model layered scene geometry,\nmotion forecasting and novel view synthesis together, while assuming simplified\naffine motion and homography-based warping at each scene layer, leading to\ninaccurate video extrapolation. Instead of entangled scene representation and\nrendering, our approach chooses to disentangle scene geometry from scene\nmotion, via lifting the 2D scene to 3D point clouds, which enables high quality\nrendering of future videos from novel views. To model future 3D scene motion,\nwe propose a disentangled two-stage approach that initially forecasts\nego-motion and subsequently the residual motion of dynamic objects (e.g., cars,\npeople). This approach ensures more precise motion predictions by reducing\ninaccuracies from entanglement of ego-motion with dynamic object motion, where\nbetter ego-motion forecasting could significantly enhance the visual outcomes.\nExtensive experimental analysis on two urban scene datasets demonstrate\nsuperior performance of our proposed method in comparison to strong baselines.\n","authors":["Sudhir Yarram","Junsong Yuan"],"pdf_url":"https://arxiv.org/pdf/2407.21450v1.pdf","comment":"Accepted to ECCV 2024. Project Page:\n https://skrya.github.io/projects/ffn-dsr/"},{"id":"http://arxiv.org/abs/2407.21448v1","updated":"2024-07-31T08:53:10Z","published":"2024-07-31T08:53:10Z","title":"Accelerating Image Super-Resolution Networks with Pixel-Level\n Classification","summary":" In recent times, the need for effective super-resolution (SR) techniques has\nsurged, especially for large-scale images ranging 2K to 8K resolutions. For\nDNN-based SISR, decomposing images into overlapping patches is typically\nnecessary due to computational constraints. In such patch-decomposing scheme,\none can allocate computational resources differently based on each patch's\ndifficulty to further improve efficiency while maintaining SR performance.\nHowever, this approach has a limitation: computational resources is uniformly\nallocated within a patch, leading to lower efficiency when the patch contain\npixels with varying levels of restoration difficulty. To address the issue, we\npropose the Pixel-level Classifier for Single Image Super-Resolution (PCSR), a\nnovel method designed to distribute computational resources adaptively at the\npixel level. A PCSR model comprises a backbone, a pixel-level classifier, and a\nset of pixel-level upsamplers with varying capacities. The pixel-level\nclassifier assigns each pixel to an appropriate upsampler based on its\nrestoration difficulty, thereby optimizing computational resource usage. Our\nmethod allows for performance and computational cost balance during inference\nwithout re-training. Our experiments demonstrate PCSR's advantage over existing\npatch-distributing methods in PSNR-FLOP trade-offs across different backbone\nmodels and benchmarks. The code is available at\nhttps://github.com/3587jjh/PCSR.\n","authors":["Jinho Jeong","Jinwoo Kim","Younghyun Jo","Seon Joo Kim"],"pdf_url":"https://arxiv.org/pdf/2407.21448v1.pdf","comment":"Accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2407.20566v2","updated":"2024-07-31T08:45:19Z","published":"2024-07-30T05:45:06Z","title":"Monocular Human-Object Reconstruction in the Wild","summary":" Learning the prior knowledge of the 3D human-object spatial relation is\ncrucial for reconstructing human-object interaction from images and\nunderstanding how humans interact with objects in 3D space. Previous works\nlearn this prior from datasets collected in controlled environments, but due to\nthe diversity of domains, they struggle to generalize to real-world scenarios.\nTo overcome this limitation, we present a 2D-supervised method that learns the\n3D human-object spatial relation prior purely from 2D images in the wild. Our\nmethod utilizes a flow-based neural network to learn the prior distribution of\nthe 2D human-object keypoint layout and viewports for each image in the\ndataset. The effectiveness of the prior learned from 2D images is demonstrated\non the human-object reconstruction task by applying the prior to tune the\nrelative pose between the human and the object during the post-optimization\nstage. To validate and benchmark our method on in-the-wild images, we collect\nthe WildHOI dataset from the YouTube website, which consists of various\ninteractions with 8 objects in real-world scenarios. We conduct the experiments\non the indoor BEHAVE dataset and the outdoor WildHOI dataset. The results show\nthat our method achieves almost comparable performance with fully 3D supervised\nmethods on the BEHAVE dataset, even if we have only utilized the 2D layout\ninformation, and outperforms previous methods in terms of generality and\ninteraction diversity on in-the-wild images.\n","authors":["Chaofan Huo","Ye Shi","Jingya Wang"],"pdf_url":"https://arxiv.org/pdf/2407.20566v2.pdf","comment":"Accepted by MM '24"},{"id":"http://arxiv.org/abs/2407.18839v2","updated":"2024-07-31T08:43:34Z","published":"2024-07-26T16:02:37Z","title":"Scalable Group Choreography via Variational Phase Manifold Learning","summary":" Generating group dance motion from the music is a challenging task with\nseveral industrial applications. Although several methods have been proposed to\ntackle this problem, most of them prioritize optimizing the fidelity in dancing\nmovement, constrained by predetermined dancer counts in datasets. This\nlimitation impedes adaptability to real-world applications. Our study addresses\nthe scalability problem in group choreography while preserving naturalness and\nsynchronization. In particular, we propose a phase-based variational generative\nmodel for group dance generation on learning a generative manifold. Our method\nachieves high-fidelity group dance motion and enables the generation with an\nunlimited number of dancers while consuming only a minimal and constant amount\nof memory. The intensive experiments on two public datasets show that our\nproposed method outperforms recent state-of-the-art approaches by a large\nmargin and is scalable to a great number of dancers beyond the training data.\n","authors":["Nhat Le","Khoa Do","Xuan Bui","Tuong Do","Erman Tjiputra","Quang D. Tran","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2407.18839v2.pdf","comment":"Accepted at ECCV 2024"},{"id":"http://arxiv.org/abs/2407.21438v1","updated":"2024-07-31T08:42:48Z","published":"2024-07-31T08:42:48Z","title":"A Plug-and-Play Method for Rare Human-Object Interactions Detection by\n Bridging Domain Gap","summary":" Human-object interactions (HOI) detection aims at capturing human-object\npairs in images and corresponding actions. It is an important step toward\nhigh-level visual reasoning and scene understanding. However, due to the\nnatural bias from the real world, existing methods mostly struggle with rare\nhuman-object pairs and lead to sub-optimal results. Recently, with the\ndevelopment of the generative model, a straightforward approach is to construct\na more balanced dataset based on a group of supplementary samples.\nUnfortunately, there is a significant domain gap between the generated data and\nthe original data, and simply merging the generated images into the original\ndataset cannot significantly boost the performance. To alleviate the above\nproblem, we present a novel model-agnostic framework called\n\\textbf{C}ontext-\\textbf{E}nhanced \\textbf{F}eature \\textbf{A}lignment (CEFA)\nmodule, which can effectively align the generated data with the original data\nat the feature level and bridge the domain gap. Specifically, CEFA consists of\na feature alignment module and a context enhancement module. On one hand,\nconsidering the crucial role of human-object pairs information in HOI tasks,\nthe feature alignment module aligns the human-object pairs by aggregating\ninstance information. On the other hand, to mitigate the issue of losing\nimportant context information caused by the traditional discriminator-style\nalignment method, we employ a context-enhanced image reconstruction module to\nimprove the model's learning ability of contextual cues. Extensive experiments\nhave shown that our method can serve as a plug-and-play module to improve the\ndetection performance of HOI models on rare\ncategories\\footnote{https://github.com/LijunZhang01/CEFA}.\n","authors":["Lijun Zhang","Wei Suo","Peng Wang","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.21438v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21436v1","updated":"2024-07-31T08:38:55Z","published":"2024-07-31T08:38:55Z","title":"Enriching thermal point clouds of buildings using semantic 3D building\n modelsenriching thermal point clouds of buildings using semantic 3D building\n models","summary":" Thermal point clouds integrate thermal radiation and laser point clouds\neffectively. However, the semantic information for the interpretation of\nbuilding thermal point clouds can hardly be precisely inferred. Transferring\nthe semantics encapsulated in 3D building models at LoD3 has a potential to\nfill this gap. In this work, we propose a workflow enriching thermal point\nclouds with the geo-position and semantics of LoD3 building models, which\nutilizes features of both modalities: The proposed method can automatically\nco-register the point clouds from different sources and enrich the thermal\npoint cloud in facade-detailed semantics. The enriched thermal point cloud\nsupports thermal analysis and can facilitate the development of currently\nscarce deep learning models operating directly on thermal point clouds.\n","authors":["Jingwei Zhu","Olaf Wysocki","Christoph Holst","Thomas H. Kolbe"],"pdf_url":"https://arxiv.org/pdf/2407.21436v1.pdf","comment":"Accepted to the 3D GeoInfo 2024"},{"id":"http://arxiv.org/abs/2407.21432v1","updated":"2024-07-31T08:33:41Z","published":"2024-07-31T08:33:41Z","title":"Analyzing the impact of semantic LoD3 building models on image-based\n vehicle localization","summary":" Numerous navigation applications rely on data from global navigation\nsatellite systems (GNSS), even though their accuracy is compromised in urban\nareas, posing a significant challenge, particularly for precise autonomous car\nlocalization. Extensive research has focused on enhancing localization accuracy\nby integrating various sensor types to address this issue. This paper\nintroduces a novel approach for car localization, leveraging image features\nthat correspond with highly detailed semantic 3D building models. The core\nconcept involves augmenting positioning accuracy by incorporating prior\ngeometric and semantic knowledge into calculations. The work assesses outcomes\nusing Level of Detail 2 (LoD2) and Level of Detail 3 (LoD3) models, analyzing\nwhether facade-enriched models yield superior accuracy. This comprehensive\nanalysis encompasses diverse methods, including off-the-shelf feature matching\nand deep learning, facilitating thorough discussion. Our experiments\ncorroborate that LoD3 enables detecting up to 69\\% more features than using\nLoD2 models. We believe that this study will contribute to the research of\nenhancing positioning accuracy in GNSS-denied urban canyons. It also shows a\npractical application of under-explored LoD3 building models on map-based car\npositioning.\n","authors":["Antonia Bieringer","Olaf Wysocki","Sebastian Tuttas","Ludwig Hoegner","Christoph Holst"],"pdf_url":"https://arxiv.org/pdf/2407.21432v1.pdf","comment":"Accepted to the 3D GeoInfo 2024"},{"id":"http://arxiv.org/abs/2311.12751v4","updated":"2024-07-31T08:24:16Z","published":"2023-11-21T17:52:30Z","title":"Towards Natural Language-Guided Drones: GeoText-1652 Benchmark with\n Spatial Relation Matching","summary":" Navigating drones through natural language commands remains challenging due\nto the dearth of accessible multi-modal datasets and the stringent precision\nrequirements for aligning visual and textual data. To address this pressing\nneed, we introduce GeoText-1652, a new natural language-guided geo-localization\nbenchmark. This dataset is systematically constructed through an interactive\nhuman-computer process leveraging Large Language Model (LLM) driven annotation\ntechniques in conjunction with pre-trained vision models. GeoText-1652 extends\nthe established University-1652 image dataset with spatial-aware text\nannotations, thereby establishing one-to-one correspondences between image,\ntext, and bounding box elements. We further introduce a new optimization\nobjective to leverage fine-grained spatial associations, called blending\nspatial matching, for region-level spatial relation matching. Extensive\nexperiments reveal that our approach maintains a competitive recall rate\ncomparing other prevailing cross-modality methods. This underscores the\npromising potential of our approach in elevating drone control and navigation\nthrough the seamless integration of natural language commands in real-world\nscenarios.\n","authors":["Meng Chu","Zhedong Zheng","Wei Ji","Tingyu Wang","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2311.12751v4.pdf","comment":"Accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2407.21422v1","updated":"2024-07-31T08:17:23Z","published":"2024-07-31T08:17:23Z","title":"Generalized Tampered Scene Text Detection in the era of Generative AI","summary":" The rapid advancements of generative AI have fueled the potential of\ngenerative text image editing while simultaneously escalating the threat of\nmisinformation spreading. However, existing forensics methods struggle to\ndetect unseen forgery types that they have not been trained on, leaving the\ndevelopment of a model capable of generalized detection of tampered scene text\nas an unresolved issue. To tackle this, we propose a novel task: open-set\ntampered scene text detection, which evaluates forensics models on their\nability to identify both seen and previously unseen forgery types. We have\ncurated a comprehensive, high-quality dataset, featuring the texts tampered by\neight text editing models, to thoroughly assess the open-set generalization\ncapabilities. Further, we introduce a novel and effective pre-training paradigm\nthat subtly alters the texture of selected texts within an image and trains the\nmodel to identify these regions. This approach not only mitigates the scarcity\nof high-quality training data but also enhances models' fine-grained perception\nand open-set generalization abilities. Additionally, we present DAF, a novel\nframework that improves open-set generalization by distinguishing between the\nfeatures of authentic and tampered text, rather than focusing solely on the\ntampered text's features. Our extensive experiments validate the remarkable\nefficacy of our methods. For example, our zero-shot performance can even beat\nthe previous state-of-the-art full-shot model by a large margin. Our dataset\nand code will be open-source.\n","authors":["Chenfan Qu","Yiwu Zhong","Fengjun Guo","Lianwen Jin"],"pdf_url":"https://arxiv.org/pdf/2407.21422v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21416v1","updated":"2024-07-31T08:04:32Z","published":"2024-07-31T08:04:32Z","title":"VIPeR: Visual Incremental Place Recognition with Adaptive Mining and\n Lifelong Learning","summary":" Visual place recognition (VPR) is an essential component of many autonomous\nand augmented/virtual reality systems. It enables the systems to robustly\nlocalize themselves in large-scale environments. Existing VPR methods\ndemonstrate attractive performance at the cost of heavy pre-training and\nlimited generalizability. When deployed in unseen environments, these methods\nexhibit significant performance drops. Targeting this issue, we present VIPeR,\na novel approach for visual incremental place recognition with the ability to\nadapt to new environments while retaining the performance of previous\nenvironments. We first introduce an adaptive mining strategy that balances the\nperformance within a single environment and the generalizability across\nmultiple environments. Then, to prevent catastrophic forgetting in lifelong\nlearning, we draw inspiration from human memory systems and design a novel\nmemory bank for our VIPeR. Our memory bank contains a sensory memory, a working\nmemory and a long-term memory, with the first two focusing on the current\nenvironment and the last one for all previously visited environments.\nAdditionally, we propose a probabilistic knowledge distillation to explicitly\nsafeguard the previously learned knowledge. We evaluate our proposed VIPeR on\nthree large-scale datasets, namely Oxford Robotcar, Nordland, and TartanAir.\nFor comparison, we first set a baseline performance with naive finetuning.\nThen, several more recent lifelong learning methods are compared. Our VIPeR\nachieves better performance in almost all aspects with the biggest improvement\nof 13.65% in average performance.\n","authors":["Yuhang Ming","Minyang Xu","Xingrui Yang","Weicai Ye","Weihan Wang","Yong Peng","Weichen Dai","Wanzeng Kong"],"pdf_url":"https://arxiv.org/pdf/2407.21416v1.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2407.21408v1","updated":"2024-07-31T07:54:26Z","published":"2024-07-31T07:54:26Z","title":"Benchmarking AIGC Video Quality Assessment: A Dataset and Unified Model","summary":" In recent years, artificial intelligence (AI) driven video generation has\ngarnered significant attention due to advancements in stable diffusion and\nlarge language model techniques. Thus, there is a great demand for accurate\nvideo quality assessment (VQA) models to measure the perceptual quality of\nAI-generated content (AIGC) videos as well as optimize video generation\ntechniques. However, assessing the quality of AIGC videos is quite challenging\ndue to the highly complex distortions they exhibit (e.g., unnatural action,\nirrational objects, etc.). Therefore, in this paper, we try to systemically\ninvestigate the AIGC-VQA problem from both subjective and objective quality\nassessment perspectives. For the subjective perspective, we construct a\nLarge-scale Generated Vdeo Quality assessment (LGVQ) dataset, consisting of\n2,808 AIGC videos generated by 6 video generation models using 468 carefully\nselected text prompts. Unlike previous subjective VQA experiments, we evaluate\nthe perceptual quality of AIGC videos from three dimensions: spatial quality,\ntemporal quality, and text-to-video alignment, which hold utmost importance for\ncurrent video generation techniques. For the objective perspective, we\nestablish a benchmark for evaluating existing quality assessment metrics on the\nLGVQ dataset, which reveals that current metrics perform poorly on the LGVQ\ndataset. Thus, we propose a Unify Generated Video Quality assessment (UGVQ)\nmodel to comprehensively and accurately evaluate the quality of AIGC videos\nacross three aspects using a unified model, which uses visual, textual and\nmotion features of video and corresponding prompt, and integrates key features\nto enhance feature expression. We hope that our benchmark can promote the\ndevelopment of quality evaluation metrics for AIGC videos. The LGVQ dataset and\nthe UGVQ metric will be publicly released.\n","authors":["Zhichao Zhang","Xinyue Li","Wei Sun","Jun Jia","Xiongkuo Min","Zicheng Zhang","Chunyi Li","Zijian Chen","Puyi Wang","Zhongpeng Ji","Fengyu Sun","Shangling Jui","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2407.21408v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21402v1","updated":"2024-07-31T07:43:58Z","published":"2024-07-31T07:43:58Z","title":"DD-rPPGNet: De-interfering and Descriptive Feature Learning for\n Unsupervised rPPG Estimation","summary":" Remote Photoplethysmography (rPPG) aims to measure physiological signals and\nHeart Rate (HR) from facial videos. Recent unsupervised rPPG estimation methods\nhave shown promising potential in estimating rPPG signals from facial regions\nwithout relying on ground truth rPPG signals. However, these methods seem\noblivious to interference existing in rPPG signals and still result in\nunsatisfactory performance. In this paper, we propose a novel De-interfered and\nDescriptive rPPG Estimation Network (DD-rPPGNet) to eliminate the interference\nwithin rPPG features for learning genuine rPPG signals. First, we investigate\nthe characteristics of local spatial-temporal similarities of interference and\ndesign a novel unsupervised model to estimate the interference. Next, we\npropose an unsupervised de-interfered method to learn genuine rPPG signals with\ntwo stages. In the first stage, we estimate the initial rPPG signals by\ncontrastive learning from both the training data and their augmented\ncounterparts. In the second stage, we use the estimated interference features\nto derive de-interfered rPPG features and encourage the rPPG signals to be\ndistinct from the interference. In addition, we propose an effective\ndescriptive rPPG feature learning by developing a strong 3D Learnable\nDescriptive Convolution (3DLDC) to capture the subtle chrominance changes for\nenhancing rPPG estimation. Extensive experiments conducted on five rPPG\nbenchmark datasets demonstrate that the proposed DD-rPPGNet outperforms\nprevious unsupervised rPPG estimation methods and achieves competitive\nperformances with state-of-the-art supervised rPPG methods.\n","authors":["Pei-Kai Huang","Tzu-Hsien Chen","Ya-Ting Chan","Kuan-Wen Chen","Chiou-Ting Hsu"],"pdf_url":"https://arxiv.org/pdf/2407.21402v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16813v3","updated":"2024-07-31T07:33:00Z","published":"2024-05-27T04:14:20Z","title":"SiNGR: Brain Tumor Segmentation via Signed Normalized Geodesic Transform\n Regression","summary":" One of the primary challenges in brain tumor segmentation arises from the\nuncertainty of voxels close to tumor boundaries. However, the conventional\nprocess of generating ground truth segmentation masks fails to treat such\nuncertainties properly. Those \"hard labels\" with 0s and 1s conceptually\ninfluenced the majority of prior studies on brain image segmentation. As a\nresult, tumor segmentation is often solved through voxel classification. In\nthis work, we instead view this problem as a voxel-level regression, where the\nground truth represents a certainty mapping from any pixel to the border of the\ntumor. We propose a novel ground truth label transformation, which is based on\na signed geodesic transform, to capture the uncertainty in brain tumors'\nvicinity. We combine this idea with a Focal-like regression L1-loss that\nenables effective regression learning in high-dimensional output space by\nappropriately weighting voxels according to their difficulty. We thoroughly\nconduct an experimental evaluation to validate the components of our proposed\nmethod, compare it to a diverse array of state-of-the-art segmentation models,\nand show that it is architecture-agnostic. The code of our method is made\npublicly available (\\url{https://github.com/Oulu-IMEDS/SiNGR/}).\n","authors":["Trung Dang","Huy Hoang Nguyen","Aleksei Tiulpin"],"pdf_url":"https://arxiv.org/pdf/2405.16813v3.pdf","comment":"Accepted as a conference paper at MICCAI 2024"},{"id":"http://arxiv.org/abs/2407.21394v1","updated":"2024-07-31T07:32:18Z","published":"2024-07-31T07:32:18Z","title":"Force Sensing Guided Artery-Vein Segmentation via Sequential Ultrasound\n Images","summary":" Accurate identification of arteries and veins in ultrasound images is crucial\nfor vascular examinations and interventions in robotics-assisted surgeries.\nHowever, current methods for ultrasound vessel segmentation face challenges in\ndistinguishing between arteries and veins due to their morphological\nsimilarities. To address this challenge, this study introduces a novel force\nsensing guided segmentation approach to enhance artery-vein segmentation\naccuracy by leveraging their distinct deformability. Our proposed method\nutilizes force magnitude to identify key frames with the most significant\nvascular deformation in a sequence of ultrasound images. These key frames are\nthen integrated with the current frame through attention mechanisms, with\nweights assigned in accordance with force magnitude. Our proposed force sensing\nguided framework can be seamlessly integrated into various segmentation\nnetworks and achieves significant performance improvements in multiple U-shaped\nnetworks such as U-Net, Swin-unet and Transunet. Furthermore, we contribute the\nfirst multimodal ultrasound artery-vein segmentation dataset, Mus-V, which\nencompasses both force and image data simultaneously. The dataset comprises\n3114 ultrasound images of carotid and femoral vessels extracted from 105\nvideos, with corresponding force data recorded by the force sensor mounted on\nthe US probe. Our code and dataset will be publicly available.\n","authors":["Yimeng Geng","Gaofeng Meng","Mingcong Chen","Guanglin Cao","Mingyang Zhao","Jianbo Zhao","Hongbin Liu"],"pdf_url":"https://arxiv.org/pdf/2407.21394v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21391v1","updated":"2024-07-31T07:31:13Z","published":"2024-07-31T07:31:13Z","title":"Design and Development of Laughter Recognition System Based on\n Multimodal Fusion and Deep Learning","summary":" This study aims to design and implement a laughter recognition system based\non multimodal fusion and deep learning, leveraging image and audio processing\ntechnologies to achieve accurate laughter recognition and emotion analysis.\nFirst, the system loads video files and uses the OpenCV library to extract\nfacial information while employing the Librosa library to process audio\nfeatures such as MFCC. Then, multimodal fusion techniques are used to integrate\nimage and audio features, followed by training and prediction using deep\nlearning models. Evaluation results indicate that the model achieved 80%\naccuracy, precision, and recall on the test dataset, with an F1 score of 80%,\ndemonstrating robust performance and the ability to handle real-world data\nvariability. This study not only verifies the effectiveness of multimodal\nfusion methods in laughter recognition but also highlights their potential\napplications in affective computing and human-computer interaction. Future work\nwill focus on further optimizing feature extraction and model architecture to\nimprove recognition accuracy and expand application scenarios, promoting the\ndevelopment of laughter recognition technology in fields such as mental health\nmonitoring and educational activity evaluation\n","authors":["Fuzheng Zhao","Yu Bai"],"pdf_url":"https://arxiv.org/pdf/2407.21391v1.pdf","comment":"7 pages,2 figures"},{"id":"http://arxiv.org/abs/2407.21385v1","updated":"2024-07-31T07:16:40Z","published":"2024-07-31T07:16:40Z","title":"SmileyNet -- Towards the Prediction of the Lottery by Reading Tea Leaves\n with AI","summary":" We introduce SmileyNet, a novel neural network with psychic abilities. It is\ninspired by the fact that a positive mood can lead to improved cognitive\ncapabilities including classification tasks. The network is hence presented in\na first phase with smileys and an encouraging loss function is defined to bias\nit into a good mood. SmileyNet is then used to forecast the flipping of a coin\nbased on an established method of Tasseology, namely by reading tea leaves.\nTraining and testing in this second phase are done with a high-fidelity\nsimulation based on real-world pixels sampled from a professional tea-reading\ncup. SmileyNet has an amazing accuracy of 72% to correctly predict the flip of\na coin. Resnet-34, respectively YOLOv5 achieve only 49%, respectively 53%. It\nis then shown how multiple SmileyNets can be combined to win the lottery.\n","authors":["Andreas Birk"],"pdf_url":"https://arxiv.org/pdf/2407.21385v1.pdf","comment":"This is a satirical accumulation of misconceptions, mistakes, and\n flawed reasoning I have encountered in recent times as a reviewer and\n sometimes even as a reader of published papers. I hope it is entertaining and\n useful in the context of the education of BSc, MSc, and PhD students in\n Machine Learning, Artificial Intelligence, and Cognitive Science"},{"id":"http://arxiv.org/abs/2407.21381v1","updated":"2024-07-31T07:12:06Z","published":"2024-07-31T07:12:06Z","title":"Identity-Consistent Diffusion Network for Grading Knee Osteoarthritis\n Progression in Radiographic Imaging","summary":" Knee osteoarthritis (KOA), a common form of arthritis that causes physical\ndisability, has become increasingly prevalent in society. Employing\ncomputer-aided techniques to automatically assess the severity and progression\nof KOA can greatly benefit KOA treatment and disease management. Particularly,\nthe advancement of X-ray technology in KOA demonstrates its potential for this\npurpose. Yet, existing X-ray prognosis research generally yields a singular\nprogression severity grade, overlooking the potential visual changes for\nunderstanding and explaining the progression outcome. Therefore, in this study,\na novel generative model is proposed, namely Identity-Consistent Radiographic\nDiffusion Network (IC-RDN), for multifaceted KOA prognosis encompassing a\npredicted future knee X-ray scan conditioned on the baseline scan.\nSpecifically, an identity prior module for the diffusion and a downstream\ngeneration-guided progression prediction module are introduced. Compared to\nconventional image-to-image generative models, identity priors regularize and\nguide the diffusion to focus more on the clinical nuances of the prognosis\nbased on a contrastive learning strategy. The progression prediction module\nutilizes both forecasted and baseline knee scans, and a more comprehensive\nformulation of KOA severity progression grading is expected. Extensive\nexperiments on a widely used public dataset, OAI, demonstrate the effectiveness\nof the proposed method.\n","authors":["Wenhua Wu","Kun Hu","Wenxi Yue","Wei Li","Milena Simic","Changyang Li","Wei Xiang","Zhiyong Wang"],"pdf_url":"https://arxiv.org/pdf/2407.21381v1.pdf","comment":"Accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2407.20461v2","updated":"2024-07-31T07:00:31Z","published":"2024-07-29T23:40:13Z","title":"Weakly Supervised Intracranial Hemorrhage Segmentation with YOLO and an\n Uncertainty Rectified Segment Anything Model","summary":" Intracranial hemorrhage (ICH) is a life-threatening condition that requires\nrapid and accurate diagnosis to improve treatment outcomes and patient survival\nrates. Recent advancements in supervised deep learning have greatly improved\nthe analysis of medical images, but often rely on extensive datasets with\nhigh-quality annotations, which are costly, time-consuming, and require medical\nexpertise to prepare. To mitigate the need for large amounts of expert-prepared\nsegmentation data, we have developed a novel weakly supervised ICH segmentation\nmethod that utilizes the YOLO object detection model and an\nuncertainty-rectified Segment Anything Model (SAM). In addition, we have\nproposed a novel point prompt generator for this model to further improve\nsegmentation results with YOLO-predicted bounding box prompts. Our approach\nachieved a high accuracy of 0.933 and an AUC of 0.796 in ICH detection, along\nwith a mean Dice score of 0.629 for ICH segmentation, outperforming existing\nweakly supervised and popular supervised (UNet and Swin-UNETR) approaches.\nOverall, the proposed method provides a robust and accurate alternative to the\nmore commonly used supervised techniques for ICH quantification without\nrequiring refined segmentation ground truths during model training.\n","authors":["Pascal Spiegler","Amirhossein Rasoulian","Yiming Xiao"],"pdf_url":"https://arxiv.org/pdf/2407.20461v2.pdf","comment":"Manuscript was accepted at SWITCH2024. 10 pages, 2 figures"},{"id":"http://arxiv.org/abs/2407.21374v1","updated":"2024-07-31T06:56:46Z","published":"2024-07-31T06:56:46Z","title":"Dynamic Gesture Recognition in Ultra-Range Distance for Effective\n Human-Robot Interaction","summary":" This paper presents a novel approach for ultra-range gesture recognition,\naddressing Human-Robot Interaction (HRI) challenges over extended distances. By\nleveraging human gestures in video data, we propose the Temporal-Spatiotemporal\nFusion Network (TSFN) model that surpasses the limitations of current methods,\nenabling robots to understand gestures from long distances. With applications\nin service robots, search and rescue operations, and drone-based interactions,\nour approach enhances HRI in expansive environments. Experimental validation\ndemonstrates significant advancements in gesture recognition accuracy,\nparticularly in prolonged gesture sequences.\n","authors":["Eran Bamani Beeri","Eden Nissinman","Avishai Sintov"],"pdf_url":"https://arxiv.org/pdf/2407.21374v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01853v3","updated":"2024-07-31T06:46:16Z","published":"2023-12-04T12:35:43Z","title":"Robot Synesthesia: In-Hand Manipulation with Visuotactile Sensing","summary":" Executing contact-rich manipulation tasks necessitates the fusion of tactile\nand visual feedback. However, the distinct nature of these modalities poses\nsignificant challenges. In this paper, we introduce a system that leverages\nvisual and tactile sensory inputs to enable dexterous in-hand manipulation.\nSpecifically, we propose Robot Synesthesia, a novel point cloud-based tactile\nrepresentation inspired by human tactile-visual synesthesia. This approach\nallows for the simultaneous and seamless integration of both sensory inputs,\noffering richer spatial information and facilitating better reasoning about\nrobot actions. The method, trained in a simulated environment and then deployed\nto a real robot, is applicable to various in-hand object rotation tasks.\nComprehensive ablations are performed on how the integration of vision and\ntouch can improve reinforcement learning and Sim2Real performance. Our project\npage is available at https://yingyuan0414.github.io/visuotactile/ .\n","authors":["Ying Yuan","Haichuan Che","Yuzhe Qin","Binghao Huang","Zhao-Heng Yin","Kang-Won Lee","Yi Wu","Soo-Chul Lim","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2312.01853v3.pdf","comment":"Project page: https://yingyuan0414.github.io/visuotactile/"},{"id":"http://arxiv.org/abs/2407.21368v1","updated":"2024-07-31T06:34:38Z","published":"2024-07-31T06:34:38Z","title":"Prompting Medical Large Vision-Language Models to Diagnose Pathologies\n by Visual Question Answering","summary":" Large Vision-Language Models (LVLMs) have achieved significant success in\nrecent years, and they have been extended to the medical domain. Although\ndemonstrating satisfactory performance on medical Visual Question Answering\n(VQA) tasks, Medical LVLMs (MLVLMs) suffer from the hallucination problem,\nwhich makes them fail to diagnose complex pathologies. Moreover, they readily\nfail to learn minority pathologies due to imbalanced training data. We propose\ntwo prompting strategies for MLVLMs that reduce hallucination and improve VQA\nperformance. In the first strategy, we provide a detailed explanation of the\nqueried pathology. In the second strategy, we fine-tune a cheap, weak learner\nto achieve high performance on a specific metric, and textually provide its\njudgment to the MLVLM. Tested on the MIMIC-CXR-JPG and Chexpert datasets, our\nmethods significantly improve the diagnostic F1 score, with the highest\nincrease being 0.27. We also demonstrate that our prompting strategies can be\nextended to general LVLM domains. Based on POPE metrics, it effectively\nsuppresses the false negative predictions of existing LVLMs and improves Recall\nby approximately 0.07.\n","authors":["Danfeng Guo","Demetri Terzopoulos"],"pdf_url":"https://arxiv.org/pdf/2407.21368v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21363v1","updated":"2024-07-31T06:20:21Z","published":"2024-07-31T06:20:21Z","title":"ESIQA: Perceptual Quality Assessment of Vision-Pro-based Egocentric\n Spatial Images","summary":" With the development of eXtended Reality (XR), head-mounted shooting and\ndisplay technology have experienced significant advancement and gained\nconsiderable attention. Egocentric spatial images and videos are emerging as a\ncompelling form of stereoscopic XR content. Different from traditional 2D\nimages, egocentric spatial images present challenges for perceptual quality\nassessment due to their special shooting, processing methods, and stereoscopic\ncharacteristics. However, the corresponding image quality assessment (IQA)\nresearch for egocentric spatial images is still lacking. In this paper, we\nestablish the Egocentric Spatial Images Quality Assessment Database (ESIQAD),\nthe first IQA database dedicated for egocentric spatial images as far as we\nknow. Our ESIQAD includes 500 egocentric spatial images, containing 400 images\ncaptured with the Apple Vision Pro and 100 images generated via an iPhone's\n\"Spatial Camera\" app. The corresponding mean opinion scores (MOSs) are\ncollected under three viewing modes, including 2D display, 3D-window display,\nand 3D-immersive display. Furthermore, based on our database, we conduct a\nbenchmark experiment and evaluate the performance of 22 state-of-the-art IQA\nmodels under three different viewing modes. We hope this research can\nfacilitate future IQA research on egocentric spatial images. The database is\navailable at https://github.com/IntMeGroup/ESIQA.\n","authors":["Xilei Zhu","Liu Yang","Huiyu Duan","Xiongkuo Min","Guangtao Zhai","Patrick Le Callet"],"pdf_url":"https://arxiv.org/pdf/2407.21363v1.pdf","comment":"8 pages, 8 figures"},{"id":"http://arxiv.org/abs/2407.05586v2","updated":"2024-07-31T06:04:25Z","published":"2024-07-08T03:46:56Z","title":"Dynamic Neural Radiance Field From Defocused Monocular Video","summary":" Dynamic Neural Radiance Field (NeRF) from monocular videos has recently been\nexplored for space-time novel view synthesis and achieved excellent results.\nHowever, defocus blur caused by depth variation often occurs in video capture,\ncompromising the quality of dynamic reconstruction because the lack of sharp\ndetails interferes with modeling temporal consistency between input views. To\ntackle this issue, we propose D2RF, the first dynamic NeRF method designed to\nrestore sharp novel views from defocused monocular videos. We introduce layered\nDepth-of-Field (DoF) volume rendering to model the defocus blur and reconstruct\na sharp NeRF supervised by defocused views. The blur model is inspired by the\nconnection between DoF rendering and volume rendering. The opacity in volume\nrendering aligns with the layer visibility in DoF rendering. To execute the\nblurring, we modify the layered blur kernel to the ray-based kernel and employ\nan optimized sparse kernel to gather the input rays efficiently and render the\noptimized rays with our layered DoF volume rendering. We synthesize a dataset\nwith defocused dynamic scenes for our task, and extensive experiments on our\ndataset show that our method outperforms existing approaches in synthesizing\nall-in-focus novel views from defocus blur while maintaining spatial-temporal\nconsistency in the scene.\n","authors":["Xianrui Luo","Huiqiang Sun","Juewen Peng","Zhiguo Cao"],"pdf_url":"https://arxiv.org/pdf/2407.05586v2.pdf","comment":"Accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2407.11087v2","updated":"2024-07-31T05:46:54Z","published":"2024-07-14T12:22:05Z","title":"Restore-RWKV: Efficient and Effective Medical Image Restoration with\n RWKV","summary":" Transformers have revolutionized medical image restoration, but the quadratic\ncomplexity still poses limitations for their application to high-resolution\nmedical images. The recent advent of RWKV in the NLP field has attracted much\nattention as it can process long sequences efficiently. To leverage its\nadvanced design, we propose Restore-RWKV, the first RWKV-based model for\nmedical image restoration. Since the original RWKV model is designed for 1D\nsequences, we make two necessary modifications for modeling spatial relations\nin 2D images. First, we present a recurrent WKV (Re-WKV) attention mechanism\nthat captures global dependencies with linear computational complexity. Re-WKV\nincorporates bidirectional attention as basic for a global receptive field and\nrecurrent attention to effectively model 2D dependencies from various scan\ndirections. Second, we develop an omnidirectional token shift (Omni-Shift)\nlayer that enhances local dependencies by shifting tokens from all directions\nand across a wide context range. These adaptations make the proposed\nRestore-RWKV an efficient and effective model for medical image restoration.\nExtensive experiments demonstrate that Restore-RWKV achieves superior\nperformance across various medical image restoration tasks, including MRI image\nsuper-resolution, CT image denoising, PET image synthesis, and all-in-one\nmedical image restoration. Code is available at:\n\\href{https://github.com/Yaziwel/Restore-RWKV.git}{https://github.com/Yaziwel/Restore-RWKV}.\n","authors":["Zhiwen Yang","Hui Zhang","Dan Zhao","Bingzheng Wei","Yan Xu"],"pdf_url":"https://arxiv.org/pdf/2407.11087v2.pdf","comment":"This paper introduces the first RWKV-based model for image\n restoration"},{"id":"http://arxiv.org/abs/2407.21351v1","updated":"2024-07-31T05:43:36Z","published":"2024-07-31T05:43:36Z","title":"Small Object Few-shot Segmentation for Vision-based Industrial\n Inspection","summary":" Vision-based industrial inspection (VII) aims to locate defects quickly and\naccurately. Supervised learning under a close-set setting and industrial\nanomaly detection, as two common paradigms in VII, face different problems in\npractical applications. The former is that various and sufficient defects are\ndifficult to obtain, while the latter is that specific defects cannot be\nlocated. To solve these problems, in this paper, we focus on the few-shot\nsemantic segmentation (FSS) method, which can locate unseen defects conditioned\non a few annotations without retraining. Compared to common objects in natural\nimages, the defects in VII are small. This brings two problems to current FSS\nmethods: 1 distortion of target semantics and 2 many false positives for\nbackgrounds. To alleviate these problems, we propose a small object few-shot\nsegmentation (SOFS) model. The key idea for alleviating 1 is to avoid the\nresizing of the original image and correctly indicate the intensity of target\nsemantics. SOFS achieves this idea via the non-resizing procedure and the\nprototype intensity downsampling of support annotations. To alleviate 2, we\ndesign an abnormal prior map in SOFS to guide the model to reduce false\npositives and propose a mixed normal Dice loss to preferentially prevent the\nmodel from predicting false positives. SOFS can achieve FSS and few-shot\nanomaly detection determined by support masks. Diverse experiments substantiate\nthe superior performance of SOFS. Code is available at\nhttps://github.com/zhangzilongc/SOFS.\n","authors":["Zilong Zhang","Chang Niu","Zhibin Zhao","Xingwu Zhang","Xuefeng Chen"],"pdf_url":"https://arxiv.org/pdf/2407.21351v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21343v1","updated":"2024-07-31T05:17:31Z","published":"2024-07-31T05:17:31Z","title":"MIST: A Simple and Scalable End-To-End 3D Medical Imaging Segmentation\n Framework","summary":" Medical imaging segmentation is a highly active area of research, with deep\nlearning-based methods achieving state-of-the-art results in several\nbenchmarks. However, the lack of standardized tools for training, testing, and\nevaluating new methods makes the comparison of methods difficult. To address\nthis, we introduce the Medical Imaging Segmentation Toolkit (MIST), a simple,\nmodular, and end-to-end medical imaging segmentation framework designed to\nfacilitate consistent training, testing, and evaluation of deep learning-based\nmedical imaging segmentation methods. MIST standardizes data analysis,\npreprocessing, and evaluation pipelines, accommodating multiple architectures\nand loss functions. This standardization ensures reproducible and fair\ncomparisons across different methods. We detail MIST's data format\nrequirements, pipelines, and auxiliary features and demonstrate its efficacy\nusing the BraTS Adult Glioma Post-Treatment Challenge dataset. Our results\nhighlight MIST's ability to produce accurate segmentation masks and its\nscalability across multiple GPUs, showcasing its potential as a powerful tool\nfor future medical imaging research and development.\n","authors":["Adrian Celaya","Evan Lim","Rachel Glenn","Brayden Mi","Alex Balsells","Tucker Netherton","Caroline Chung","Beatrice Riviere","David Fuentes"],"pdf_url":"https://arxiv.org/pdf/2407.21343v1.pdf","comment":"Submitted to BraTS 2024"},{"id":"http://arxiv.org/abs/2407.21341v1","updated":"2024-07-31T05:15:24Z","published":"2024-07-31T05:15:24Z","title":"High-throughput 3D shape completion of potato tubers on a harvester","summary":" Potato yield is an important metric for farmers to further optimize their\ncultivation practices. Potato yield can be estimated on a harvester using an\nRGB-D camera that can estimate the three-dimensional (3D) volume of individual\npotato tubers. A challenge, however, is that the 3D shape derived from RGB-D\nimages is only partially completed, underestimating the actual volume. To\naddress this issue, we developed a 3D shape completion network, called CoRe++,\nwhich can complete the 3D shape from RGB-D images. CoRe++ is a deep learning\nnetwork that consists of a convolutional encoder and a decoder. The encoder\ncompresses RGB-D images into latent vectors that are used by the decoder to\ncomplete the 3D shape using the deep signed distance field network (DeepSDF).\nTo evaluate our CoRe++ network, we collected partial and complete 3D point\nclouds of 339 potato tubers on an operational harvester in Japan. On the 1425\nRGB-D images in the test set (representing 51 unique potato tubers), our\nnetwork achieved a completion accuracy of 2.8 mm on average. For volumetric\nestimation, the root mean squared error (RMSE) was 22.6 ml, and this was better\nthan the RMSE of the linear regression (31.1 ml) and the base model (36.9 ml).\nWe found that the RMSE can be further reduced to 18.2 ml when performing the 3D\nshape completion in the center of the RGB-D image. With an average 3D shape\ncompletion time of 10 milliseconds per tuber, we can conclude that CoRe++ is\nboth fast and accurate enough to be implemented on an operational harvester for\nhigh-throughput potato yield estimation. Our code, network weights and dataset\nare publicly available at\nhttps://github.com/UTokyo-FieldPhenomics-Lab/corepp.git.\n","authors":["Pieter M. Blok","Federico Magistri","Cyrill Stachniss","Haozhou Wang","James Burridge","Wei Guo"],"pdf_url":"https://arxiv.org/pdf/2407.21341v1.pdf","comment":"18 pages, 11 figures, 6 tables"},{"id":"http://arxiv.org/abs/2407.21335v1","updated":"2024-07-31T04:57:06Z","published":"2024-07-31T04:57:06Z","title":"On-the-fly Point Feature Representation for Point Clouds Analysis","summary":" Point cloud analysis is challenging due to its unique characteristics of\nunorderness, sparsity and irregularity. Prior works attempt to capture local\nrelationships by convolution operations or attention mechanisms, exploiting\ngeometric information from coordinates implicitly. These methods, however, are\ninsufficient to describe the explicit local geometry, e.g., curvature and\norientation. In this paper, we propose On-the-fly Point Feature Representation\n(OPFR), which captures abundant geometric information explicitly through Curve\nFeature Generator module. This is inspired by Point Feature Histogram (PFH)\nfrom computer vision community. However, the utilization of vanilla PFH\nencounters great difficulties when applied to large datasets and dense point\nclouds, as it demands considerable time for feature generation. In contrast, we\nintroduce the Local Reference Constructor module, which approximates the local\ncoordinate systems based on triangle sets. Owing to this, our OPFR only\nrequires extra 1.56ms for inference (65x faster than vanilla PFH) and 0.012M\nmore parameters, and it can serve as a versatile plug-and-play module for\nvarious backbones, particularly MLP-based and Transformer-based backbones\nexamined in this study. Additionally, we introduce the novel Hierarchical\nSampling module aimed at enhancing the quality of triangle sets, thereby\nensuring robustness of the obtained geometric features. Our proposed method\nimproves overall accuracy (OA) on ModelNet40 from 90.7% to 94.5% (+3.8%) for\nclassification, and OA on S3DIS Area-5 from 86.4% to 90.0% (+3.6%) for semantic\nsegmentation, respectively, building upon PointNet++ backbone. When integrated\nwith Point Transformer backbone, we achieve state-of-the-art results on both\ntasks: 94.8% OA on ModelNet40 and 91.7% OA on S3DIS Area-5.\n","authors":["Jiangyi Wang","Zhongyao Cheng","Na Zhao","Jun Cheng","Xulei Yang"],"pdf_url":"https://arxiv.org/pdf/2407.21335v1.pdf","comment":"Accepted by ACM MM 2024"},{"id":"http://arxiv.org/abs/2407.21333v1","updated":"2024-07-31T04:49:46Z","published":"2024-07-31T04:49:46Z","title":"Chat2Layout: Interactive 3D Furniture Layout with a Multimodal LLM","summary":" Automatic furniture layout is long desired for convenient interior design.\nLeveraging the remarkable visual reasoning capabilities of multimodal large\nlanguage models (MLLMs), recent methods address layout generation in a static\nmanner, lacking the feedback-driven refinement essential for interactive user\nengagement. We introduce Chat2Layout, a novel interactive furniture layout\ngeneration system that extends the functionality of MLLMs into the realm of\ninteractive layout design. To achieve this, we establish a unified\nvision-question paradigm for in-context learning, enabling seamless\ncommunication with MLLMs to steer their behavior without altering model\nweights. Within this framework, we present a novel training-free visual\nprompting mechanism. This involves a visual-text prompting technique that\nassist MLLMs in reasoning about plausible layout plans, followed by an\nOffline-to-Online search (O2O-Search) method, which automatically identifies\nthe minimal set of informative references to provide exemplars for visual-text\nprompting. By employing an agent system with MLLMs as the core controller, we\nenable bidirectional interaction. The agent not only comprehends the 3D\nenvironment and user requirements through linguistic and visual perception but\nalso plans tasks and reasons about actions to generate and arrange furniture\nwithin the virtual space. Furthermore, the agent iteratively updates based on\nvisual feedback from execution results. Experimental results demonstrate that\nour approach facilitates language-interactive generation and arrangement for\ndiverse and complex 3D furniture.\n","authors":["Can Wang","Hongliang Zhong","Menglei Chai","Mingming He","Dongdong Chen","Jing Liao"],"pdf_url":"https://arxiv.org/pdf/2407.21333v1.pdf","comment":"Main paper with supplemental materials"},{"id":"http://arxiv.org/abs/2407.21331v1","updated":"2024-07-31T04:41:49Z","published":"2024-07-31T04:41:49Z","title":"CAMAv2: A Vision-Centric Approach for Static Map Element Annotation","summary":" The recent development of online static map element (a.k.a. HD map)\nconstruction algorithms has raised a vast demand for data with ground truth\nannotations. However, available public datasets currently cannot provide\nhigh-quality training data regarding consistency and accuracy. For instance,\nthe manual labelled (low efficiency) nuScenes still contains misalignment and\ninconsistency between the HD maps and images (e.g., around 8.03 pixels\nreprojection error on average). To this end, we present CAMAv2: a\nvision-centric approach for Consistent and Accurate Map Annotation. Without\nLiDAR inputs, our proposed framework can still generate high-quality 3D\nannotations of static map elements. Specifically, the annotation can achieve\nhigh reprojection accuracy across all surrounding cameras and is\nspatial-temporal consistent across the whole sequence. We apply our proposed\nframework to the popular nuScenes dataset to provide efficient and highly\naccurate annotations. Compared with the original nuScenes static map element,\nour CAMAv2 annotations achieve lower reprojection errors (e.g., 4.96 vs. 8.03\npixels). Models trained with annotations from CAMAv2 also achieve lower\nreprojection errors (e.g., 5.62 vs. 8.43 pixels).\n","authors":["Shiyuan Chen","Jiaxin Zhang","Ruohong Mei","Yingfeng Cai","Haoran Yin","Tao Chen","Wei Sui","Cong Yang"],"pdf_url":"https://arxiv.org/pdf/2407.21331v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2309.11754"},{"id":"http://arxiv.org/abs/2407.21328v1","updated":"2024-07-31T04:32:43Z","published":"2024-07-31T04:32:43Z","title":"Knowledge-Guided Prompt Learning for Lifespan Brain MR Image\n Segmentation","summary":" Automatic and accurate segmentation of brain MR images throughout the human\nlifespan into tissue and structure is crucial for understanding brain\ndevelopment and diagnosing diseases. However, challenges arise from the\nintricate variations in brain appearance due to rapid early brain development,\naging, and disorders, compounded by the limited availability of\nmanually-labeled datasets. In response, we present a two-step segmentation\nframework employing Knowledge-Guided Prompt Learning (KGPL) for brain MRI.\nSpecifically, we first pre-train segmentation models on large-scale datasets\nwith sub-optimal labels, followed by the incorporation of knowledge-driven\nembeddings learned from image-text alignment into the models. The introduction\nof knowledge-wise prompts captures semantic relationships between anatomical\nvariability and biological processes, enabling models to learn structural\nfeature embeddings across diverse age groups. Experimental findings demonstrate\nthe superiority and robustness of our proposed method, particularly noticeable\nwhen employing Swin UNETR as the backbone. Our approach achieves average DSC\nvalues of 95.17% and 94.19% for brain tissue and structure segmentation,\nrespectively. Our code is available at https://github.com/TL9792/KGPL.\n","authors":["Lin Teng","Zihao Zhao","Jiawei Huang","Zehong Cao","Runqi Meng","Feng Shi","Dinggang Shen"],"pdf_url":"https://arxiv.org/pdf/2407.21328v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14841v3","updated":"2024-07-31T04:26:43Z","published":"2024-05-23T17:55:11Z","title":"MOD-UV: Learning Mobile Object Detectors from Unlabeled Videos","summary":" Embodied agents must detect and localize objects of interest, e.g. traffic\nparticipants for self-driving cars. Supervision in the form of bounding boxes\nfor this task is extremely expensive. As such, prior work has looked at\nunsupervised instance detection and segmentation, but in the absence of\nannotated boxes, it is unclear how pixels must be grouped into objects and\nwhich objects are of interest. This results in over-/under-segmentation and\nirrelevant objects. Inspired by human visual system and practical applications,\nwe posit that the key missing cue for unsupervised detection is motion: objects\nof interest are typically mobile objects that frequently move and their motions\ncan specify separate instances. In this paper, we propose MOD-UV, a Mobile\nObject Detector learned from Unlabeled Videos only. We begin with instance\npseudo-labels derived from motion segmentation, but introduce a novel training\nparadigm to progressively discover small objects and static-but-mobile objects\nthat are missed by motion segmentation. As a result, though only learned from\nunlabeled videos, MOD-UV can detect and segment mobile objects from a single\nstatic image. Empirically, we achieve state-of-the-art performance in\nunsupervised mobile object detection on Waymo Open, nuScenes, and KITTI\nDatasets without using any external data or supervised models. Code is\navailable at https://github.com/YihongSun/MOD-UV.\n","authors":["Yihong Sun","Bharath Hariharan"],"pdf_url":"https://arxiv.org/pdf/2405.14841v3.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2407.21323v1","updated":"2024-07-31T04:06:47Z","published":"2024-07-31T04:06:47Z","title":"STANet: A Novel Spatio-Temporal Aggregation Network for Depression\n Classification with Small and Unbalanced FMRI Data","summary":" Accurate diagnosis of depression is crucial for timely implementation of\noptimal treatments, preventing complications and reducing the risk of suicide.\nTraditional methods rely on self-report questionnaires and clinical assessment,\nlacking objective biomarkers. Combining fMRI with artificial intelligence can\nenhance depression diagnosis by integrating neuroimaging indicators. However,\nthe specificity of fMRI acquisition for depression often results in unbalanced\nand small datasets, challenging the sensitivity and accuracy of classification\nmodels. In this study, we propose the Spatio-Temporal Aggregation Network\n(STANet) for diagnosing depression by integrating CNN and RNN to capture both\ntemporal and spatial features of brain activity. STANet comprises the following\nsteps:(1) Aggregate spatio-temporal information via ICA. (2) Utilize\nmulti-scale deep convolution to capture detailed features. (3) Balance data\nusing the SMOTE to generate new samples for minority classes. (4) Employ the\nAFGRU classifier, which combines Fourier transformation with GRU, to capture\nlong-term dependencies, with an adaptive weight assignment mechanism to enhance\nmodel generalization. The experimental results demonstrate that STANet achieves\nsuperior depression diagnostic performance with 82.38% accuracy and a 90.72%\nAUC. The STFA module enhances classification by capturing deeper features at\nmultiple scales. The AFGRU classifier, with adaptive weights and stacked GRU,\nattains higher accuracy and AUC. SMOTE outperforms other oversampling methods.\nAdditionally, spatio-temporal aggregated features achieve better performance\ncompared to using only temporal or spatial features. STANet outperforms\ntraditional or deep learning classifiers, and functional connectivity-based\nclassifiers, as demonstrated by ten-fold cross-validation.\n","authors":["Wei Zhang","Weiming Zeng","Hongyu Chen","Jie Liu","Hongjie Yan","Kaile Zhang","Ran Tao","Wai Ting Siok","Nizhuan Wang"],"pdf_url":"https://arxiv.org/pdf/2407.21323v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21317v1","updated":"2024-07-31T03:58:48Z","published":"2024-07-31T03:58:48Z","title":"Pathology Foundation Models","summary":" Pathology has played a crucial role in the diagnosis and evaluation of\npatient tissue samples obtained from surgeries and biopsies for many years. The\nadvent of Whole Slide Scanners and the development of deep learning\ntechnologies have significantly advanced the field, leading to extensive\nresearch and development in pathology AI (Artificial Intelligence). These\nadvancements have contributed to reducing the workload of pathologists and\nsupporting decision-making in treatment plans. Recently, large-scale AI models\nknown as Foundation Models (FMs), which are more accurate and applicable to a\nwide range of tasks compared to traditional AI, have emerged, and expanded\ntheir application scope in the healthcare field. Numerous FMs have been\ndeveloped in pathology, and there are reported cases of their application in\nvarious tasks, such as disease diagnosis, rare cancer diagnosis, patient\nsurvival prognosis prediction, biomarker expression prediction, and the scoring\nof immunohistochemical expression intensity. However, several challenges remain\nfor the clinical application of FMs, which healthcare professionals, as users,\nmust be aware of. Research is ongoing to address these challenges. In the\nfuture, it is expected that the development of Generalist Medical AI, which\nintegrates pathology FMs with FMs from other medical domains, will progress,\nleading to the effective utilization of AI in real clinical settings to promote\nprecision and personalized medicine.\n","authors":["Mieko Ochi","Daisuke Komura","Shumpei Ishikawa"],"pdf_url":"https://arxiv.org/pdf/2407.21317v1.pdf","comment":"19 pages, 1 figure, 3 tables"},{"id":"http://arxiv.org/abs/2404.18401v2","updated":"2024-07-31T03:42:47Z","published":"2024-04-29T03:36:05Z","title":"Spectral-Spatial Mamba for Hyperspectral Image Classification","summary":" Recently, deep learning models have achieved excellent performance in\nhyperspectral image (HSI) classification. Among the many deep models,\nTransformer has gradually attracted interest for its excellence in modeling the\nlong-range dependencies of spatial-spectral features in HSI. However,\nTransformer has the problem of quadratic computational complexity due to the\nself-attention mechanism, which is heavier than other models and thus has\nlimited adoption in HSI processing. Fortunately, the recently emerging state\nspace model-based Mamba shows great computational efficiency while achieving\nthe modeling power of Transformers. Therefore, in this paper, we make a\npreliminary attempt to apply the Mamba to HSI classification, leading to the\nproposed spectral-spatial Mamba (SS-Mamba). Specifically, the proposed SS-Mamba\nmainly consists of spectral-spatial token generation module and several stacked\nspectral-spatial Mamba blocks. Firstly, the token generation module converts\nany given HSI cube to spatial and spectral tokens as sequences. And then these\ntokens are sent to stacked spectral-spatial mamba blocks (SS-MB). Each SS-MB\nblock consists of two basic mamba blocks and a spectral-spatial feature\nenhancement module. The spatial and spectral tokens are processed separately by\nthe two basic mamba blocks, respectively. Besides, the feature enhancement\nmodule modulates spatial and spectral tokens using HSI sample's center region\ninformation. In this way, the spectral and spatial tokens cooperate with each\nother and achieve information fusion within each block. The experimental\nresults conducted on widely used HSI datasets reveal that the proposed model\nachieves competitive results compared with the state-of-the-art methods. The\nMamba-based method opens a new window for HSI classification.\n","authors":["Lingbo Huang","Yushi Chen","Xin He"],"pdf_url":"https://arxiv.org/pdf/2404.18401v2.pdf","comment":"23 pages"},{"id":"http://arxiv.org/abs/2403.00228v2","updated":"2024-07-31T03:36:32Z","published":"2024-03-01T02:19:40Z","title":"DISORF: A Distributed Online 3D Reconstruction Framework for Mobile\n Robots","summary":" We present a framework, DISORF, to enable online 3D reconstruction and\nvisualization of scenes captured by resource-constrained mobile robots and edge\ndevices. To address the limited computing capabilities of edge devices and\npotentially limited network availability, we design a framework that\nefficiently distributes computation between the edge device and the remote\nserver. We leverage on-device SLAM systems to generate posed keyframes and\ntransmit them to remote servers that can perform high-quality 3D reconstruction\nand visualization at runtime by leveraging recent advances in neural 3D\nmethods. We identify a key challenge with online training where naive image\nsampling strategies can lead to significant degradation in rendering quality.\nWe propose a novel shifted exponential frame sampling method that addresses\nthis challenge for online training. We demonstrate the effectiveness of our\nframework in enabling high-quality real-time reconstruction and visualization\nof unknown scenes as they are captured and streamed from cameras in mobile\nrobots and edge devices.\n","authors":["Chunlin Li","Hanrui Fan","Xiaorui Huang","Ruofan Liang","Sankeerth Durvasula","Nandita Vijaykumar"],"pdf_url":"https://arxiv.org/pdf/2403.00228v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21311v1","updated":"2024-07-31T03:29:28Z","published":"2024-07-31T03:29:28Z","title":"EUDA: An Efficient Unsupervised Domain Adaptation via Self-Supervised\n Vision Transformer","summary":" Unsupervised domain adaptation (UDA) aims to mitigate the domain shift issue,\nwhere the distribution of training (source) data differs from that of testing\n(target) data. Many models have been developed to tackle this problem, and\nrecently vision transformers (ViTs) have shown promising results. However, the\ncomplexity and large number of trainable parameters of ViTs restrict their\ndeployment in practical applications. This underscores the need for an\nefficient model that not only reduces trainable parameters but also allows for\nadjustable complexity based on specific needs while delivering comparable\nperformance. To achieve this, in this paper we introduce an Efficient\nUnsupervised Domain Adaptation (EUDA) framework. EUDA employs the DINOv2, which\nis a self-supervised ViT, as a feature extractor followed by a simplified\nbottleneck of fully connected layers to refine features for enhanced domain\nadaptation. Additionally, EUDA employs the synergistic domain alignment loss\n(SDAL), which integrates cross-entropy (CE) and maximum mean discrepancy (MMD)\nlosses, to balance adaptation by minimizing classification errors in the source\ndomain while aligning the source and target domain distributions. The\nexperimental results indicate the effectiveness of EUDA in producing comparable\nresults as compared with other state-of-the-art methods in domain adaptation\nwith significantly fewer trainable parameters, between 42% to 99.7% fewer. This\nshowcases the ability to train the model in a resource-limited environment. The\ncode of the model is available at: https://github.com/A-Abedi/EUDA.\n","authors":["Ali Abedi","Q. M. Jonathan Wu","Ning Zhang","Farhad Pourpanah"],"pdf_url":"https://arxiv.org/pdf/2407.21311v1.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2407.21308v1","updated":"2024-07-31T03:20:11Z","published":"2024-07-31T03:20:11Z","title":"Enhanced Self-Checkout System for Retail Based on Improved YOLOv10","summary":" With the rapid advancement of deep learning technologies, computer vision has\nshown immense potential in retail automation. This paper presents a novel\nself-checkout system for retail based on an improved YOLOv10 network, aimed at\nenhancing checkout efficiency and reducing labor costs. We propose targeted\noptimizations to the YOLOv10 model, by incorporating the detection head\nstructure from YOLOv8, which significantly improves product recognition\naccuracy. Additionally, we develop a post-processing algorithm tailored for\nself-checkout scenarios, to further enhance the application of system.\nExperimental results demonstrate that our system outperforms existing methods\nin both product recognition accuracy and checkout speed. This research not only\nprovides a new technical solution for retail automation but offers valuable\ninsights into optimizing deep learning models for real-world applications.\n","authors":["Lianghao Tan","Shubing Liu","Jing Gao","Xiaoyi Liu","Linyue Chu","Huangqi Jiang"],"pdf_url":"https://arxiv.org/pdf/2407.21308v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11271v3","updated":"2024-07-31T03:06:40Z","published":"2024-06-17T07:21:36Z","title":"MINT-1T: Scaling Open-Source Multimodal Data by 10x: A Multimodal\n Dataset with One Trillion Tokens","summary":" Multimodal interleaved datasets featuring free-form interleaved sequences of\nimages and text are crucial for training frontier large multimodal models\n(LMMs). Despite the rapid progression of open-source LMMs, there remains a\npronounced scarcity of large-scale, diverse open-source multimodal interleaved\ndatasets. In response, we introduce MINT-1T, the most extensive and diverse\nopen-source Multimodal INTerleaved dataset to date. MINT-1T comprises one\ntrillion text tokens and 3.4 billion images, a 10x scale-up from existing\nopen-source datasets. Additionally, we include previously untapped sources such\nas PDFs and ArXiv papers. As scaling multimodal interleaved datasets requires\nsubstantial engineering effort, sharing the data curation process and releasing\nthe dataset greatly benefits the community. Our experiments show that LMMs\ntrained on MINT-1T rival the performance of models trained on the previous\nleading dataset, OBELICS. Our data and code will be released at\nhttps://github.com/mlfoundations/MINT-1T.\n","authors":["Anas Awadalla","Le Xue","Oscar Lo","Manli Shu","Hannah Lee","Etash Kumar Guha","Matt Jordan","Sheng Shen","Mohamed Awadalla","Silvio Savarese","Caiming Xiong","Ran Xu","Yejin Choi","Ludwig Schmidt"],"pdf_url":"https://arxiv.org/pdf/2406.11271v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21293v1","updated":"2024-07-31T02:35:33Z","published":"2024-07-31T02:35:33Z","title":"SimpleLLM4AD: An End-to-End Vision-Language Model with Graph Visual\n Question Answering for Autonomous Driving","summary":" Many fields could benefit from the rapid development of the large language\nmodels (LLMs). The end-to-end autonomous driving (e2eAD) is one of the\ntypically fields facing new opportunities as the LLMs have supported more and\nmore modalities. Here, by utilizing vision-language model (VLM), we proposed an\ne2eAD method called SimpleLLM4AD. In our method, the e2eAD task are divided\ninto four stages, which are perception, prediction, planning, and behavior.\nEach stage consists of several visual question answering (VQA) pairs and VQA\npairs interconnect with each other constructing a graph called Graph VQA\n(GVQA). By reasoning each VQA pair in the GVQA through VLM stage by stage, our\nmethod could achieve e2e driving with language. In our method, vision\ntransformers (ViT) models are employed to process nuScenes visual data, while\nVLM are utilized to interpret and reason about the information extracted from\nthe visual inputs. In the perception stage, the system identifies and\nclassifies objects from the driving environment. The prediction stage involves\nforecasting the potential movements of these objects. The planning stage\nutilizes the gathered information to develop a driving strategy, ensuring the\nsafety and efficiency of the autonomous vehicle. Finally, the behavior stage\ntranslates the planned actions into executable commands for the vehicle. Our\nexperiments demonstrate that SimpleLLM4AD achieves competitive performance in\ncomplex driving scenarios.\n","authors":["Peiru Zheng","Yun Zhao","Zhan Gong","Hong Zhu","Shaohua Wu"],"pdf_url":"https://arxiv.org/pdf/2407.21293v1.pdf","comment":"16 pages, 3 figures"},{"id":"http://arxiv.org/abs/2403.12712v2","updated":"2024-07-31T02:33:31Z","published":"2024-03-19T13:19:41Z","title":"Saliency Guided Image Warping for Unsupervised Domain Adaptation","summary":" Driving is challenging in conditions like night, rain, and snow. The lack of\ngood labeled datasets has hampered progress in scene understanding under such\nconditions. Unsupervised domain adaptation (UDA) using large labeled clear-day\ndatasets is a promising research direction in such cases. Current UDA methods,\nhowever, treat all image pixels uniformly, leading to over-reliance on the\ndominant scene backgrounds (e.g., roads, sky, sidewalks) that appear\ndramatically different across domains. As a result, they struggle to learn\neffective features of smaller and often sparse foreground objects (e.g.,\npeople, vehicles, signs).\n In this work, we improve UDA training by using in-place image warping to\nfocus on salient object regions. Our insight is that while backgrounds vary\nsignificantly across domains (e.g., snowy night vs. clear day), object\nappearances vary to a lesser extent. Therefore, we design instance-level\nsaliency guidance to adaptively oversample object regions, which reduces\nadverse effects from background context and enhances backbone feature learning.\nWe then unwarp the better learned features while adapting from source to\ntarget. Our approach improves adaptation across geographies, lighting, and\nweather conditions, and is agnostic to the task (segmentation, detection),\ndomain adaptation algorithm, saliency guidance, and underlying model\narchitecture. Result highlights include +6.1 mAP50 for BDD100K Clear\n$\\rightarrow$ DENSE Foggy, +3.7 mAP50 for BDD100K Day $\\rightarrow$ Night, +3.0\nmAP50 for BDD100K Clear $\\rightarrow$ Rainy, and +6.3 mIoU for Cityscapes\n$\\rightarrow$ ACDC. Our method adds minimal training memory and incurs no\nadditional inference latency. Please see Appendix for more results and\nanalysis.\n","authors":["Shen Zheng","Anurag Ghosh","Srinivasa G. Narasimhan"],"pdf_url":"https://arxiv.org/pdf/2403.12712v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21289v1","updated":"2024-07-31T02:25:30Z","published":"2024-07-31T02:25:30Z","title":"Fine-grained Metrics for Point Cloud Semantic Segmentation","summary":" Two forms of imbalances are commonly observed in point cloud semantic\nsegmentation datasets: (1) category imbalances, where certain objects are more\nprevalent than others; and (2) size imbalances, where certain objects occupy\nmore points than others. Because of this, the majority of categories and large\nobjects are favored in the existing evaluation metrics. This paper suggests\nfine-grained mIoU and mAcc for a more thorough assessment of point cloud\nsegmentation algorithms in order to address these issues. Richer statistical\ninformation is provided for models and datasets by these fine-grained metrics,\nwhich also lessen the bias of current semantic segmentation metrics towards\nlarge objects. The proposed metrics are used to train and assess various\nsemantic segmentation algorithms on three distinct indoor and outdoor semantic\nsegmentation datasets.\n","authors":["Zhuheng Lu","Ting Wu","Yuewei Dai","Weiqing Li","Zhiyong Su"],"pdf_url":"https://arxiv.org/pdf/2407.21289v1.pdf","comment":"PRCV 2024"},{"id":"http://arxiv.org/abs/2407.21284v1","updated":"2024-07-31T02:16:28Z","published":"2024-07-31T02:16:28Z","title":"Robust Box Prompt based SAM for Medical Image Segmentation","summary":" The Segment Anything Model (SAM) can achieve satisfactory segmentation\nperformance under high-quality box prompts. However, SAM's robustness is\ncompromised by the decline in box quality, limiting its practicality in\nclinical reality. In this study, we propose a novel Robust Box prompt based SAM\n(\\textbf{RoBox-SAM}) to ensure SAM's segmentation performance under prompts\nwith different qualities. Our contribution is three-fold. First, we propose a\nprompt refinement module to implicitly perceive the potential targets, and\noutput the offsets to directly transform the low-quality box prompt into a\nhigh-quality one. We then provide an online iterative strategy for further\nprompt refinement. Second, we introduce a prompt enhancement module to\nautomatically generate point prompts to assist the box-promptable segmentation\neffectively. Last, we build a self-information extractor to encode the prior\ninformation from the input image. These features can optimize the image\nembeddings and attention calculation, thus, the robustness of SAM can be\nfurther enhanced. Extensive experiments on the large medical segmentation\ndataset including 99,299 images, 5 modalities, and 25 organs/targets validated\nthe efficacy of our proposed RoBox-SAM.\n","authors":["Yuhao Huang","Xin Yang","Han Zhou","Yan Cao","Haoran Dou","Fajin Dong","Dong Ni"],"pdf_url":"https://arxiv.org/pdf/2407.21284v1.pdf","comment":"Accepted by MICCAI MLMI 2024"},{"id":"http://arxiv.org/abs/2406.11196v3","updated":"2024-07-31T02:03:04Z","published":"2024-06-17T04:09:04Z","title":"Vid3D: Synthesis of Dynamic 3D Scenes using 2D Video Diffusion","summary":" A recent frontier in computer vision has been the task of 3D video\ngeneration, which consists of generating a time-varying 3D representation of a\nscene. To generate dynamic 3D scenes, current methods explicitly model 3D\ntemporal dynamics by jointly optimizing for consistency across both time and\nviews of the scene. In this paper, we instead investigate whether it is\nnecessary to explicitly enforce multiview consistency over time, as current\napproaches do, or if it is sufficient for a model to generate 3D\nrepresentations of each timestep independently. We hence propose a model,\nVid3D, that leverages 2D video diffusion to generate 3D videos by first\ngenerating a 2D \"seed\" of the video's temporal dynamics and then independently\ngenerating a 3D representation for each timestep in the seed video. We evaluate\nVid3D against two state-of-the-art 3D video generation methods and find that\nVid3D is achieves comparable results despite not explicitly modeling 3D\ntemporal dynamics. We further ablate how the quality of Vid3D depends on the\nnumber of views generated per frame. While we observe some degradation with\nfewer views, performance degradation remains minor. Our results thus suggest\nthat 3D temporal knowledge may not be necessary to generate high-quality\ndynamic 3D scenes, potentially enabling simpler generative algorithms for this\ntask.\n","authors":["Rishab Parthasarathy","Zachary Ankner","Aaron Gokaslan"],"pdf_url":"https://arxiv.org/pdf/2406.11196v3.pdf","comment":"14 pages, 10 figures, 3 tables"},{"id":"http://arxiv.org/abs/2407.21273v1","updated":"2024-07-31T01:36:47Z","published":"2024-07-31T01:36:47Z","title":"Enhanced Uncertainty Estimation in Ultrasound Image Segmentation with\n MSU-Net","summary":" Efficient intravascular access in trauma and critical care significantly\nimpacts patient outcomes. However, the availability of skilled medical\npersonnel in austere environments is often limited. Autonomous robotic\nultrasound systems can aid in needle insertion for medication delivery and\nsupport non-experts in such tasks. Despite advances in autonomous needle\ninsertion, inaccuracies in vessel segmentation predictions pose risks.\nUnderstanding the uncertainty of predictive models in ultrasound imaging is\ncrucial for assessing their reliability. We introduce MSU-Net, a novel\nmultistage approach for training an ensemble of U-Nets to yield accurate\nultrasound image segmentation maps. We demonstrate substantial improvements,\n18.1% over a single Monte Carlo U-Net, enhancing uncertainty evaluations, model\ntransparency, and trustworthiness. By highlighting areas of model certainty,\nMSU-Net can guide safe needle insertions, empowering non-experts to accomplish\nsuch tasks.\n","authors":["Rohini Banerjee","Cecilia G. Morales","Artur Dubrawski"],"pdf_url":"https://arxiv.org/pdf/2407.21273v1.pdf","comment":"Accepted for the 5th International Workshop of Advances in\n Simplifying Medical UltraSound (ASMUS), held in conjunction with MICCAI 2024,\n the 27th International Conference on Medical Image Computing and Computer\n Assisted Intervention"},{"id":"http://arxiv.org/abs/2407.21272v1","updated":"2024-07-31T01:33:47Z","published":"2024-07-31T01:33:47Z","title":"Automated Quantification of Hyperreflective Foci in SD-OCT With Diabetic\n Retinopathy","summary":" The presence of hyperreflective foci (HFs) is related to retinal disease\nprogression, and the quantity has proven to be a prognostic factor of visual\nand anatomical outcome in various retinal diseases. However, lack of efficient\nquantitative tools for evaluating the HFs has deprived ophthalmologist of\nassessing the volume of HFs. For this reason, we propose an automated\nquantification algorithm to segment and quantify HFs in spectral domain optical\ncoherence tomography (SD-OCT). The proposed algorithm consists of two parallel\nprocesses namely: region of interest (ROI) generation and HFs estimation. To\ngenerate the ROI, we use morphological reconstruction to obtain the\nreconstructed image and histogram constructed for data distributions and\nclustering. In parallel, we estimate the HFs by extracting the extremal regions\nfrom the connected regions obtained from a component tree. Finally, both the\nROI and the HFs estimation process are merged to obtain the segmented HFs. The\nproposed algorithm was tested on 40 3D SD-OCT volumes from 40 patients\ndiagnosed with non-proliferative diabetic retinopathy (NPDR), proliferative\ndiabetic retinopathy (PDR), and diabetic macular edema (DME). The average dice\nsimilarity coefficient (DSC) and correlation coefficient (r) are 69.70%, 0.99\nfor NPDR, 70.31%, 0.99 for PDR, and 71.30%, 0.99 for DME, respectively. The\nproposed algorithm can provide ophthalmologist with good HFs quantitative\ninformation, such as volume, size, and location of the HFs.\n","authors":["Idowu Paul Okuwobi","Zexuan Ji","Wen Fan","Songtao Yuan","Loza Bekalo","Qiang Chen"],"pdf_url":"https://arxiv.org/pdf/2407.21272v1.pdf","comment":"IEEE Journal of Biomedical and Health Informatics, Volume: 24, Issue:\n 4, pp. 1125 - 1136, 2020"},{"id":"http://arxiv.org/abs/2407.21267v1","updated":"2024-07-31T01:13:25Z","published":"2024-07-31T01:13:25Z","title":"DEF-oriCORN: efficient 3D scene understanding for robust\n language-directed manipulation without demonstrations","summary":" We present DEF-oriCORN, a framework for language-directed manipulation tasks.\nBy leveraging a novel object-based scene representation and\ndiffusion-model-based state estimation algorithm, our framework enables\nefficient and robust manipulation planning in response to verbal commands, even\nin tightly packed environments with sparse camera views without any\ndemonstrations. Unlike traditional representations, our representation affords\nefficient collision checking and language grounding. Compared to\nstate-of-the-art baselines, our framework achieves superior estimation and\nmotion planning performance from sparse RGB images and zero-shot generalizes to\nreal-world scenarios with diverse materials, including transparent and\nreflective objects, despite being trained exclusively in simulation. Our code\nfor data generation, training, inference, and pre-trained weights are publicly\navailable at: https://sites.google.com/view/def-oricorn/home.\n","authors":["Dongwon Son","Sanghyeon Son","Jaehyung Kim","Beomjoon Kim"],"pdf_url":"https://arxiv.org/pdf/2407.21267v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21266v1","updated":"2024-07-31T01:07:21Z","published":"2024-07-31T01:07:21Z","title":"DDU-Net: A Domain Decomposition-based CNN on Multiple GPUs","summary":" The segmentation of ultra-high resolution images poses challenges such as\nloss of spatial information or computational inefficiency. In this work, a\nnovel approach that combines encoder-decoder architectures with domain\ndecomposition strategies to address these challenges is proposed. Specifically,\na domain decomposition-based U-Net (DDU-Net) architecture is introduced, which\npartitions input images into non-overlapping patches that can be processed\nindependently on separate devices. A communication network is added to\nfacilitate inter-patch information exchange to enhance the understanding of\nspatial context. Experimental validation is performed on a synthetic dataset\nthat is designed to measure the effectiveness of the communication network.\nThen, the performance is tested on the DeepGlobe land cover classification\ndataset as a real-world benchmark data set. The results demonstrate that the\napproach, which includes inter-patch communication for images divided into\n$16\\times16$ non-overlapping subimages, achieves a $2-3\\,\\%$ higher\nintersection over union (IoU) score compared to the same network without\ninter-patch communication. The performance of the network which includes\ncommunication is equivalent to that of a baseline U-Net trained on the full\nimage, showing that our model provides an effective solution for segmenting\nultra-high-resolution images while preserving spatial context. The code is\navailable at https://github.com/corne00/HiRes-Seg-CNN.\n","authors":["Corné Verburg","Alexander Heinlein","Eric C. Cyr"],"pdf_url":"https://arxiv.org/pdf/2407.21266v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21263v1","updated":"2024-07-31T00:56:06Z","published":"2024-07-31T00:56:06Z","title":"Outlier Detection in Large Radiological Datasets using UMAP","summary":" The success of machine learning algorithms heavily relies on the quality of\nsamples and the accuracy of their corresponding labels. However, building and\nmaintaining large, high-quality datasets is an enormous task. This is\nespecially true for biomedical data and for meta-sets that are compiled from\nsmaller ones, as variations in image quality, labeling, reports, and archiving\ncan lead to errors, inconsistencies, and repeated samples. Here, we show that\nthe uniform manifold approximation and projection (UMAP) algorithm can find\nthese anomalies essentially by forming independent clusters that are distinct\nfrom the main (good) data but similar to other points with the same error type.\nAs a representative example, we apply UMAP to discover outliers in the publicly\navailable ChestX-ray14, CheXpert, and MURA datasets. While the results are\narchival and retrospective and focus on radiological images, the graph-based\nmethods work for any data type and will prove equally beneficial for curation\nat the time of dataset creation.\n","authors":["Mohammad Tariqul Islam","Jason W. Fleischer"],"pdf_url":"https://arxiv.org/pdf/2407.21263v1.pdf","comment":"Accepted in MICCAI-2024 Workshop on Topology- and Graph-Informed\n Imaging Informatics (TGI3)"},{"id":"http://arxiv.org/abs/2312.14124v2","updated":"2024-07-31T00:54:09Z","published":"2023-12-21T18:46:27Z","title":"Neural Point Cloud Diffusion for Disentangled 3D Shape and Appearance\n Generation","summary":" Controllable generation of 3D assets is important for many practical\napplications like content creation in movies, games and engineering, as well as\nin AR/VR. Recently, diffusion models have shown remarkable results in\ngeneration quality of 3D objects. However, none of the existing models enable\ndisentangled generation to control the shape and appearance separately. For the\nfirst time, we present a suitable representation for 3D diffusion models to\nenable such disentanglement by introducing a hybrid point cloud and neural\nradiance field approach. We model a diffusion process over point positions\njointly with a high-dimensional feature space for a local density and radiance\ndecoder. While the point positions represent the coarse shape of the object,\nthe point features allow modeling the geometry and appearance details. This\ndisentanglement enables us to sample both independently and therefore to\ncontrol both separately. Our approach sets a new state of the art in generation\ncompared to previous disentanglement-capable methods by reduced FID scores of\n30-90% and is on-par with other non disentanglement-capable state-of-the art\nmethods.\n","authors":["Philipp Schröppel","Christopher Wewer","Jan Eric Lenssen","Eddy Ilg","Thomas Brox"],"pdf_url":"https://arxiv.org/pdf/2312.14124v2.pdf","comment":"CVPR 2024. Project page:\n https://neural-point-cloud-diffusion.github.io/"},{"id":"http://arxiv.org/abs/2407.21256v1","updated":"2024-07-31T00:34:37Z","published":"2024-07-31T00:34:37Z","title":"Leveraging Adaptive Implicit Representation Mapping for Ultra\n High-Resolution Image Segmentation","summary":" Implicit representation mapping (IRM) can translate image features to any\ncontinuous resolution, showcasing its potent capability for\nultra-high-resolution image segmentation refinement. Current IRM-based methods\nfor refining ultra-high-resolution image segmentation often rely on CNN-based\nencoders to extract image features and apply a Shared Implicit Representation\nMapping Function (SIRMF) to convert pixel-wise features into segmented results.\nHence, these methods exhibit two crucial limitations. Firstly, the CNN-based\nencoder may not effectively capture long-distance information, resulting in a\nlack of global semantic information in the pixel-wise features. Secondly, SIRMF\nis shared across all samples, which limits its ability to generalize and handle\ndiverse inputs. To address these limitations, we propose a novel approach that\nleverages the newly proposed Adaptive Implicit Representation Mapping (AIRM)\nfor ultra-high-resolution Image Segmentation. Specifically, the proposed method\ncomprises two components: (1) the Affinity Empowered Encoder (AEE), a robust\nfeature extractor that leverages the benefits of the transformer architecture\nand semantic affinity to model long-distance features effectively, and (2) the\nAdaptive Implicit Representation Mapping Function (AIRMF), which adaptively\ntranslates pixel-wise features without neglecting the global semantic\ninformation, allowing for flexible and precise feature translation. We\nevaluated our method on the commonly used ultra-high-resolution segmentation\nrefinement datasets, i.e., BIG and PASCAL VOC 2012. The extensive experiments\ndemonstrate that our method outperforms competitors by a large margin. The code\nis provided in supplementary material.\n","authors":["Ziyu Zhao","Xiaoguang Li","Pingping Cai","Canyu Zhang","Song Wang"],"pdf_url":"https://arxiv.org/pdf/2407.21256v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21252v1","updated":"2024-07-31T00:19:22Z","published":"2024-07-31T00:19:22Z","title":"Lifelong Person Search","summary":" Person search is the task to localize a query person in gallery datasets of\nscene images. Existing methods have been mainly developed to handle a single\ntarget dataset only, however diverse datasets are continuously given in\npractical applications of person search. In such cases, they suffer from the\ncatastrophic knowledge forgetting in the old datasets when trained on new\ndatasets. In this paper, we first introduce a novel problem of lifelong person\nsearch (LPS) where the model is incrementally trained on the new datasets while\npreserving the knowledge learned in the old datasets. We propose an end-to-end\nLPS framework that facilitates the knowledge distillation to enforce the\nconsistency learning between the old and new models by utilizing the prototype\nfeatures of the foreground persons as well as the hard background proposals in\nthe old domains. Moreover, we also devise the rehearsal-based instance matching\nto further improve the discrimination ability in the old domains by using the\nunlabeled person instances additionally. Experimental results demonstrate that\nthe proposed method achieves significantly superior performance of both the\ndetection and re-identification to preserve the knowledge learned in the old\ndomains compared with the existing methods.\n","authors":["Jae-Won Yang","Seungbin Hong","Jae-Young Sim"],"pdf_url":"https://arxiv.org/pdf/2407.21252v1.pdf","comment":"10 pages, 6 figure"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2407.21758v1","updated":"2024-07-31T17:26:40Z","published":"2024-07-31T17:26:40Z","title":"MOSAIC: Multimodal Multistakeholder-aware Visual Art Recommendation","summary":" Visual art (VA) recommendation is complex, as it has to consider the\ninterests of users (e.g. museum visitors) and other stakeholders (e.g. museum\ncurators). We study how to effectively account for key stakeholders in VA\nrecommendations while also considering user-centred measures such as novelty,\nserendipity, and diversity. We propose MOSAIC, a novel multimodal\nmultistakeholder-aware approach using state-of-the-art CLIP and BLIP backbone\narchitectures and two joint optimisation objectives: popularity and\nrepresentative selection of paintings across different categories. We conducted\nan offline evaluation using preferences elicited from 213 users followed by a\nuser study with 100 crowdworkers. We found a strong effect of popularity, which\nwas positively perceived by users, and a minimal effect of representativeness.\nMOSAIC's impact extends beyond visitors, benefiting various art stakeholders.\nIts user-centric approach has broader applicability, offering advancements for\ncontent recommendation across domains that require considering multiple\nstakeholders.\n","authors":["Bereket A. Yilma","Luis A. Leiva"],"pdf_url":"https://arxiv.org/pdf/2407.21758v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21712v1","updated":"2024-07-31T16:04:03Z","published":"2024-07-31T16:04:03Z","title":"Adaptive Retrieval-Augmented Generation for Conversational Systems","summary":" Despite the success of integrating large language models into the development\nof conversational systems, many studies have shown the effectiveness of\nretrieving and augmenting external knowledge for informative responses. Hence,\nmany existing studies commonly assume the always need for Retrieval Augmented\nGeneration (RAG) in a conversational system without explicit control. This\nraises a research question about such a necessity. In this study, we propose to\ninvestigate the need for each turn of system response to be augmented with\nexternal knowledge. In particular, by leveraging human judgements on the binary\nchoice of adaptive augmentation, we develop RAGate, a gating model, which\nmodels conversation context and relevant inputs to predict if a conversational\nsystem requires RAG for improved responses. We conduct extensive experiments on\ndevising and applying RAGate to conversational models and well-rounded analyses\nof different conversational scenarios. Our experimental results and analysis\nindicate the effective application of RAGate in RAG-based conversational\nsystems in identifying system responses for appropriate RAG with high-quality\nresponses and a high generation confidence. This study also identifies the\ncorrelation between the generation's confidence level and the relevance of the\naugmented knowledge.\n","authors":["Xi Wang","Procheta Sen","Ruizhe Li","Emine Yilmaz"],"pdf_url":"https://arxiv.org/pdf/2407.21712v1.pdf","comment":"12 pages, under review"},{"id":"http://arxiv.org/abs/2312.14433v2","updated":"2024-07-31T16:03:43Z","published":"2023-12-22T04:46:21Z","title":"Attribute-driven Disentangled Representation Learning for Multimodal\n Recommendation","summary":" Recommendation algorithms forecast user preferences by correlating user and\nitem representations derived from historical interaction patterns. In pursuit\nof enhanced performance, many methods focus on learning robust and independent\nrepresentations by disentangling the intricate factors within interaction data\nacross various modalities in an unsupervised manner. However, such an approach\nobfuscates the discernment of how specific factors (e.g., category or brand)\ninfluence the outcomes, making it challenging to regulate their effects. In\nresponse to this challenge, we introduce a novel method called Attribute-Driven\nDisentangled Representation Learning (short for AD-DRL), which explicitly\nincorporates attributes from different modalities into the disentangled\nrepresentation learning process. By assigning a specific attribute to each\nfactor in multimodal features, AD-DRL can disentangle the factors at both\nattribute and attribute-value levels. To obtain robust and independent\nrepresentations for each factor associated with a specific attribute, we first\ndisentangle the representations of features both within and across different\nmodalities. Moreover, we further enhance the robustness of the representations\nby fusing the multimodal features of the same factor. Empirical evaluations\nconducted on three public real-world datasets substantiate the effectiveness of\nAD-DRL, as well as its interpretability and controllability.\n","authors":["Zhenyang Li","Fan Liu","Yinwei Wei","Zhiyong Cheng","Liqiang Nie","Mohan Kankanhalli"],"pdf_url":"https://arxiv.org/pdf/2312.14433v2.pdf","comment":"ACM Multimedia 2024 Accepted"},{"id":"http://arxiv.org/abs/2407.13349v4","updated":"2024-07-31T15:59:46Z","published":"2024-07-18T09:49:13Z","title":"DCNv3: Towards Next Generation Deep Cross Network for CTR Prediction","summary":" Deep & Cross Network and its derivative models have become an important\nparadigm in click-through rate (CTR) prediction due to their effective balance\nbetween computational cost and performance. However, these models face four\nmajor limitations: (1) while most models claim to capture high-order feature\ninteractions, they often do so implicitly and non-interpretably through deep\nneural networks (DNN), which limits the trustworthiness of the model's\npredictions; (2) the performance of existing explicit feature interaction\nmethods is often weaker than that of implicit DNN, undermining their necessity;\n(3) many models fail to adaptively filter noise while enhancing the order of\nfeature interactions; (4) the fusion methods of most models cannot provide\nsuitable supervision signals for their different interaction methods.\n To address the identified limitations, this paper proposes the next\ngeneration Deep Cross Network (DCNv3) and Shallow & Deep Cross Network\n(SDCNv3). These models ensure interpretability in feature interaction modeling\nwhile exponentially increasing the order of feature interactions to achieve\ngenuine Deep Crossing rather than just Deep & Cross. Additionally, we employ a\nSelf-Mask operation to filter noise and reduce the number of parameters in the\ncross network by half. In the fusion layer, we use a simple yet effective loss\nweight calculation method called Tri-BCE to provide appropriate supervision\nsignals. Comprehensive experiments on six datasets demonstrate the\neffectiveness, efficiency, and interpretability of DCNv3 and SDCNv3. The code,\nrunning logs, and detailed hyperparameter configurations are available at:\nhttps://anonymous.4open.science/r/DCNv3-E352.\n","authors":["Honghao Li","Yiwen Zhang","Yi Zhang","Hanwei Li","Lei Sang"],"pdf_url":"https://arxiv.org/pdf/2407.13349v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.16383v2","updated":"2024-07-31T15:02:07Z","published":"2024-06-24T07:52:05Z","title":"Context-augmented Retrieval: A Novel Framework for Fast Information\n Retrieval based Response Generation using Large Language Model","summary":" Generating high-quality answers consistently by providing contextual\ninformation embedded in the prompt passed to the Large Language Model (LLM) is\ndependent on the quality of information retrieval. As the corpus of contextual\ninformation grows, the answer/inference quality of Retrieval Augmented\nGeneration (RAG) based Question Answering (QA) systems declines. This work\nsolves this problem by combining classical text classification with the Large\nLanguage Model (LLM) to enable quick information retrieval from the vector\nstore and ensure the relevancy of retrieved information. For the same, this\nwork proposes a new approach Context Augmented retrieval (CAR), where\npartitioning of vector database by real-time classification of information\nflowing into the corpus is done. CAR demonstrates good quality answer\ngeneration along with significant reduction in information retrieval and answer\ngeneration time.\n","authors":["Sai Ganesh","Anupam Purwar","Gautam B"],"pdf_url":"https://arxiv.org/pdf/2406.16383v2.pdf","comment":"Because the dataset in which the model was trained upon wasn't\n consistent across different sections so it was preferred to delete this\n preprint"},{"id":"http://arxiv.org/abs/2310.20501v3","updated":"2024-07-31T13:08:08Z","published":"2023-10-31T14:42:23Z","title":"Neural Retrievers are Biased Towards LLM-Generated Content","summary":" Recently, the emergence of large language models (LLMs) has revolutionized\nthe paradigm of information retrieval (IR) applications, especially in web\nsearch, by generating vast amounts of human-like texts on the Internet. As a\nresult, IR systems in the LLM era are facing a new challenge: the indexed\ndocuments are now not only written by human beings but also automatically\ngenerated by the LLMs. How these LLM-generated documents influence the IR\nsystems is a pressing and still unexplored question. In this work, we conduct a\nquantitative evaluation of IR models in scenarios where both human-written and\nLLM-generated texts are involved. Surprisingly, our findings indicate that\nneural retrieval models tend to rank LLM-generated documents higher. We refer\nto this category of biases in neural retrievers towards the LLM-generated\ncontent as the \\textbf{source bias}. Moreover, we discover that this bias is\nnot confined to the first-stage neural retrievers, but extends to the\nsecond-stage neural re-rankers. Then, in-depth analyses from the perspective of\ntext compression indicate that LLM-generated texts exhibit more focused\nsemantics with less noise, making it easier for neural retrieval models to\nsemantic match. To mitigate the source bias, we also propose a plug-and-play\ndebiased constraint for the optimization objective, and experimental results\nshow its effectiveness. Finally, we discuss the potential severe concerns\nstemming from the observed source bias and hope our findings can serve as a\ncritical wake-up call to the IR community and beyond. To facilitate future\nexplorations of IR in the LLM era, the constructed two new benchmarks are\navailable at https://github.com/KID-22/Source-Bias.\n","authors":["Sunhao Dai","Yuqi Zhou","Liang Pang","Weihao Liu","Xiaolin Hu","Yong Liu","Xiao Zhang","Gang Wang","Jun Xu"],"pdf_url":"https://arxiv.org/pdf/2310.20501v3.pdf","comment":"KDD 2024"},{"id":"http://arxiv.org/abs/2407.21515v1","updated":"2024-07-31T10:33:32Z","published":"2024-07-31T10:33:32Z","title":"Learning Effective Representations for Retrieval Using Self-Distillation\n with Adaptive Relevance Margins","summary":" Representation-based retrieval models, so-called biencoders, estimate the\nrelevance of a document to a query by calculating the similarity of their\nrespective embeddings. Current state-of-the-art biencoders are trained using an\nexpensive training regime involving knowledge distillation from a teacher model\nand batch-sampling. Instead of relying on a teacher model, we contribute a\nnovel parameter-free loss function for self-supervision that exploits the\npre-trained language modeling capabilities of the encoder model as a training\nsignal, eliminating the need for batch sampling by performing implicit hard\nnegative mining. We investigate the capabilities of our proposed approach\nthrough extensive ablation studies, demonstrating that self-distillation can\nmatch the effectiveness of teacher distillation using only 13.5% of the data,\nwhile offering a speedup in training time between 3x and 15x compared to\nparametrized losses. Code and data is made openly available.\n","authors":["Lukas Gienapp","Niklas Deckers","Martin Potthast","Harrisen Scells"],"pdf_url":"https://arxiv.org/pdf/2407.21515v1.pdf","comment":"9 Pages, 4 Tables, 6 Figures"},{"id":"http://arxiv.org/abs/2407.21488v1","updated":"2024-07-31T09:52:53Z","published":"2024-07-31T09:52:53Z","title":"Breaking the Hourglass Phenomenon of Residual Quantization: Enhancing\n the Upper Bound of Generative Retrieval","summary":" Generative retrieval (GR) has emerged as a transformative paradigm in search\nand recommender systems, leveraging numeric-based identifier representations to\nenhance efficiency and generalization. Notably, methods like TIGER employing\nResidual Quantization-based Semantic Identifiers (RQ-SID), have shown\nsignificant promise in e-commerce scenarios by effectively managing item IDs.\nHowever, a critical issue termed the \"\\textbf{Hourglass}\" phenomenon, occurs in\nRQ-SID, where intermediate codebook tokens become overly concentrated,\nhindering the full utilization of generative retrieval methods. This paper\nanalyses and addresses this problem by identifying data sparsity and\nlong-tailed distribution as the primary causes. Through comprehensive\nexperiments and detailed ablation studies, we analyze the impact of these\nfactors on codebook utilization and data distribution. Our findings reveal that\nthe \"Hourglass\" phenomenon substantially impacts the performance of RQ-SID in\ngenerative retrieval. We propose effective solutions to mitigate this issue,\nthereby significantly enhancing the effectiveness of generative retrieval in\nreal-world E-commerce applications.\n","authors":["Zhirui Kuai","Zuxu Chen","Huimu Wang","Mingming Li","Dadong Miao","Binbin Wang","Xusong Chen","Li Kuang","Yuxing Han","Jiaxing Wang","Guoyu Tang","Lin Liu","Songlin Wang","Jingwei Zhuo"],"pdf_url":"https://arxiv.org/pdf/2407.21488v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.13284v2","updated":"2024-07-31T09:20:02Z","published":"2024-07-18T08:36:28Z","title":"Semantic-aware Representation Learning for Homography Estimation","summary":" Homography estimation is the task of determining the transformation from an\nimage pair. Our approach focuses on employing detector-free feature matching\nmethods to address this issue. Previous work has underscored the importance of\nincorporating semantic information, however there still lacks an efficient way\nto utilize semantic information. Previous methods suffer from treating the\nsemantics as a pre-processing, causing the utilization of semantics overly\ncoarse-grained and lack adaptability when dealing with different tasks. In our\nwork, we seek another way to use the semantic information, that is\nsemantic-aware feature representation learning framework.Based on this, we\npropose SRMatcher, a new detector-free feature matching method, which\nencourages the network to learn integrated semantic feature\nrepresentation.Specifically, to capture precise and rich semantics, we leverage\nthe capabilities of recently popularized vision foundation models (VFMs)\ntrained on extensive datasets. Then, a cross-images Semantic-aware Fusion Block\n(SFB) is proposed to integrate its fine-grained semantic features into the\nfeature representation space. In this way, by reducing errors stemming from\nsemantic inconsistencies in matching pairs, our proposed SRMatcher is able to\ndeliver more accurate and realistic outcomes. Extensive experiments show that\nSRMatcher surpasses solid baselines and attains SOTA results on multiple\nreal-world datasets. Compared to the previous SOTA approach GeoFormer,\nSRMatcher increases the area under the cumulative curve (AUC) by about 11% on\nHPatches. Additionally, the SRMatcher could serve as a plug-and-play framework\nfor other matching methods like LoFTR, yielding substantial precision\nimprovement.\n","authors":["Yuhan Liu","Qianxin Huang","Siqi Hui","Jingwen Fu","Sanping Zhou","Kangyi Wu","Pengna Li","Jinjun Wang"],"pdf_url":"https://arxiv.org/pdf/2407.13284v2.pdf","comment":"Accepted by ACM MM 2024"},{"id":"http://arxiv.org/abs/2310.20443v2","updated":"2024-07-31T08:47:41Z","published":"2023-10-31T13:24:28Z","title":"Ontologies for Models and Algorithms in Applied Mathematics and Related\n Disciplines","summary":" In applied mathematics and related disciplines, the\nmodeling-simulation-optimization workflow is a prominent scheme, with\nmathematical models and numerical algorithms playing a crucial role. For these\ntypes of mathematical research data, the Mathematical Research Data Initiative\nhas developed, merged and implemented ontologies and knowledge graphs. This\ncontributes to making mathematical research data FAIR by introducing semantic\ntechnology and documenting the mathematical foundations accordingly. Using the\nconcrete example of microfracture analysis of porous media, it is shown how the\nknowledge of the underlying mathematical model and the corresponding numerical\nalgorithms for its solution can be represented by the ontologies.\n","authors":["Björn Schembera","Frank Wübbeling","Hendrik Kleikamp","Christine Biedinger","Jochen Fiedler","Marco Reidelbach","Aurela Shehu","Burkhard Schmidt","Thomas Koprucki","Dorothea Iglezakis","Dominik Göddeke"],"pdf_url":"https://arxiv.org/pdf/2310.20443v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21430v1","updated":"2024-07-31T08:29:35Z","published":"2024-07-31T08:29:35Z","title":"ABCDE: Application-Based Cluster Diff Evals","summary":" This paper considers the problem of evaluating clusterings of very large\npopulations of items. Given two clusterings, namely a Baseline clustering and\nan Experiment clustering, the tasks are twofold: 1) characterize their\ndifferences, and 2) determine which clustering is better. ABCDE is a novel\nevaluation technique for accomplishing that. It aims to be practical: it allows\nitems to have associated importance values that are application-specific, it is\nfrugal in its use of human judgements when determining which clustering is\nbetter, and it can report metrics for arbitrary slices of items, thereby\nfacilitating understanding and debugging. The approach to measuring the delta\nin the clustering quality is novel: instead of trying to construct an expensive\nground truth up front and evaluating the each clustering with respect to that,\nwhere the ground truth must effectively pre-anticipate clustering changes,\nABCDE samples questions for judgement on the basis of the actual diffs between\nthe clusterings. ABCDE builds upon the pointwise metrics for clustering\nevaluation, which make the ABCDE metrics intuitive and simple to understand.\nThe mathematical elegance of the pointwise metrics equip ABCDE with rigorous\nyet practical ways to explore the clustering diffs and to estimate the quality\ndelta.\n","authors":["Stephan van Staden","Alexander Grubb"],"pdf_url":"https://arxiv.org/pdf/2407.21430v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21364v1","updated":"2024-07-31T06:27:06Z","published":"2024-07-31T06:27:06Z","title":"Personalized Multi-task Training for Recommender System","summary":" In the vast landscape of internet information, recommender systems (RecSys)\nhave become essential for guiding users through a sea of choices aligned with\ntheir preferences. These systems have applications in diverse domains, such as\nnews feeds, game suggestions, and shopping recommendations. Personalization is\na key technique in RecSys, where modern methods leverage representation\nlearning to encode user/item interactions into embeddings, forming the\nfoundation for personalized recommendations. However, integrating information\nfrom multiple sources to enhance recommendation performance remains\nchallenging. This paper introduces a novel approach named PMTRec, the first\npersonalized multi-task learning algorithm to obtain comprehensive user/item\nembeddings from various information sources. Addressing challenges specific to\npersonalized RecSys, we develop modules to handle personalized task weights,\ndiverse task orientations, and variations in gradient magnitudes across tasks.\nPMTRec dynamically adjusts task weights based on gradient norms for each\nuser/item, employs a Task Focusing module to align gradient combinations with\nthe main recommendation task, and uses a Gradient Magnitude Balancing module to\nensure balanced training across tasks. Through extensive experiments on three\nreal-world datasets with different scales, we demonstrate that PMTRec\nsignificantly outperforms existing multi-task learning methods, showcasing its\neffectiveness in achieving enhanced recommendation accuracy by leveraging\nmultiple tasks simultaneously. Our contributions open new avenues for advancing\npersonalized multi-task training in recommender systems.\n","authors":["Liangwei Yang","Zhiwei Liu","Jianguo Zhang","Rithesh Murthy","Shelby Heinecke","Huan Wang","Caiming Xiong","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2407.21364v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2407.21359v1","updated":"2024-07-31T06:04:55Z","published":"2024-07-31T06:04:55Z","title":"ProSpec RL: Plan Ahead, then Execute","summary":" Imagining potential outcomes of actions before execution helps agents make\nmore informed decisions, a prospective thinking ability fundamental to human\ncognition. However, mainstream model-free Reinforcement Learning (RL) methods\nlack the ability to proactively envision future scenarios, plan, and guide\nstrategies. These methods typically rely on trial and error to adjust policy\nfunctions, aiming to maximize cumulative rewards or long-term value, even if\nsuch high-reward decisions place the environment in extremely dangerous states.\nTo address this, we propose the Prospective (ProSpec) RL method, which makes\nhigher-value, lower-risk optimal decisions by imagining future n-stream\ntrajectories. Specifically, ProSpec employs a dynamic model to predict future\nstates (termed \"imagined states\") based on the current state and a series of\nsampled actions. Furthermore, we integrate the concept of Model Predictive\nControl and introduce a cycle consistency constraint that allows the agent to\nevaluate and select the optimal actions from these trajectories. Moreover,\nProSpec employs cycle consistency to mitigate two fundamental issues in RL:\naugmenting state reversibility to avoid irreversible events (low risk) and\naugmenting actions to generate numerous virtual trajectories, thereby improving\ndata efficiency. We validated the effectiveness of our method on the DMControl\nbenchmarks, where our approach achieved significant performance improvements.\nCode will be open-sourced upon acceptance.\n","authors":["Liangliang Liu","Yi Guan","BoRan Wang","Rujia Shen","Yi Lin","Chaoran Kong","Lian Yan","Jingchi Jiang"],"pdf_url":"https://arxiv.org/pdf/2407.21359v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15411v2","updated":"2024-07-31T05:46:09Z","published":"2024-07-22T06:37:24Z","title":"Scalable Dynamic Embedding Size Search for Streaming Recommendation","summary":" Recommender systems typically represent users and items by learning their\nembeddings, which are usually set to uniform dimensions and dominate the model\nparameters. However, real-world recommender systems often operate in streaming\nrecommendation scenarios, where the number of users and items continues to\ngrow, leading to substantial storage resource consumption for these embeddings.\nAlthough a few methods attempt to mitigate this by employing embedding size\nsearch strategies to assign different embedding dimensions in streaming\nrecommendations, they assume that the embedding size grows with the frequency\nof users/items, which eventually still exceeds the predefined memory budget\nover time. To address this issue, this paper proposes to learn Scalable\nLightweight Embeddings for streaming recommendation, called SCALL, which can\nadaptively adjust the embedding sizes of users/items within a given memory\nbudget over time. Specifically, we propose to sample embedding sizes from a\nprobabilistic distribution, with the guarantee to meet any predefined memory\nbudget. By fixing the memory budget, the proposed embedding size sampling\nstrategy can increase and decrease the embedding sizes in accordance to the\nfrequency of the corresponding users or items. Furthermore, we develop a\nreinforcement learning-based search paradigm that models each state with mean\npooling to keep the length of the state vectors fixed, invariant to the\nchanging number of users and items. As a result, the proposed method can\nprovide embedding sizes to unseen users and items. Comprehensive empirical\nevaluations on two public datasets affirm the advantageous effectiveness of our\nproposed method.\n","authors":["Yunke Qu","Liang Qu","Tong Chen","Xiangyu Zhao","Quoc Viet Hung Nguyen","Hongzhi Yin"],"pdf_url":"https://arxiv.org/pdf/2407.15411v2.pdf","comment":"accepted to CIKM 2024"},{"id":"http://arxiv.org/abs/2403.05122v2","updated":"2024-07-31T04:58:56Z","published":"2024-03-08T07:36:14Z","title":"Multi-Tower Multi-Interest Recommendation with User Representation Repel","summary":" In the era of information overload, the value of recommender systems has been\nprofoundly recognized in academia and industry alike. Multi-interest sequential\nrecommendation, in particular, is a subfield that has been receiving increasing\nattention in recent years. By generating multiple-user representations,\nmulti-interest learning models demonstrate superior expressiveness than\nsingle-user representation models, both theoretically and empirically. Despite\nmajor advancements in the field, three major issues continue to plague the\nperformance and adoptability of multi-interest learning methods, the difference\nbetween training and deployment objectives, the inability to access item\ninformation, and the difficulty of industrial adoption due to its single-tower\narchitecture. We address these challenges by proposing a novel multi-tower\nmulti-interest framework with user representation repel. Experimental results\nacross multiple large-scale industrial datasets proved the effectiveness and\ngeneralizability of our proposed framework.\n","authors":["Tianyu Xiong","Xiaohan Yu"],"pdf_url":"https://arxiv.org/pdf/2403.05122v2.pdf","comment":"Not accepted by conference"},{"id":"http://arxiv.org/abs/2407.21300v1","updated":"2024-07-31T03:00:59Z","published":"2024-07-31T03:00:59Z","title":"Implementing Streaming algorithm and k-means clusters to RAG","summary":" Retrieval-augmented generation (RAG) has achieved great success in\ninformation retrieval to assist large models because it builds an external\nknowledge database. However, it also has many problems: it consumes a lot of\nmemory because of the huge database. When faced with massive streaming data, it\nis unable to update the established index database in time. To save the memory\nof building the database and maintain accuracy simultaneously, we proposed a\nnew approach combining a streaming algorithm and k-means cluster with RAG. Our\napproach applies a streaming algorithm to update the index and reduce memory\nconsumption. Then use the k-means algorithm to cluster documents with high\nsimilarities together, the query time will be shortened by doing this. We\nconducted comparative experiments on four methods, and the results show that\nRAG with streaming algorithm and k-means cluster performs well in accuracy and\nmemory. For massive streaming data, we find that our method behaves better than\ntraditional RAG\n","authors":["Haoyu Kang","Yuzhou Zhu","Yukun Zhong","Ke Wang"],"pdf_url":"https://arxiv.org/pdf/2407.21300v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2407.21794v1","updated":"2024-07-31T17:59:58Z","published":"2024-07-31T17:59:58Z","title":"Generalized Out-of-Distribution Detection and Beyond in Vision Language\n Model Era: A Survey","summary":" Detecting out-of-distribution (OOD) samples is crucial for ensuring the\nsafety of machine learning systems and has shaped the field of OOD detection.\nMeanwhile, several other problems are closely related to OOD detection,\nincluding anomaly detection (AD), novelty detection (ND), open set recognition\n(OSR), and outlier detection (OD). To unify these problems, a generalized OOD\ndetection framework was proposed, taxonomically categorizing these five\nproblems. However, Vision Language Models (VLMs) such as CLIP have\nsignificantly changed the paradigm and blurred the boundaries between these\nfields, again confusing researchers. In this survey, we first present a\ngeneralized OOD detection v2, encapsulating the evolution of AD, ND, OSR, OOD\ndetection, and OD in the VLM era. Our framework reveals that, with some field\ninactivity and integration, the demanding challenges have become OOD detection\nand AD. In addition, we also highlight the significant shift in the definition,\nproblem settings, and benchmarks; we thus feature a comprehensive review of the\nmethodology for OOD detection, including the discussion over other related\ntasks to clarify their relationship to OOD detection. Finally, we explore the\nadvancements in the emerging Large Vision Language Model (LVLM) era, such as\nGPT-4V. We conclude this survey with open challenges and future directions.\n","authors":["Atsuyuki Miyai","Jingkang Yang","Jingyang Zhang","Yifei Ming","Yueqian Lin","Qing Yu","Go Irie","Shafiq Joty","Yixuan Li","Hai Li","Ziwei Liu","Toshihiko Yamasaki","Kiyoharu Aizawa"],"pdf_url":"https://arxiv.org/pdf/2407.21794v1.pdf","comment":"survey paper. We welcome questions, issues, and paper requests via\n https://github.com/AtsuMiyai/Awesome-OOD-VLM"},{"id":"http://arxiv.org/abs/2407.21792v1","updated":"2024-07-31T17:59:24Z","published":"2024-07-31T17:59:24Z","title":"Safetywashing: Do AI Safety Benchmarks Actually Measure Safety Progress?","summary":" As artificial intelligence systems grow more powerful, there has been\nincreasing interest in \"AI safety\" research to address emerging and future\nrisks. However, the field of AI safety remains poorly defined and\ninconsistently measured, leading to confusion about how researchers can\ncontribute. This lack of clarity is compounded by the unclear relationship\nbetween AI safety benchmarks and upstream general capabilities (e.g., general\nknowledge and reasoning). To address these issues, we conduct a comprehensive\nmeta-analysis of AI safety benchmarks, empirically analyzing their correlation\nwith general capabilities across dozens of models and providing a survey of\nexisting directions in AI safety. Our findings reveal that many safety\nbenchmarks highly correlate with upstream model capabilities, potentially\nenabling \"safetywashing\" -- where capability improvements are misrepresented as\nsafety advancements. Based on these findings, we propose an empirical\nfoundation for developing more meaningful safety metrics and define AI safety\nin a machine learning research context as a set of clearly delineated research\ngoals that are empirically separable from generic capabilities advancements. In\ndoing so, we aim to provide a more rigorous framework for AI safety research,\nadvancing the science of safety evaluations and clarifying the path towards\nmeasurable progress.\n","authors":["Richard Ren","Steven Basart","Adam Khoja","Alice Gatti","Long Phan","Xuwang Yin","Mantas Mazeika","Alexander Pan","Gabriel Mukobi","Ryan H. Kim","Stephen Fitz","Dan Hendrycks"],"pdf_url":"https://arxiv.org/pdf/2407.21792v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21791v1","updated":"2024-07-31T17:59:09Z","published":"2024-07-31T17:59:09Z","title":"Deep Learning for Options Trading: An End-To-End Approach","summary":" We introduce a novel approach to options trading strategies using a highly\nscalable and data-driven machine learning algorithm. In contrast to traditional\napproaches that often require specifications of underlying market dynamics or\nassumptions on an option pricing model, our models depart fundamentally from\nthe need for these prerequisites, directly learning non-trivial mappings from\nmarket data to optimal trading signals. Backtesting on more than a decade of\noption contracts for equities listed on the S&P 100, we demonstrate that deep\nlearning models trained according to our end-to-end approach exhibit\nsignificant improvements in risk-adjusted performance over existing rules-based\ntrading strategies. We find that incorporating turnover regularization into the\nmodels leads to further performance enhancements at prohibitively high levels\nof transaction costs.\n","authors":["Wee Ling Tan","Stephen Roberts","Stefan Zohren"],"pdf_url":"https://arxiv.org/pdf/2407.21791v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00278v2","updated":"2024-07-31T17:57:37Z","published":"2024-06-29T02:06:01Z","title":"PerAct2: Benchmarking and Learning for Robotic Bimanual Manipulation\n Tasks","summary":" Bimanual manipulation is challenging due to precise spatial and temporal\ncoordination required between two arms. While there exist several real-world\nbimanual systems, there is a lack of simulated benchmarks with a large task\ndiversity for systematically studying bimanual capabilities across a wide range\nof tabletop tasks. This paper addresses the gap by extending RLBench to\nbimanual manipulation. We open-source our code and benchmark comprising 13 new\ntasks with 23 unique task variations, each requiring a high degree of\ncoordination and adaptability. To kickstart the benchmark, we extended several\nstate-of-the art methods to bimanual manipulation and also present a\nlanguage-conditioned behavioral cloning agent -- PerAct2, which enables the\nlearning and execution of bimanual 6-DoF manipulation tasks. Our novel network\narchitecture efficiently integrates language processing with action prediction,\nallowing robots to understand and perform complex bimanual tasks in response to\nuser-specified goals. Project website with code is available at:\nhttp://bimanual.github.io\n","authors":["Markus Grotz","Mohit Shridhar","Tamim Asfour","Dieter Fox"],"pdf_url":"https://arxiv.org/pdf/2407.00278v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20194v4","updated":"2024-07-31T17:57:33Z","published":"2024-05-30T15:58:22Z","title":"Occam Gradient Descent","summary":" Deep learning neural network models must be large enough to adapt to their\nproblem domain, while small enough to avoid overfitting training data during\ngradient descent. To balance these competing demands, overprovisioned deep\nlearning models such as transformers are trained for a single epoch on large\ndata sets, and hence inefficient with both computing resources and training\ndata. In response to these inefficiencies, we exploit learning theory to derive\nOccam Gradient Descent, an algorithm that interleaves adaptive reduction of\nmodel size to minimize generalization error, with gradient descent on model\nweights to minimize fitting error. In contrast, traditional gradient descent\ngreedily minimizes fitting error without regard to generalization error. Our\nalgorithm simultaneously descends the space of weights and topological size of\nany neural network without modification. With respect to loss, compute and\nmodel size, our experiments show (a) on image classification benchmarks, linear\nand convolutional neural networks trained with Occam Gradient Descent\noutperform traditional gradient descent with or without post-train pruning; (b)\non a range of tabular data classification tasks, neural networks trained with\nOccam Gradient Descent outperform traditional gradient descent, as well as\nRandom Forests; (c) on natural language transformers, Occam Gradient Descent\noutperforms traditional gradient descent.\n","authors":["B. N. Kausik"],"pdf_url":"https://arxiv.org/pdf/2405.20194v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21788v1","updated":"2024-07-31T17:57:32Z","published":"2024-07-31T17:57:32Z","title":"Vision-Language Model Based Handwriting Verification","summary":" Handwriting Verification is a critical in document forensics. Deep learning\nbased approaches often face skepticism from forensic document examiners due to\ntheir lack of explainability and reliance on extensive training data and\nhandcrafted features. This paper explores using Vision Language Models (VLMs),\nsuch as OpenAI's GPT-4o and Google's PaliGemma, to address these challenges. By\nleveraging their Visual Question Answering capabilities and 0-shot\nChain-of-Thought (CoT) reasoning, our goal is to provide clear,\nhuman-understandable explanations for model decisions. Our experiments on the\nCEDAR handwriting dataset demonstrate that VLMs offer enhanced\ninterpretability, reduce the need for large training datasets, and adapt better\nto diverse handwriting styles. However, results show that the CNN-based\nResNet-18 architecture outperforms the 0-shot CoT prompt engineering approach\nwith GPT-4o (Accuracy: 70%) and supervised fine-tuned PaliGemma (Accuracy:\n71%), achieving an accuracy of 84% on the CEDAR AND dataset. These findings\nhighlight the potential of VLMs in generating human-interpretable decisions\nwhile underscoring the need for further advancements to match the performance\nof specialized deep learning models.\n","authors":["Mihir Chauhan","Abhishek Satbhai","Mohammad Abuzar Hashemi","Mir Basheer Ali","Bina Ramamurthy","Mingchen Gao","Siwei Lyu","Sargur Srihari"],"pdf_url":"https://arxiv.org/pdf/2407.21788v1.pdf","comment":"4 Pages, 1 Figure, 1 Table, Accepted as Short paper at Irish Machine\n Vision and Image Processing (IMVIP) Conference"},{"id":"http://arxiv.org/abs/2407.21787v1","updated":"2024-07-31T17:57:25Z","published":"2024-07-31T17:57:25Z","title":"Large Language Monkeys: Scaling Inference Compute with Repeated Sampling","summary":" Scaling the amount of compute used to train language models has dramatically\nimproved their capabilities. However, when it comes to inference, we often\nlimit the amount of compute to only one attempt per problem. Here, we explore\ninference compute as another axis for scaling by increasing the number of\ngenerated samples. Across multiple tasks and models, we observe that coverage -\nthe fraction of problems solved by any attempt - scales with the number of\nsamples over four orders of magnitude. In domains like coding and formal\nproofs, where all answers can be automatically verified, these increases in\ncoverage directly translate into improved performance. When we apply repeated\nsampling to SWE-bench Lite, the fraction of issues solved with\nDeepSeek-V2-Coder-Instruct increases from 15.9% with one sample to 56% with 250\nsamples, outperforming the single-attempt state-of-the-art of 43% which uses\nmore capable frontier models. Moreover, using current API pricing, amplifying\nthe cheaper DeepSeek model with five samples is more cost-effective and solves\nmore issues than paying a premium for one sample from GPT-4o or Claude 3.5\nSonnet. Interestingly, the relationship between coverage and the number of\nsamples is often log-linear and can be modelled with an exponentiated power\nlaw, suggesting the existence of inference-time scaling laws. Finally, we find\nthat identifying correct samples out of many generations remains an important\ndirection for future research in domains without automatic verifiers. When\nsolving math word problems from GSM8K and MATH, coverage with Llama-3 models\ngrows to over 95% with 10,000 samples. However, common methods to pick correct\nsolutions from a sample collection, such as majority voting or reward models,\nplateau beyond several hundred samples and fail to fully scale with the sample\nbudget.\n","authors":["Bradley Brown","Jordan Juravsky","Ryan Ehrlich","Ronald Clark","Quoc V. Le","Christopher Ré","Azalia Mirhoseini"],"pdf_url":"https://arxiv.org/pdf/2407.21787v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20999v2","updated":"2024-07-31T17:56:03Z","published":"2024-07-30T17:38:24Z","title":"MoFO: Momentum-Filtered Optimizer for Mitigating Forgetting in LLM\n Fine-Tuning","summary":" Recently, large language models (LLMs) have demonstrated remarkable\ncapabilities in a wide range of tasks. Typically, an LLM is pre-trained on\nlarge corpora and subsequently fine-tuned on task-specific datasets. However,\nduring fine-tuning, LLMs may forget the knowledge acquired in the pre-training\nstage, leading to a decline in general capabilities. To address this issue, we\npropose a new fine-tuning algorithm termed Momentum-Filtered Optimizer (MoFO).\nThe key idea of MoFO is to iteratively select and update the model parameters\nwith the largest momentum magnitudes. Compared to full-parameter training, MoFO\nachieves similar fine-tuning performance while keeping parameters closer to the\npre-trained model, thereby mitigating knowledge forgetting. Unlike most\nexisting methods for forgetting mitigation, MoFO combines the following two\nadvantages. First, MoFO does not require access to pre-training data. This\nmakes MoFO particularly suitable for fine-tuning scenarios where pre-training\ndata is unavailable, such as fine-tuning checkpoint-only open-source LLMs.\nSecond, MoFO does not alter the original loss function. This could avoid\nimpairing the model performance on the fine-tuning tasks. We validate MoFO\nthrough rigorous convergence analysis and extensive experiments, demonstrating\nits superiority over existing methods in mitigating forgetting and enhancing\nfine-tuning performance.\n","authors":["Yupeng Chen","Senmiao Wang","Zhihang Lin","Zeyu Qin","Yushun Zhang","Tian Ding","Ruoyu Sun"],"pdf_url":"https://arxiv.org/pdf/2407.20999v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.13989v3","updated":"2024-07-31T17:55:00Z","published":"2024-02-21T18:19:20Z","title":"FedADMM-InSa: An Inexact and Self-Adaptive ADMM for Federated Learning","summary":" Federated learning (FL) is a promising framework for learning from\ndistributed data while maintaining privacy. The development of efficient FL\nalgorithms encounters various challenges, including heterogeneous data and\nsystems, limited communication capacities, and constrained local computational\nresources. Recently developed FedADMM methods show great resilience to both\ndata and system heterogeneity. However, they still suffer from performance\ndeterioration if the hyperparameters are not carefully tuned. To address this\nissue, we propose an inexact and self-adaptive FedADMM algorithm, termed\nFedADMM-InSa. First, we design an inexactness criterion for the clients' local\nupdates to eliminate the need for empirically setting the local training\naccuracy. This inexactness criterion can be assessed by each client\nindependently based on its unique condition, thereby reducing the local\ncomputational cost and mitigating the undesirable straggle effect. The\nconvergence of the resulting inexact ADMM is proved under the assumption of\nstrongly convex loss functions. Additionally, we present a self-adaptive scheme\nthat dynamically adjusts each client's penalty parameter, enhancing algorithm\nrobustness by mitigating the need for empirical penalty parameter choices for\neach client. Extensive numerical experiments on both synthetic and real-world\ndatasets are conducted. As validated by some numerical tests, our proposed\nalgorithm can reduce the clients' local computational load significantly and\nalso accelerate the learning process compared to the vanilla FedADMM.\n","authors":["Yongcun Song","Ziqi Wang","Enrique Zuazua"],"pdf_url":"https://arxiv.org/pdf/2402.13989v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21772v1","updated":"2024-07-31T17:48:14Z","published":"2024-07-31T17:48:14Z","title":"ShieldGemma: Generative AI Content Moderation Based on Gemma","summary":" We present ShieldGemma, a comprehensive suite of LLM-based safety content\nmoderation models built upon Gemma2. These models provide robust,\nstate-of-the-art predictions of safety risks across key harm types (sexually\nexplicit, dangerous content, harassment, hate speech) in both user input and\nLLM-generated output. By evaluating on both public and internal benchmarks, we\ndemonstrate superior performance compared to existing models, such as Llama\nGuard (+10.8\\% AU-PRC on public benchmarks) and WildCard (+4.3\\%).\nAdditionally, we present a novel LLM-based data curation pipeline, adaptable to\na variety of safety-related tasks and beyond. We have shown strong\ngeneralization performance for model trained mainly on synthetic data. By\nreleasing ShieldGemma, we provide a valuable resource to the research\ncommunity, advancing LLM safety and enabling the creation of more effective\ncontent moderation solutions for developers.\n","authors":["Wenjun Zeng","Yuchi Liu","Ryan Mullins","Ludovic Peran","Joe Fernandez","Hamza Harkous","Karthik Narasimhan","Drew Proud","Piyush Kumar","Bhaktipriya Radharapu","Olivia Sturman","Oscar Wahltinez"],"pdf_url":"https://arxiv.org/pdf/2407.21772v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21770v1","updated":"2024-07-31T17:46:51Z","published":"2024-07-31T17:46:51Z","title":"MoMa: Efficient Early-Fusion Pre-training with Mixture of Modality-Aware\n Experts","summary":" We introduce MoMa, a novel modality-aware mixture-of-experts (MoE)\narchitecture designed for pre-training mixed-modal, early-fusion language\nmodels. MoMa processes images and text in arbitrary sequences by dividing\nexpert modules into modality-specific groups. These groups exclusively process\ndesignated tokens while employing learned routing within each group to maintain\nsemantically informed adaptivity. Our empirical results reveal substantial\npre-training efficiency gains through this modality-specific parameter\nallocation. Under a 1-trillion-token training budget, the MoMa 1.4B model,\nfeaturing 4 text experts and 4 image experts, achieves impressive FLOPs\nsavings: 3.7x overall, with 2.6x for text and 5.2x for image processing\ncompared to a compute-equivalent dense baseline, measured by pre-training loss.\nThis outperforms the standard expert-choice MoE with 8 mixed-modal experts,\nwhich achieves 3x overall FLOPs savings (3x for text, 2.8x for image).\nCombining MoMa with mixture-of-depths (MoD) further improves pre-training FLOPs\nsavings to 4.2x overall (text: 3.4x, image: 5.3x), although this combination\nhurts performance in causal inference due to increased sensitivity to router\naccuracy. These results demonstrate MoMa's potential to significantly advance\nthe efficiency of mixed-modal, early-fusion language model pre-training, paving\nthe way for more resource-efficient and capable multimodal AI systems.\n","authors":["Xi Victoria Lin","Akshat Shrivastava","Liang Luo","Srinivasan Iyer","Mike Lewis","Gargi Gosh","Luke Zettlemoyer","Armen Aghajanyan"],"pdf_url":"https://arxiv.org/pdf/2407.21770v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21748v1","updated":"2024-07-31T17:05:10Z","published":"2024-07-31T17:05:10Z","title":"Diagnostic Runtime Monitoring with Martingales","summary":" Machine learning systems deployed in safety-critical robotics settings must\nbe robust to distribution shifts. However, system designers must understand the\ncause of a distribution shift in order to implement the appropriate\nintervention or mitigation strategy and prevent system failure. In this paper,\nwe present a novel framework for diagnosing distribution shifts in a streaming\nfashion by deploying multiple stochastic martingales simultaneously. We show\nthat knowledge of the underlying cause of a distribution shift can lead to\nproper interventions over the lifecycle of a deployed system. Our experimental\nframework can easily be adapted to different types of distribution shifts,\nmodels, and datasets. We find that our method outperforms existing work on\ndiagnosing distribution shifts in terms of speed, accuracy, and flexibility,\nand validate the efficiency of our model in both simulated and live hardware\nsettings.\n","authors":["Ali Hindy","Rachel Luo","Somrita Banerjee","Jonathan Kuck","Edward Schmerling","Marco Pavone"],"pdf_url":"https://arxiv.org/pdf/2407.21748v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21742v1","updated":"2024-07-31T16:55:18Z","published":"2024-07-31T16:55:18Z","title":"HGOE: Hybrid External and Internal Graph Outlier Exposure for Graph\n Out-of-Distribution Detection","summary":" With the progressive advancements in deep graph learning, out-of-distribution\n(OOD) detection for graph data has emerged as a critical challenge. While the\nefficacy of auxiliary datasets in enhancing OOD detection has been extensively\nstudied for image and text data, such approaches have not yet been explored for\ngraph data. Unlike Euclidean data, graph data exhibits greater diversity but\nlower robustness to perturbations, complicating the integration of outliers. To\ntackle these challenges, we propose the introduction of \\textbf{H}ybrid\nExternal and Internal \\textbf{G}raph \\textbf{O}utlier \\textbf{E}xposure (HGOE)\nto improve graph OOD detection performance. Our framework involves using\nrealistic external graph data from various domains and synthesizing internal\noutliers within ID subgroups to address the poor robustness and presence of OOD\nsamples within the ID class. Furthermore, we develop a boundary-aware OE loss\nthat adaptively assigns weights to outliers, maximizing the use of high-quality\nOOD samples while minimizing the impact of low-quality ones. Our proposed HGOE\nframework is model-agnostic and designed to enhance the effectiveness of\nexisting graph OOD detection models. Experimental results demonstrate that our\nHGOE framework can significantly improve the performance of existing OOD\ndetection models across all 8 real datasets.\n","authors":["Junwei He","Qianqian Xu","Yangbangyan Jiang","Zitai Wang","Yuchen Sun","Qingming Huang"],"pdf_url":"https://arxiv.org/pdf/2407.21742v1.pdf","comment":"Proceedings of the 32nd ACM International Conference on Multimedia"},{"id":"http://arxiv.org/abs/2407.21740v1","updated":"2024-07-31T16:52:00Z","published":"2024-07-31T16:52:00Z","title":"Contrastive Factor Analysis","summary":" Factor analysis, often regarded as a Bayesian variant of matrix\nfactorization, offers superior capabilities in capturing uncertainty, modeling\ncomplex dependencies, and ensuring robustness. As the deep learning era\narrives, factor analysis is receiving less and less attention due to their\nlimited expressive ability. On the contrary, contrastive learning has emerged\nas a potent technique with demonstrated efficacy in unsupervised\nrepresentational learning. While the two methods are different paradigms,\nrecent theoretical analysis has revealed the mathematical equivalence between\ncontrastive learning and matrix factorization, providing a potential\npossibility for factor analysis combined with contrastive learning. Motivated\nby the interconnectedness of contrastive learning, matrix factorization, and\nfactor analysis, this paper introduces a novel Contrastive Factor Analysis\nframework, aiming to leverage factor analysis's advantageous properties within\nthe realm of contrastive learning. To further leverage the interpretability\nproperties of non-negative factor analysis, which can learn disentangled\nrepresentations, contrastive factor analysis is extended to a non-negative\nversion. Finally, extensive experimental validation showcases the efficacy of\nthe proposed contrastive (non-negative) factor analysis methodology across\nmultiple key properties, including expressiveness, robustness,\ninterpretability, and accurate uncertainty estimation.\n","authors":["Zhibin Duan","Tiansheng Wen","Yifei Wang","Chen Zhu","Bo Chen","Mingyuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.21740v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21739v1","updated":"2024-07-31T16:48:06Z","published":"2024-07-31T16:48:06Z","title":"A Federated Learning-Friendly Approach for Parameter-Efficient\n Fine-Tuning of SAM in 3D Segmentation","summary":" Adapting foundation models for medical image analysis requires finetuning\nthem on a considerable amount of data because of extreme distribution shifts\nbetween natural (source) data used for pretraining and medical (target) data.\nHowever, collecting task-specific medical data for such finetuning at a central\nlocation raises many privacy concerns. Although Federated learning (FL)\nprovides an effective means for training on private decentralized data,\ncommunication costs in federating large foundation models can quickly become a\nsignificant bottleneck, impacting the solution's scalability. In this work, we\naddress this problem of efficient communication while ensuring effective\nlearning in FL by combining the strengths of Parameter-Efficient Fine-tuning\n(PEFT) with FL. Specifically, we study plug-and-play Low-Rank Adapters (LoRA)\nin a federated manner to adapt the Segment Anything Model (SAM) for 3D medical\nimage segmentation. Unlike prior works that utilize LoRA and finetune the\nentire decoder, we critically analyze the contribution of each granular\ncomponent of SAM on finetuning performance. Thus, we identify specific layers\nto be federated that are very efficient in terms of communication cost while\nproducing on-par accuracy. Our experiments show that retaining the parameters\nof the SAM model (including most of the decoder) in their original state during\nadaptation is beneficial because fine-tuning on small datasets tends to distort\nthe inherent capabilities of the underlying foundation model. On Fed-KiTS, our\napproach decreases communication cost (~48x) compared to full fine-tuning while\nincreasing performance (~6% Dice score) in 3D segmentation tasks. Our approach\nperforms similar to SAMed while achieving ~2.8x reduction in communication and\nparameters to be finetuned. We further validate our approach with experiments\non Fed-IXI and Prostate MRI datasets.\n","authors":["Mothilal Asokan","Joseph Geo Benjamin","Mohammad Yaqub","Karthik Nandakumar"],"pdf_url":"https://arxiv.org/pdf/2407.21739v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21738v1","updated":"2024-07-31T16:47:21Z","published":"2024-07-31T16:47:21Z","title":"Leveraging Self-Supervised Learning for Fetal Cardiac Planes\n Classification using Ultrasound Scan Videos","summary":" Self-supervised learning (SSL) methods are popular since they can address\nsituations with limited annotated data by directly utilising the underlying\ndata distribution. However, the adoption of such methods is not explored enough\nin ultrasound (US) imaging, especially for fetal assessment. We investigate the\npotential of dual-encoder SSL in utilizing unlabelled US video data to improve\nthe performance of challenging downstream Standard Fetal Cardiac Planes (SFCP)\nclassification using limited labelled 2D US images. We study 7 SSL approaches\nbased on reconstruction, contrastive loss, distillation, and information theory\nand evaluate them extensively on a large private US dataset. Our observations\nand findings are consolidated from more than 500 downstream training\nexperiments under different settings. Our primary observation shows that for\nSSL training, the variance of the dataset is more crucial than its size because\nit allows the model to learn generalisable representations, which improve the\nperformance of downstream tasks. Overall, the BarlowTwins method shows robust\nperformance, irrespective of the training settings and data variations, when\nused as an initialisation for downstream tasks. Notably, full fine-tuning with\n1% of labelled data outperforms ImageNet initialisation by 12% in F1-score and\noutperforms other SSL initialisations by at least 4% in F1-score, thus making\nit a promising candidate for transfer learning from US video to image data.\n","authors":["Joseph Geo Benjamin","Mothilal Asokan","Amna Alhosani","Hussain Alasmawi","Werner Gerhard Diehl","Leanne Bricker","Karthik Nandakumar","Mohammad Yaqub"],"pdf_url":"https://arxiv.org/pdf/2407.21738v1.pdf","comment":"Simplifying Medical Ultrasound: 4th International Workshop, ASMUS\n 2023, Held in Conjunction with MICCAI 2023, Vancouver, BC, Canada, October 8,\n 2023, Proceedings"},{"id":"http://arxiv.org/abs/2312.15357v2","updated":"2024-07-31T16:20:51Z","published":"2023-12-23T21:47:50Z","title":"Optimal Decision Tree and Adaptive Submodular Ranking with Noisy\n Outcomes","summary":" In pool-based active learning, the learner is given an unlabeled data set and\naims to efficiently learn the unknown hypothesis by querying the labels of the\ndata points. This can be formulated as the classical Optimal Decision Tree\n(ODT) problem: Given a set of tests, a set of hypotheses, and an outcome for\neach pair of test and hypothesis, our objective is to find a low-cost testing\nprocedure (i.e., decision tree) that identifies the true hypothesis. This\noptimization problem has been extensively studied under the assumption that\neach test generates a deterministic outcome. However, in numerous applications,\nfor example, clinical trials, the outcomes may be uncertain, which renders the\nideas from the deterministic setting invalid. In this work, we study a\nfundamental variant of the ODT problem in which some test outcomes are noisy,\neven in the more general case where the noise is persistent, i.e., repeating a\ntest gives the same noisy output. Our approximation algorithms provide\nguarantees that are nearly best possible and hold for the general case of a\nlarge number of noisy outcomes per test or per hypothesis where the performance\ndegrades continuously with this number. We numerically evaluated our algorithms\nfor identifying toxic chemicals and learning linear classifiers, and observed\nthat our algorithms have costs very close to the information-theoretic minimum.\n","authors":["Su Jia","Fatemeh Navidi","Viswanath Nagarajan","R. Ravi"],"pdf_url":"https://arxiv.org/pdf/2312.15357v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16137v2","updated":"2024-07-31T16:16:12Z","published":"2024-03-24T13:10:09Z","title":"A Survey on Self-Supervised Graph Foundation Models: Knowledge-Based\n Perspective","summary":" Graph self-supervised learning (SSL) is now a go-to method for pre-training\ngraph foundation models (GFMs). There is a wide variety of knowledge patterns\nembedded in the graph data, such as node properties and clusters, which are\ncrucial to learning generalized representations for GFMs. However, existing\nsurveys of GFMs have several shortcomings: they lack comprehensiveness\nregarding the most recent progress, have unclear categorization of\nself-supervised methods, and take a limited architecture-based perspective that\nis restricted to only certain types of graph models. As the ultimate goal of\nGFMs is to learn generalized graph knowledge, we provide a comprehensive survey\nof self-supervised GFMs from a novel knowledge-based perspective. We propose a\nknowledge-based taxonomy, which categorizes self-supervised graph models by the\nspecific graph knowledge utilized. Our taxonomy consists of microscopic (nodes,\nlinks, etc.), mesoscopic (context, clusters, etc.), and macroscopic knowledge\n(global structure, manifolds, etc.). It covers a total of 9 knowledge\ncategories and more than 25 pretext tasks for pre-training GFMs, as well as\nvarious downstream task generalization strategies. Such a knowledge-based\ntaxonomy allows us to re-examine graph models based on new architectures more\nclearly, such as graph language models, as well as provide more in-depth\ninsights for constructing GFMs.\n","authors":["Ziwen Zhao","Yixin Su","Yuhua Li","Yixiong Zou","Ruixuan Li","Rui Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.16137v2.pdf","comment":"21 pages, 7 figures; work in progress"},{"id":"http://arxiv.org/abs/2407.21713v1","updated":"2024-07-31T16:06:34Z","published":"2024-07-31T16:06:34Z","title":"Social Learning through Interactions with Other Agents: A Survey","summary":" Social learning plays an important role in the development of human\nintelligence. As children, we imitate our parents' speech patterns until we are\nable to produce sounds; we learn from them praising us and scolding us; and as\nadults, we learn by working with others. In this work, we survey the degree to\nwhich this paradigm -- social learning -- has been mirrored in machine\nlearning. In particular, since learning socially requires interacting with\nothers, we are interested in how embodied agents can and have utilised these\ntechniques. This is especially in light of the degree to which recent advances\nin natural language processing (NLP) enable us to perform new forms of social\nlearning. We look at how behavioural cloning and next-token prediction mirror\nhuman imitation, how learning from human feedback mirrors human education, and\nhow we can go further to enable fully communicative agents that learn from each\nother. We find that while individual social learning techniques have been used\nsuccessfully, there has been little unifying work showing how to bring them\ntogether into socially embodied agents.\n","authors":["Dylan hillier","Cheston Tan","Jing Jiang"],"pdf_url":"https://arxiv.org/pdf/2407.21713v1.pdf","comment":"To be published in IJCAI 2024, available on http://www.ijcai.org"},{"id":"http://arxiv.org/abs/2301.10369v3","updated":"2024-07-31T16:00:23Z","published":"2023-01-25T00:50:28Z","title":"Exact Fractional Inference via Re-Parametrization & Interpolation\n between Tree-Re-Weighted- and Belief Propagation- Algorithms","summary":" The computational complexity of inference -- required to compute the\npartition function, $Z$, of an Ising model over a graph of $N$''spins\" -- is\nmost likely exponential in $N$. Efficient variational methods, such as Belief\nPropagation (BP) and Tree Re-Weighted (TRW) algorithms, compute $Z$\napproximately by minimizing the respective (BP- or TRW-) free energy. We\ngeneralize the variational scheme by building a $\\lambda$-fractional\ninterpolation, $Z^{(\\lambda)}$, where $\\lambda=0$ and $\\lambda=1$ correspond to\nTRW- and BP-approximations, respectively. This fractional scheme -- coined\nFractional Belief Propagation (FBP) -- guarantees that in the attractive\n(ferromagnetic) case $Z^{(TRW)} \\geq Z^{(\\lambda)} \\geq Z^{(BP)}$, and there\nexists a unique (``exact\") $\\lambda_*$ such that $Z=Z^{(\\lambda_*)}$.\nGeneralizing the re-parametrization approach of\n\\citep{wainwright_tree-based_2002} and the loop series approach of\n\\citep{chertkov_loop_2006}, we show how to express $Z$ as a product, $\\forall\n\\lambda:\\ Z=Z^{(\\lambda)}{\\tilde Z}^{(\\lambda)}$, where the multiplicative\ncorrection, ${\\tilde Z}^{(\\lambda)}$, is an expectation over a node-independent\nprobability distribution built from node-wise fractional marginals. Our\ntheoretical analysis is complemented by extensive experiments with models from\nIsing ensembles over planar and random graphs of medium- and large-sizes. The\nempirical study yields a number of interesting observations, such as the\nability to estimate ${\\tilde Z}^{(\\lambda)}$ with $O(N^{2::4})$ fractional\nsamples and suppression of $\\lambda_*$ fluctuations with an increase in $N$ for\ninstances from a particular random Ising ensemble. We also verify and discuss\nthe applicability of this approach to the problem of image de-noising.\n","authors":["Hamidreza Behjoo","Michael Chertkov"],"pdf_url":"https://arxiv.org/pdf/2301.10369v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21670v1","updated":"2024-07-31T15:13:39Z","published":"2024-07-31T15:13:39Z","title":"Universal Approximation Theory: Foundations for Parallelism in Neural\n Networks","summary":" Neural networks are increasingly evolving towards training large models with\nbig data, a method that has demonstrated superior performance across many\ntasks. However, this approach introduces an urgent problem: current deep\nlearning models are predominantly serial, meaning that as the number of network\nlayers increases, so do the training and inference times. This is unacceptable\nif deep learning is to continue advancing. Therefore, this paper proposes a\ndeep learning parallelization strategy based on the Universal Approximation\nTheorem (UAT). From this foundation, we designed a parallel network called\nPara-Former to test our theory. Unlike traditional serial models, the inference\ntime of Para-Former does not increase with the number of layers, significantly\naccelerating the inference speed of multi-layer networks. Experimental results\nvalidate the effectiveness of this network.\n","authors":["Wei Wang","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2407.21670v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21669v1","updated":"2024-07-31T15:12:24Z","published":"2024-07-31T15:12:24Z","title":"Synth-Empathy: Towards High-Quality Synthetic Empathy Data","summary":" In recent years, with the rapid advancements in large language models (LLMs),\nachieving excellent empathetic response capabilities has become a crucial\nprerequisite. Consequently, managing and understanding empathetic datasets have\ngained increasing significance. However, empathetic data are typically\nhuman-labeled, leading to insufficient datasets and wasted human labor. In this\nwork, we present Synth-Empathy, an LLM-based data generation and quality and\ndiversity selection pipeline that automatically generates high-quality\nempathetic data while discarding low-quality data. With the data generated from\na low empathetic model, we are able to further improve empathetic response\nperformance and achieve state-of-the-art (SoTA) results across multiple\nbenchmarks. Moreover, our model achieves SoTA performance on various human\nevaluation benchmarks, demonstrating its effectiveness and robustness in\nreal-world applications. Furthermore, we show the trade-off between data\nquantity and quality, providing insights into empathetic data generation and\nselection.\n","authors":["Hao Liang","Linzhuang Sun","Jingxuan Wei","Xijie Huang","Linkun Sun","Bihui Yu","Conghui He","Wentao Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.21669v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2407.01937"},{"id":"http://arxiv.org/abs/2407.21666v1","updated":"2024-07-31T15:08:26Z","published":"2024-07-31T15:08:26Z","title":"An Explainable Vision Transformer with Transfer Learning Combined with\n Support Vector Machine Based Efficient Drought Stress Identification","summary":" Early detection of drought stress is critical for taking timely measures for\nreducing crop loss before the drought impact becomes irreversible. The subtle\nphenotypical and physiological changes in response to drought stress are\ncaptured by non-invasive imaging techniques and these imaging data serve as\nvaluable resource for machine learning methods to identify drought stress.\nWhile convolutional neural networks (CNNs) are in wide use, vision transformers\n(ViTs) present a promising alternative in capturing long-range dependencies and\nintricate spatial relationships, thereby enhancing the detection of subtle\nindicators of drought stress. We propose an explainable deep learning pipeline\nthat leverages the power of ViTs for drought stress detection in potato crops\nusing aerial imagery. We applied two distinct approaches: a synergistic\ncombination of ViT and support vector machine (SVM), where ViT extracts\nintricate spatial features from aerial images, and SVM classifies the crops as\nstressed or healthy and an end-to-end approach using a dedicated classification\nlayer within ViT to directly detect drought stress. Our key findings explain\nthe ViT model's decision-making process by visualizing attention maps. These\nmaps highlight the specific spatial features within the aerial images that the\nViT model focuses as the drought stress signature. Our findings demonstrate\nthat the proposed methods not only achieve high accuracy in drought stress\nidentification but also shedding light on the diverse subtle plant features\nassociated with drought stress. This offers a robust and interpretable solution\nfor drought stress monitoring for farmers to undertake informed decisions for\nimproved crop management.\n","authors":["Aswini Kumar Patra","Ankit Varshney","Lingaraj Sahoo"],"pdf_url":"https://arxiv.org/pdf/2407.21666v1.pdf","comment":"30 pages, 6 figures, 4 tables"},{"id":"http://arxiv.org/abs/2407.21665v1","updated":"2024-07-31T15:08:15Z","published":"2024-07-31T15:08:15Z","title":"A State-of-the-Art Review of Computational Models for Analyzing\n Longitudinal Wearable Sensor Data in Healthcare","summary":" Wearable devices are increasingly used as tools for biomedical research, as\nthe continuous stream of behavioral and physiological data they collect can\nprovide insights about our health in everyday contexts. Long-term tracking,\ndefined in the timescale of months of year, can provide insights of patterns\nand changes as indicators of health changes. These insights can make medicine\nand healthcare more predictive, preventive, personalized, and participative\n(The 4P's). However, the challenges in modeling, understanding and processing\nlongitudinal data are a significant barrier to their adoption in research\nstudies and clinical settings. In this paper, we review and discuss three\nmodels used to make sense of longitudinal data: routines, rhythms and stability\nmetrics. We present the challenges associated with the processing and analysis\nof longitudinal wearable sensor data, with a special focus on how to handle the\ndifferent temporal dynamics at various granularities. We then discuss current\nlimitations and identify directions for future work. This review is essential\nto the advancement of computational modeling and analysis of longitudinal\nsensor data for pervasive healthcare.\n","authors":["Paula Lago"],"pdf_url":"https://arxiv.org/pdf/2407.21665v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14664v2","updated":"2024-07-31T15:03:57Z","published":"2024-07-19T21:01:19Z","title":"Is $F_1$ Score Suboptimal for Cybersecurity Models? Introducing\n $C_{score}$, a Cost-Aware Alternative for Model Assessment","summary":" The cost of errors related to machine learning classifiers, namely, false\npositives and false negatives, are not equal and are application dependent. For\nexample, in cybersecurity applications, the cost of not detecting an attack is\nvery different from marking a benign activity as an attack. Various design\nchoices during machine learning model building, such as hyperparameter tuning\nand model selection, allow a data scientist to trade-off between these two\nerrors. However, most of the commonly used metrics to evaluate model quality,\nsuch as $F_1$ score, which is defined in terms of model precision and recall,\ntreat both these errors equally, making it difficult for users to optimize for\nthe actual cost of these errors. In this paper, we propose a new cost-aware\nmetric, $C_{score}$ based on precision and recall that can replace $F_1$ score\nfor model evaluation and selection. It includes a cost ratio that takes into\naccount the differing costs of handling false positives and false negatives. We\nderive and characterize the new cost metric, and compare it to $F_1$ score.\nFurther, we use this metric for model thresholding for five cybersecurity\nrelated datasets for multiple cost ratios. The results show an average cost\nsavings of 49%.\n","authors":["Manish Marwah","Asad Narayanan","Stephan Jou","Martin Arlitt","Maria Pospelova"],"pdf_url":"https://arxiv.org/pdf/2407.14664v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21658v1","updated":"2024-07-31T14:59:17Z","published":"2024-07-31T14:59:17Z","title":"Beat this! Accurate beat tracking without DBN postprocessing","summary":" We propose a system for tracking beats and downbeats with two objectives:\ngenerality across a diverse music range, and high accuracy. We achieve\ngenerality by training on multiple datasets -- including solo instrument\nrecordings, pieces with time signature changes, and classical music with high\ntempo variations -- and by removing the commonly used Dynamic Bayesian Network\n(DBN) postprocessing, which introduces constraints on the meter and tempo. For\nhigh accuracy, among other improvements, we develop a loss function tolerant to\nsmall time shifts of annotations, and an architecture alternating convolutions\nwith transformers either over frequency or time. Our system surpasses the\ncurrent state of the art in F1 score despite using no DBN. However, it can\nstill fail, especially for difficult and underrepresented genres, and performs\nworse on continuity metrics, so we publish our model, code, and preprocessed\ndatasets, and invite others to beat this.\n","authors":["Francesco Foscarin","Jan Schlüter","Gerhard Widmer"],"pdf_url":"https://arxiv.org/pdf/2407.21658v1.pdf","comment":"Accepted at the 25th International Society for Music Information\n Retrieval Conference (ISMIR), 2024"},{"id":"http://arxiv.org/abs/2407.21656v1","updated":"2024-07-31T14:57:23Z","published":"2024-07-31T14:57:23Z","title":"Comgra: A Tool for Analyzing and Debugging Neural Networks","summary":" Neural Networks are notoriously difficult to inspect. We introduce comgra, an\nopen source python library for use with PyTorch. Comgra extracts data about the\ninternal activations of a model and organizes it in a GUI (graphical user\ninterface). It can show both summary statistics and individual data points,\ncompare early and late stages of training, focus on individual samples of\ninterest, and visualize the flow of the gradient through the network. This\nmakes it possible to inspect the model's behavior from many different angles\nand save time by rapidly testing different hypotheses without having to rerun\nit. Comgra has applications for debugging, neural architecture design, and\nmechanistic interpretability. We publish our library through Python Package\nIndex (PyPI) and provide code, documentation, and tutorials at\nhttps://github.com/FlorianDietz/comgra.\n","authors":["Florian Dietz","Sophie Fellenz","Dietrich Klakow","Marius Kloft"],"pdf_url":"https://arxiv.org/pdf/2407.21656v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.19967v3","updated":"2024-07-31T14:54:25Z","published":"2023-10-30T19:30:00Z","title":"Early detection of inflammatory arthritis to improve referrals using\n multimodal machine learning from blood testing, semi-structured and\n unstructured patient records","summary":" Early detection of inflammatory arthritis (IA) is critical to efficient and\naccurate hospital referral triage for timely treatment and preventing the\ndeterioration of the IA disease course, especially under limited healthcare\nresources. The manual assessment process is the most common approach in\npractice for the early detection of IA, but it is extremely labor-intensive and\ninefficient. A large amount of clinical information needs to be assessed for\nevery referral from General Practice (GP) to the hospitals. Machine learning\nshows great potential in automating repetitive assessment tasks and providing\ndecision support for the early detection of IA. However, most machine\nlearning-based methods for IA detection rely on blood testing results. But in\npractice, blood testing data is not always available at the point of referrals,\nso we need methods to leverage multimodal data such as semi-structured and\nunstructured data for early detection of IA. In this research, we present\nfusion and ensemble learning-based methods using multimodal data to assist\ndecision-making in the early detection of IA, and a conformal prediction-based\nmethod to quantify the uncertainty of the prediction and detect any unreliable\npredictions. To the best of our knowledge, our study is the first attempt to\nutilize multimodal data to support the early detection of IA from GP referrals.\n","authors":["Bing Wang","Weizi Li","Anthony Bradlow","Antoni T. Y. Chan","Eghosa Bazuaye"],"pdf_url":"https://arxiv.org/pdf/2310.19967v3.pdf","comment":"We found some issues in data preprocessing, which will impact the\n final result. Therefore we would like to withdraw the paper"},{"id":"http://arxiv.org/abs/2407.21652v1","updated":"2024-07-31T14:53:41Z","published":"2024-07-31T14:53:41Z","title":"Spatial Transformer Network YOLO Model for Agricultural Object Detection","summary":" Object detection plays a crucial role in the field of computer vision by\nautonomously identifying and locating objects of interest. The You Only Look\nOnce (YOLO) model is an effective single-shot detector. However, YOLO faces\nchallenges in cluttered or partially occluded scenes and can struggle with\nsmall, low-contrast objects. We propose a new method that integrates spatial\ntransformer networks (STNs) into YOLO to improve performance. The proposed\nSTN-YOLO aims to enhance the model's effectiveness by focusing on important\nareas of the image and improving the spatial invariance of the model before the\ndetection process. Our proposed method improved object detection performance\nboth qualitatively and quantitatively. We explore the impact of different\nlocalization networks within the STN module as well as the robustness of the\nmodel across different spatial transformations. We apply the STN-YOLO on\nbenchmark datasets for Agricultural object detection as well as a new dataset\nfrom a state-of-the-art plant phenotyping greenhouse facility. Our code and\ndataset are publicly available.\n","authors":["Yash Zambre","Ekdev Rajkitkul","Akshatha Mohan","Joshua Peeples"],"pdf_url":"https://arxiv.org/pdf/2407.21652v1.pdf","comment":"7 pages, 5 figures, submitted for review"},{"id":"http://arxiv.org/abs/2407.18569v2","updated":"2024-07-31T14:53:23Z","published":"2024-07-26T07:51:11Z","title":"PP-TIL: Personalized Planning for Autonomous Driving with Instance-based\n Transfer Imitation Learning","summary":" Personalized motion planning holds significant importance within urban\nautomated driving, catering to the unique requirements of individual users.\nNevertheless, prior endeavors have frequently encountered difficulties in\nsimultaneously addressing two crucial aspects: personalized planning within\nintricate urban settings and enhancing planning performance through data\nutilization. The challenge arises from the expensive and limited nature of user\ndata, coupled with the scene state space tending towards infinity. These\nfactors contribute to overfitting and poor generalization problems during model\ntraining. Henceforth, we propose an instance-based transfer imitation learning\napproach. This method facilitates knowledge transfer from extensive expert\ndomain data to the user domain, presenting a fundamental resolution to these\nissues. We initially train a pre-trained model using large-scale expert data.\nSubsequently, during the fine-tuning phase, we feed the batch data, which\ncomprises expert and user data. Employing the inverse reinforcement learning\ntechnique, we extract the style feature distribution from user demonstrations,\nconstructing the regularization term for the approximation of user style. In\nour experiments, we conducted extensive evaluations of the proposed method.\nCompared to the baseline methods, our approach mitigates the overfitting issue\ncaused by sparse user data. Furthermore, we discovered that integrating the\ndriving model with a differentiable nonlinear optimizer as a safety protection\nlayer for end-to-end personalized fine-tuning results in superior planning\nperformance.\n","authors":["Fangze Lin","Ying He","Fei Yu"],"pdf_url":"https://arxiv.org/pdf/2407.18569v2.pdf","comment":"IROS 2024 Accepted"},{"id":"http://arxiv.org/abs/2407.21642v1","updated":"2024-07-31T14:41:40Z","published":"2024-07-31T14:41:40Z","title":"Lyapunov weights to convey the meaning of time in physics-informed\n neural networks","summary":" Time is not a dimension as the others. In Physics-Informed Neural Networks\n(PINN) several proposals attempted to adapt the time sampling or time weighting\nto take into account the specifics of this special dimension. But these\nproposals are not principled and need guidance to be used. We explain here\ntheoretically why the Lyapunov exponents give actionable insights and propose a\nweighting scheme to automatically adapt to chaotic, periodic or stable\ndynamics. We characterize theoretically the best weighting scheme under\ncomputational constraints as a cumulative exponential integral of the local\nLyapunov exponent estimators and show that it performs well in practice under\nthe regimes mentioned above.\n","authors":["Gabriel Turinici"],"pdf_url":"https://arxiv.org/pdf/2407.21642v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16205v2","updated":"2024-07-31T14:37:05Z","published":"2024-07-23T06:14:41Z","title":"Figure it Out: Analyzing-based Jailbreak Attack on Large Language Models","summary":" The rapid development of Large Language Models (LLMs) has brought remarkable\ngenerative capabilities across diverse tasks. However, despite the impressive\nachievements, these models still have numerous security vulnerabilities,\nparticularly when faced with jailbreak attacks. Therefore, by investigating\njailbreak attacks, we can uncover hidden weaknesses in LLMs and guide us in\ndeveloping more robust defense mechanisms to fortify their security. In this\npaper, we further explore the boundary of jailbreak attacks on LLMs and propose\nAnalyzing-based Jailbreak (ABJ). This effective jailbreak attack method takes\nadvantage of LLMs' growing analyzing and reasoning capability and reveals their\nunderlying vulnerabilities when facing analysis-based tasks. We conduct a\ndetailed evaluation of ABJ across various open-source and closed-source LLMs,\nwhich achieves 94.8% Attack Success Rate (ASR) and 1.06 Attack Efficiency (AE)\non GPT-4-turbo-0409, demonstrating state-of-the-art attack effectiveness and\nefficiency. Our research highlights the importance of prioritizing and\nenhancing the safety of LLMs to mitigate the risks of misuse.The code is\npublicly available at https://github.com/theshi-1128/ABJ-Attack.\n","authors":["Shi Lin","Rongchang Li","Xun Wang","Changting Lin","Wenpeng Xing","Meng Han"],"pdf_url":"https://arxiv.org/pdf/2407.16205v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.18153v2","updated":"2024-07-31T14:34:43Z","published":"2024-05-28T13:14:26Z","title":"Practical aspects for the creation of an audio dataset from field\n recordings with optimized labeling budget with AI-assisted strategy","summary":" Machine Listening focuses on developing technologies to extract relevant\ninformation from audio signals. A critical aspect of these projects is the\nacquisition and labeling of contextualized data, which is inherently complex\nand requires specific resources and strategies. Despite the availability of\nsome audio datasets, many are unsuitable for commercial applications. The paper\nemphasizes the importance of Active Learning (AL) using expert labelers over\ncrowdsourcing, which often lacks detailed insights into dataset structures. AL\nis an iterative process combining human labelers and AI models to optimize the\nlabeling budget by intelligently selecting samples for human review. This\napproach addresses the challenge of handling large, constantly growing datasets\nthat exceed available computational resources and memory. The paper presents a\ncomprehensive data-centric framework for Machine Listening projects, detailing\nthe configuration of recording nodes, database structure, and labeling budget\noptimization in resource-constrained scenarios. Applied to an industrial port\nin Valencia, Spain, the framework successfully labeled 6540 ten-second audio\nsamples over five months with a small team, demonstrating its effectiveness and\nadaptability to various resource availability situations.\n Acknowledgments: The participation of Javier Naranjo-Alcazar, Jordi Grau-Haro\nand Pedro Zuccarello in this research was funded by the Valencian Institute for\nBusiness Competitiveness (IVACE) and the FEDER funds by means of project\nSoroll-IA2 (IMDEEA/2023/91).\n","authors":["Javier Naranjo-Alcazar","Jordi Grau-Haro","Ruben Ribes-Serrano","Pedro Zuccarello"],"pdf_url":"https://arxiv.org/pdf/2405.18153v2.pdf","comment":"Submitted to ICML 2024 Workshop on Data-Centric Machine Learning\n Research"},{"id":"http://arxiv.org/abs/2407.21635v1","updated":"2024-07-31T14:31:49Z","published":"2024-07-31T14:31:49Z","title":"MART: MultiscAle Relational Transformer Networks for Multi-agent\n Trajectory Prediction","summary":" Multi-agent trajectory prediction is crucial to autonomous driving and\nunderstanding the surrounding environment. Learning-based approaches for\nmulti-agent trajectory prediction, such as primarily relying on graph neural\nnetworks, graph transformers, and hypergraph neural networks, have demonstrated\noutstanding performance on real-world datasets in recent years. However, the\nhypergraph transformer-based method for trajectory prediction is yet to be\nexplored. Therefore, we present a MultiscAle Relational Transformer (MART)\nnetwork for multi-agent trajectory prediction. MART is a hypergraph transformer\narchitecture to consider individual and group behaviors in transformer\nmachinery. The core module of MART is the encoder, which comprises a Pair-wise\nRelational Transformer (PRT) and a Hyper Relational Transformer (HRT). The\nencoder extends the capabilities of a relational transformer by introducing\nHRT, which integrates hyperedge features into the transformer mechanism,\npromoting attention weights to focus on group-wise relations. In addition, we\npropose an Adaptive Group Estimator (AGE) designed to infer complex group\nrelations in real-world environments. Extensive experiments on three real-world\ndatasets (NBA, SDD, and ETH-UCY) demonstrate that our method achieves\nstate-of-the-art performance, enhancing ADE/FDE by 3.9%/11.8% on the NBA\ndataset. Code is available at https://github.com/gist-ailab/MART.\n","authors":["Seongju Lee","Junseok Lee","Yeonguk Yu","Taeri Kim","Kyoobin Lee"],"pdf_url":"https://arxiv.org/pdf/2407.21635v1.pdf","comment":"19 pages, 12 figures, 7 tables, 8 pages of supplementary material.\n Paper accepted at ECCV 2024"},{"id":"http://arxiv.org/abs/2407.20651v2","updated":"2024-07-31T14:24:20Z","published":"2024-07-30T08:48:49Z","title":"Towards Generalizable Reinforcement Learning via Causality-Guided\n Self-Adaptive Representations","summary":" General intelligence requires quick adaption across tasks. While existing\nreinforcement learning (RL) methods have made progress in generalization, they\ntypically assume only distribution changes between source and target domains.\nIn this paper, we explore a wider range of scenarios where both the\ndistribution and environment spaces may change. For example, in Atari games, we\ntrain agents to generalize to tasks with different levels of mode and\ndifficulty, where there could be new state or action variables that never\noccurred in previous environments. To address this challenging setting, we\nintroduce a causality-guided self-adaptive representation-based approach,\ncalled CSR, that equips the agent to generalize effectively and efficiently\nacross a sequence of tasks with evolving dynamics. Specifically, we employ\ncausal representation learning to characterize the latent causal variables and\nworld models within the RL system. Such compact causal representations uncover\nthe structural relationships among variables, enabling the agent to\nautonomously determine whether changes in the environment stem from\ndistribution shifts or variations in space, and to precisely locate these\nchanges. We then devise a three-step strategy to fine-tune the model under\ndifferent scenarios accordingly. Empirical experiments show that CSR\nefficiently adapts to the target domains with only a few samples and\noutperforms state-of-the-art baselines on a wide range of scenarios, including\nour simulated environments, Cartpole, and Atari games.\n","authors":["Yupei Yang","Biwei Huang","Fan Feng","Xinyue Wang","Shikui Tu","Lei Xu"],"pdf_url":"https://arxiv.org/pdf/2407.20651v2.pdf","comment":"This paper was submitted to NeurIPS24. According to the reviews,\n there are some mistakes in the Theorems in this papers. Moreover, we will\n choose some other environments for experiments, which means that it takes at\n least months to update/rewrite the Experiment & Appendix Sections. So we need\n to withdraw this paper for major revision"},{"id":"http://arxiv.org/abs/2407.21622v1","updated":"2024-07-31T14:15:42Z","published":"2024-07-31T14:15:42Z","title":"Extended Fiducial Inference: Toward an Automated Process of Statistical\n Inference","summary":" While fiducial inference was widely considered a big blunder by R.A. Fisher,\nthe goal he initially set --`inferring the uncertainty of model parameters on\nthe basis of observations' -- has been continually pursued by many\nstatisticians. To this end, we develop a new statistical inference method\ncalled extended Fiducial inference (EFI). The new method achieves the goal of\nfiducial inference by leveraging advanced statistical computing techniques\nwhile remaining scalable for big data. EFI involves jointly imputing random\nerrors realized in observations using stochastic gradient Markov chain Monte\nCarlo and estimating the inverse function using a sparse deep neural network\n(DNN). The consistency of the sparse DNN estimator ensures that the uncertainty\nembedded in observations is properly propagated to model parameters through the\nestimated inverse function, thereby validating downstream statistical\ninference. Compared to frequentist and Bayesian methods, EFI offers significant\nadvantages in parameter estimation and hypothesis testing. Specifically, EFI\nprovides higher fidelity in parameter estimation, especially when outliers are\npresent in the observations; and eliminates the need for theoretical reference\ndistributions in hypothesis testing, thereby automating the statistical\ninference process. EFI also provides an innovative framework for\nsemi-supervised learning.\n","authors":["Faming Liang","Sehwan Kim","Yan Sun"],"pdf_url":"https://arxiv.org/pdf/2407.21622v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06298v2","updated":"2024-07-31T13:57:36Z","published":"2024-03-10T20:07:14Z","title":"Analysis of Total Variation Minimization for Clustered Federated\n Learning","summary":" A key challenge in federated learning applications is the statistical\nheterogeneity of local datasets. Clustered federated learning addresses this\nchallenge by identifying clusters of local datasets that are approximately\nhomogeneous. One recent approach to clustered federated learning is generalized\ntotal variation minimization (GTVMin). This approach requires a similarity\ngraph which can be obtained by domain expertise or in a data-driven fashion via\ngraph learning techniques. Under a widely applicable clustering assumption, we\nderive an upper bound the deviation between GTVMin solutions and their\ncluster-wise averages. This bound provides valuable insights into the\neffectiveness and robustness of GTVMin in addressing statistical heterogeneity\nwithin federated learning environments.\n","authors":["A. Jung"],"pdf_url":"https://arxiv.org/pdf/2403.06298v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09175v3","updated":"2024-07-31T13:55:49Z","published":"2023-08-17T20:27:33Z","title":"Diversifying AI: Towards Creative Chess with AlphaZero","summary":" In recent years, Artificial Intelligence (AI) systems have surpassed human\nintelligence in a variety of computational tasks. However, AI systems, like\nhumans, make mistakes, have blind spots, hallucinate, and struggle to\ngeneralize to new situations. This work explores whether AI can benefit from\ncreative decision-making mechanisms when pushed to the limits of its\ncomputational rationality. In particular, we investigate whether a team of\ndiverse AI systems can outperform a single AI in challenging tasks by\ngenerating more ideas as a group and then selecting the best ones. We study\nthis question in the game of chess, the so-called drosophila of AI. We build on\nAlphaZero (AZ) and extend it to represent a league of agents via a\nlatent-conditioned architecture, which we call AZ_db. We train AZ_db to\ngenerate a wider range of ideas using behavioral diversity techniques and\nselect the most promising ones with sub-additive planning. Our experiments\nsuggest that AZ_db plays chess in diverse ways, solves more puzzles as a group\nand outperforms a more homogeneous team. Notably, AZ_db solves twice as many\nchallenging puzzles as AZ, including the challenging Penrose positions. When\nplaying chess from different openings, we notice that players in AZ_db\nspecialize in different openings, and that selecting a player for each opening\nusing sub-additive planning results in a 50 Elo improvement over AZ. Our\nfindings suggest that diversity bonuses emerge in teams of AI agents, just as\nthey do in teams of humans and that diversity is a valuable asset in solving\ncomputationally hard problems.\n","authors":["Tom Zahavy","Vivek Veeriah","Shaobo Hou","Kevin Waugh","Matthew Lai","Edouard Leurent","Nenad Tomasev","Lisa Schut","Demis Hassabis","Satinder Singh"],"pdf_url":"https://arxiv.org/pdf/2308.09175v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21609v1","updated":"2024-07-31T13:47:53Z","published":"2024-07-31T13:47:53Z","title":"Ironing the Graphs: Toward a Correct Geometric Analysis of Large-Scale\n Graphs","summary":" Graph embedding approaches attempt to project graphs into geometric entities,\ni.e, manifolds. The idea is that the geometric properties of the projected\nmanifolds are helpful in the inference of graph properties. However, if the\nchoice of the embedding manifold is incorrectly performed, it can lead to\nincorrect geometric inference. In this paper, we argue that the classical\nembedding techniques cannot lead to correct geometric interpretation as they\nmiss the curvature at each point, of manifold. We advocate that for doing\ncorrect geometric interpretation the embedding of graph should be done over\nregular constant curvature manifolds. To this end, we present an embedding\napproach, the discrete Ricci flow graph embedding (dRfge) based on the discrete\nRicci flow that adapts the distance between nodes in a graph so that the graph\ncan be embedded onto a constant curvature manifold that is homogeneous and\nisotropic, i.e., all directions are equivalent and distances comparable,\nresulting in correct geometric interpretations. A major contribution of this\npaper is that for the first time, we prove the convergence of discrete Ricci\nflow to a constant curvature and stable distance metrics over the edges. A\ndrawback of using the discrete Ricci flow is the high computational complexity\nthat prevented its usage in large-scale graph analysis. Another contribution of\nthis paper is a new algorithmic solution that makes it feasible to calculate\nthe Ricci flow for graphs of up to 50k nodes, and beyond. The intuitions behind\nthe discrete Ricci flow make it possible to obtain new insights into the\nstructure of large-scale graphs. We demonstrate this through a case study on\nanalyzing the internet connectivity structure between countries at the BGP\nlevel.\n","authors":["Saloua Naama","Kavé Salamatian","Francesco Bronzino"],"pdf_url":"https://arxiv.org/pdf/2407.21609v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05954v2","updated":"2024-07-31T13:41:15Z","published":"2023-09-11T08:39:57Z","title":"A comparison between black-, grey- and white-box modeling for the\n bidirectional Raman amplifier optimization","summary":" Designing and optimizing optical amplifiers to maximize system performance is\nbecoming increasingly important as optical communication systems strive to\nincrease throughput. Offline optimization of optical amplifiers relies on\nmodels ranging from white-box models deeply rooted in physics to black-box\ndata-driven and physics-agnostic models. Here, we compare the capabilities of\nwhite-, grey- and black-box models on the challenging test case of optimizing a\nbidirectional distributed Raman amplifier to achieve a target\nfrequency-distance signal power profile. We show that any of the studied\nmethods can achieve similar frequency and distance flatness of between 1 and\n3.6 dB (depending on the definition of flatness) over the C-band in an 80-km\nspan. Then, we discuss the models' applicability, advantages, and drawbacks\nbased on the target application scenario, in particular in terms of\nflexibility, optimization speed, and access to training data.\n","authors":["Metodi P. Yankov","Mehran Soltani","Andrea Carena","Darko Zibar","Francesco Da Ros"],"pdf_url":"https://arxiv.org/pdf/2310.05954v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21602v1","updated":"2024-07-31T13:37:04Z","published":"2024-07-31T13:37:04Z","title":"Higher order quantum reservoir computing for non-intrusive reduced-order\n models","summary":" Forecasting dynamical systems is of importance to numerous real-world\napplications. When possible, dynamical systems forecasts are constructed based\non first-principles-based models such as through the use of differential\nequations. When these equations are unknown, non-intrusive techniques must be\nutilized to build predictive models from data alone. Machine learning (ML)\nmethods have recently been used for such tasks. Moreover, ML methods provide\nthe added advantage of significant reductions in time-to-solution for\npredictions in contrast with first-principle based models. However, many\nstate-of-the-art ML-based methods for forecasting rely on neural networks,\nwhich may be expensive to train and necessitate requirements for large amounts\nof memory. In this work, we propose a quantum mechanics inspired ML modeling\nstrategy for learning nonlinear dynamical systems that provides data-driven\nforecasts for complex dynamical systems with reduced training time and memory\ncosts. This approach, denoted the quantum reservoir computing technique (QRC),\nis a hybrid quantum-classical framework employing an ensemble of interconnected\nsmall quantum systems via classical linear feedback connections. By mapping the\ndynamical state to a suitable quantum representation amenable to unitary\noperations, QRC is able to predict complex nonlinear dynamical systems in a\nstable and accurate manner. We demonstrate the efficacy of this framework\nthrough benchmark forecasts of the NOAA Optimal Interpolation Sea Surface\nTemperature dataset and compare the performance of QRC to other ML methods.\n","authors":["Vinamr Jain","Romit Maulik"],"pdf_url":"https://arxiv.org/pdf/2407.21602v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21590v1","updated":"2024-07-31T13:26:09Z","published":"2024-07-31T13:26:09Z","title":"Measuring What Matters: Intrinsic Distance Preservation as a Robust\n Metric for Embedding Quality","summary":" Unsupervised embeddings are fundamental to numerous machine learning\napplications, yet their evaluation remains a challenging task. Traditional\nassessment methods often rely on extrinsic variables, such as performance in\ndownstream tasks, which can introduce confounding factors and mask the true\nquality of embeddings. This paper introduces the Intrinsic Distance\nPreservation Evaluation (IDPE) method, a novel approach for assessing embedding\nquality based on the preservation of Mahalanobis distances between data points\nin the original and embedded spaces. We demonstrate the limitations of\nextrinsic evaluation methods through a simple example, highlighting how they\ncan lead to misleading conclusions about embedding quality. IDPE addresses\nthese issues by providing a task-independent measure of how well embeddings\npreserve the intrinsic structure of the original data. Our method leverages\nefficient similarity search techniques to make it applicable to large-scale\ndatasets. We compare IDPE with established intrinsic metrics like\ntrustworthiness and continuity, as well as extrinsic metrics such as Average\nRank and Mean Reciprocal Rank. Our results show that IDPE offers a more\ncomprehensive and reliable assessment of embedding quality across various\nscenarios. We evaluate PCA and t-SNE embeddings using IDPE, revealing insights\ninto their performance that are not captured by traditional metrics. This work\ncontributes to the field by providing a robust, efficient, and interpretable\nmethod for embedding evaluation. IDPE's focus on intrinsic properties offers a\nvaluable tool for researchers and practitioners seeking to develop and assess\nhigh-quality embeddings for diverse machine learning applications.\n","authors":["Steven N. Hart","Thomas E. Tavolara"],"pdf_url":"https://arxiv.org/pdf/2407.21590v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01352v2","updated":"2024-07-31T13:18:13Z","published":"2024-06-03T14:16:56Z","title":"Position: An Inner Interpretability Framework for AI Inspired by Lessons\n from Cognitive Neuroscience","summary":" Inner Interpretability is a promising emerging field tasked with uncovering\nthe inner mechanisms of AI systems, though how to develop these mechanistic\ntheories is still much debated. Moreover, recent critiques raise issues that\nquestion its usefulness to advance the broader goals of AI. However, it has\nbeen overlooked that these issues resemble those that have been grappled with\nin another field: Cognitive Neuroscience. Here we draw the relevant connections\nand highlight lessons that can be transferred productively between fields.\nBased on these, we propose a general conceptual framework and give concrete\nmethodological strategies for building mechanistic explanations in AI inner\ninterpretability research. With this conceptual framework, Inner\nInterpretability can fend off critiques and position itself on a productive\npath to explain AI systems.\n","authors":["Martina G. Vilas","Federico Adolfi","David Poeppel","Gemma Roig"],"pdf_url":"https://arxiv.org/pdf/2406.01352v2.pdf","comment":"Accepted at ICML 2024"},{"id":"http://arxiv.org/abs/2407.12223v3","updated":"2024-07-31T12:49:17Z","published":"2024-07-17T00:25:35Z","title":"Conditional Quantile Estimation for Uncertain Watch Time in Short-Video\n Recommendation","summary":" Accurately predicting watch time is crucial for optimizing recommendations\nand user experience in short video platforms. However, existing methods that\nestimate a single average watch time often fail to capture the inherent\nuncertainty and diversity in user engagement patterns. In this paper, we\npropose the Conditional Quantile Estimation (CQE) framework to model the entire\nconditional distribution of watch time. Using quantile regression, CQE\ncharacterizes the complex watch-time distribution for each user-video pair,\nproviding a flexible and comprehensive approach to understanding user behavior.\nWe further design multiple strategies to combine the quantile estimates,\nadapting to different recommendation scenarios and user preferences. Extensive\noffline experiments and online A/B tests demonstrate the superiority of CQE in\nwatch time prediction and user engagement modeling. In particular, the online\ndeployment of CQE in KuaiShow has led to significant improvements in key\nevaluation metrics, including active days, active users, engagement duration,\nand video view counts. These results highlight the practical impact of our\nproposed approach in enhancing the user experience and overall performance of\nthe short video recommendation system. The code will be released after\npublication.\n","authors":["Chengzhi Lin","Shuchang Liu","Chuyuan Wang","Yongqi Liu"],"pdf_url":"https://arxiv.org/pdf/2407.12223v3.pdf","comment":"8 pages, 5 figures, 5 tables"},{"id":"http://arxiv.org/abs/2407.21565v1","updated":"2024-07-31T12:41:20Z","published":"2024-07-31T12:41:20Z","title":"Multi-agent reinforcement learning for the control of three-dimensional\n Rayleigh-Bénard convection","summary":" Deep reinforcement learning (DRL) has found application in numerous use-cases\npertaining to flow control. Multi-agent RL (MARL), a variant of DRL, has shown\nto be more effective than single-agent RL in controlling flows exhibiting\nlocality and translational invariance. We present, for the first time, an\nimplementation of MARL-based control of three-dimensional Rayleigh-B\\'enard\nconvection (RBC). Control is executed by modifying the temperature distribution\nalong the bottom wall divided into multiple control segments, each of which\nacts as an independent agent. Two regimes of RBC are considered at Rayleigh\nnumbers $\\mathrm{Ra}=500$ and $750$. Evaluation of the learned control policy\nreveals a reduction in convection intensity by $23.5\\%$ and $8.7\\%$ at\n$\\mathrm{Ra}=500$ and $750$, respectively. The MARL controller converts\nirregularly shaped convective patterns to regular straight rolls with lower\nconvection that resemble flow in a relatively more stable regime. We draw\ncomparisons with proportional control at both $\\mathrm{Ra}$ and show that MARL\nis able to outperform the proportional controller. The learned control strategy\nis complex, featuring different non-linear segment-wise actuator delays and\nactuation magnitudes. We also perform successful evaluations on a larger domain\nthan used for training, demonstrating that the invariant property of MARL\nallows direct transfer of the learnt policy.\n","authors":["Joel Vasanth","Jean Rabault","Francisco Alcántara-Ávila","Mikael Mortensen","Ricardo Vinuesa"],"pdf_url":"https://arxiv.org/pdf/2407.21565v1.pdf","comment":"Submitted to the special issue titled 'Machine Learning for Fluid\n Dynamics' in the journal Flow, Turbulence and Combusion. 39 pages and 20\n figures"},{"id":"http://arxiv.org/abs/2407.21553v1","updated":"2024-07-31T12:22:40Z","published":"2024-07-31T12:22:40Z","title":"CXSimulator: A User Behavior Simulation using LLM Embeddings for\n Web-Marketing Campaign Assessment","summary":" This paper presents the Customer Experience (CX) Simulator, a novel framework\ndesigned to assess the effects of untested web-marketing campaigns through user\nbehavior simulations. The proposed framework leverages large language models\n(LLMs) to represent various events in a user's behavioral history, such as\nviewing an item, applying a coupon, or purchasing an item, as semantic\nembedding vectors. We train a model to predict transitions between events from\ntheir LLM embeddings, which can even generalize to unseen events by learning\nfrom diverse training data. In web-marketing applications, we leverage this\ntransition prediction model to simulate how users might react differently when\nnew campaigns or products are presented to them. This allows us to eliminate\nthe need for costly online testing and enhance the marketers' abilities to\nreveal insights. Our numerical evaluation and user study, utilizing BigQuery\nPublic Datasets from the Google Merchandise Store, demonstrate the\neffectiveness of our framework.\n","authors":["Akira Kasuga","Ryo Yonetani"],"pdf_url":"https://arxiv.org/pdf/2407.21553v1.pdf","comment":"5 pages, 2 figures, 1 table, the 33rd ACM International Conference on\n Information and Knowledge Management (CIKM '24)"},{"id":"http://arxiv.org/abs/2311.08549v2","updated":"2024-07-31T12:10:56Z","published":"2023-11-14T21:21:35Z","title":"Manifold learning in Wasserstein space","summary":" This paper aims at building the theoretical foundations for manifold learning\nalgorithms in the space of absolutely continuous probability measures on a\ncompact and convex subset of $\\mathbb{R}^d$, metrized with the Wasserstein-2\ndistance $\\mathrm{W}$. We begin by introducing a construction of submanifolds\n$\\Lambda$ of probability measures equipped with metric $\\mathrm{W}_\\Lambda$,\nthe geodesic restriction of $W$ to $\\Lambda$. In contrast to other\nconstructions, these submanifolds are not necessarily flat, but still allow for\nlocal linearizations in a similar fashion to Riemannian submanifolds of\n$\\mathbb{R}^d$. We then show how the latent manifold structure of\n$(\\Lambda,\\mathrm{W}_{\\Lambda})$ can be learned from samples\n$\\{\\lambda_i\\}_{i=1}^N$ of $\\Lambda$ and pairwise extrinsic Wasserstein\ndistances $\\mathrm{W}$ only. In particular, we show that the metric space\n$(\\Lambda,\\mathrm{W}_{\\Lambda})$ can be asymptotically recovered in the sense\nof Gromov--Wasserstein from a graph with nodes $\\{\\lambda_i\\}_{i=1}^N$ and edge\nweights $W(\\lambda_i,\\lambda_j)$. In addition, we demonstrate how the tangent\nspace at a sample $\\lambda$ can be asymptotically recovered via spectral\nanalysis of a suitable \"covariance operator\" using optimal transport maps from\n$\\lambda$ to sufficiently close and diverse samples $\\{\\lambda_i\\}_{i=1}^N$.\nThe paper closes with some explicit constructions of submanifolds $\\Lambda$ and\nnumerical examples on the recovery of tangent spaces through spectral analysis.\n","authors":["Keaton Hamm","Caroline Moosmüller","Bernhard Schmitzer","Matthew Thorpe"],"pdf_url":"https://arxiv.org/pdf/2311.08549v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21546v1","updated":"2024-07-31T12:09:33Z","published":"2024-07-31T12:09:33Z","title":"Black box meta-learning intrinsic rewards for sparse-reward environments","summary":" Despite the successes and progress of deep reinforcement learning over the\nlast decade, several challenges remain that hinder its broader application.\nSome fundamental aspects to improve include data efficiency, generalization\ncapability, and ability to learn in sparse-reward environments, which often\nrequire human-designed dense rewards. Meta-learning has emerged as a promising\napproach to address these issues by optimizing components of the learning\nalgorithm to meet desired characteristics. Additionally, a different line of\nwork has extensively studied the use of intrinsic rewards to enhance the\nexploration capabilities of algorithms. This work investigates how\nmeta-learning can improve the training signal received by RL agents. The focus\nis on meta-learning intrinsic rewards under a framework that doesn't rely on\nthe use of meta-gradients. We analyze and compare this approach to the use of\nextrinsic rewards and a meta-learned advantage function. The developed\nalgorithms are evaluated on distributions of continuous control tasks with both\nparametric and non-parametric variations, and with only sparse rewards\naccessible for the evaluation tasks.\n","authors":["Octavio Pappalardo","Rodrigo Ramele","Juan Miguel Santos"],"pdf_url":"https://arxiv.org/pdf/2407.21546v1.pdf","comment":"This work is part of OP Bachelor's Degree Thesis"},{"id":"http://arxiv.org/abs/2407.21535v1","updated":"2024-07-31T11:44:54Z","published":"2024-07-31T11:44:54Z","title":"Probabilistic Scoring Lists for Interpretable Machine Learning","summary":" A scoring system is a simple decision model that checks a set of features,\nadds a certain number of points to a total score for each feature that is\nsatisfied, and finally makes a decision by comparing the total score to a\nthreshold. Scoring systems have a long history of active use in safety-critical\ndomains such as healthcare and justice, where they provide guidance for making\nobjective and accurate decisions. Given their genuine interpretability, the\nidea of learning scoring systems from data is obviously appealing from the\nperspective of explainable AI. In this paper, we propose a practically\nmotivated extension of scoring systems called probabilistic scoring lists\n(PSL), as well as a method for learning PSLs from data. Instead of making a\ndeterministic decision, a PSL represents uncertainty in the form of probability\ndistributions, or, more generally, probability intervals. Moreover, in the\nspirit of decision lists, a PSL evaluates features one by one and stops as soon\nas a decision can be made with enough confidence. To evaluate our approach, we\nconduct a case study in the medical domain.\n","authors":["Jonas Hanselle","Stefan Heid","Johannes Fürnkranz","Eyke Hüllermeier"],"pdf_url":"https://arxiv.org/pdf/2407.21535v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21530v1","updated":"2024-07-31T11:26:57Z","published":"2024-07-31T11:26:57Z","title":"Data Contamination Report from the 2024 CONDA Shared Task","summary":" The 1st Workshop on Data Contamination (CONDA 2024) focuses on all relevant\naspects of data contamination in natural language processing, where data\ncontamination is understood as situations where evaluation data is included in\npre-training corpora used to train large scale models, compromising evaluation\nresults. The workshop fostered a shared task to collect evidence on data\ncontamination in current available datasets and models. The goal of the shared\ntask and associated database is to assist the community in understanding the\nextent of the problem and to assist researchers in avoiding reporting\nevaluation results on known contaminated resources. The shared task provides a\nstructured, centralized public database for the collection of contamination\nevidence, open to contributions from the community via GitHub pool requests.\nThis first compilation paper is based on 566 reported entries over 91\ncontaminated sources from a total of 23 contributors. The details of the\nindividual contamination events are available in the platform. The platform\ncontinues to be online, open to contributions from the community.\n","authors":["Oscar Sainz","Iker García-Ferrero","Alon Jacovi","Jon Ander Campos","Yanai Elazar","Eneko Agirre","Yoav Goldberg","Wei-Lin Chen","Jenny Chim","Leshem Choshen","Luca D'Amico-Wong","Melissa Dell","Run-Ze Fan","Shahriar Golchin","Yucheng Li","Pengfei Liu","Bhavish Pahwa","Ameya Prabhu","Suryansh Sharma","Emily Silcock","Kateryna Solonko","David Stap","Mihai Surdeanu","Yu-Min Tseng","Vishaal Udandarao","Zengzhi Wang","Ruijie Xu","Jinglin Yang"],"pdf_url":"https://arxiv.org/pdf/2407.21530v1.pdf","comment":"https://huggingface.co/spaces/CONDA-Workshop/Data-Contamination-Database"},{"id":"http://arxiv.org/abs/2405.19024v2","updated":"2024-07-31T11:06:56Z","published":"2024-05-29T12:07:17Z","title":"Inverse Concave-Utility Reinforcement Learning is Inverse Game Theory","summary":" We consider inverse reinforcement learning problems with concave utilities.\nConcave Utility Reinforcement Learning (CURL) is a generalisation of the\nstandard RL objective, which employs a concave function of the state occupancy\nmeasure, rather than a linear function. CURL has garnered recent attention for\nits ability to represent instances of many important applications including the\nstandard RL such as imitation learning, pure exploration, constrained MDPs,\noffline RL, human-regularized RL, and others. Inverse reinforcement learning is\na powerful paradigm that focuses on recovering an unknown reward function that\ncan rationalize the observed behaviour of an agent. There has been recent\ntheoretical advances in inverse RL where the problem is formulated as\nidentifying the set of feasible reward functions. However, inverse RL for CURL\nproblems has not been considered previously. In this paper we show that most of\nthe standard IRL results do not apply to CURL in general, since CURL\ninvalidates the classical Bellman equations. This calls for a new theoretical\nframework for the inverse CURL problem. Using a recent equivalence result\nbetween CURL and Mean-field Games, we propose a new definition for the feasible\nrewards for I-CURL by proving that this problem is equivalent to an inverse\ngame theory problem in a subclass of mean-field games. We present initial query\nand sample complexity results for the I-CURL problem under assumptions such as\nLipschitz-continuity. Finally, we outline future directions and applications in\nhuman--AI collaboration enabled by our results.\n","authors":["Mustafa Mert Çelikok","Frans A. Oliehoek","Jan-Willem van de Meent"],"pdf_url":"https://arxiv.org/pdf/2405.19024v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21523v1","updated":"2024-07-31T10:56:20Z","published":"2024-07-31T10:56:20Z","title":"Tabular Data Augmentation for Machine Learning: Progress and Prospects\n of Embracing Generative AI","summary":" Machine learning (ML) on tabular data is ubiquitous, yet obtaining abundant\nhigh-quality tabular data for model training remains a significant obstacle.\nNumerous works have focused on tabular data augmentation (TDA) to enhance the\noriginal table with additional data, thereby improving downstream ML tasks.\nRecently, there has been a growing interest in leveraging the capabilities of\ngenerative AI for TDA. Therefore, we believe it is time to provide a\ncomprehensive review of the progress and future prospects of TDA, with a\nparticular emphasis on the trending generative AI. Specifically, we present an\narchitectural view of the TDA pipeline, comprising three main procedures:\npre-augmentation, augmentation, and post-augmentation. Pre-augmentation\nencompasses preparation tasks that facilitate subsequent TDA, including error\nhandling, table annotation, table simplification, table representation, table\nindexing, table navigation, schema matching, and entity matching. Augmentation\nsystematically analyzes current TDA methods, categorized into retrieval-based\nmethods, which retrieve external data, and generation-based methods, which\ngenerate synthetic data. We further subdivide these methods based on the\ngranularity of the augmentation process at the row, column, cell, and table\nlevels. Post-augmentation focuses on the datasets, evaluation and optimization\naspects of TDA. We also summarize current trends and future directions for TDA,\nhighlighting promising opportunities in the era of generative AI. In addition,\nthe accompanying papers and related resources are continuously updated and\nmaintained in the GitHub repository at\nhttps://github.com/SuDIS-ZJU/awesome-tabular-data-augmentation to reflect\nongoing advancements in the field.\n","authors":["Lingxi Cui","Huan Li","Ke Chen","Lidan Shou","Gang Chen"],"pdf_url":"https://arxiv.org/pdf/2407.21523v1.pdf","comment":"repository maintained at\n https://github.com/SuDIS-ZJU/awesome-tabular-data-augmentation"},{"id":"http://arxiv.org/abs/2312.00125v3","updated":"2024-07-31T10:51:03Z","published":"2023-11-30T19:00:02Z","title":"Scalable Bayesian uncertainty quantification with data-driven priors for\n radio interferometric imaging","summary":" Next-generation radio interferometers like the Square Kilometer Array have\nthe potential to unlock scientific discoveries thanks to their unprecedented\nangular resolution and sensitivity. One key to unlocking their potential\nresides in handling the deluge and complexity of incoming data. This challenge\nrequires building radio interferometric imaging methods that can cope with the\nmassive data sizes and provide high-quality image reconstructions with\nuncertainty quantification (UQ). This work proposes a method coined QuantifAI\nto address UQ in radio-interferometric imaging with data-driven (learned)\npriors for high-dimensional settings. Our model, rooted in the Bayesian\nframework, uses a physically motivated model for the likelihood. The model\nexploits a data-driven convex prior, which can encode complex information\nlearned implicitly from simulations and guarantee the log-concavity of the\nposterior. We leverage probability concentration phenomena of high-dimensional\nlog-concave posteriors that let us obtain information about the posterior,\navoiding MCMC sampling techniques. We rely on convex optimisation methods to\ncompute the MAP estimation, which is known to be faster and better scale with\ndimension than MCMC sampling strategies. Our method allows us to compute local\ncredible intervals, i.e., Bayesian error bars, and perform hypothesis testing\nof structure on the reconstructed image. In addition, we propose a novel\nblazing-fast method to compute pixel-wise uncertainties at different scales. We\ndemonstrate our method by reconstructing radio-interferometric images in a\nsimulated setting and carrying out fast and scalable UQ, which we validate with\nMCMC sampling. Our method shows an improved image quality and more meaningful\nuncertainties than the benchmark method based on a sparsity-promoting prior.\nQuantifAI's source code: https://github.com/astro-informatics/QuantifAI.\n","authors":["Tobías I. Liaudat","Matthijs Mars","Matthew A. Price","Marcelo Pereyra","Marta M. Betcke","Jason D. McEwen"],"pdf_url":"https://arxiv.org/pdf/2312.00125v3.pdf","comment":"30 pages, 14 figures, 10 tables, code available at\n https://github.com/astro-informatics/QuantifAI"},{"id":"http://arxiv.org/abs/2407.15425v2","updated":"2024-07-31T10:27:37Z","published":"2024-07-22T07:02:15Z","title":"Empirical Capacity Model for Self-Attention Neural Networks","summary":" Large pretrained self-attention neural networks, or transformers, have been\nvery successful in various tasks recently. The performance of a model on a\ngiven task depends on its ability to memorize and generalize the training data.\nLarge transformer models, which may have billions of parameters, in theory have\na huge capacity to memorize content. However, the current algorithms for the\noptimization fall short of the theoretical capacity, and the capacity is also\nhighly dependent on the content. In this paper, we focus on the memory capacity\nof these models obtained using common training algorithms and synthetic\ntraining data. Based on the results, we derive an empirical capacity model\n(ECM) for a generic transformer. The ECM can be used to design task-specific\ntransformer models with an optimal number of parameters in cases where the\ntarget memorization capability of the task can be defined.\n","authors":["Aki Härmä","Marcin Pietrasik","Anna Wilbik"],"pdf_url":"https://arxiv.org/pdf/2407.15425v2.pdf","comment":"Submitted to BNAIC'24, 14 pages + refs"},{"id":"http://arxiv.org/abs/2104.05467v2","updated":"2024-07-31T10:26:55Z","published":"2021-04-12T13:42:50Z","title":"Understanding Prediction Discrepancies in Machine Learning Classifiers","summary":" A multitude of classifiers can be trained on the same data to achieve similar\nperformances during test time, while having learned significantly different\nclassification patterns. This phenomenon, which we call prediction\ndiscrepancies, is often associated with the blind selection of one model\ninstead of another with similar performances. When making a choice, the machine\nlearning practitioner has no understanding on the differences between models,\ntheir limits, where they agree and where they don't. But his/her choice will\nresult in concrete consequences for instances to be classified in the\ndiscrepancy zone, since the final decision will be based on the selected\nclassification pattern. Besides the arbitrary nature of the result, a bad\nchoice could have further negative consequences such as loss of opportunity or\nlack of fairness. This paper proposes to address this question by analyzing the\nprediction discrepancies in a pool of best-performing models trained on the\nsame data. A model-agnostic algorithm, DIG, is proposed to capture and explain\ndiscrepancies locally, to enable the practitioner to make the best educated\ndecision when selecting a model by anticipating its potential undesired\nconsequences. All the code to reproduce the experiments is available.\n","authors":["Xavier Renard","Thibault Laugel","Marcin Detyniecki"],"pdf_url":"https://arxiv.org/pdf/2104.05467v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21507v1","updated":"2024-07-31T10:25:24Z","published":"2024-07-31T10:25:24Z","title":"FSSC: Federated Learning of Transformer Neural Networks for Semantic\n Image Communication","summary":" In this paper, we address the problem of image semantic communication in a\nmulti-user deployment scenario and propose a federated learning (FL) strategy\nfor a Swin Transformer-based semantic communication system (FSSC). Firstly, we\ndemonstrate that the adoption of a Swin Transformer for joint source-channel\ncoding (JSCC) effectively extracts semantic information in the communication\nsystem. Next, the FL framework is introduced to collaboratively learn a global\nmodel by aggregating local model parameters, rather than directly sharing\nclients' data. This approach enhances user privacy protection and reduces the\nworkload on the server or mobile edge. Simulation evaluations indicate that our\nmethod outperforms the typical JSCC algorithm and traditional separate-based\ncommunication algorithms. Particularly after integrating local semantics, the\nglobal aggregation model has further increased the Peak Signal-to-Noise Ratio\n(PSNR) by more than 2dB, thoroughly proving the effectiveness of our algorithm.\n","authors":["Yuna Yan","Xin Zhang","Lixin Li","Wensheng Lin","Rui Li","Wenchi Cheng","Zhu Han"],"pdf_url":"https://arxiv.org/pdf/2407.21507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21503v1","updated":"2024-07-31T10:21:20Z","published":"2024-07-31T10:21:20Z","title":"Root Cause Analysis Of Productivity Losses In Manufacturing Systems\n Utilizing Ensemble Machine Learning","summary":" In today's rapidly evolving landscape of automation and manufacturing\nsystems, the efficient resolution of productivity losses is paramount. This\nstudy introduces a data-driven ensemble approach, utilizing the cyclic\nmultivariate time series data from binary sensors and signals from Programmable\nLogic Controllers (PLCs) within these systems. The objective is to\nautomatically analyze productivity losses per cycle and pinpoint their root\ncauses by assigning the loss to a system element. The ensemble approach\nintroduced in this publication integrates various methods, including\ninformation theory and machine learning behavior models, to provide a robust\nanalysis for each production cycle. To expedite the resolution of productivity\nlosses and ensure short response times, stream processing becomes a necessity.\nAddressing this, the approach is implemented as data-stream analysis and can be\ntransferred to batch processing, seamlessly integrating into existing systems\nwithout the need for extensive historical data analysis. This method has two\npositive effects. Firstly, the result of the analysis ensures that the period\nof lower productivity is reduced by identifying the likely root cause of the\nproductivity loss. Secondly, these results are more reliable due to the\nensemble approach and therefore avoid dependency on technical experts. The\napproach is validated using a semi-automated welding manufacturing system, an\ninjection molding automation system, and a synthetically generated test PLC\ndataset. The results demonstrate the method's efficacy in offering a\ndata-driven understanding of process behavior and mark an advancement in\nautonomous manufacturing system analysis.\n","authors":["Jonas Gram","Brandon K. Sai","Thomas Bauernhansl"],"pdf_url":"https://arxiv.org/pdf/2407.21503v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11301v2","updated":"2024-07-31T10:18:50Z","published":"2024-06-17T08:08:11Z","title":"Enhancing and Assessing Instruction-Following with Fine-Grained\n Instruction Variants","summary":" The effective alignment of Large Language Models (LLMs) with precise\ninstructions is essential for their application in diverse real-world\nscenarios. Current methods focus on enhancing the diversity and complexity of\ntraining and evaluation samples, yet they fall short in accurately assessing\nLLMs' ability to follow similar instruction variants. We introduce an effective\ndata augmentation technique that decomposes complex instructions into simpler\nsub-components, modifies these, and reconstructs them into new variants,\nthereby preserves the original instruction's context and complexity while\nintroducing variability, which is critical for training and evaluating LLMs'\ninstruction-following precision. We developed the DeMoRecon dataset using this\nmethod to both fine-tune and evaluate LLMs. Our findings show that LLMs\nfine-tuned with DeMoRecon will gain significant performance boost on both ours\nand commonly used instructions-following benchmarks.\n","authors":["Jiuding Yang","Weidong Guo","Kaitong Yang","Xiangyang Li","Zhuwei Rao","Yu Xu","Di Niu"],"pdf_url":"https://arxiv.org/pdf/2406.11301v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.07822v2","updated":"2024-07-31T10:00:58Z","published":"2024-05-13T15:07:52Z","title":"Synthetic Tabular Data Validation: A Divergence-Based Approach","summary":" The ever-increasing use of generative models in various fields where tabular\ndata is used highlights the need for robust and standardized validation metrics\nto assess the similarity between real and synthetic data. Current methods lack\na unified framework and rely on diverse and often inconclusive statistical\nmeasures. Divergences, which quantify discrepancies between data distributions,\noffer a promising avenue for validation. However, traditional approaches\ncalculate divergences independently for each feature due to the complexity of\njoint distribution modeling. This paper addresses this challenge by proposing a\nnovel approach that uses divergence estimation to overcome the limitations of\nmarginal comparisons. Our core contribution lies in applying a divergence\nestimator to build a validation metric considering the joint distribution of\nreal and synthetic data. We leverage a probabilistic classifier to approximate\nthe density ratio between datasets, allowing the capture of complex\nrelationships. We specifically calculate two divergences: the well-known\nKullback-Leibler (KL) divergence and the Jensen-Shannon (JS) divergence. KL\ndivergence offers an established use in the field, while JS divergence is\nsymmetric and bounded, providing a reliable metric. The efficacy of this\napproach is demonstrated through a series of experiments with varying\ndistribution complexities. The initial phase involves comparing estimated\ndivergences with analytical solutions for simple distributions, setting a\nbenchmark for accuracy. Finally, we validate our method on a real-world dataset\nand its corresponding synthetic counterpart, showcasing its effectiveness in\npractical applications. This research offers a significant contribution with\napplicability beyond tabular data and the potential to improve synthetic data\nvalidation in various fields.\n","authors":["Patricia A. Apellániz","Ana Jiménez","Borja Arroyo Galende","Juan Parras","Santiago Zazo"],"pdf_url":"https://arxiv.org/pdf/2405.07822v2.pdf","comment":"15 pages, 14 figures"},{"id":"http://arxiv.org/abs/2407.21490v1","updated":"2024-07-31T09:59:20Z","published":"2024-07-31T09:59:20Z","title":"Explainable and Controllable Motion Curve Guided Cardiac Ultrasound\n Video Generation","summary":" Echocardiography video is a primary modality for diagnosing heart diseases,\nbut the limited data poses challenges for both clinical teaching and machine\nlearning training. Recently, video generative models have emerged as a\npromising strategy to alleviate this issue. However, previous methods often\nrelied on holistic conditions during generation, hindering the flexible\nmovement control over specific cardiac structures. In this context, we propose\nan explainable and controllable method for echocardiography video generation,\ntaking an initial frame and a motion curve as guidance. Our contributions are\nthree-fold. First, we extract motion information from each heart substructure\nto construct motion curves, enabling the diffusion model to synthesize\ncustomized echocardiography videos by modifying these curves. Second, we\npropose the structure-to-motion alignment module, which can map semantic\nfeatures onto motion curves across cardiac structures. Third, The\nposition-aware attention mechanism is designed to enhance video consistency\nutilizing Gaussian masks with structural position information. Extensive\nexperiments on three echocardiography datasets show that our method outperforms\nothers regarding fidelity and consistency. The full code will be released at\nhttps://github.com/mlmi-2024-72/ECM.\n","authors":["Junxuan Yu","Rusi Chen","Yongsong Zhou","Yanlin Chen","Yaofei Duan","Yuhao Huang","Han Zhou","Tan Tao","Xin Yang","Dong Ni"],"pdf_url":"https://arxiv.org/pdf/2407.21490v1.pdf","comment":"Accepted by MICCAI MLMI 2024"},{"id":"http://arxiv.org/abs/2407.21476v1","updated":"2024-07-31T09:37:27Z","published":"2024-07-31T09:37:27Z","title":"On the Problem of Text-To-Speech Model Selection for Synthetic Data\n Generation in Automatic Speech Recognition","summary":" The rapid development of neural text-to-speech (TTS) systems enabled its\nusage in other areas of natural language processing such as automatic speech\nrecognition (ASR) or spoken language translation (SLT). Due to the large number\nof different TTS architectures and their extensions, selecting which TTS\nsystems to use for synthetic data creation is not an easy task. We use the\ncomparison of five different TTS decoder architectures in the scope of\nsynthetic data generation to show the impact on CTC-based speech recognition\ntraining. We compare the recognition results to computable metrics like NISQA\nMOS and intelligibility, finding that there are no clear relations to the ASR\nperformance. We also observe that for data generation auto-regressive decoding\nperforms better than non-autoregressive decoding, and propose an approach to\nquantify TTS generalization capabilities.\n","authors":["Nick Rossenbach","Ralf Schlüter","Sakriani Sakti"],"pdf_url":"https://arxiv.org/pdf/2407.21476v1.pdf","comment":"Accepted at the SynData4GenAI 2024 workshop"},{"id":"http://arxiv.org/abs/2311.12550v5","updated":"2024-07-31T09:20:43Z","published":"2023-11-21T11:59:16Z","title":"Explainable Time Series Anomaly Detection using Masked Latent Generative\n Modeling","summary":" We present a novel time series anomaly detection method that achieves\nexcellent detection accuracy while offering a superior level of explainability.\nOur proposed method, TimeVQVAE-AD, leverages masked generative modeling adapted\nfrom the cutting-edge time series generation method known as TimeVQVAE. The\nprior model is trained on the discrete latent space of a time-frequency domain.\nNotably, the dimensional semantics of the time-frequency domain are preserved\nin the latent space, enabling us to compute anomaly scores across different\nfrequency bands, which provides a better insight into the detected anomalies.\nAdditionally, the generative nature of the prior model allows for sampling\nlikely normal states for detected anomalies, enhancing the explainability of\nthe detected anomalies through counterfactuals. Our experimental evaluation on\nthe UCR Time Series Anomaly archive demonstrates that TimeVQVAE-AD\nsignificantly surpasses the existing methods in terms of detection accuracy and\nexplainability. We provide our implementation on GitHub:\nhttps://github.com/ML4ITS/TimeVQVAE-AnomalyDetection.\n","authors":["Daesoo Lee","Sara Malacarne","Erlend Aune"],"pdf_url":"https://arxiv.org/pdf/2311.12550v5.pdf","comment":"Published in Pattern Recognition"},{"id":"http://arxiv.org/abs/2402.12539v2","updated":"2024-07-31T09:17:54Z","published":"2024-02-19T21:01:11Z","title":"Impact of data for forecasting on performance of model predictive\n control in buildings with smart energy storage","summary":" Data is required to develop forecasting models for use in Model Predictive\nControl (MPC) schemes in building energy systems. However, data is costly to\nboth collect and exploit. Determining cost optimal data usage strategies\nrequires understanding of the forecast accuracy and resulting MPC operational\nperformance it enables. This study investigates the performance of both simple\nand state-of-the-art machine learning prediction models for MPC in\nmulti-building energy systems using a simulated case study with historic\nbuilding energy data. The impact on forecast accuracy of measures to improve\nmodel data efficiency are quantified, specifically for: reuse of prediction\nmodels, reduction of training data duration, reduction of model data features,\nand online model training. A simple linear multi-layer perceptron model is\nshown to provide equivalent forecast accuracy to state-of-the-art models, with\ngreater data efficiency and generalisability. The use of more than 2 years of\ntraining data for load prediction models provided no significant improvement in\nforecast accuracy. Forecast accuracy and data efficiency were improved\nsimultaneously by using change-point analysis to screen training data. Reused\nmodels and those trained with 3 months of data had on average 10% higher error\nthan baseline, indicating that deploying MPC systems without prior data\ncollection may be economic.\n","authors":["Max Langtry","Vijja Wichitwechkarn","Rebecca Ward","Chaoqun Zhuang","Monika J. Kreitmair","Nikolas Makasis","Zack Xuereb Conti","Ruchi Choudhary"],"pdf_url":"https://arxiv.org/pdf/2402.12539v2.pdf","comment":"36 pages, 22 figures"},{"id":"http://arxiv.org/abs/2407.21460v1","updated":"2024-07-31T09:17:09Z","published":"2024-07-31T09:17:09Z","title":"Multi-agent Assessment with QoS Enhancement for HD Map Updates in a\n Vehicular Network","summary":" Reinforcement Learning (RL) algorithms have been used to address the\nchallenging problems in the offloading process of vehicular ad hoc networks\n(VANET). More recently, they have been utilized to improve the dissemination of\nhigh-definition (HD) Maps. Nevertheless, implementing solutions such as deep\nQ-learning (DQN) and Actor-critic at the autonomous vehicle (AV) may lead to an\nincrease in the computational load, causing a heavy burden on the computational\ndevices and higher costs. Moreover, their implementation might raise\ncompatibility issues between technologies due to the required modifications to\nthe standards. Therefore, in this paper, we assess the scalability of an\napplication utilizing a Q-learning single-agent solution in a distributed\nmulti-agent environment. This application improves the network performance by\ntaking advantage of a smaller state, and action space whilst using a\nmulti-agent approach. The proposed solution is extensively evaluated with\ndifferent test cases involving reward function considering individual or\noverall network performance, number of agents, and centralized and distributed\nlearning comparison. The experimental results demonstrate that the time\nlatencies of our proposed solution conducted in voice, video, HD Map, and\nbest-effort cases have significant improvements, with 40.4%, 36%, 43%, and 12%\nrespectively, compared to the performances with the single-agent approach.\n","authors":["Jeffrey Redondo","Nauman Aslam","Juan Zhang","Zhenhui Yuan"],"pdf_url":"https://arxiv.org/pdf/2407.21460v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15017v2","updated":"2024-07-31T09:14:29Z","published":"2024-07-22T06:15:59Z","title":"Knowledge Mechanisms in Large Language Models: A Survey and Perspective","summary":" Understanding knowledge mechanisms in Large Language Models (LLMs) is crucial\nfor advancing towards trustworthy AGI. This paper reviews knowledge mechanism\nanalysis from a novel taxonomy including knowledge utilization and evolution.\nKnowledge utilization delves into the mechanism of memorization, comprehension\nand application, and creation. Knowledge evolution focuses on the dynamic\nprogression of knowledge within individual and group LLMs. Moreover, we discuss\nwhat knowledge LLMs have learned, the reasons for the fragility of parametric\nknowledge, and the potential dark knowledge (hypothesis) that will be\nchallenging to address. We hope this work can help understand knowledge in LLMs\nand provide insights for future research.\n","authors":["Mengru Wang","Yunzhi Yao","Ziwen Xu","Shuofei Qiao","Shumin Deng","Peng Wang","Xiang Chen","Jia-Chen Gu","Yong Jiang","Pengjun Xie","Fei Huang","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.15017v2.pdf","comment":"Ongoing work (v2); add Section 5: Application of Knowledge Mechanism;\n revise Section 6 and 7; fix typos"},{"id":"http://arxiv.org/abs/2407.21453v1","updated":"2024-07-31T08:57:42Z","published":"2024-07-31T08:57:42Z","title":"TinyChirp: Bird Song Recognition Using TinyML Models on Low-power\n Wireless Acoustic Sensors","summary":" Monitoring biodiversity at scale is challenging. Detecting and identifying\nspecies in fine grained taxonomies requires highly accurate machine learning\n(ML) methods. Training such models requires large high quality data sets. And\ndeploying these models to low power devices requires novel compression\ntechniques and model architectures. While species classification methods have\nprofited from novel data sets and advances in ML methods, in particular neural\nnetworks, deploying these state of the art models to low power devices remains\ndifficult. Here we present a comprehensive empirical comparison of various\ntinyML neural network architectures and compression techniques for species\nclassification. We focus on the example of bird song detection, more concretely\na data set curated for studying the corn bunting bird species. The data set is\nreleased along with all code and experiments of this study. In our experiments\nwe compare predictive performance, memory and time complexity of classical\nspectrogram based methods and recent approaches operating on raw audio signal.\nOur results indicate that individual bird species can be robustly detected with\nrelatively simple architectures that can be readily deployed to low power\ndevices.\n","authors":["Zhaolan Huang","Adrien Tousnakhoff","Polina Kozyr","Roman Rehausen","Felix Bießmann","Robert Lachlan","Cedric Adjih","Emmanuel Baccelli"],"pdf_url":"https://arxiv.org/pdf/2407.21453v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21439v1","updated":"2024-07-31T08:43:17Z","published":"2024-07-31T08:43:17Z","title":"MLLM Is a Strong Reranker: Advancing Multimodal Retrieval-augmented\n Generation via Knowledge-enhanced Reranking and Noise-injected Training","summary":" Multimodal Large Language Models (MLLMs) have demonstrated remarkable\ncapabilities in processing and generating content across multiple data\nmodalities, including text, images, audio, and video. However, a significant\ndrawback of MLLMs is their reliance on static training data, leading to\noutdated information and limited contextual awareness. This static nature\nhampers their ability to provide accurate, up-to-date responses, particularly\nin dynamic or rapidly evolving contexts. Integrating Multimodal\nRetrieval-augmented Generation (Multimodal RAG) offers a promising solution,\nbut the system would inevitably encounter the multi-granularity noisy\ncorrespondence (MNC) problem, which involves two types of noise: coarse-grained\n(query-caption) and fine-grained (query-image). This noise hinders accurate\nretrieval and generation. In this work, we propose \\textbf{RagLLaVA}, a novel\nframework with knowledge-enhanced reranking and noise-injected training, to\naddress these limitations. We instruction-tune the MLLM with a simple yet\neffective instruction template to induce its ranking ability and serve it as a\nreranker to precisely filter the top-k retrieved images. For generation, we\ninject visual noise during training at the data and token levels to enhance the\ngenerator's robustness. Extensive experiments are conducted on the subsets of\ntwo datasets that require retrieving and reasoning over images to answer a\ngiven query. Our results demonstrate the superiority of RagLLaVA in retrieving\naccurately and generating robustly. Code and models are available at\nhttps://github.com/IDEA-FinAI/RagLLaVA.\n","authors":["Zhanpeng Chen","Chengjin Xu","Yiyan Qi","Jian Guo"],"pdf_url":"https://arxiv.org/pdf/2407.21439v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21435v1","updated":"2024-07-31T08:38:39Z","published":"2024-07-31T08:38:39Z","title":"Transient anisotropic kernel for probabilistic learning on manifolds","summary":" PLoM (Probabilistic Learning on Manifolds) is a method introduced in 2016 for\nhandling small training datasets by projecting an It\\^o equation from a\nstochastic dissipative Hamiltonian dynamical system, acting as the MCMC\ngenerator, for which the KDE-estimated probability measure with the training\ndataset is the invariant measure. PLoM performs a projection on a reduced-order\nvector basis related to the training dataset, using the diffusion maps (DMAPS)\nbasis constructed with a time-independent isotropic kernel. In this paper, we\npropose a new ISDE projection vector basis built from a transient anisotropic\nkernel, providing an alternative to the DMAPS basis to improve statistical\nsurrogates for stochastic manifolds with heterogeneous data. The construction\nensures that for times near the initial time, the DMAPS basis coincides with\nthe transient basis. For larger times, the differences between the two bases\nare characterized by the angle of their spanned vector subspaces. The optimal\ninstant yielding the optimal transient basis is determined using an estimation\nof mutual information from Information Theory, which is normalized by the\nentropy estimation to account for the effects of the number of realizations\nused in the estimations. Consequently, this new vector basis better represents\nstatistical dependencies in the learned probability measure for any dimension.\nThree applications with varying levels of statistical complexity and data\nheterogeneity validate the proposed theory, showing that the transient\nanisotropic kernel improves the learned probability measure.\n","authors":["Christian Soize","Roger Ghanem"],"pdf_url":"https://arxiv.org/pdf/2407.21435v1.pdf","comment":"44 pages, 14 figures"},{"id":"http://arxiv.org/abs/2405.09802v3","updated":"2024-07-31T08:28:48Z","published":"2024-05-16T04:21:09Z","title":"Analysis and Predictive Modeling of Solar Coronal Holes Using Computer\n Vision and ARIMA-LSTM Networks","summary":" In the era of space exploration, coronal holes on the sun play a significant\nrole due to their impact on satellites and aircraft through their open magnetic\nfields and increased solar wind emissions. This study employs computer vision\ntechniques to detect coronal hole regions and estimate their sizes using\nimagery from the Solar Dynamics Observatory (SDO). Additionally, we utilize\nhybrid time series prediction model, specifically combination of Long\nShort-Term Memory (LSTM) networks and ARIMA, to analyze trends in the area of\ncoronal holes and predict their areas across various solar regions over a span\nof seven days. By examining time series data, we aim to identify patterns in\ncoronal hole behavior and understand their potential effects on space weather.\n","authors":["Juyoung Yun","Jungmin Shin"],"pdf_url":"https://arxiv.org/pdf/2405.09802v3.pdf","comment":"Accepted to the first joint European Space Agency SPAICE Conference\n 2024"},{"id":"http://arxiv.org/abs/2407.21424v1","updated":"2024-07-31T08:19:06Z","published":"2024-07-31T08:19:06Z","title":"Cost-Effective Hallucination Detection for LLMs","summary":" Large language models (LLMs) can be prone to hallucinations - generating\nunreliable outputs that are unfaithful to their inputs, external facts or\ninternally inconsistent. In this work, we address several challenges for\npost-hoc hallucination detection in production settings. Our pipeline for\nhallucination detection entails: first, producing a confidence score\nrepresenting the likelihood that a generated answer is a hallucination; second,\ncalibrating the score conditional on attributes of the inputs and candidate\nresponse; finally, performing detection by thresholding the calibrated score.\nWe benchmark a variety of state-of-the-art scoring methods on different\ndatasets, encompassing question answering, fact checking, and summarization\ntasks. We employ diverse LLMs to ensure a comprehensive assessment of\nperformance. We show that calibrating individual scoring methods is critical\nfor ensuring risk-aware downstream decision making. Based on findings that no\nindividual score performs best in all situations, we propose a multi-scoring\nframework, which combines different scores and achieves top performance across\nall datasets. We further introduce cost-effective multi-scoring, which can\nmatch or even outperform more expensive detection methods, while significantly\nreducing computational overhead.\n","authors":["Simon Valentin","Jinmiao Fu","Gianluca Detommaso","Shaoyuan Xu","Giovanni Zappella","Bryan Wang"],"pdf_url":"https://arxiv.org/pdf/2407.21424v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21418v1","updated":"2024-07-31T08:05:33Z","published":"2024-07-31T08:05:33Z","title":"FTuner: A Fast Dynamic Shape Tensors Program Auto-Tuner for Deep\n Learning Compilers","summary":" Many artificial intelligence models process input data of different lengths\nand resolutions, making the shape of the tensors dynamic. The performance of\nthese models depends on the shape of the tensors, which makes it difficult to\noptimize the tensors before the model runs. There are two common solutions to\nthis problem. The first is to add useless data to the input to match a\npre-optimized tensor library. The second is to use small basic tensors to\ncreate a tensor that is closest in size to the input data and then tune it to\nminimize padding. However, this second solution can be time-consuming.\n This paper proposes a new technique for deep learning compilers called\nFTuner. Instead of using a large design space or training a cost model, we use\nan abstract computational unit called the uKernel to patch together small,\nvarious-sized tensors to match the shape of the input tensor. We determine the\nshape of the uKernel using an analytic hardware information model. Experiments\nshow that the FTuner can achieve comparable operators and end-to-end\nperformance to vendor libraries and achieves 3\\% speedup on existing auto-tuner\nwith the model-training compiler while reducing tuning time by two orders of\nmagnitude.\n","authors":["Pengyu Mu","Linquan Wei","Yi Liu","Rui Wang"],"pdf_url":"https://arxiv.org/pdf/2407.21418v1.pdf","comment":"14 pages, 16 figures, 6 tables"},{"id":"http://arxiv.org/abs/2407.21407v1","updated":"2024-07-31T07:54:14Z","published":"2024-07-31T07:54:14Z","title":"Deep Fréchet Regression","summary":" Advancements in modern science have led to the increasing availability of\nnon-Euclidean data in metric spaces. This paper addresses the challenge of\nmodeling relationships between non-Euclidean responses and multivariate\nEuclidean predictors. We propose a flexible regression model capable of\nhandling high-dimensional predictors without imposing parametric assumptions.\nTwo primary challenges are addressed: the curse of dimensionality in\nnonparametric regression and the absence of linear structure in general metric\nspaces. The former is tackled using deep neural networks, while for the latter\nwe demonstrate the feasibility of mapping the metric space where responses\nreside to a low-dimensional Euclidean space using manifold learning. We\nintroduce a reverse mapping approach, employing local Fr\\'echet regression, to\nmap the low-dimensional manifold representations back to objects in the\noriginal metric space. We develop a theoretical framework, investigating the\nconvergence rate of deep neural networks under dependent sub-Gaussian noise\nwith bias. The convergence rate of the proposed regression model is then\nobtained by expanding the scope of local Fr\\'echet regression to accommodate\nmultivariate predictors in the presence of errors in predictors. Simulations\nand case studies show that the proposed model outperforms existing methods for\nnon-Euclidean responses, focusing on the special cases of probability measures\nand networks.\n","authors":["Su I Iao","Yidong Zhou","Hans-Georg Müller"],"pdf_url":"https://arxiv.org/pdf/2407.21407v1.pdf","comment":"66 pages, 6 figures, 5 tables"},{"id":"http://arxiv.org/abs/2407.00891v2","updated":"2024-07-31T07:51:56Z","published":"2024-07-01T01:28:14Z","title":"ZeroDDI: A Zero-Shot Drug-Drug Interaction Event Prediction Method with\n Semantic Enhanced Learning and Dual-Modal Uniform Alignment","summary":" Drug-drug interactions (DDIs) can result in various pharmacological changes,\nwhich can be categorized into different classes known as DDI events (DDIEs). In\nrecent years, previously unobserved/unseen DDIEs have been emerging, posing a\nnew classification task when unseen classes have no labelled instances in the\ntraining stage, which is formulated as a zero-shot DDIE prediction (ZS-DDIE)\ntask. However, existing computational methods are not directly applicable to\nZS-DDIE, which has two primary challenges: obtaining suitable DDIE\nrepresentations and handling the class imbalance issue. To overcome these\nchallenges, we propose a novel method named ZeroDDI for the ZS-DDIE task.\nSpecifically, we design a biological semantic enhanced DDIE representation\nlearning module, which emphasizes the key biological semantics and distills\ndiscriminative molecular substructure-related semantics for DDIE representation\nlearning. Furthermore, we propose a dual-modal uniform alignment strategy to\ndistribute drug pair representations and DDIE semantic representations\nuniformly in a unit sphere and align the matched ones, which can mitigate the\nissue of class imbalance. Extensive experiments showed that ZeroDDI surpasses\nthe baselines and indicate that it is a promising tool for detecting unseen\nDDIEs. Our code has been released in https://github.com/wzy-Sarah/ZeroDDI.\n","authors":["Ziyan Wang","Zhankun Xiong","Feng Huang","Xuan Liu","Wen Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.00891v2.pdf","comment":"Accepted by IJCAI2024"},{"id":"http://arxiv.org/abs/2209.03075v4","updated":"2024-07-31T07:39:48Z","published":"2022-09-07T11:28:17Z","title":"A learning theory for quantum photonic processors and beyond","summary":" We consider the tasks of learning quantum states, measurements and channels\ngenerated by continuous-variable (CV) quantum circuits. This family of circuits\nis suited to describe optical quantum technologies and in particular it\nincludes state-of-the-art photonic processors capable of showing quantum\nadvantage. We define classes of functions that map classical variables, encoded\ninto the CV circuit parameters, to outcome probabilities evaluated on those\ncircuits. We then establish efficient learnability guarantees for such classes,\nby computing bounds on their pseudo-dimension or covering numbers, showing that\nCV quantum circuits can be learned with a sample complexity that scales\npolynomially with the circuit's size, i.e., the number of modes. Our results\nshow that CV circuits can be trained efficiently using a number of training\nsamples that, unlike their finite-dimensional counterpart, does not scale with\nthe circuit depth.\n","authors":["Matteo Rosati"],"pdf_url":"https://arxiv.org/pdf/2209.03075v4.pdf","comment":"27+5 pages, 2 figures"},{"id":"http://arxiv.org/abs/2406.09904v3","updated":"2024-07-31T07:35:42Z","published":"2024-06-14T10:23:45Z","title":"QQQ: Quality Quattuor-Bit Quantization for Large Language Models","summary":" Quantization is a proven effective method for compressing large language\nmodels. Although popular techniques like W8A8 and W4A16 effectively maintain\nmodel performance, they often fail to concurrently speed up the prefill and\ndecoding stages of inference. W4A8 is a promising strategy to accelerate both\nof them while usually leads to a significant performance degradation. To\naddress these issues, we present QQQ, a Quality Quattuor-bit Quantization\nmethod with 4-bit weights and 8-bit activations. QQQ employs adaptive smoothing\nand Hessian-based compensation, significantly enhancing the performance of\nquantized models without extensive training. Furthermore, we meticulously\nengineer W4A8 GEMM kernels to increase inference speed. Our specialized\nper-channel W4A8 GEMM and per-group W4A8 GEMM achieve impressive speed\nincreases of 3.67$\\times$ and 3.29 $\\times$ over FP16 GEMM. Our extensive\nexperiments show that QQQ achieves performance on par with existing\nstate-of-the-art LLM quantization methods while significantly accelerating\ninference, achieving speed boosts up to 2.24 $\\times$, 2.10$\\times$, and\n1.25$\\times$ compared to FP16, W8A8, and W4A16, respectively.\n","authors":["Ying Zhang","Peng Zhang","Mincong Huang","Jingyang Xiang","Yujie Wang","Chao Wang","Yineng Zhang","Lei Yu","Chuan Liu","Wei Lin"],"pdf_url":"https://arxiv.org/pdf/2406.09904v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.12684v4","updated":"2024-07-31T07:27:56Z","published":"2024-05-21T11:19:50Z","title":"Model Free Prediction with Uncertainty Assessment","summary":" Deep nonparametric regression, characterized by the utilization of deep\nneural networks to learn target functions, has emerged as a focus of research\nattention in recent years. Despite considerable progress in understanding\nconvergence rates, the absence of asymptotic properties hinders rigorous\nstatistical inference. To address this gap, we propose a novel framework that\ntransforms the deep estimation paradigm into a platform conducive to\nconditional mean estimation, leveraging the conditional diffusion model.\nTheoretically, we develop an end-to-end convergence rate for the conditional\ndiffusion model and establish the asymptotic normality of the generated\nsamples. Consequently, we are equipped to construct confidence regions,\nfacilitating robust statistical inference. Furthermore, through numerical\nexperiments, we empirically validate the efficacy of our proposed methodology.\n","authors":["Yuling Jiao","Lican Kang","Jin Liu","Heng Peng","Heng Zuo"],"pdf_url":"https://arxiv.org/pdf/2405.12684v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21385v1","updated":"2024-07-31T07:16:40Z","published":"2024-07-31T07:16:40Z","title":"SmileyNet -- Towards the Prediction of the Lottery by Reading Tea Leaves\n with AI","summary":" We introduce SmileyNet, a novel neural network with psychic abilities. It is\ninspired by the fact that a positive mood can lead to improved cognitive\ncapabilities including classification tasks. The network is hence presented in\na first phase with smileys and an encouraging loss function is defined to bias\nit into a good mood. SmileyNet is then used to forecast the flipping of a coin\nbased on an established method of Tasseology, namely by reading tea leaves.\nTraining and testing in this second phase are done with a high-fidelity\nsimulation based on real-world pixels sampled from a professional tea-reading\ncup. SmileyNet has an amazing accuracy of 72% to correctly predict the flip of\na coin. Resnet-34, respectively YOLOv5 achieve only 49%, respectively 53%. It\nis then shown how multiple SmileyNets can be combined to win the lottery.\n","authors":["Andreas Birk"],"pdf_url":"https://arxiv.org/pdf/2407.21385v1.pdf","comment":"This is a satirical accumulation of misconceptions, mistakes, and\n flawed reasoning I have encountered in recent times as a reviewer and\n sometimes even as a reader of published papers. I hope it is entertaining and\n useful in the context of the education of BSc, MSc, and PhD students in\n Machine Learning, Artificial Intelligence, and Cognitive Science"},{"id":"http://arxiv.org/abs/2407.19429v2","updated":"2024-07-31T07:15:15Z","published":"2024-07-28T08:39:28Z","title":"FTF-ER: Feature-Topology Fusion-Based Experience Replay Method for\n Continual Graph Learning","summary":" Continual graph learning (CGL) is an important and challenging task that aims\nto extend static GNNs to dynamic task flow scenarios. As one of the mainstream\nCGL methods, the experience replay (ER) method receives widespread attention\ndue to its superior performance. However, existing ER methods focus on\nidentifying samples by feature significance or topological relevance, which\nlimits their utilization of comprehensive graph data. In addition, the\ntopology-based ER methods only consider local topological information and add\nneighboring nodes to the buffer, which ignores the global topological\ninformation and increases memory overhead. To bridge these gaps, we propose a\nnovel method called Feature-Topology Fusion-based Experience Replay (FTF-ER) to\neffectively mitigate the catastrophic forgetting issue with enhanced\nefficiency. Specifically, from an overall perspective to maximize the\nutilization of the entire graph data, we propose a highly complementary\napproach including both feature and global topological information, which can\nsignificantly improve the effectiveness of the sampled nodes. Moreover, to\nfurther utilize global topological information, we propose Hodge Potential\nScore (HPS) as a novel module to calculate the topological importance of nodes.\nHPS derives a global node ranking via Hodge decomposition on graphs, providing\nmore accurate global topological information compared to neighbor sampling. By\nexcluding neighbor sampling, HPS significantly reduces buffer storage costs for\nacquiring topological information and simultaneously decreases training time.\nCompared with state-of-the-art methods, FTF-ER achieves a significant\nimprovement of 3.6% in AA and 7.1% in AF on the OGB-Arxiv dataset,\ndemonstrating its superior performance in the class-incremental learning\nsetting.\n","authors":["Jinhui Pang","Changqing Lin","Xiaoshuai Hao","Rong Yin","Zixuan Wang","Zhihui Zhang","Jinglin He","Huang Tai Sheng"],"pdf_url":"https://arxiv.org/pdf/2407.19429v2.pdf","comment":"Accepted by ACM Multimedia 2024"},{"id":"http://arxiv.org/abs/2407.21374v1","updated":"2024-07-31T06:56:46Z","published":"2024-07-31T06:56:46Z","title":"Dynamic Gesture Recognition in Ultra-Range Distance for Effective\n Human-Robot Interaction","summary":" This paper presents a novel approach for ultra-range gesture recognition,\naddressing Human-Robot Interaction (HRI) challenges over extended distances. By\nleveraging human gestures in video data, we propose the Temporal-Spatiotemporal\nFusion Network (TSFN) model that surpasses the limitations of current methods,\nenabling robots to understand gestures from long distances. With applications\nin service robots, search and rescue operations, and drone-based interactions,\nour approach enhances HRI in expansive environments. Experimental validation\ndemonstrates significant advancements in gesture recognition accuracy,\nparticularly in prolonged gesture sequences.\n","authors":["Eran Bamani Beeri","Eden Nissinman","Avishai Sintov"],"pdf_url":"https://arxiv.org/pdf/2407.21374v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21372v1","updated":"2024-07-31T06:54:24Z","published":"2024-07-31T06:54:24Z","title":"Two Completely Parameter-Free Alternating Gradient Projection Algorithms\n for Nonconvex-(strongly) Concave Minimax Problems","summary":" Due to their importance in various emerging applications, efficient\nalgorithms for solving minimax problems have recently received increasing\nattention. However, many existing algorithms require prior knowledge of the\nproblem parameters in order to achieve optimal iteration complexity. In this\npaper, we propose a completely parameter-free alternating gradient projection\n(PF-AGP) algorithm to solve the smooth nonconvex-(strongly) concave minimax\nproblems using a backtracking strategy, which does not require prior knowledge\nof parameters such as the Lipschtiz constant $L$ or the strongly concave\nconstant $\\mu$. The PF-AGP algorithm utilizes a parameter-free gradient\nprojection step to alternately update the outer and inner variables in each\niteration. We show that the total number of gradient calls of the PF-AGP\nalgorithm to obtain an $\\varepsilon$-stationary point for nonconvex-strongly\nconcave minimax problems is upper bounded by $\\mathcal{O}\\left(\nL\\kappa^3\\varepsilon^{-2} \\right)$ where $\\kappa$ is the condition number,\nwhile the total number of gradient calls to obtain an $\\varepsilon$-stationary\npoint for nonconvex-concave minimax problems is upper bounded by\n$\\mathcal{O}\\left( L^4\\varepsilon^{-4} \\right)$. As far as we know, this is the\nfirst completely parameter-free algorithm for solving nonconvex-strongly\nconcave minimax problems, and it is also the completely parameter-free\nalgorithm which achieves the best iteration complexity in single loop method\nfor solving nonconvex-concave minimax problems. Numerical results validate the\nefficiency of the proposed PF-AGP algorithm.\n","authors":["Junnan Yang","Huiling Zhang","Zi Xu"],"pdf_url":"https://arxiv.org/pdf/2407.21372v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01853v3","updated":"2024-07-31T06:46:16Z","published":"2023-12-04T12:35:43Z","title":"Robot Synesthesia: In-Hand Manipulation with Visuotactile Sensing","summary":" Executing contact-rich manipulation tasks necessitates the fusion of tactile\nand visual feedback. However, the distinct nature of these modalities poses\nsignificant challenges. In this paper, we introduce a system that leverages\nvisual and tactile sensory inputs to enable dexterous in-hand manipulation.\nSpecifically, we propose Robot Synesthesia, a novel point cloud-based tactile\nrepresentation inspired by human tactile-visual synesthesia. This approach\nallows for the simultaneous and seamless integration of both sensory inputs,\noffering richer spatial information and facilitating better reasoning about\nrobot actions. The method, trained in a simulated environment and then deployed\nto a real robot, is applicable to various in-hand object rotation tasks.\nComprehensive ablations are performed on how the integration of vision and\ntouch can improve reinforcement learning and Sim2Real performance. Our project\npage is available at https://yingyuan0414.github.io/visuotactile/ .\n","authors":["Ying Yuan","Haichuan Che","Yuzhe Qin","Binghao Huang","Zhao-Heng Yin","Kang-Won Lee","Yi Wu","Soo-Chul Lim","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2312.01853v3.pdf","comment":"Project page: https://yingyuan0414.github.io/visuotactile/"},{"id":"http://arxiv.org/abs/2407.21368v1","updated":"2024-07-31T06:34:38Z","published":"2024-07-31T06:34:38Z","title":"Prompting Medical Large Vision-Language Models to Diagnose Pathologies\n by Visual Question Answering","summary":" Large Vision-Language Models (LVLMs) have achieved significant success in\nrecent years, and they have been extended to the medical domain. Although\ndemonstrating satisfactory performance on medical Visual Question Answering\n(VQA) tasks, Medical LVLMs (MLVLMs) suffer from the hallucination problem,\nwhich makes them fail to diagnose complex pathologies. Moreover, they readily\nfail to learn minority pathologies due to imbalanced training data. We propose\ntwo prompting strategies for MLVLMs that reduce hallucination and improve VQA\nperformance. In the first strategy, we provide a detailed explanation of the\nqueried pathology. In the second strategy, we fine-tune a cheap, weak learner\nto achieve high performance on a specific metric, and textually provide its\njudgment to the MLVLM. Tested on the MIMIC-CXR-JPG and Chexpert datasets, our\nmethods significantly improve the diagnostic F1 score, with the highest\nincrease being 0.27. We also demonstrate that our prompting strategies can be\nextended to general LVLM domains. Based on POPE metrics, it effectively\nsuppresses the false negative predictions of existing LVLMs and improves Recall\nby approximately 0.07.\n","authors":["Danfeng Guo","Demetri Terzopoulos"],"pdf_url":"https://arxiv.org/pdf/2407.21368v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21359v1","updated":"2024-07-31T06:04:55Z","published":"2024-07-31T06:04:55Z","title":"ProSpec RL: Plan Ahead, then Execute","summary":" Imagining potential outcomes of actions before execution helps agents make\nmore informed decisions, a prospective thinking ability fundamental to human\ncognition. However, mainstream model-free Reinforcement Learning (RL) methods\nlack the ability to proactively envision future scenarios, plan, and guide\nstrategies. These methods typically rely on trial and error to adjust policy\nfunctions, aiming to maximize cumulative rewards or long-term value, even if\nsuch high-reward decisions place the environment in extremely dangerous states.\nTo address this, we propose the Prospective (ProSpec) RL method, which makes\nhigher-value, lower-risk optimal decisions by imagining future n-stream\ntrajectories. Specifically, ProSpec employs a dynamic model to predict future\nstates (termed \"imagined states\") based on the current state and a series of\nsampled actions. Furthermore, we integrate the concept of Model Predictive\nControl and introduce a cycle consistency constraint that allows the agent to\nevaluate and select the optimal actions from these trajectories. Moreover,\nProSpec employs cycle consistency to mitigate two fundamental issues in RL:\naugmenting state reversibility to avoid irreversible events (low risk) and\naugmenting actions to generate numerous virtual trajectories, thereby improving\ndata efficiency. We validated the effectiveness of our method on the DMControl\nbenchmarks, where our approach achieved significant performance improvements.\nCode will be open-sourced upon acceptance.\n","authors":["Liangliang Liu","Yi Guan","BoRan Wang","Rujia Shen","Yi Lin","Chaoran Kong","Lian Yan","Jingchi Jiang"],"pdf_url":"https://arxiv.org/pdf/2407.21359v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06082v2","updated":"2024-07-31T05:59:31Z","published":"2024-03-10T04:01:49Z","title":"FrameQuant: Flexible Low-Bit Quantization for Transformers","summary":" Transformers are the backbone of powerful foundation models for many Vision\nand Natural Language Processing tasks. But their compute and memory/storage\nfootprint is large, and so, serving such models is expensive often requiring\nhigh-end hardware. To mitigate this difficulty, Post-Training Quantization\nseeks to modify a pre-trained model and quantize it to eight bits or lower,\nsignificantly boosting compute/memory/latency efficiency. Such models have been\nsuccessfully quantized to four bits with some performance loss. In this work,\nwe outline a simple scheme to quantize Transformer-based models to just two\nbits (plus some overhead) with only a small drop in accuracy. Key to our\nformulation is a concept borrowed from Harmonic analysis called Fusion Frames.\nOur main finding is that the quantization must take place not in the original\nweight space, but instead in the Fusion Frame representations. If quantization\nis interpreted as the addition of noise, our casting of the problem allows\ninvoking an extensive body of known consistent recovery and noise robustness\nguarantees. Further, if desired, de-noising filters are known in closed form.\nWe show empirically, via a variety of experiments, that (almost) two-bit\nquantization for Transformer models promises sizable efficiency gains. The code\nis available at https://github.com/vsingh-group/FrameQuant\n","authors":["Harshavardhan Adepu","Zhanpeng Zeng","Li Zhang","Vikas Singh"],"pdf_url":"https://arxiv.org/pdf/2403.06082v2.pdf","comment":"25 pages, 15 figures"},{"id":"http://arxiv.org/abs/2407.17040v2","updated":"2024-07-31T05:39:34Z","published":"2024-07-24T07:02:16Z","title":"Time Series Imputation with Multivariate Radial Basis Function Neural\n Network","summary":" Researchers have been persistently working to address the issue of missing\nvalues in time series data. Numerous models have been proposed, striving to\nestimate the distribution of the data. The Radial Basis Functions Neural\nNetwork (RBFNN) has recently exhibited exceptional performance in estimating\ndata distribution. In this paper, we propose a time series imputation model\nbased on RBFNN. Our imputation model learns local information from timestamps\nto create a continuous function. Additionally, we incorporate time gaps to\nfacilitate learning information considering the missing terms of missing\nvalues. We name this model the Missing Imputation Multivariate RBFNN\n(MIM-RBFNN). However, MIM-RBFNN relies on a local information-based learning\napproach, which presents difficulties in utilizing temporal information.\nTherefore, we propose an extension called the Missing Value Imputation\nRecurrent Neural Network with Continuous Function (MIRNN-CF) using the\ncontinuous function generated by MIM-RBFNN. We evaluate the performance using\ntwo real-world datasets with non-random missing and random missing patterns,\nand conduct an ablation study comparing MIM-RBFNN and MIRNN-CF.\n","authors":["Chanyoung Jung","Yun Jang"],"pdf_url":"https://arxiv.org/pdf/2407.17040v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21347v1","updated":"2024-07-31T05:32:37Z","published":"2024-07-31T05:32:37Z","title":"Differentially Private Block-wise Gradient Shuffle for Deep Learning","summary":" Traditional Differentially Private Stochastic Gradient Descent (DP-SGD)\nintroduces statistical noise on top of gradients drawn from a Gaussian\ndistribution to ensure privacy. This paper introduces the novel Differentially\nPrivate Block-wise Gradient Shuffle (DP-BloGS) algorithm for deep learning.\nBloGS builds off of existing private deep learning literature, but makes a\ndefinitive shift by taking a probabilistic approach to gradient noise\nintroduction through shuffling modeled after information theoretic privacy\nanalyses. The theoretical results presented in this paper show that the\ncombination of shuffling, parameter-specific block size selection, batch layer\nclipping, and gradient accumulation allows DP-BloGS to achieve training times\nclose to that of non-private training while maintaining similar privacy and\nutility guarantees to DP-SGD. DP-BloGS is found to be significantly more\nresistant to data extraction attempts than DP-SGD. The theoretical results are\nvalidated by the experimental findings.\n","authors":["David Zagardo"],"pdf_url":"https://arxiv.org/pdf/2407.21347v1.pdf","comment":"43 pages, 11 figures, 8 tables"},{"id":"http://arxiv.org/abs/2407.21343v1","updated":"2024-07-31T05:17:31Z","published":"2024-07-31T05:17:31Z","title":"MIST: A Simple and Scalable End-To-End 3D Medical Imaging Segmentation\n Framework","summary":" Medical imaging segmentation is a highly active area of research, with deep\nlearning-based methods achieving state-of-the-art results in several\nbenchmarks. However, the lack of standardized tools for training, testing, and\nevaluating new methods makes the comparison of methods difficult. To address\nthis, we introduce the Medical Imaging Segmentation Toolkit (MIST), a simple,\nmodular, and end-to-end medical imaging segmentation framework designed to\nfacilitate consistent training, testing, and evaluation of deep learning-based\nmedical imaging segmentation methods. MIST standardizes data analysis,\npreprocessing, and evaluation pipelines, accommodating multiple architectures\nand loss functions. This standardization ensures reproducible and fair\ncomparisons across different methods. We detail MIST's data format\nrequirements, pipelines, and auxiliary features and demonstrate its efficacy\nusing the BraTS Adult Glioma Post-Treatment Challenge dataset. Our results\nhighlight MIST's ability to produce accurate segmentation masks and its\nscalability across multiple GPUs, showcasing its potential as a powerful tool\nfor future medical imaging research and development.\n","authors":["Adrian Celaya","Evan Lim","Rachel Glenn","Brayden Mi","Alex Balsells","Tucker Netherton","Caroline Chung","Beatrice Riviere","David Fuentes"],"pdf_url":"https://arxiv.org/pdf/2407.21343v1.pdf","comment":"Submitted to BraTS 2024"},{"id":"http://arxiv.org/abs/2407.21338v1","updated":"2024-07-31T05:11:06Z","published":"2024-07-31T05:11:06Z","title":"Image-Based Deep Reinforcement Learning with Intrinsically Motivated\n Stimuli: On the Execution of Complex Robotic Tasks","summary":" Reinforcement Learning (RL) has been widely used to solve tasks where the\nenvironment consistently provides a dense reward value. However, in real-world\nscenarios, rewards can often be poorly defined or sparse. Auxiliary signals are\nindispensable for discovering efficient exploration strategies and aiding the\nlearning process. In this work, inspired by intrinsic motivation theory, we\npostulate that the intrinsic stimuli of novelty and surprise can assist in\nimproving exploration in complex, sparsely rewarded environments. We introduce\na novel sample-efficient method able to learn directly from pixels, an\nimage-based extension of TD3 with an autoencoder called \\textit{NaSA-TD3}. The\nexperiments demonstrate that NaSA-TD3 is easy to train and an efficient method\nfor tackling complex continuous-control robotic tasks, both in simulated\nenvironments and real-world settings. NaSA-TD3 outperforms existing\nstate-of-the-art RL image-based methods in terms of final performance without\nrequiring pre-trained models or human demonstrations.\n","authors":["David Valencia","Henry Williams","Yuning Xing","Trevor Gee","Minas Liarokapis","Bruce A. MacDonald"],"pdf_url":"https://arxiv.org/pdf/2407.21338v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05122v2","updated":"2024-07-31T04:58:56Z","published":"2024-03-08T07:36:14Z","title":"Multi-Tower Multi-Interest Recommendation with User Representation Repel","summary":" In the era of information overload, the value of recommender systems has been\nprofoundly recognized in academia and industry alike. Multi-interest sequential\nrecommendation, in particular, is a subfield that has been receiving increasing\nattention in recent years. By generating multiple-user representations,\nmulti-interest learning models demonstrate superior expressiveness than\nsingle-user representation models, both theoretically and empirically. Despite\nmajor advancements in the field, three major issues continue to plague the\nperformance and adoptability of multi-interest learning methods, the difference\nbetween training and deployment objectives, the inability to access item\ninformation, and the difficulty of industrial adoption due to its single-tower\narchitecture. We address these challenges by proposing a novel multi-tower\nmulti-interest framework with user representation repel. Experimental results\nacross multiple large-scale industrial datasets proved the effectiveness and\ngeneralizability of our proposed framework.\n","authors":["Tianyu Xiong","Xiaohan Yu"],"pdf_url":"https://arxiv.org/pdf/2403.05122v2.pdf","comment":"Not accepted by conference"},{"id":"http://arxiv.org/abs/2308.06709v2","updated":"2024-07-31T04:24:55Z","published":"2023-08-13T07:56:01Z","title":"The Hard-Constraint PINNs for Interface Optimal Control Problems","summary":" We show that the physics-informed neural networks (PINNs), in combination\nwith some recently developed discontinuity capturing neural networks, can be\napplied to solve optimal control problems subject to partial differential\nequations (PDEs) with interfaces and some control constraints. The resulting\nalgorithm is mesh-free and scalable to different PDEs, and it ensures the\ncontrol constraints rigorously. Since the boundary and interface conditions, as\nwell as the PDEs, are all treated as soft constraints by lumping them into a\nweighted loss function, it is necessary to learn them simultaneously and there\nis no guarantee that the boundary and interface conditions can be satisfied\nexactly. This immediately causes difficulties in tuning the weights in the\ncorresponding loss function and training the neural networks. To tackle these\ndifficulties and guarantee the numerical accuracy, we propose to impose the\nboundary and interface conditions as hard constraints in PINNs by developing a\nnovel neural network architecture. The resulting hard-constraint PINNs approach\nguarantees that both the boundary and interface conditions can be satisfied\nexactly or with a high degree of accuracy, and they are decoupled from the\nlearning of the PDEs. Its efficiency is promisingly validated by some elliptic\nand parabolic interface optimal control problems.\n","authors":["Ming-Chih Lai","Yongcun Song","Xiaoming Yuan","Hangrui Yue","Tianyou Zeng"],"pdf_url":"https://arxiv.org/pdf/2308.06709v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21319v1","updated":"2024-07-31T03:59:14Z","published":"2024-07-31T03:59:14Z","title":"Big Cooperative Learning","summary":" Cooperation plays a pivotal role in the evolution of human intelligence;\nmoreover, it also underlies the recent revolutionary advancement of artificial\nintelligence (AI) that is driven by foundation models. Specifically, we reveal\nthat the training of foundation models can be interpreted as a form of big\ncooperative learning (\\textit{abbr.} big learning), where massive learning\nindividuals/tasks \\emph{cooperate} to approach the unique essence of data from\ndiverse perspectives of data prediction, leveraging a universal model. The\npresented big learning therefore unifies most training objectives of foundation\nmodels within a consistent framework, where their underlying assumptions are\nexposed simultaneously. We design tailored simulations to demonstrate the\nprinciple of big learning, based on which we provide learning-perspective\njustifications for the successes of foundation models, with interesting\nside-products. Furthermore, we reveal that big learning is a new dimension for\nupgrading conventional machine learning paradigms, valuable for endowing\nreinvigorations to associated applications; as an illustrative example, we\npropose the BigLearn-GAN, which is a novel adversarially-trained foundation\nmodel with versatile data sampling capabilities. Code is available at\n\\texttt{https://github.com/YulaiCong/BigCooperativeLearning}.\n","authors":["Yulai Cong"],"pdf_url":"https://arxiv.org/pdf/2407.21319v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21316v1","updated":"2024-07-31T03:54:41Z","published":"2024-07-31T03:54:41Z","title":"Diff-Cleanse: Identifying and Mitigating Backdoor Attacks in Diffusion\n Models","summary":" Diffusion models (DM) represent one of the most advanced generative models\ntoday, yet recent studies suggest that DMs are vulnerable to backdoor attacks.\nBackdoor attacks establish hidden associations between particular input\npatterns and model behaviors, compromising model integrity by triggering\nundesirable actions with manipulated input data. This vulnerability poses\nsubstantial risks, including reputational damage to model owners and the\ndissemination of harmful content. To mitigate the threat of backdoor attacks,\nthere have been some investigations on backdoor detection and model repair.\nHowever, previous work fails to purify the backdoored DMs created by\nstate-of-the-art attacks, rendering the field much underexplored. To bridge\nthis gap, we introduce \\textbf{Diff-Cleanse}, a novel two-stage backdoor\ndefense framework specifically designed for DMs. The first stage employs a\ninnovative trigger inversion technique to detect the backdoor and reconstruct\nthe trigger, and the second stage utilizes a structural pruning method to\neliminate the backdoor. We evaluate our framework on hundreds of DMs attacked\nby 3 existing backdoor attack methods. Extensive experiments demonstrate that\nDiff-Cleanse achieves nearly 100\\% detection accuracy and effectively mitigates\nbackdoor impacts, preserving the model's benign performance with minimal\ncompromise. Our code is avaliable at https://github.com/shymuel/diff-cleanse.\n","authors":["Jiang Hao","Xiao Jin","Hu Xiaoguang","Chen Tianyou"],"pdf_url":"https://arxiv.org/pdf/2407.21316v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21314v1","updated":"2024-07-31T03:47:20Z","published":"2024-07-31T03:47:20Z","title":"State-observation augmented diffusion model for nonlinear assimilation","summary":" Data assimilation has become a crucial technique aiming to combine physical\nmodels with observational data to estimate state variables. Traditional\nassimilation algorithms often face challenges of high nonlinearity brought by\nboth the physical and observational models. In this work, we propose a novel\ndata-driven assimilation algorithm based on generative models to address such\nconcerns. Our State-Observation Augmented Diffusion (SOAD) model is designed to\nhandle nonlinear physical and observational models more effectively. The\nmarginal posterior associated with SOAD has been derived and then proved to\nmatch the real posterior under mild assumptions, which shows theoretical\nsuperiority over previous score-based assimilation works. Experimental results\nalso indicate that our SOAD model may offer improved accuracy over existing\ndata-driven methods.\n","authors":["Zhuoyuan Li","Bin Dong","Pingwen Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.21314v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21311v1","updated":"2024-07-31T03:29:28Z","published":"2024-07-31T03:29:28Z","title":"EUDA: An Efficient Unsupervised Domain Adaptation via Self-Supervised\n Vision Transformer","summary":" Unsupervised domain adaptation (UDA) aims to mitigate the domain shift issue,\nwhere the distribution of training (source) data differs from that of testing\n(target) data. Many models have been developed to tackle this problem, and\nrecently vision transformers (ViTs) have shown promising results. However, the\ncomplexity and large number of trainable parameters of ViTs restrict their\ndeployment in practical applications. This underscores the need for an\nefficient model that not only reduces trainable parameters but also allows for\nadjustable complexity based on specific needs while delivering comparable\nperformance. To achieve this, in this paper we introduce an Efficient\nUnsupervised Domain Adaptation (EUDA) framework. EUDA employs the DINOv2, which\nis a self-supervised ViT, as a feature extractor followed by a simplified\nbottleneck of fully connected layers to refine features for enhanced domain\nadaptation. Additionally, EUDA employs the synergistic domain alignment loss\n(SDAL), which integrates cross-entropy (CE) and maximum mean discrepancy (MMD)\nlosses, to balance adaptation by minimizing classification errors in the source\ndomain while aligning the source and target domain distributions. The\nexperimental results indicate the effectiveness of EUDA in producing comparable\nresults as compared with other state-of-the-art methods in domain adaptation\nwith significantly fewer trainable parameters, between 42% to 99.7% fewer. This\nshowcases the ability to train the model in a resource-limited environment. The\ncode of the model is available at: https://github.com/A-Abedi/EUDA.\n","authors":["Ali Abedi","Q. M. Jonathan Wu","Ning Zhang","Farhad Pourpanah"],"pdf_url":"https://arxiv.org/pdf/2407.21311v1.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2407.21310v1","updated":"2024-07-31T03:26:14Z","published":"2024-07-31T03:26:14Z","title":"MSMA: Multi-agent Trajectory Prediction in Connected and Autonomous\n Vehicle Environment with Multi-source Data Integration","summary":" The prediction of surrounding vehicle trajectories is crucial for\ncollision-free path planning. In this study, we focus on a scenario where a\nconnected and autonomous vehicle (CAV) serves as the central agent, utilizing\nboth sensors and communication technologies to perceive its surrounding\ntraffics consisting of autonomous vehicles (AVs), connected vehicles (CVs), and\nhuman-driven vehicles (HDVs). Our trajectory prediction task is aimed at all\nthe detected surrounding vehicles. To effectively integrate the multi-source\ndata from both sensor and communication technologies, we propose a deep\nlearning framework called MSMA utilizing a cross-attention module for\nmulti-source data fusion. Vector map data is utilized to provide contextual\ninformation. The trajectory dataset is collected in CARLA simulator with\nsynthesized data errors introduced. Numerical experiments demonstrate that in a\nmixed traffic flow scenario, the integration of data from different sources\nenhances our understanding of the environment. This notably improves trajectory\nprediction accuracy, particularly in situations with a high CV market\npenetration rate. The code is available at: https://github.com/xichennn/MSMA.\n","authors":["Xi Chen","Rahul Bhadani","Zhanbo Sun","Larry Head"],"pdf_url":"https://arxiv.org/pdf/2407.21310v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11271v3","updated":"2024-07-31T03:06:40Z","published":"2024-06-17T07:21:36Z","title":"MINT-1T: Scaling Open-Source Multimodal Data by 10x: A Multimodal\n Dataset with One Trillion Tokens","summary":" Multimodal interleaved datasets featuring free-form interleaved sequences of\nimages and text are crucial for training frontier large multimodal models\n(LMMs). Despite the rapid progression of open-source LMMs, there remains a\npronounced scarcity of large-scale, diverse open-source multimodal interleaved\ndatasets. In response, we introduce MINT-1T, the most extensive and diverse\nopen-source Multimodal INTerleaved dataset to date. MINT-1T comprises one\ntrillion text tokens and 3.4 billion images, a 10x scale-up from existing\nopen-source datasets. Additionally, we include previously untapped sources such\nas PDFs and ArXiv papers. As scaling multimodal interleaved datasets requires\nsubstantial engineering effort, sharing the data curation process and releasing\nthe dataset greatly benefits the community. Our experiments show that LMMs\ntrained on MINT-1T rival the performance of models trained on the previous\nleading dataset, OBELICS. Our data and code will be released at\nhttps://github.com/mlfoundations/MINT-1T.\n","authors":["Anas Awadalla","Le Xue","Oscar Lo","Manli Shu","Hannah Lee","Etash Kumar Guha","Matt Jordan","Sheng Shen","Mohamed Awadalla","Silvio Savarese","Caiming Xiong","Ran Xu","Yejin Choi","Ludwig Schmidt"],"pdf_url":"https://arxiv.org/pdf/2406.11271v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21299v1","updated":"2024-07-31T02:57:21Z","published":"2024-07-31T02:57:21Z","title":"Who should I trust? A Visual Analytics Approach for Comparing Net Load\n Forecasting Models","summary":" Net load forecasting is crucial for energy planning and facilitating informed\ndecision-making regarding trade and load distributions. However, evaluating\nforecasting models' performance against benchmark models remains challenging,\nthereby impeding experts' trust in the model's performance. In this context,\nthere is a demand for technological interventions that allow scientists to\ncompare models across various timeframes and solar penetration levels. This\npaper introduces a visual analytics-based application designed to compare the\nperformance of deep-learning-based net load forecasting models with other\nmodels for probabilistic net load forecasting. This application employs\ncarefully selected visual analytic interventions, enabling users to discern\ndifferences in model performance across different solar penetration levels,\ndataset resolutions, and hours of the day over multiple months. We also present\nobservations made using our application through a case study, demonstrating the\neffectiveness of visualizations in aiding scientists in making informed\ndecisions and enhancing trust in net load forecasting models.\n","authors":["Kaustav Bhattacharjee","Soumya Kundu","Indrasis Chakraborty","Aritra Dasgupta"],"pdf_url":"https://arxiv.org/pdf/2407.21299v1.pdf","comment":"Accepted for publication in the proceedings of 2025 IEEE PES Grid\n Edge Technologies Conference & Exposition (Grid Edge)"},{"id":"http://arxiv.org/abs/2407.21298v1","updated":"2024-07-31T02:55:01Z","published":"2024-07-31T02:55:01Z","title":"A Vectorization Method Induced By Maximal Margin Classification For\n Persistent Diagrams","summary":" Persistent homology is an effective method for extracting topological\ninformation, represented as persistent diagrams, of spatial structure data.\nHence it is well-suited for the study of protein structures. Attempts to\nincorporate Persistent homology in machine learning methods of protein function\nprediction have resulted in several techniques for vectorizing persistent\ndiagrams. However, current vectorization methods are excessively artificial and\ncannot ensure the effective utilization of information or the rationality of\nthe methods. To address this problem, we propose a more geometrical\nvectorization method of persistent diagrams based on maximal margin\nclassification for Banach space, and additionaly propose a framework that\nutilizes topological data analysis to identify proteins with specific\nfunctions. We evaluated our vectorization method using a binary classification\ntask on proteins and compared it with the statistical methods that exhibit the\nbest performance among thirteen commonly used vectorization methods. The\nexperimental results indicate that our approach surpasses the statistical\nmethods in both robustness and precision.\n","authors":["An Wu","Yu Pan","Fuqi Zhou","Jinghui Yan","Chuanlu Liu"],"pdf_url":"https://arxiv.org/pdf/2407.21298v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12231v2","updated":"2024-07-31T02:52:20Z","published":"2024-01-18T09:59:00Z","title":"Disentangled Condensation for Large-scale Graphs","summary":" Graph condensation has emerged as an intriguing technique to save the\nexpensive training costs of Graph Neural Networks (GNNs) by substituting a\ncondensed small graph with the original graph. Despite the promising results\nachieved, previous methods usually employ an entangled paradigm of redundant\nparameters (nodes, edges, GNNs), which incurs complex joint optimization during\ncondensation. This paradigm has considerably impeded the scalability of graph\ncondensation, making it challenging to condense extremely large-scale graphs\nand generate high-fidelity condensed graphs. Therefore, we propose to\ndisentangle the condensation process into a two-stage GNN-free paradigm,\nindependently condensing nodes and generating edges while eliminating the need\nto optimize GNNs at the same time. The node condensation module avoids the\ncomplexity of GNNs by focusing on node feature alignment with anchors of the\noriginal graph, while the edge translation module constructs the edges of the\ncondensed nodes by transferring the original structure knowledge with\nneighborhood anchors. This simple yet effective approach achieves at least 10\ntimes faster than state-of-the-art methods with comparable accuracy on\nmedium-scale graphs. Moreover, the proposed DisCo can successfully scale up to\nthe Ogbn-papers100M graph with flexible reduction rates. Extensive downstream\ntasks and ablation study on five common datasets further demonstrate the\neffectiveness of the proposed DisCo framework. The source code will be made\npublicly available.\n","authors":["Zhenbang Xiao","Shunyu Liu","Yu Wang","Tongya Zheng","Mingli Song"],"pdf_url":"https://arxiv.org/pdf/2401.12231v2.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2407.21294v1","updated":"2024-07-31T02:36:14Z","published":"2024-07-31T02:36:14Z","title":"Decentralized and Uncoordinated Learning of Stable Matchings: A\n Game-Theoretic Approach","summary":" We consider the problem of learning stable matchings in a fully decentralized\nand uncoordinated manner. In this problem, there are $n$ men and $n$ women,\neach having preference over the other side. It is assumed that women know their\npreferences over men, but men are not aware of their preferences over women,\nand they only learn them if they propose and successfully get matched to women.\nA matching is called stable if no man and woman prefer each other over their\ncurrent matches. When all the preferences are known a priori, the celebrated\nDeferred-Acceptance algorithm proposed by Gale and Shapley provides a\ndecentralized and uncoordinated algorithm to obtain a stable matching. However,\nwhen the preferences are unknown, developing such an algorithm faces major\nchallenges due to a lack of coordination. We achieve this goal by making a\nconnection between stable matchings and learning Nash equilibria (NE) in\nnoncooperative games. First, we provide a complete information game formulation\nfor the stable matching problem with known preferences such that its set of\npure NE coincides with the set of stable matchings, while its mixed NE can be\nrounded in a decentralized manner to a stable matching. Relying on such a\ngame-theoretic formulation, we show that for hierarchical markets, adopting the\nexponential weight (EXP) learning algorithm for the stable matching game\nachieves logarithmic regret with polynomial dependence on the number of\nplayers, thus answering a question posed in previous literature. Moreover, we\nshow that the same EXP learning algorithm converges locally and exponentially\nfast to a stable matching in general matching markets. We complement this\nresult by introducing another decentralized and uncoordinated learning\nalgorithm that globally converges to a stable matching with arbitrarily high\nprobability, leveraging the weak acyclicity property of the stable matching\ngame.\n","authors":["S. Rasoul Etesami","R. Srikant"],"pdf_url":"https://arxiv.org/pdf/2407.21294v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12712v2","updated":"2024-07-31T02:33:31Z","published":"2024-03-19T13:19:41Z","title":"Saliency Guided Image Warping for Unsupervised Domain Adaptation","summary":" Driving is challenging in conditions like night, rain, and snow. The lack of\ngood labeled datasets has hampered progress in scene understanding under such\nconditions. Unsupervised domain adaptation (UDA) using large labeled clear-day\ndatasets is a promising research direction in such cases. Current UDA methods,\nhowever, treat all image pixels uniformly, leading to over-reliance on the\ndominant scene backgrounds (e.g., roads, sky, sidewalks) that appear\ndramatically different across domains. As a result, they struggle to learn\neffective features of smaller and often sparse foreground objects (e.g.,\npeople, vehicles, signs).\n In this work, we improve UDA training by using in-place image warping to\nfocus on salient object regions. Our insight is that while backgrounds vary\nsignificantly across domains (e.g., snowy night vs. clear day), object\nappearances vary to a lesser extent. Therefore, we design instance-level\nsaliency guidance to adaptively oversample object regions, which reduces\nadverse effects from background context and enhances backbone feature learning.\nWe then unwarp the better learned features while adapting from source to\ntarget. Our approach improves adaptation across geographies, lighting, and\nweather conditions, and is agnostic to the task (segmentation, detection),\ndomain adaptation algorithm, saliency guidance, and underlying model\narchitecture. Result highlights include +6.1 mAP50 for BDD100K Clear\n$\\rightarrow$ DENSE Foggy, +3.7 mAP50 for BDD100K Day $\\rightarrow$ Night, +3.0\nmAP50 for BDD100K Clear $\\rightarrow$ Rainy, and +6.3 mIoU for Cityscapes\n$\\rightarrow$ ACDC. Our method adds minimal training memory and incurs no\nadditional inference latency. Please see Appendix for more results and\nanalysis.\n","authors":["Shen Zheng","Anurag Ghosh","Srinivasa G. Narasimhan"],"pdf_url":"https://arxiv.org/pdf/2403.12712v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21290v1","updated":"2024-07-31T02:27:57Z","published":"2024-07-31T02:27:57Z","title":"TrackSorter: A Transformer-based sorting algorithm for track finding in\n High Energy Physics","summary":" Track finding in particle data is a challenging pattern recognition problem\nin High Energy Physics. It takes as inputs a point cloud of space points and\nlabels them so that space points created by the same particle have the same\nlabel. The list of space points with the same label is a track candidate. We\nargue that this pattern recognition problem can be formulated as a sorting\nproblem, of which the inputs are a list of space points sorted by their\ndistances away from the collision points and the outputs are the space points\nsorted by their labels. In this paper, we propose the TrackSorter algorithm: a\nTransformer-based algorithm for pattern recognition in particle data.\nTrackSorter uses a simple tokenization scheme to convert space points into\ndiscrete tokens. It then uses the tokenized space points as inputs and sorts\nthe input tokens into track candidates. TrackSorter is a novel end-to-end track\nfinding algorithm that leverages Transformer-based models to solve pattern\nrecognition problems. It is evaluated on the TrackML dataset and has good track\nfinding performance.\n","authors":["Yash Melkani","Xiangyang Ju"],"pdf_url":"https://arxiv.org/pdf/2407.21290v1.pdf","comment":"6 pages, 3 figures, to be included in Proceedings of the 22nd\n International Workshop on Advanced Computing and Analysis Techniques in\n Physics Research (ACAT 2024)"},{"id":"http://arxiv.org/abs/2407.21284v1","updated":"2024-07-31T02:16:28Z","published":"2024-07-31T02:16:28Z","title":"Robust Box Prompt based SAM for Medical Image Segmentation","summary":" The Segment Anything Model (SAM) can achieve satisfactory segmentation\nperformance under high-quality box prompts. However, SAM's robustness is\ncompromised by the decline in box quality, limiting its practicality in\nclinical reality. In this study, we propose a novel Robust Box prompt based SAM\n(\\textbf{RoBox-SAM}) to ensure SAM's segmentation performance under prompts\nwith different qualities. Our contribution is three-fold. First, we propose a\nprompt refinement module to implicitly perceive the potential targets, and\noutput the offsets to directly transform the low-quality box prompt into a\nhigh-quality one. We then provide an online iterative strategy for further\nprompt refinement. Second, we introduce a prompt enhancement module to\nautomatically generate point prompts to assist the box-promptable segmentation\neffectively. Last, we build a self-information extractor to encode the prior\ninformation from the input image. These features can optimize the image\nembeddings and attention calculation, thus, the robustness of SAM can be\nfurther enhanced. Extensive experiments on the large medical segmentation\ndataset including 99,299 images, 5 modalities, and 25 organs/targets validated\nthe efficacy of our proposed RoBox-SAM.\n","authors":["Yuhao Huang","Xin Yang","Han Zhou","Yan Cao","Haoran Dou","Fajin Dong","Dong Ni"],"pdf_url":"https://arxiv.org/pdf/2407.21284v1.pdf","comment":"Accepted by MICCAI MLMI 2024"},{"id":"http://arxiv.org/abs/2308.04430v2","updated":"2024-07-31T02:15:31Z","published":"2023-08-08T17:58:15Z","title":"SILO Language Models: Isolating Legal Risk In a Nonparametric Datastore","summary":" The legality of training language models (LMs) on copyrighted or otherwise\nrestricted data is under intense debate. However, as we show, model performance\nsignificantly degrades if trained only on low-risk text (e.g., out-of-copyright\nbooks or government documents), due to its limited size and domain coverage. We\npresent SILO, a new language model that manages this risk-performance tradeoff\nduring inference. SILO is built by (1) training a parametric LM on Open License\nCorpus (OLC), a new corpus we curate with 228B tokens of public domain and\npermissively licensed text and (2) augmenting it with a more general and easily\nmodifiable nonparametric datastore (e.g., containing copyrighted books or news)\nthat is only queried during inference. The datastore allows use of high-risk\ndata without training on it, supports sentence-level data attribution, and\nenables data producers to opt out from the model by removing content from the\nstore. These capabilities can foster compliance with data-use regulations such\nas the fair use doctrine in the United States and the GDPR in the European\nUnion. Our experiments show that the parametric LM struggles on domains not\ncovered by OLC. However, access to the datastore greatly improves out of domain\nperformance, closing 90% of the performance gap with an LM trained on the Pile,\na more diverse corpus with mostly high-risk text. We also analyze which\nnonparametric approach works best, where the remaining errors lie, and how\nperformance scales with datastore size. Our results suggest that it is possible\nto build high quality language models while mitigating their legal risk.\n","authors":["Sewon Min","Suchin Gururangan","Eric Wallace","Weijia Shi","Hannaneh Hajishirzi","Noah A. Smith","Luke Zettlemoyer"],"pdf_url":"https://arxiv.org/pdf/2308.04430v2.pdf","comment":"29 pages; 7 figures. Published as a conference paper at ICLR 2024\n (spotlight). Code, models, and data available at\n https://github.com/kernelmachine/silo-lm"},{"id":"http://arxiv.org/abs/2407.21282v1","updated":"2024-07-31T02:12:05Z","published":"2024-07-31T02:12:05Z","title":"FedBChain: A Blockchain-enabled Federated Learning Framework for\n Improving DeepConvLSTM with Comparative Strategy Insights","summary":" Recent research in the field of Human Activity Recognition has shown that an\nimprovement in prediction performance can be achieved by reducing the number of\nLSTM layers. However, this kind of enhancement is only significant on\nmonolithic architectures, and when it runs on large-scale distributed training,\ndata security and privacy issues will be reconsidered, and its prediction\nperformance is unknown. In this paper, we introduce a novel framework:\nFedBChain, which integrates the federated learning paradigm based on a modified\nDeepConvLSTM architecture with a single LSTM layer. This framework performs\ncomparative tests of prediction performance on three different real-world\ndatasets based on three different hidden layer units (128, 256, and 512)\ncombined with five different federated learning strategies, respectively. The\nresults show that our architecture has significant improvements in Precision,\nRecall and F1-score compared to the centralized training approach on all\ndatasets with all hidden layer units for all strategies: FedAvg strategy\nimproves on average by 4.54%, FedProx improves on average by 4.57%,\nFedTrimmedAvg improves on average by 4.35%, Krum improves by 4.18% on average,\nand FedAvgM improves by 4.46% on average. Based on our results, it can be seen\nthat FedBChain not only improves in performance, but also guarantees the\nsecurity and privacy of user data compared to centralized training methods\nduring the training process. The code for our experiments is publicly available\n(https://github.com/Glen909/FedBChain).\n","authors":["Gaoxuan Li","Chern Hong Lim","Qiyao Ma","Xinyu Tang","Hwa Hui Tew"],"pdf_url":"https://arxiv.org/pdf/2407.21282v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20485v2","updated":"2024-07-31T02:02:40Z","published":"2024-07-30T01:13:42Z","title":"A2SF: Accumulative Attention Scoring with Forgetting Factor for Token\n Pruning in Transformer Decoder","summary":" Recently, large language models (LLM) based on transformers are facing memory\nbottleneck issues due to KV cache, especially in long sequence handling.\nPrevious researches proposed KV cache compression techniques that identify\ninsignificant tokens based on Accumulative Attention Scores and removes their\nitems from KV cache, noting that only few tokens play an important role in\nattention operations. However, we have observed that the existing Accumulative\nAttention Score is not suitable for the transformer decoder structure. In the\ndecoder model, the number of times the Attention Score accumulates varies\ndepending on the order of token appearance due to the effect of masking,\ncausing an uneven comparison between tokens. To solve this, we propose\nAccumulative Attention Score with Forgetting Factor (A2SF) technique, which\nintroduces a Forgetting Factor in the Attention Score accumulation process.\nA2SF applies a penalty to the past Attention Score generated from old tokens by\nrepeatedly multiplying the Forgetting Factor to the Attention Score over time.\nTherefore, older tokens receive a larger penalty, providing fairness among\ndifferent ages of tokens. Through the fair comparison among tokens, we can more\neffectively select important tokens. We have verified the accuracy improvement\nthrough A2SF in the OPT and LLaMA models and A2SF improves the accuracy of\nLLaMA 2 by up to 7.8% and 5.1% on 1-shot and 0-shot.\n","authors":["Hyun-rae Jo","Dongkun Shin"],"pdf_url":"https://arxiv.org/pdf/2407.20485v2.pdf","comment":"11 pages(9 pages + reference 2 pages), 6 figures"},{"id":"http://arxiv.org/abs/2407.21273v1","updated":"2024-07-31T01:36:47Z","published":"2024-07-31T01:36:47Z","title":"Enhanced Uncertainty Estimation in Ultrasound Image Segmentation with\n MSU-Net","summary":" Efficient intravascular access in trauma and critical care significantly\nimpacts patient outcomes. However, the availability of skilled medical\npersonnel in austere environments is often limited. Autonomous robotic\nultrasound systems can aid in needle insertion for medication delivery and\nsupport non-experts in such tasks. Despite advances in autonomous needle\ninsertion, inaccuracies in vessel segmentation predictions pose risks.\nUnderstanding the uncertainty of predictive models in ultrasound imaging is\ncrucial for assessing their reliability. We introduce MSU-Net, a novel\nmultistage approach for training an ensemble of U-Nets to yield accurate\nultrasound image segmentation maps. We demonstrate substantial improvements,\n18.1% over a single Monte Carlo U-Net, enhancing uncertainty evaluations, model\ntransparency, and trustworthiness. By highlighting areas of model certainty,\nMSU-Net can guide safe needle insertions, empowering non-experts to accomplish\nsuch tasks.\n","authors":["Rohini Banerjee","Cecilia G. Morales","Artur Dubrawski"],"pdf_url":"https://arxiv.org/pdf/2407.21273v1.pdf","comment":"Accepted for the 5th International Workshop of Advances in\n Simplifying Medical UltraSound (ASMUS), held in conjunction with MICCAI 2024,\n the 27th International Conference on Medical Image Computing and Computer\n Assisted Intervention"},{"id":"http://arxiv.org/abs/2407.21266v1","updated":"2024-07-31T01:07:21Z","published":"2024-07-31T01:07:21Z","title":"DDU-Net: A Domain Decomposition-based CNN on Multiple GPUs","summary":" The segmentation of ultra-high resolution images poses challenges such as\nloss of spatial information or computational inefficiency. In this work, a\nnovel approach that combines encoder-decoder architectures with domain\ndecomposition strategies to address these challenges is proposed. Specifically,\na domain decomposition-based U-Net (DDU-Net) architecture is introduced, which\npartitions input images into non-overlapping patches that can be processed\nindependently on separate devices. A communication network is added to\nfacilitate inter-patch information exchange to enhance the understanding of\nspatial context. Experimental validation is performed on a synthetic dataset\nthat is designed to measure the effectiveness of the communication network.\nThen, the performance is tested on the DeepGlobe land cover classification\ndataset as a real-world benchmark data set. The results demonstrate that the\napproach, which includes inter-patch communication for images divided into\n$16\\times16$ non-overlapping subimages, achieves a $2-3\\,\\%$ higher\nintersection over union (IoU) score compared to the same network without\ninter-patch communication. The performance of the network which includes\ncommunication is equivalent to that of a baseline U-Net trained on the full\nimage, showing that our model provides an effective solution for segmenting\nultra-high-resolution images while preserving spatial context. The code is\navailable at https://github.com/corne00/HiRes-Seg-CNN.\n","authors":["Corné Verburg","Alexander Heinlein","Eric C. Cyr"],"pdf_url":"https://arxiv.org/pdf/2407.21266v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21260v1","updated":"2024-07-31T00:43:51Z","published":"2024-07-31T00:43:51Z","title":"Tractable and Provably Efficient Distributional Reinforcement Learning\n with General Value Function Approximation","summary":" Distributional reinforcement learning improves performance by effectively\ncapturing environmental stochasticity, but a comprehensive theoretical\nunderstanding of its effectiveness remains elusive. In this paper, we present a\nregret analysis for distributional reinforcement learning with general value\nfunction approximation in a finite episodic Markov decision process setting. We\nfirst introduce a key notion of Bellman unbiasedness for a tractable and\nexactly learnable update via statistical functional dynamic programming. Our\ntheoretical results show that approximating the infinite-dimensional return\ndistribution with a finite number of moment functionals is the only method to\nlearn the statistical information unbiasedly, including nonlinear statistical\nfunctionals. Second, we propose a provably efficient algorithm,\n$\\texttt{SF-LSVI}$, achieving a regret bound of $\\tilde{O}(d_E\nH^{\\frac{3}{2}}\\sqrt{K})$ where $H$ is the horizon, $K$ is the number of\nepisodes, and $d_E$ is the eluder dimension of a function class.\n","authors":["Taehyun Cho","Seungyub Han","Kyungjae Lee","Seokhun Ju","Dohyeong Kim","Jungwoo Lee"],"pdf_url":"https://arxiv.org/pdf/2407.21260v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16944v2","updated":"2024-07-31T00:37:20Z","published":"2024-07-24T02:23:18Z","title":"An Adaptive Gradient Regularization Method","summary":" Optimizer plays an important role in neural network training with high\nefficiency and performance. Weight update based on its gradient is the central\npart of the optimizer. It has been shown that normalization and standardization\noperation on weight and gradient can accelerate the training process and\nimprove performance such as Weight Standardization (WS), weight normalization\n(WN) and gradient normalization (GN); there is also gradient centralization\n(GC). In this work, we introduce a new optimization technique based on the\ngradient magnitude in a gradient vector named adaptive gradient regularization\n(AGR), which normalizes the gradient vector in all dimensions as a coefficient\nvector and subtracts the product of the gradient and its coefficient vector by\nthe vanilla gradient. It can be viewed as an adaptive gradient clipping method.\nWe show that the AGR can improve the loss function Lipschitzness with a more\nstable training process and better generalization performance. AGR is very\nsimple to be embedded into vanilla optimizers such as Adan and AdamW with only\nthree lines of code. Our experiments are conducted in image generation, image\nclassification and language representation, which shows that our AGR improves\nthe training result.\n","authors":["Huixiu Jiang","Ling Yang","Yu Bao","Rutong Si"],"pdf_url":"https://arxiv.org/pdf/2407.16944v2.pdf","comment":"11 pages, 11 figures"},{"id":"http://arxiv.org/abs/2403.07187v3","updated":"2024-07-31T00:37:11Z","published":"2024-03-11T22:00:39Z","title":"UPS: Efficiently Building Foundation Models for PDE Solving via\n Cross-Modal Adaptation","summary":" We present Unified PDE Solvers (UPS), a data- and compute-efficient approach\nto developing unified neural operators for diverse families of spatiotemporal\nPDEs from various domains, dimensions, and resolutions. UPS embeds different\nPDEs into a shared representation space and processes them using a\nFNO-transformer architecture. Rather than training the network from scratch,\nwhich is data-demanding and computationally expensive, we warm-start the\ntransformer from pretrained LLMs and perform explicit alignment to reduce the\nmodality gap while improving data and compute efficiency. The cross-modal UPS\nachieves state-of-the-art results on a wide range of 1D and 2D PDE families\nfrom PDEBench, outperforming existing unified models using 4 times less data\nand 26 times less compute. Meanwhile, it is capable of few-shot transfer to\nunseen PDE families and coefficients.\n","authors":["Junhong Shen","Tanya Marwah","Ameet Talwalkar"],"pdf_url":"https://arxiv.org/pdf/2403.07187v3.pdf","comment":"ICML 2024 AI for Science Workshop (Spotlight)"},{"id":"http://arxiv.org/abs/2406.17931v3","updated":"2024-07-31T00:31:45Z","published":"2024-06-25T20:43:15Z","title":"CAT: Interpretable Concept-based Taylor Additive Models","summary":" As an emerging interpretable technique, Generalized Additive Models (GAMs)\nadopt neural networks to individually learn non-linear functions for each\nfeature, which are then combined through a linear model for final predictions.\nAlthough GAMs can explain deep neural networks (DNNs) at the feature level,\nthey require large numbers of model parameters and are prone to overfitting,\nmaking them hard to train and scale. Additionally, in real-world datasets with\nmany features, the interpretability of feature-based explanations diminishes\nfor humans. To tackle these issues, recent research has shifted towards\nconcept-based interpretable methods. These approaches try to integrate concept\nlearning as an intermediate step before making predictions, explaining the\npredictions in terms of human-understandable concepts. However, these methods\nrequire domain experts to extensively label concepts with relevant names and\ntheir ground-truth values. In response, we propose CAT, a novel interpretable\nConcept-bAsed Taylor additive model to simply this process. CAT does not have\nto require domain experts to annotate concepts and their ground-truth values.\nInstead, it only requires users to simply categorize input features into broad\ngroups, which can be easily accomplished through a quick metadata review.\nSpecifically, CAT first embeds each group of input features into\none-dimensional high-level concept representation, and then feeds the concept\nrepresentations into a new white-box Taylor Neural Network (TaylorNet). The\nTaylorNet aims to learn the non-linear relationship between the inputs and\noutputs using polynomials. Evaluation results across multiple benchmarks\ndemonstrate that CAT can outperform or compete with the baselines while\nreducing the need of extensive model parameters. Importantly, it can explain\nmodel predictions through high-level concepts that human can understand.\n","authors":["Viet Duong","Qiong Wu","Zhengyi Zhou","Hongjue Zhao","Chenxiang Luo","Eric Zavesky","Huaxiu Yao","Huajie Shao"],"pdf_url":"https://arxiv.org/pdf/2406.17931v3.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2407.21757v1","updated":"2024-07-31T17:23:57Z","published":"2024-07-31T17:23:57Z","title":"Learning Video Context as Interleaved Multimodal Sequences","summary":" Narrative videos, such as movies, pose significant challenges in video\nunderstanding due to their rich contexts (characters, dialogues, storylines)\nand diverse demands (identify who, relationship, and reason). In this paper, we\nintroduce MovieSeq, a multimodal language model developed to address the wide\nrange of challenges in understanding video contexts. Our core idea is to\nrepresent videos as interleaved multimodal sequences (including images, plots,\nvideos, and subtitles), either by linking external knowledge databases or using\noffline models (such as whisper for subtitles). Through instruction-tuning,\nthis approach empowers the language model to interact with videos using\ninterleaved multimodal instructions. For example, instead of solely relying on\nvideo as input, we jointly provide character photos alongside their names and\ndialogues, allowing the model to associate these elements and generate more\ncomprehensive responses. To demonstrate its effectiveness, we validate\nMovieSeq's performance on six datasets (LVU, MAD, Movienet, CMD, TVC, MovieQA)\nacross five settings (video classification, audio description, video-text\nretrieval, video captioning, and video question-answering). The code will be\npublic at https://github.com/showlab/MovieSeq.\n","authors":["Kevin Qinghong Lin","Pengchuan Zhang","Difei Gao","Xide Xia","Joya Chen","Ziteng Gao","Jinheng Xie","Xuhong Xiao","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2407.21757v1.pdf","comment":"Accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2407.21721v1","updated":"2024-07-31T16:14:09Z","published":"2024-07-31T16:14:09Z","title":"Open-Vocabulary Audio-Visual Semantic Segmentation","summary":" Audio-visual semantic segmentation (AVSS) aims to segment and classify\nsounding objects in videos with acoustic cues. However, most approaches operate\non the close-set assumption and only identify pre-defined categories from\ntraining data, lacking the generalization ability to detect novel categories in\npractical applications. In this paper, we introduce a new task: open-vocabulary\naudio-visual semantic segmentation, extending AVSS task to open-world scenarios\nbeyond the annotated label space. This is a more challenging task that requires\nrecognizing all categories, even those that have never been seen nor heard\nduring training. Moreover, we propose the first open-vocabulary AVSS framework,\nOV-AVSS, which mainly consists of two parts: 1) a universal sound source\nlocalization module to perform audio-visual fusion and locate all potential\nsounding objects and 2) an open-vocabulary classification module to predict\ncategories with the help of the prior knowledge from large-scale pre-trained\nvision-language models. To properly evaluate the open-vocabulary AVSS, we split\nzero-shot training and testing subsets based on the AVSBench-semantic\nbenchmark, namely AVSBench-OV. Extensive experiments demonstrate the strong\nsegmentation and zero-shot generalization ability of our model on all\ncategories. On the AVSBench-OV dataset, OV-AVSS achieves 55.43% mIoU on base\ncategories and 29.14% mIoU on novel categories, exceeding the state-of-the-art\nzero-shot method by 41.88%/20.61% and open-vocabulary method by 10.2%/11.6%.\nThe code is available at https://github.com/ruohaoguo/ovavss.\n","authors":["Ruohao Guo","Liao Qu","Dantong Niu","Yanyu Qi","Wenzhen Yue","Ji Shi","Bowei Xing","Xianghua Ying"],"pdf_url":"https://arxiv.org/pdf/2407.21721v1.pdf","comment":"Accepted by ACM MM 2024 (Oral)"},{"id":"http://arxiv.org/abs/2312.14433v2","updated":"2024-07-31T16:03:43Z","published":"2023-12-22T04:46:21Z","title":"Attribute-driven Disentangled Representation Learning for Multimodal\n Recommendation","summary":" Recommendation algorithms forecast user preferences by correlating user and\nitem representations derived from historical interaction patterns. In pursuit\nof enhanced performance, many methods focus on learning robust and independent\nrepresentations by disentangling the intricate factors within interaction data\nacross various modalities in an unsupervised manner. However, such an approach\nobfuscates the discernment of how specific factors (e.g., category or brand)\ninfluence the outcomes, making it challenging to regulate their effects. In\nresponse to this challenge, we introduce a novel method called Attribute-Driven\nDisentangled Representation Learning (short for AD-DRL), which explicitly\nincorporates attributes from different modalities into the disentangled\nrepresentation learning process. By assigning a specific attribute to each\nfactor in multimodal features, AD-DRL can disentangle the factors at both\nattribute and attribute-value levels. To obtain robust and independent\nrepresentations for each factor associated with a specific attribute, we first\ndisentangle the representations of features both within and across different\nmodalities. Moreover, we further enhance the robustness of the representations\nby fusing the multimodal features of the same factor. Empirical evaluations\nconducted on three public real-world datasets substantiate the effectiveness of\nAD-DRL, as well as its interpretability and controllability.\n","authors":["Zhenyang Li","Fan Liu","Yinwei Wei","Zhiyong Cheng","Liqiang Nie","Mohan Kankanhalli"],"pdf_url":"https://arxiv.org/pdf/2312.14433v2.pdf","comment":"ACM Multimedia 2024 Accepted"},{"id":"http://arxiv.org/abs/2407.21531v1","updated":"2024-07-31T11:29:46Z","published":"2024-07-31T11:29:46Z","title":"Can LLMs \"Reason\" in Music? An Evaluation of LLMs' Capability of Music\n Understanding and Generation","summary":" Symbolic Music, akin to language, can be encoded in discrete symbols. Recent\nresearch has extended the application of large language models (LLMs) such as\nGPT-4 and Llama2 to the symbolic music domain including understanding and\ngeneration. Yet scant research explores the details of how these LLMs perform\non advanced music understanding and conditioned generation, especially from the\nmulti-step reasoning perspective, which is a critical aspect in the\nconditioned, editable, and interactive human-computer co-creation process. This\nstudy conducts a thorough investigation of LLMs' capability and limitations in\nsymbolic music processing. We identify that current LLMs exhibit poor\nperformance in song-level multi-step music reasoning, and typically fail to\nleverage learned music knowledge when addressing complex musical tasks. An\nanalysis of LLMs' responses highlights distinctly their pros and cons. Our\nfindings suggest achieving advanced musical capability is not intrinsically\nobtained by LLMs, and future research should focus more on bridging the gap\nbetween music knowledge and reasoning, to improve the co-creation experience\nfor musicians.\n","authors":["Ziya Zhou","Yuhang Wu","Zhiyue Wu","Xinyue Zhang","Ruibin Yuan","Yinghao Ma","Lu Wang","Emmanouil Benetos","Wei Xue","Yike Guo"],"pdf_url":"https://arxiv.org/pdf/2407.21531v1.pdf","comment":"Accepted by ISMIR2024"},{"id":"http://arxiv.org/abs/2311.12751v4","updated":"2024-07-31T08:24:16Z","published":"2023-11-21T17:52:30Z","title":"Towards Natural Language-Guided Drones: GeoText-1652 Benchmark with\n Spatial Relation Matching","summary":" Navigating drones through natural language commands remains challenging due\nto the dearth of accessible multi-modal datasets and the stringent precision\nrequirements for aligning visual and textual data. To address this pressing\nneed, we introduce GeoText-1652, a new natural language-guided geo-localization\nbenchmark. This dataset is systematically constructed through an interactive\nhuman-computer process leveraging Large Language Model (LLM) driven annotation\ntechniques in conjunction with pre-trained vision models. GeoText-1652 extends\nthe established University-1652 image dataset with spatial-aware text\nannotations, thereby establishing one-to-one correspondences between image,\ntext, and bounding box elements. We further introduce a new optimization\nobjective to leverage fine-grained spatial associations, called blending\nspatial matching, for region-level spatial relation matching. Extensive\nexperiments reveal that our approach maintains a competitive recall rate\ncomparing other prevailing cross-modality methods. This underscores the\npromising potential of our approach in elevating drone control and navigation\nthrough the seamless integration of natural language commands in real-world\nscenarios.\n","authors":["Meng Chu","Zhedong Zheng","Wei Ji","Tingyu Wang","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2311.12751v4.pdf","comment":"Accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2407.21391v1","updated":"2024-07-31T07:31:13Z","published":"2024-07-31T07:31:13Z","title":"Design and Development of Laughter Recognition System Based on\n Multimodal Fusion and Deep Learning","summary":" This study aims to design and implement a laughter recognition system based\non multimodal fusion and deep learning, leveraging image and audio processing\ntechnologies to achieve accurate laughter recognition and emotion analysis.\nFirst, the system loads video files and uses the OpenCV library to extract\nfacial information while employing the Librosa library to process audio\nfeatures such as MFCC. Then, multimodal fusion techniques are used to integrate\nimage and audio features, followed by training and prediction using deep\nlearning models. Evaluation results indicate that the model achieved 80%\naccuracy, precision, and recall on the test dataset, with an F1 score of 80%,\ndemonstrating robust performance and the ability to handle real-world data\nvariability. This study not only verifies the effectiveness of multimodal\nfusion methods in laughter recognition but also highlights their potential\napplications in affective computing and human-computer interaction. Future work\nwill focus on further optimizing feature extraction and model architecture to\nimprove recognition accuracy and expand application scenarios, promoting the\ndevelopment of laughter recognition technology in fields such as mental health\nmonitoring and educational activity evaluation\n","authors":["Fuzheng Zhao","Yu Bai"],"pdf_url":"https://arxiv.org/pdf/2407.21391v1.pdf","comment":"7 pages,2 figures"},{"id":"http://arxiv.org/abs/2407.21363v1","updated":"2024-07-31T06:20:21Z","published":"2024-07-31T06:20:21Z","title":"ESIQA: Perceptual Quality Assessment of Vision-Pro-based Egocentric\n Spatial Images","summary":" With the development of eXtended Reality (XR), head-mounted shooting and\ndisplay technology have experienced significant advancement and gained\nconsiderable attention. Egocentric spatial images and videos are emerging as a\ncompelling form of stereoscopic XR content. Different from traditional 2D\nimages, egocentric spatial images present challenges for perceptual quality\nassessment due to their special shooting, processing methods, and stereoscopic\ncharacteristics. However, the corresponding image quality assessment (IQA)\nresearch for egocentric spatial images is still lacking. In this paper, we\nestablish the Egocentric Spatial Images Quality Assessment Database (ESIQAD),\nthe first IQA database dedicated for egocentric spatial images as far as we\nknow. Our ESIQAD includes 500 egocentric spatial images, containing 400 images\ncaptured with the Apple Vision Pro and 100 images generated via an iPhone's\n\"Spatial Camera\" app. The corresponding mean opinion scores (MOSs) are\ncollected under three viewing modes, including 2D display, 3D-window display,\nand 3D-immersive display. Furthermore, based on our database, we conduct a\nbenchmark experiment and evaluate the performance of 22 state-of-the-art IQA\nmodels under three different viewing modes. We hope this research can\nfacilitate future IQA research on egocentric spatial images. The database is\navailable at https://github.com/IntMeGroup/ESIQA.\n","authors":["Xilei Zhu","Liu Yang","Huiyu Duan","Xiongkuo Min","Guangtao Zhai","Patrick Le Callet"],"pdf_url":"https://arxiv.org/pdf/2407.21363v1.pdf","comment":"8 pages, 8 figures"}]}} \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 0000000000000000000000000000000000000000..7f5166c7afa0cda370aafaf91ba8d66cdeff74e5 GIT binary patch literal 15086 zcmeHO33yaRwyq9g34stc4G>5efuJa|8H50X3WB1d4n)8Z5JVvktB7$yKtv!Kb`lau zdP!$Nc9B&OSrq|+r=TLJ;K~kR2@vF;|J<9Kba%RAH(}4!8Q=syhFvI(6#Q zsX{4}Dx;b;Q+$T2oQ6t8Dy7213w{SH^#k7p^C{m4`z!S>3p8dKR#E*)@?QIEpg)}c zg{r6iyXi3T|KFt>>TsDWWe%JEGeI;m)t_ z#K0v35xKK9{I2==#4YqlUA%3Xh?MkH#4d|P#d8&Xs_&s!ylR8}jrLpHV^;bsWZSYa zH!TUx_B8jZuJHA{>W61Pj6tR~6PcNrNKa44y2nHqn=Z;*^b+JZFPz5nh)ES%qX=#z&q(>zlfC!?&@YSrplEEcre-mb8` zuXubZFMZU1X@6u`GYT;qc;sp5g4h(J-Ri$pM?zXcp{_ZWm%PAnhXUA$>^4n&yX>&2hmV*Ry0tB zDc2*Jr;1N^DfmP%o?7)3+HY@<`rp+@r%jzP%QCA_hD^#Z(KZo(JM=fG&)C8vq&l}j zJwF&~<0nw3PebMBv;9AHx_+HHM>2k2y$bcquTUQ>g6h^CsupeGV1<^;S|Zt!?1a7Z z#?J81^LLBW9d_fL={n^r&=JYsxAQ)IUfZ*@bwK9T6E8jvRhMmd?FO}(eVmu4AjvwHnm*c+SZaI_{G2dio}E$k=@I4b)RlA>{TDjCf@P9^3fXP4&AYX4kyP&|*&u zL_Z&m!0N<4WeU`#OS-PO!zb88j|}~hyr;2|GQa;071I}ibplcL)3QG6j4NJuzfD_B zC=*%^D*iPcyDJ`}Kd)TT$K}u=sD1mO_U@%EJplFd&rlaHy4N$2?^n)?N2!-$3x0El zpcL=UvTj#WH?}W2Bm5luU4G~0LM>eiHE+x9X!aoTcDDYxjsAu_3X1y$Bwy|VK(IpqYlX$aVwJaeLK?Nm$>FoH>BT1SA+z+ zU=vI)(6%2wtjj0%MZP(b^zV%uI__S*bQz3sFxr#nAAdmI&v6?@p6=F4Uu9a$c0#M_ zYms0O{AbSSdL;Y>%a9?ohv$p;r=yM;d1*uX{*gzhC!9-CPjph+lrr*t+6=DYwBtv~ zyR_+Lw$R}Ly?yB);gOO8)vups_cUD>9g)5^F#gq3Fz(vLe!d?nI$Cc_+LU_I&i?)M zIch^KE+zVltlyC|I^G%I@#DH)3(vSXsLPkV$8N|bbwuZ+4Uu2klyA~U=gyHYb#inm z@gHOTMt<~hZ2FpM@D?7T<1$AF4AA)*-@JVaMzK}WhO}jjts%pUoNtelKmDVdPWxH2 zUPby@A3Nh09x~3tJ0qiLUVDpO%84zIy3&TL?uk4TCquO+|J<8Kuls0W!BE?_7q^=R zR>LM4G8ykZJsq(+)^#i|_{CpsPVA>jf&X*X4WnPYb(?4W24BGk+NDH$2J`E zf`8mZuHWQ;fpoJ;{E)k7hv&^N8NXnQkB3QdfAN5etrc7{H)-GHn^uNpi|M>0e#!TL zUf<(*vuE`rpX~9(?-?@G**>`PqAzOd-d)F5zrMZ>JLy9R#yRq)UYk01*0I&Dt@{+N_~~bu_)WvlvL5G&ri-7^ z->MF^<`&@J!8Sr^LszWytV3LjOg($**cvs`*CSW_T%%0(R| z7fJph+p5D@i1_zn8yvAoUifk!XnKY+uH-m5_PtS7-tn7OM)r*E%1GHa#zNgmoALcE z#4q!>Kk2^KMLx2D%Xms3%l^vv?dd6HJdB}Qx1PEh0yX;DK9ZoqOLJHETi*8l?M+!q*#o zC6zI-cj$nK1`W|ZyFL7`-F)mM@LV85j)p*DxN;%mZkENXqy&csb zsD_E}O+J!}{T|s1i|rJAB99{pc8M?U{DQvC+q9AQaBsmro~7V_${$@ebz$tS zD0VAxxY*^fnl6;TP##r}S4F+U^$_|)D9TLDq~u-ur`m{J5sTMs zuVdouiO8@OoM~_@pI-BHF}c0L>wncBZML_?6w4IMOrMFU?NJb4z)d@OeCL^Ns63wI}D zBv}ESDELfpe#mbj`F_{^oZh>Z^W}G7ZeV`Y?soZMN5fs)bg~^4FI2?vWy3K&V>+50 zU`*>4B=PI|ujbte-Xj>la6GD=q@VASBzQbE(Mou3Mu)rQ&jISBtLywuzSE(YL* zRWHzQDO&Hv*NR_EzfbO-(Ql3ZI2xH2{XUVbt8zbPa`t3YJ<0PO*Cc+F#GZd3NgVut zNOB${@er3J{oZq9PuOfW&DRq@LnzyljUgWmDclQ1?vKPg+dRz&)b3@`ACx*V>tj&< zQh{G_jjc<}=nbyZ(YJA*)|_Wd4{~4Ik$L+3y{kb@W`7DE$|U3Y$hJq3Zlm8!pKa`- zv1q-oHJX3j0-ZkZ?7)E-Ue)=q) z1McS8?eE9wOY)5BEYBN$`Hivk&!HwUHvc$vb{y}ht!;-idz!|3=!&7Jc7pgyNTLgZ zL&}654a0gZHLP z#fT3_pz0|%<5&U~4Z|;Ccy3ZZ)RDUXWm zuW1r%$T_63N0z;(oDoczz`f8=o@319zR0eVoPCet-frwzJsvA%1%sSn_Upy_AU<-J z`Se6X?t8y0>UX)zFl-nU@1AMxy7s@^n~*a*Gj(PbecRHl3 z)RAxUQWpboZ1o5#BsAxUFp)gfC+3&x=I=7_IiV()^N6pLIfV24e(_kM$kW2W1{+TyObG z%18SuI2`rMz##ABo2)s!CwhC^NWA&#Ye>jRK*XU4w{bZGSA|Oz&~HsYEykEkKg?pY zXufJvMiN@@Z_T@=EZKu=$l!rI`}>%4?++b|p?{Y+CP#nP?M%$mP|pP*d{r3U&v{>q zi@lfm9`4_JKPsK8gz6`1fO`u_9Kqm!&w+bjW;{vaQU+P+dv)2-r6{&=nx(CzzLj}D zrv-UL^G+<+sG)xM6 z?TJFFEmf0C{C}YwOAfki>*iPzQOx^KY$zohMj$PFW zCBJEte(!S2disF0r>^Ov+jX9@#tC1!$1G3B{B^EFJGawD(vNmcn=A5eJ=^~ChPFSD z`yJZd2HtPb^0MEMZ)+A3XVDr_*beuF%KQ@h`>O7b$(~E@NRv#Gm$SfJ`dkb8=(^#^ zpZemTUj}!jMxoMTuBTTxc8|>VvOmu-_V5+=E%E4@&Z01YYUJsU+fLnv2}>u?uI6Cm+L8KM zAc2-+N1Mj9EDb0eKE% zBWGqVy3+V)V + + + + MyArxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Computation and Language 53 + +
+
+
+ + ☆ Safetywashing: Do AI Safety Benchmarks Actually Measure Safety Progress? + + +
+ As artificial intelligence systems grow more powerful, there has been +increasing interest in "AI safety" research to address emerging and future +risks. However, the field of AI safety remains poorly defined and +inconsistently measured, leading to confusion about how researchers can +contribute. This lack of clarity is compounded by the unclear relationship +between AI safety benchmarks and upstream general capabilities (e.g., general +knowledge and reasoning). To address these issues, we conduct a comprehensive +meta-analysis of AI safety benchmarks, empirically analyzing their correlation +with general capabilities across dozens of models and providing a survey of +existing directions in AI safety. Our findings reveal that many safety +benchmarks highly correlate with upstream model capabilities, potentially +enabling "safetywashing" -- where capability improvements are misrepresented as +safety advancements. Based on these findings, we propose an empirical +foundation for developing more meaningful safety metrics and define AI safety +in a machine learning research context as a set of clearly delineated research +goals that are empirically separable from generic capabilities advancements. In +doing so, we aim to provide a more rigorous framework for AI safety research, +advancing the science of safety evaluations and clarifying the path towards +measurable progress. + +
+
+
+
+
+ + ☆ Vision-Language Model Based Handwriting Verification + + +
+ Handwriting Verification is a critical in document forensics. Deep learning +based approaches often face skepticism from forensic document examiners due to +their lack of explainability and reliance on extensive training data and +handcrafted features. This paper explores using Vision Language Models (VLMs), +such as OpenAI's GPT-4o and Google's PaliGemma, to address these challenges. By +leveraging their Visual Question Answering capabilities and 0-shot +Chain-of-Thought (CoT) reasoning, our goal is to provide clear, +human-understandable explanations for model decisions. Our experiments on the +CEDAR handwriting dataset demonstrate that VLMs offer enhanced +interpretability, reduce the need for large training datasets, and adapt better +to diverse handwriting styles. However, results show that the CNN-based +ResNet-18 architecture outperforms the 0-shot CoT prompt engineering approach +with GPT-4o (Accuracy: 70%) and supervised fine-tuned PaliGemma (Accuracy: +71%), achieving an accuracy of 84% on the CEDAR AND dataset. These findings +highlight the potential of VLMs in generating human-interpretable decisions +while underscoring the need for further advancements to match the performance +of specialized deep learning models. + +
+
+ comment: 4 Pages, 1 Figure, 1 Table, Accepted as Short paper at Irish Machine + Vision and Image Processing (IMVIP) Conference +
+
+
+
+
+ + ☆ The Llama 3 Herd of Models + + +
+ Modern artificial intelligence (AI) systems are powered by foundation models. +This paper presents a new set of foundation models, called Llama 3. It is a +herd of language models that natively support multilinguality, coding, +reasoning, and tool usage. Our largest model is a dense Transformer with 405B +parameters and a context window of up to 128K tokens. This paper presents an +extensive empirical evaluation of Llama 3. We find that Llama 3 delivers +comparable quality to leading language models such as GPT-4 on a plethora of +tasks. We publicly release Llama 3, including pre-trained and post-trained +versions of the 405B parameter language model and our Llama Guard 3 model for +input and output safety. The paper also presents the results of experiments in +which we integrate image, video, and speech capabilities into Llama 3 via a +compositional approach. We observe this approach performs competitively with +the state-of-the-art on image, video, and speech recognition tasks. The +resulting models are not yet being broadly released as they are still under +development. + +
+
+
+
+
+ + ☆ ShieldGemma: Generative AI Content Moderation Based on Gemma + + +
+ We present ShieldGemma, a comprehensive suite of LLM-based safety content +moderation models built upon Gemma2. These models provide robust, +state-of-the-art predictions of safety risks across key harm types (sexually +explicit, dangerous content, harassment, hate speech) in both user input and +LLM-generated output. By evaluating on both public and internal benchmarks, we +demonstrate superior performance compared to existing models, such as Llama +Guard (+10.8\% AU-PRC on public benchmarks) and WildCard (+4.3\%). +Additionally, we present a novel LLM-based data curation pipeline, adaptable to +a variety of safety-related tasks and beyond. We have shown strong +generalization performance for model trained mainly on synthetic data. By +releasing ShieldGemma, we provide a valuable resource to the research +community, advancing LLM safety and enabling the creation of more effective +content moderation solutions for developers. + +
+
+
+
+
+ + ☆ Adaptive Retrieval-Augmented Generation for Conversational Systems + + +
+ Despite the success of integrating large language models into the development +of conversational systems, many studies have shown the effectiveness of +retrieving and augmenting external knowledge for informative responses. Hence, +many existing studies commonly assume the always need for Retrieval Augmented +Generation (RAG) in a conversational system without explicit control. This +raises a research question about such a necessity. In this study, we propose to +investigate the need for each turn of system response to be augmented with +external knowledge. In particular, by leveraging human judgements on the binary +choice of adaptive augmentation, we develop RAGate, a gating model, which +models conversation context and relevant inputs to predict if a conversational +system requires RAG for improved responses. We conduct extensive experiments on +devising and applying RAGate to conversational models and well-rounded analyses +of different conversational scenarios. Our experimental results and analysis +indicate the effective application of RAGate in RAG-based conversational +systems in identifying system responses for appropriate RAG with high-quality +responses and a high generation confidence. This study also identifies the +correlation between the generation's confidence level and the relevance of the +augmented knowledge. + +
+
+ comment: 12 pages, under review +
+
+
+
+
+ + ☆ Synth-Empathy: Towards High-Quality Synthetic Empathy Data + + +
+ In recent years, with the rapid advancements in large language models (LLMs), +achieving excellent empathetic response capabilities has become a crucial +prerequisite. Consequently, managing and understanding empathetic datasets have +gained increasing significance. However, empathetic data are typically +human-labeled, leading to insufficient datasets and wasted human labor. In this +work, we present Synth-Empathy, an LLM-based data generation and quality and +diversity selection pipeline that automatically generates high-quality +empathetic data while discarding low-quality data. With the data generated from +a low empathetic model, we are able to further improve empathetic response +performance and achieve state-of-the-art (SoTA) results across multiple +benchmarks. Moreover, our model achieves SoTA performance on various human +evaluation benchmarks, demonstrating its effectiveness and robustness in +real-world applications. Furthermore, we show the trade-off between data +quantity and quality, providing insights into empathetic data generation and +selection. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2407.01937 +
+
+
+
+
+ + ☆ Defending Jailbreak Attack in VLMs via Cross-modality Information + Detector ACL + + +
+ Vision Language Models (VLMs) extend the capacity of LLMs to comprehensively +understand vision information, achieving remarkable performance in many +vision-centric tasks. Despite that, recent studies have shown that these models +are susceptible to jailbreak attacks, which refer to an exploitative technique +where malicious users can break the safety alignment of the target model and +generate misleading and harmful answers. This potential threat is caused by +both the inherent vulnerabilities of LLM and the larger attack scope introduced +by vision input. To enhance the security of VLMs against jailbreak attacks, +researchers have developed various defense techniques. However, these methods +either require modifications to the model's internal structure or demand +significant computational resources during the inference phase. Multimodal +information is a double-edged sword. While it increases the risk of attacks, it +also provides additional data that can enhance safeguards. Inspired by this, we +propose $\underline{\textbf{C}}$ross-modality +$\underline{\textbf{I}}$nformation +$\underline{\textbf{DE}}$tecto$\underline{\textbf{R}}$ ($\textit{CIDER})$, a +plug-and-play jailbreaking detector designed to identify maliciously perturbed +image inputs, utilizing the cross-modal similarity between harmful queries and +adversarial images. This simple yet effective cross-modality information +detector, $\textit{CIDER}$, is independent of the target VLMs and requires less +computation cost. Extensive experimental results demonstrate the effectiveness +and efficiency of $\textit{CIDER}$, as well as its transferability to both +white-box and black-box VLMs. + +
+
+ comment: 12 pages, 9 figures, ACL ARR 2024 June Submission +
+
+
+
+
+ + ☆ Towards Achieving Human Parity on End-to-end Simultaneous Speech + Translation via LLM Agent + + +
+ In this paper, we present Cross Language Agent -- Simultaneous +Interpretation, CLASI, a high-quality and human-like Simultaneous Speech +Translation (SiST) System. Inspired by professional human interpreters, we +utilize a novel data-driven read-write strategy to balance the translation +quality and latency. To address the challenge of translating in-domain +terminologies, CLASI employs a multi-modal retrieving module to obtain relevant +information to augment the translation. Supported by LLMs, our approach can +generate error-tolerated translation by considering the input audio, historical +context, and retrieved information. Experimental results show that our system +outperforms other systems by significant margins. Aligned with professional +human interpreters, we evaluate CLASI with a better human evaluation metric, +valid information proportion (VIP), which measures the amount of information +that can be successfully conveyed to the listeners. In the real-world +scenarios, where the speeches are often disfluent, informal, and unclear, CLASI +achieves VIP of 81.3% and 78.0% for Chinese-to-English and English-to-Chinese +translation directions, respectively. In contrast, state-of-the-art commercial +or open-source systems only achieve 35.4% and 41.6%. On the extremely hard +dataset, where other systems achieve under 13% VIP, CLASI can still achieve 70% +VIP. + +
+
+ comment: Authors are listed in alphabetical order by last name. Demonstrations + and human-annotated test sets are available at + https://byteresearchcla.github.io/clasi +
+
+
+
+
+ + ☆ Zero-Shot Cross-Domain Dialogue State Tracking via Dual Low-Rank + Adaptation ACL 2024 + + +
+ Zero-shot dialogue state tracking (DST) seeks to enable dialogue systems to +transition to unfamiliar domains without manual annotation or extensive +retraining. Prior research has approached this objective by embedding prompts +into language models (LMs). Common methodologies include integrating prompts at +the input layer or introducing learnable variables at each transformer layer. +Nonetheless, each strategy exhibits inherent limitations. Prompts integrated at +the input layer risk underutilization, with their impact potentially +diminishing across successive transformer layers. Conversely, the addition of +learnable variables to each layer can complicate the training process and +increase inference latency. To tackle the issues mentioned above, this paper +proposes Dual Low-Rank Adaptation (DualLoRA), a plug-and-play architecture +designed for zero-shot DST. DualLoRA incorporates two distinct Low-Rank +Adaptation (LoRA) components, targeting both dialogue context processing and +prompt optimization, to ensure the comprehensive influence of prompts +throughout the transformer model layers. This is achieved without incurring +additional inference latency, showcasing an efficient integration into existing +architectures. Through rigorous evaluation on the MultiWOZ and SGD datasets, +DualLoRA demonstrates notable improvements across multiple domains, +outperforming traditional baseline methods in zero-shot settings. Our code is +accessible at: \url{https://github.com/suntea233/DualLoRA}. + +
+
+ comment: Accepted by ACL 2024 +
+
+
+
+
+ + ☆ TAROT: Task-Oriented Authorship Obfuscation Using Policy Optimization + Methods + + +
+ Authorship obfuscation aims to disguise the identity of an author within a +text by altering the writing style, vocabulary, syntax, and other linguistic +features associated with the text author. This alteration needs to balance +privacy and utility. While strong obfuscation techniques can effectively hide +the author's identity, they often degrade the quality and usefulness of the +text for its intended purpose. Conversely, maintaining high utility tends to +provide insufficient privacy, making it easier for an adversary to de-anonymize +the author. Thus, achieving an optimal trade-off between these two conflicting +objectives is crucial. In this paper, we propose TAROT: Task-Oriented +Authorship Obfuscation Using Policy Optimization, a new unsupervised authorship +obfuscation method whose goal is to optimize the privacy-utility trade-off by +regenerating the entire text considering its downstream utility. Our approach +leverages policy optimization as a fine-tuning paradigm over small language +models in order to rewrite texts by preserving author identity and downstream +task utility. We show that our approach largely reduce the accuracy of +attackers while preserving utility. We make our code and models publicly +available. + +
+
+
+
+
+ + ☆ PMoE: Progressive Mixture of Experts with Asymmetric Transformer for + Continual Learning + + +
+ Large Language Models (LLMs) encounter significant challenges in continual +learning due to catastrophic forgetting, where new information overwrites +previously acquired knowledge. This limitation leads to substantial +environmental and economic waste. In this study, we introduce the PMoE, +Progressive Mixture of Experts with Asymmetric Transformer, which aims to +minimize forgetting by utilizing an asymmetric design with shallow layers +dedicated to general knowledge and deep layers for new knowledge. PMoE +incorporates progressively added experts in deep layers and a router that +allocates new knowledge to the appropriate experts efficiently. The router, +positioned adjacent to the deep layers, utilizes deep features aggregating +consolidated information. This enables the router to perform efficiently, +allocating new knowledge to the appropriate experts, which progressively +increase in the deep layers. Extensive experiments on TRACE datasets and +general language understanding datasets demonstrate that the proposed PMoE +outperforms previous state-of-the-art approaches. + +
+
+
+
+
+ + ☆ Generative Sentiment Analysis via Latent Category Distribution and + Constrained Decoding + + +
+ Fine-grained sentiment analysis involves extracting and organizing sentiment +elements from textual data. However, existing approaches often overlook issues +of category semantic inclusion and overlap, as well as inherent structural +patterns within the target sequence. This study introduces a generative +sentiment analysis model. To address the challenges related to category +semantic inclusion and overlap, a latent category distribution variable is +introduced. By reconstructing the input of a variational autoencoder, the model +learns the intensity of the relationship between categories and text, thereby +improving sequence generation. Additionally, a trie data structure and +constrained decoding strategy are utilized to exploit structural patterns, +which in turn reduces the search space and regularizes the generation process. +Experimental results on the Restaurant-ACOS and Laptop-ACOS datasets +demonstrate a significant performance improvement compared to baseline models. +Ablation experiments further confirm the effectiveness of latent category +distribution and constrained decoding strategy. + +
+
+
+
+
+ + ☆ Tracing Intricate Cues in Dialogue: Joint Graph Structure and Sentiment + Dynamics for Multimodal Emotion Recognition + + +
+ Multimodal emotion recognition in conversation (MERC) has garnered +substantial research attention recently. Existing MERC methods face several +challenges: (1) they fail to fully harness direct inter-modal cues, possibly +leading to less-than-thorough cross-modal modeling; (2) they concurrently +extract information from the same and different modalities at each network +layer, potentially triggering conflicts from the fusion of multi-source data; +(3) they lack the agility required to detect dynamic sentimental changes, +perhaps resulting in inaccurate classification of utterances with abrupt +sentiment shifts. To address these issues, a novel approach named GraphSmile is +proposed for tracking intricate emotional cues in multimodal dialogues. +GraphSmile comprises two key components, i.e., GSF and SDP modules. GSF +ingeniously leverages graph structures to alternately assimilate inter-modal +and intra-modal emotional dependencies layer by layer, adequately capturing +cross-modal cues while effectively circumventing fusion conflicts. SDP is an +auxiliary task to explicitly delineate the sentiment dynamics between +utterances, promoting the model's ability to distinguish sentimental +discrepancies. Furthermore, GraphSmile is effortlessly applied to multimodal +sentiment analysis in conversation (MSAC), forging a unified multimodal +affective model capable of executing MERC and MSAC tasks. Empirical results on +multiple benchmarks demonstrate that GraphSmile can handle complex emotional +and sentimental patterns, significantly outperforming baseline models. + +
+
+ comment: Submitted +
+
+
+
+
+ + ☆ Can LLMs "Reason" in Music? An Evaluation of LLMs' Capability of Music + Understanding and Generation + + +
+ Symbolic Music, akin to language, can be encoded in discrete symbols. Recent +research has extended the application of large language models (LLMs) such as +GPT-4 and Llama2 to the symbolic music domain including understanding and +generation. Yet scant research explores the details of how these LLMs perform +on advanced music understanding and conditioned generation, especially from the +multi-step reasoning perspective, which is a critical aspect in the +conditioned, editable, and interactive human-computer co-creation process. This +study conducts a thorough investigation of LLMs' capability and limitations in +symbolic music processing. We identify that current LLMs exhibit poor +performance in song-level multi-step music reasoning, and typically fail to +leverage learned music knowledge when addressing complex musical tasks. An +analysis of LLMs' responses highlights distinctly their pros and cons. Our +findings suggest achieving advanced musical capability is not intrinsically +obtained by LLMs, and future research should focus more on bridging the gap +between music knowledge and reasoning, to improve the co-creation experience +for musicians. + +
+
+ comment: Accepted by ISMIR2024 +
+
+
+
+
+ + ☆ Data Contamination Report from the 2024 CONDA Shared Task + + +
+ The 1st Workshop on Data Contamination (CONDA 2024) focuses on all relevant +aspects of data contamination in natural language processing, where data +contamination is understood as situations where evaluation data is included in +pre-training corpora used to train large scale models, compromising evaluation +results. The workshop fostered a shared task to collect evidence on data +contamination in current available datasets and models. The goal of the shared +task and associated database is to assist the community in understanding the +extent of the problem and to assist researchers in avoiding reporting +evaluation results on known contaminated resources. The shared task provides a +structured, centralized public database for the collection of contamination +evidence, open to contributions from the community via GitHub pool requests. +This first compilation paper is based on 566 reported entries over 91 +contaminated sources from a total of 23 contributors. The details of the +individual contamination events are available in the platform. The platform +continues to be online, open to contributions from the community. + +
+
+ comment: https://huggingface.co/spaces/CONDA-Workshop/Data-Contamination-Database +
+
+
+
+
+ + ☆ Interpreting and learning voice commands with a Large Language Model for + a robot system + + +
+ Robots are increasingly common in industry and daily life, such as in nursing +homes where they can assist staff. A key challenge is developing intuitive +interfaces for easy communication. The use of Large Language Models (LLMs) like +GPT-4 has enhanced robot capabilities, allowing for real-time interaction and +decision-making. This integration improves robots' adaptability and +functionality. This project focuses on merging LLMs with databases to improve +decision-making and enable knowledge acquisition for request interpretation +problems. + +
+
+ comment: PP-RAI 2024, 5th Polish Conference on Artificial Intelligence, + 18-20.04.2024 Warsaw, Poland +
+
+
+
+
+ + ☆ Generative Expressive Conversational Speech Synthesis ACM MM 2024 + + +
+ Conversational Speech Synthesis (CSS) aims to express a target utterance with +the proper speaking style in a user-agent conversation setting. Existing CSS +methods employ effective multi-modal context modeling techniques to achieve +empathy understanding and expression. However, they often need to design +complex network architectures and meticulously optimize the modules within +them. In addition, due to the limitations of small-scale datasets containing +scripted recording styles, they often fail to simulate real natural +conversational styles. To address the above issues, we propose a novel +generative expressive CSS system, termed GPT-Talker.We transform the multimodal +information of the multi-turn dialogue history into discrete token sequences +and seamlessly integrate them to form a comprehensive user-agent dialogue +context. Leveraging the power of GPT, we predict the token sequence, that +includes both semantic and style knowledge, of response for the agent. After +that, the expressive conversational speech is synthesized by the +conversation-enriched VITS to deliver feedback to the user.Furthermore, we +propose a large-scale Natural CSS Dataset called NCSSD, that includes both +naturally recorded conversational speech in improvised styles and dialogues +extracted from TV shows. It encompasses both Chinese and English languages, +with a total duration of 236 hours.We conducted comprehensive experiments on +the reliability of the NCSSD and the effectiveness of our GPT-Talker. Both +subjective and objective evaluations demonstrate that our model outperforms +other state-of-the-art CSS systems significantly in terms of naturalness and +expressiveness. The Code, Dataset, and Pre-trained Model are available at: +https://github.com/AI-S2-Lab/GPT-Talker. + +
+
+ comment: 14 pages, 6 figures, 8 tables. Accepted by ACM MM 2024 +
+
+
+
+
+ + ☆ Maverick: Efficient and Accurate Coreference Resolution Defying Recent + Trends ACL 2024 + + +
+ Large autoregressive generative models have emerged as the cornerstone for +achieving the highest performance across several Natural Language Processing +tasks. However, the urge to attain superior results has, at times, led to the +premature replacement of carefully designed task-specific approaches without +exhaustive experimentation. The Coreference Resolution task is no exception; +all recent state-of-the-art solutions adopt large generative autoregressive +models that outperform encoder-based discriminative systems. In this work,we +challenge this recent trend by introducing Maverick, a carefully designed - yet +simple - pipeline, which enables running a state-of-the-art Coreference +Resolution system within the constraints of an academic budget, outperforming +models with up to 13 billion parameters with as few as 500 million parameters. +Maverick achieves state-of-the-art performance on the CoNLL-2012 benchmark, +training with up to 0.006x the memory resources and obtaining a 170x faster +inference compared to previous state-of-the-art systems. We extensively +validate the robustness of the Maverick framework with an array of diverse +experiments, reporting improvements over prior systems in data-scarce, +long-document, and out-of-domain settings. We release our code and models for +research purposes at https://github.com/SapienzaNLP/maverick-coref. + +
+
+ comment: Accepted at main conference of ACL 2024. 15 pages +
+
+
+
+
+ + ☆ On the Problem of Text-To-Speech Model Selection for Synthetic Data + Generation in Automatic Speech Recognition + + +
+ The rapid development of neural text-to-speech (TTS) systems enabled its +usage in other areas of natural language processing such as automatic speech +recognition (ASR) or spoken language translation (SLT). Due to the large number +of different TTS architectures and their extensions, selecting which TTS +systems to use for synthetic data creation is not an easy task. We use the +comparison of five different TTS decoder architectures in the scope of +synthetic data generation to show the impact on CTC-based speech recognition +training. We compare the recognition results to computable metrics like NISQA +MOS and intelligibility, finding that there are no clear relations to the ASR +performance. We also observe that for data generation auto-regressive decoding +performs better than non-autoregressive decoding, and propose an approach to +quantify TTS generalization capabilities. + +
+
+ comment: Accepted at the SynData4GenAI 2024 workshop +
+
+
+
+
+ + ☆ Navigating Beyond Instructions: Vision-and-Language Navigation in + Obstructed Environments + + +
+ Real-world navigation often involves dealing with unexpected obstructions +such as closed doors, moved objects, and unpredictable entities. However, +mainstream Vision-and-Language Navigation (VLN) tasks typically assume +instructions perfectly align with the fixed and predefined navigation graphs +without any obstructions. This assumption overlooks potential discrepancies in +actual navigation graphs and given instructions, which can cause major failures +for both indoor and outdoor agents. To address this issue, we integrate diverse +obstructions into the R2R dataset by modifying both the navigation graphs and +visual observations, introducing an innovative dataset and task, R2R with +UNexpected Obstructions (R2R-UNO). R2R-UNO contains various types and numbers +of path obstructions to generate instruction-reality mismatches for VLN +research. Experiments on R2R-UNO reveal that state-of-the-art VLN methods +inevitably encounter significant challenges when facing such mismatches, +indicating that they rigidly follow instructions rather than navigate +adaptively. Therefore, we propose a novel method called ObVLN (Obstructed VLN), +which includes a curriculum training strategy and virtual graph construction to +help agents effectively adapt to obstructed environments. Empirical results +show that ObVLN not only maintains robust performance in unobstructed scenarios +but also achieves a substantial performance advantage with unexpected +obstructions. + +
+
+ comment: Accepted to MM 2024 +
+
+
+
+
+ + ☆ Improving Faithfulness of Large Language Models in Summarization via + Sliding Generation and Self-Consistency LREC + + +
+ Despite large language models (LLMs) have demonstrated impressive performance +in various tasks, they are still suffering from the factual inconsistency +problem called hallucinations. For instance, LLMs occasionally generate content +that diverges from source article, and prefer to extract information that +appears at the beginning and end of the context, especially in long document +summarization. Inspired by these findings, we propose to improve the +faithfulness of LLMs in summarization by impelling them to process the entire +article more fairly and faithfully. We present a novel summary generation +strategy, namely SliSum, which exploits the ideas of sliding windows and +self-consistency. Specifically, SliSum divides the source article into +overlapping windows, and utilizes LLM to generate local summaries for the +content in the windows. Finally, SliSum aggregates all local summaries using +clustering and majority voting algorithm to produce more faithful summary of +entire article. Extensive experiments demonstrate that SliSum significantly +improves the faithfulness of diverse LLMs including LLaMA-2, Claude-2 and +GPT-3.5 in both short and long text summarization, while maintaining their +fluency and informativeness and without additional fine-tuning and resources. +We further conduct qualitative and quantitative studies to investigate why +SliSum works and impacts of hyperparameters in SliSum on performance. + +
+
+ comment: Long paper accepted at LREC-COLING 2024 (oral) +
+
+
+
+
+ + ☆ QuestGen: Effectiveness of Question Generation Methods for Fact-Checking + Applications CIKM 2024 + + +
+ Verifying fact-checking claims poses a significant challenge, even for +humans. Recent approaches have demonstrated that decomposing claims into +relevant questions to gather evidence enhances the efficiency of the +fact-checking process. In this paper, we provide empirical evidence showing +that this question decomposition can be effectively automated. We demonstrate +that smaller generative models, fine-tuned for the question generation task +using data augmentation from various datasets, outperform large language models +by up to 8%. Surprisingly, in some cases, the evidence retrieved using +machine-generated questions proves to be significantly more effective for +fact-checking than that obtained from human-written questions. We also perform +manual evaluation of the decomposed questions to assess the quality of the +questions generated. + +
+
+ comment: Accepted in CIKM 2024 as a short paper 4 pages and 1 page references +
+
+
+
+
+ + ☆ MLLM Is a Strong Reranker: Advancing Multimodal Retrieval-augmented + Generation via Knowledge-enhanced Reranking and Noise-injected Training + + +
+ Multimodal Large Language Models (MLLMs) have demonstrated remarkable +capabilities in processing and generating content across multiple data +modalities, including text, images, audio, and video. However, a significant +drawback of MLLMs is their reliance on static training data, leading to +outdated information and limited contextual awareness. This static nature +hampers their ability to provide accurate, up-to-date responses, particularly +in dynamic or rapidly evolving contexts. Integrating Multimodal +Retrieval-augmented Generation (Multimodal RAG) offers a promising solution, +but the system would inevitably encounter the multi-granularity noisy +correspondence (MNC) problem, which involves two types of noise: coarse-grained +(query-caption) and fine-grained (query-image). This noise hinders accurate +retrieval and generation. In this work, we propose \textbf{RagLLaVA}, a novel +framework with knowledge-enhanced reranking and noise-injected training, to +address these limitations. We instruction-tune the MLLM with a simple yet +effective instruction template to induce its ranking ability and serve it as a +reranker to precisely filter the top-k retrieved images. For generation, we +inject visual noise during training at the data and token levels to enhance the +generator's robustness. Extensive experiments are conducted on the subsets of +two datasets that require retrieving and reasoning over images to answer a +given query. Our results demonstrate the superiority of RagLLaVA in retrieving +accurately and generating robustly. Code and models are available at +https://github.com/IDEA-FinAI/RagLLaVA. + +
+
+
+
+
+ + ☆ Cost-Effective Hallucination Detection for LLMs + + +
+ Large language models (LLMs) can be prone to hallucinations - generating +unreliable outputs that are unfaithful to their inputs, external facts or +internally inconsistent. In this work, we address several challenges for +post-hoc hallucination detection in production settings. Our pipeline for +hallucination detection entails: first, producing a confidence score +representing the likelihood that a generated answer is a hallucination; second, +calibrating the score conditional on attributes of the inputs and candidate +response; finally, performing detection by thresholding the calibrated score. +We benchmark a variety of state-of-the-art scoring methods on different +datasets, encompassing question answering, fact checking, and summarization +tasks. We employ diverse LLMs to ensure a comprehensive assessment of +performance. We show that calibrating individual scoring methods is critical +for ensuring risk-aware downstream decision making. Based on findings that no +individual score performs best in all situations, we propose a multi-scoring +framework, which combines different scores and achieves top performance across +all datasets. We further introduce cost-effective multi-scoring, which can +match or even outperform more expensive detection methods, while significantly +reducing computational overhead. + +
+
+
+
+
+ + ☆ Dancing in Chains: Reconciling Instruction Following and Faithfulness in + Language Models + + +
+ Modern language models (LMs) need to follow human instructions while being +faithful; yet, they often fail to achieve both. Here, we provide concrete +evidence of a trade-off between instruction following (i.e., follow open-ended +instructions) and faithfulness (i.e., ground responses in given context) when +training LMs with these objectives. For instance, fine-tuning LLaMA-7B on +instruction following datasets renders it less faithful. Conversely, +instruction-tuned Vicuna-7B shows degraded performance at following +instructions when further optimized on tasks that require contextual grounding. +One common remedy is multi-task learning (MTL) with data mixing, yet it remains +far from achieving a synergic outcome. We propose a simple yet effective method +that relies on Rejection Sampling for Continued Self-instruction Tuning +(ReSet), which significantly outperforms vanilla MTL. Surprisingly, we find +that less is more, as training ReSet with high-quality, yet substantially +smaller data (three-fold less) yields superior results. Our findings offer a +better understanding of objective discrepancies in alignment training of LMs. + +
+
+ comment: preprint +
+
+
+
+
+ + ☆ Towards interfacing large language models with ASR systems using + confidence measures and prompting + + +
+ As large language models (LLMs) grow in parameter size and capabilities, such +as interaction through prompting, they open up new ways of interfacing with +automatic speech recognition (ASR) systems beyond rescoring n-best lists. This +work investigates post-hoc correction of ASR transcripts with LLMs. To avoid +introducing errors into likely accurate transcripts, we propose a range of +confidence-based filtering methods. Our results indicate that this can improve +the performance of less competitive ASR systems. + +
+
+ comment: 5 pages, 3 figures, 5 tables. Accepted to Interspeech 2024 +
+
+
+
+
+ + ☆ GEGA: Graph Convolutional Networks and Evidence Retrieval Guided + Attention for Enhanced Document-level Relation Extraction + + +
+ Document-level relation extraction (DocRE) aims to extract relations between +entities from unstructured document text. Compared to sentence-level relation +extraction, it requires more complex semantic understanding from a broader text +context. Currently, some studies are utilizing logical rules within evidence +sentences to enhance the performance of DocRE. However, in the data without +provided evidence sentences, researchers often obtain a list of evidence +sentences for the entire document through evidence retrieval (ER). Therefore, +DocRE suffers from two challenges: firstly, the relevance between evidence and +entity pairs is weak; secondly, there is insufficient extraction of complex +cross-relations between long-distance multi-entities. To overcome these +challenges, we propose GEGA, a novel model for DocRE. The model leverages graph +neural networks to construct multiple weight matrices, guiding attention +allocation to evidence sentences. It also employs multi-scale representation +aggregation to enhance ER. Subsequently, we integrate the most efficient +evidence information to implement both fully supervised and weakly supervised +training processes for the model. We evaluate the GEGA model on three widely +used benchmark datasets: DocRED, Re-DocRED, and Revisit-DocRED. The +experimental results indicate that our model has achieved comprehensive +improvements compared to the existing SOTA model. + +
+
+
+
+
+ + ☆ Prompting Medical Large Vision-Language Models to Diagnose Pathologies + by Visual Question Answering + + +
+ Large Vision-Language Models (LVLMs) have achieved significant success in +recent years, and they have been extended to the medical domain. Although +demonstrating satisfactory performance on medical Visual Question Answering +(VQA) tasks, Medical LVLMs (MLVLMs) suffer from the hallucination problem, +which makes them fail to diagnose complex pathologies. Moreover, they readily +fail to learn minority pathologies due to imbalanced training data. We propose +two prompting strategies for MLVLMs that reduce hallucination and improve VQA +performance. In the first strategy, we provide a detailed explanation of the +queried pathology. In the second strategy, we fine-tune a cheap, weak learner +to achieve high performance on a specific metric, and textually provide its +judgment to the MLVLM. Tested on the MIMIC-CXR-JPG and Chexpert datasets, our +methods significantly improve the diagnostic F1 score, with the highest +increase being 0.27. We also demonstrate that our prompting strategies can be +extended to general LVLM domains. Based on POPE metrics, it effectively +suppresses the false negative predictions of existing LVLMs and improves Recall +by approximately 0.07. + +
+
+
+
+
+ + ☆ Performance of Recent Large Language Models for a Low-Resourced Language + + +
+ Large Language Models (LLMs) have shown significant advances in the past +year. In addition to new versions of GPT and Llama, several other LLMs have +been introduced recently. Some of these are open models available for download +and modification. + Although multilingual large language models have been available for some +time, their performance on low-resourced languages such as Sinhala has been +poor. We evaluated four recent LLMs on their performance directly in the +Sinhala language, and by translation to and from English. We also evaluated +their fine-tunability with a small amount of fine-tuning data. Claude and GPT +4o perform well out-of-the-box and do significantly better than previous +versions. Llama and Mistral perform poorly but show some promise of improvement +with fine tuning. + +
+
+
+
+
+ + ☆ Beyond Silent Letters: Amplifying LLMs in Emotion Recognition with Vocal + Nuances + + +
+ This paper introduces a novel approach to emotion detection in speech using +Large Language Models (LLMs). We address the limitation of LLMs in processing +audio inputs by translating speech characteristics into natural language +descriptions. Our method integrates these descriptions into text prompts, +enabling LLMs to perform multimodal emotion analysis without architectural +modifications. We evaluate our approach on two datasets: IEMOCAP and MELD, +demonstrating significant improvements in emotion recognition accuracy, +particularly for high-quality audio data. Our experiments show that +incorporating speech descriptions yields a 2 percentage point increase in +weighted F1 score on IEMOCAP (from 70.111\% to 72.596\%). We also compare +various LLM architectures and explore the effectiveness of different feature +representations. Our findings highlight the potential of this approach in +enhancing emotion detection capabilities of LLMs and underscore the importance +of audio quality in speech-based emotion recognition tasks. We'll release the +source code on Github. + +
+
+
+
+
+ + ☆ Multi-Level Querying using A Knowledge Pyramid + + +
+ This paper addresses the need for improved precision in existing +Retrieval-Augmented Generation (RAG) methods that primarily focus on enhancing +recall. We propose a multi-layer knowledge pyramid approach within the RAG +framework to achieve a better balance between precision and recall. The +knowledge pyramid consists of three layers: Ontologies, Knowledge Graphs (KGs), +and chunk-based raw text. We employ cross-layer augmentation techniques for +comprehensive knowledge coverage and dynamic updates of the Ontology schema and +instances. To ensure compactness, we utilize cross-layer filtering methods for +knowledge condensation in KGs. Our approach, named PolyRAG, follows a waterfall +model for retrieval, starting from the top of the pyramid and progressing down +until a confident answer is obtained. We introduce two benchmarks for +domain-specific knowledge retrieval, one in the academic domain and the other +in the financial domain. The effectiveness of the methods has been validated +through comprehensive experiments by outperforming 19 SOTA methods. An +encouraging observation is that the proposed method has augmented the GPT-4, +providing 395\% F1 gain by improving its performance from 0.1636 to 0.8109. + +
+
+
+
+
+ + ☆ Model Attribution in Machine-Generated Disinformation: A Domain + Generalization Approach with Supervised Contrastive Learning + + +
+ Model attribution for machine-generated disinformation poses a significant +challenge in understanding its origins and mitigating its spread. This task is +especially challenging because modern large language models (LLMs) produce +disinformation with human-like quality. Additionally, the diversity in +prompting methods used to generate disinformation complicates accurate source +attribution. These methods introduce domain-specific features that can mask the +fundamental characteristics of the models. In this paper, we introduce the +concept of model attribution as a domain generalization problem, where each +prompting method represents a unique domain. We argue that an effective +attribution model must be invariant to these domain-specific features. It +should also be proficient in identifying the originating models across all +scenarios, reflecting real-world detection challenges. To address this, we +introduce a novel approach based on Supervised Contrastive Learning. This +method is designed to enhance the model's robustness to variations in prompts +and focuses on distinguishing between different source LLMs. We evaluate our +model through rigorous experiments involving three common prompting methods: +``open-ended'', ``rewriting'', and ``paraphrasing'', and three advanced LLMs: +``llama 2'', ``chatgpt'', and ``vicuna''. Our results demonstrate the +effectiveness of our approach in model attribution tasks, achieving +state-of-the-art performance across diverse and unseen datasets. + +
+
+ comment: 10 pages, 2 figures, accepted at DSAA 2024 +
+
+
+
+
+ + ♻ ☆ Can Editing LLMs Inject Harm? + + +
+ Knowledge editing techniques have been increasingly adopted to efficiently +correct the false or outdated knowledge in Large Language Models (LLMs), due to +the high cost of retraining from scratch. Meanwhile, one critical but +under-explored question is: can knowledge editing be used to inject harm into +LLMs? In this paper, we propose to reformulate knowledge editing as a new type +of safety threat for LLMs, namely Editing Attack, and conduct a systematic +investigation with a newly constructed dataset EditAttack. Specifically, we +focus on two typical safety risks of Editing Attack including Misinformation +Injection and Bias Injection. For the risk of misinformation injection, we +first categorize it into commonsense misinformation injection and long-tail +misinformation injection. Then, we find that editing attacks can inject both +types of misinformation into LLMs, and the effectiveness is particularly high +for commonsense misinformation injection. For the risk of bias injection, we +discover that not only can biased sentences be injected into LLMs with high +effectiveness, but also one single biased sentence injection can cause a bias +increase in general outputs of LLMs, which are even highly irrelevant to the +injected sentence, indicating a catastrophic impact on the overall fairness of +LLMs. Then, we further illustrate the high stealthiness of editing attacks, +measured by their impact on the general knowledge and reasoning capacities of +LLMs, and show the hardness of defending editing attacks with empirical +evidence. Our discoveries demonstrate the emerging misuse risks of knowledge +editing techniques on compromising the safety alignment of LLMs. + +
+
+ comment: The first two authors contributed equally. 9 pages for main paper, 36 + pages including appendix. The code, results, dataset for this paper and more + resources are on the project website: https://llm-editing.github.io +
+
+
+
+
+ + ♻ ☆ Do Large Language Models Understand Conversational Implicature -- A case + study with a chinese sitcom + + +
+ Understanding the non-literal meaning of an utterance is critical for large +language models (LLMs) to become human-like social communicators. In this work, +we introduce SwordsmanImp, the first Chinese multi-turn-dialogue-based dataset +aimed at conversational implicature, sourced from dialogues in the Chinese +sitcom $\textit{My Own Swordsman}$. It includes 200 carefully handcrafted +questions, all annotated on which Gricean maxims have been violated. We test +eight close-source and open-source LLMs under two tasks: a multiple-choice +question task and an implicature explanation task. Our results show that GPT-4 +attains human-level accuracy (94%) on multiple-choice questions. CausalLM +demonstrates a 78.5% accuracy following GPT-4. Other models, including GPT-3.5 +and several open-source models, demonstrate a lower accuracy ranging from 20% +to 60% on multiple-choice questions. Human raters were asked to rate the +explanation of the implicatures generated by LLMs on their reasonability, logic +and fluency. While all models generate largely fluent and self-consistent text, +their explanations score low on reasonability except for GPT-4, suggesting that +most LLMs cannot produce satisfactory explanations of the implicatures in the +conversation. Moreover, we find LLMs' performance does not vary significantly +by Gricean maxims, suggesting that LLMs do not seem to process implicatures +derived from different maxims differently. Our data and code are available at +https://github.com/sjtu-compling/llm-pragmatics. + +
+
+ comment: 14 pages, 8 tables and 5 figures +
+
+
+
+
+ + ♻ ☆ Definition generation for lexical semantic change detection ACL 2024 + + +
+ We use contextualized word definitions generated by large language models as +semantic representations in the task of diachronic lexical semantic change +detection (LSCD). In short, generated definitions are used as `senses', and the +change score of a target word is retrieved by comparing their distributions in +two time periods under comparison. On the material of five datasets and three +languages, we show that generated definitions are indeed specific and general +enough to convey a signal sufficient to rank sets of words by the degree of +their semantic change over time. Our approach is on par with or outperforms +prior non-supervised sense-based LSCD methods. At the same time, it preserves +interpretability and allows to inspect the reasons behind a specific shift in +terms of discrete definitions-as-senses. This is another step in the direction +of explainable semantic change modeling. + +
+
+ comment: Findings of ACL 2024 +
+
+
+
+
+ + ♻ ☆ Figure it Out: Analyzing-based Jailbreak Attack on Large Language Models + + +
+ The rapid development of Large Language Models (LLMs) has brought remarkable +generative capabilities across diverse tasks. However, despite the impressive +achievements, these models still have numerous security vulnerabilities, +particularly when faced with jailbreak attacks. Therefore, by investigating +jailbreak attacks, we can uncover hidden weaknesses in LLMs and guide us in +developing more robust defense mechanisms to fortify their security. In this +paper, we further explore the boundary of jailbreak attacks on LLMs and propose +Analyzing-based Jailbreak (ABJ). This effective jailbreak attack method takes +advantage of LLMs' growing analyzing and reasoning capability and reveals their +underlying vulnerabilities when facing analysis-based tasks. We conduct a +detailed evaluation of ABJ across various open-source and closed-source LLMs, +which achieves 94.8% Attack Success Rate (ASR) and 1.06 Attack Efficiency (AE) +on GPT-4-turbo-0409, demonstrating state-of-the-art attack effectiveness and +efficiency. Our research highlights the importance of prioritizing and +enhancing the safety of LLMs to mitigate the risks of misuse.The code is +publicly available at https://github.com/theshi-1128/ABJ-Attack. + +
+
+
+
+
+ + ♻ ☆ Investigating and Mitigating the Multimodal Hallucination Snowballing in + Large Vision-Language Models ACL 2024 + + +
+ Though advanced in understanding visual information with human languages, +Large Vision-Language Models (LVLMs) still suffer from multimodal +hallucinations. A natural concern is that during multimodal interaction, the +generated hallucinations could influence the LVLMs' subsequent generation. +Thus, we raise a question: When presented with a query relevant to the +previously generated hallucination, will LVLMs be misled and respond +incorrectly, even though the ground visual information exists? To answer this, +we propose a framework called MMHalSnowball to evaluate LVLMs' behaviors when +encountering generated hallucinations, where LVLMs are required to answer +specific visual questions within a curated hallucinatory conversation. +Crucially, our experiment shows that the performance of open-source LVLMs drops +by at least $31\%$, indicating that LVLMs are prone to accept the generated +hallucinations and make false claims that they would not have supported without +distractions. We term this phenomenon Multimodal Hallucination Snowballing. To +mitigate this, we further propose a training-free method called Residual Visual +Decoding, where we revise the output distribution of LVLMs with the one derived +from the residual visual input, providing models with direct access to the +visual information. Experiments show that our method can mitigate more than +$24\%$ of the snowballed multimodal hallucination while maintaining +capabilities. + +
+
+ comment: Accepted to ACL 2024 Main Conference. 21 pages, 20 figures +
+
+
+
+
+ + ♻ ☆ Neural Retrievers are Biased Towards LLM-Generated Content KDD 2024 + + +
+ Recently, the emergence of large language models (LLMs) has revolutionized +the paradigm of information retrieval (IR) applications, especially in web +search, by generating vast amounts of human-like texts on the Internet. As a +result, IR systems in the LLM era are facing a new challenge: the indexed +documents are now not only written by human beings but also automatically +generated by the LLMs. How these LLM-generated documents influence the IR +systems is a pressing and still unexplored question. In this work, we conduct a +quantitative evaluation of IR models in scenarios where both human-written and +LLM-generated texts are involved. Surprisingly, our findings indicate that +neural retrieval models tend to rank LLM-generated documents higher. We refer +to this category of biases in neural retrievers towards the LLM-generated +content as the \textbf{source bias}. Moreover, we discover that this bias is +not confined to the first-stage neural retrievers, but extends to the +second-stage neural re-rankers. Then, in-depth analyses from the perspective of +text compression indicate that LLM-generated texts exhibit more focused +semantics with less noise, making it easier for neural retrieval models to +semantic match. To mitigate the source bias, we also propose a plug-and-play +debiased constraint for the optimization objective, and experimental results +show its effectiveness. Finally, we discuss the potential severe concerns +stemming from the observed source bias and hope our findings can serve as a +critical wake-up call to the IR community and beyond. To facilitate future +explorations of IR in the LLM era, the constructed two new benchmarks are +available at https://github.com/KID-22/Source-Bias. + +
+
+ comment: KDD 2024 +
+
+
+
+
+ + ♻ ☆ Learning to Plan for Language Modeling from Unlabeled Data + + +
+ By training to predict the next token in an unlabeled corpus, large language +models learn to perform many tasks without any labeled data. However, their +next-token-prediction objective arguably limits their performance in scenarios +that require planning, such as writing a coherent article. In this paper, we +train a module for planning the future writing process via a self-supervised +learning objective. Given the textual context, this planning module learns to +predict future abstract writing actions, which correspond to centroids in a +clustered text embedding space. By conditioning on these actions, our model +extends the successful language model formula to more abstract planning in an +unsupervised way. Empirically, we demonstrate that our method improves language +modeling performance in general, particularly with respect to the text +structure. Because our framework uses a planner module that is unsupervised and +external to the language model, new planner modules can be trained at large +scale and easily be shared with the community. + +
+
+ comment: Published at COLM 2024 +
+
+
+
+
+ + ♻ ☆ Investigating the Timescales of Language Processing with EEG and + Language Models + + +
+ This study explores the temporal dynamics of language processing by examining +the alignment between word representations from a pre-trained transformer-based +language model, and EEG data. Using a Temporal Response Function (TRF) model, +we investigate how neural activity corresponds to model representations across +different layers, revealing insights into the interaction between artificial +language models and brain responses during language comprehension. Our analysis +reveals patterns in TRFs from distinct layers, highlighting varying +contributions to lexical and compositional processing. Additionally, we used +linear discriminant analysis (LDA) to isolate part-of-speech (POS) +representations, offering insights into their influence on neural responses and +the underlying mechanisms of syntactic processing. These findings underscore +EEG's utility for probing language processing dynamics with high temporal +resolution. By bridging artificial language models and neural activity, this +study advances our understanding of their interaction at fine timescales. + +
+
+ comment: Accepted at the 2024 Conference on Cognitive Computational + Neuroscience (CCN 2024) +
+
+
+
+
+ + ♻ ☆ Empirical Capacity Model for Self-Attention Neural Networks + + +
+ Large pretrained self-attention neural networks, or transformers, have been +very successful in various tasks recently. The performance of a model on a +given task depends on its ability to memorize and generalize the training data. +Large transformer models, which may have billions of parameters, in theory have +a huge capacity to memorize content. However, the current algorithms for the +optimization fall short of the theoretical capacity, and the capacity is also +highly dependent on the content. In this paper, we focus on the memory capacity +of these models obtained using common training algorithms and synthetic +training data. Based on the results, we derive an empirical capacity model +(ECM) for a generic transformer. The ECM can be used to design task-specific +transformer models with an optimal number of parameters in cases where the +target memorization capability of the task can be defined. + +
+
+ comment: Submitted to BNAIC'24, 14 pages + refs +
+
+
+
+
+ + ♻ ☆ Enhancing and Assessing Instruction-Following with Fine-Grained + Instruction Variants + + +
+ The effective alignment of Large Language Models (LLMs) with precise +instructions is essential for their application in diverse real-world +scenarios. Current methods focus on enhancing the diversity and complexity of +training and evaluation samples, yet they fall short in accurately assessing +LLMs' ability to follow similar instruction variants. We introduce an effective +data augmentation technique that decomposes complex instructions into simpler +sub-components, modifies these, and reconstructs them into new variants, +thereby preserves the original instruction's context and complexity while +introducing variability, which is critical for training and evaluating LLMs' +instruction-following precision. We developed the DeMoRecon dataset using this +method to both fine-tune and evaluate LLMs. Our findings show that LLMs +fine-tuned with DeMoRecon will gain significant performance boost on both ours +and commonly used instructions-following benchmarks. + +
+
+
+
+
+ + ♻ ☆ Knowledge Mechanisms in Large Language Models: A Survey and Perspective + + +
+ Understanding knowledge mechanisms in Large Language Models (LLMs) is crucial +for advancing towards trustworthy AGI. This paper reviews knowledge mechanism +analysis from a novel taxonomy including knowledge utilization and evolution. +Knowledge utilization delves into the mechanism of memorization, comprehension +and application, and creation. Knowledge evolution focuses on the dynamic +progression of knowledge within individual and group LLMs. Moreover, we discuss +what knowledge LLMs have learned, the reasons for the fragility of parametric +knowledge, and the potential dark knowledge (hypothesis) that will be +challenging to address. We hope this work can help understand knowledge in LLMs +and provide insights for future research. + +
+
+ comment: Ongoing work (v2); add Section 5: Application of Knowledge Mechanism; + revise Section 6 and 7; fix typos +
+
+
+
+
+ + ♻ ☆ Contrastive Feedback Mechanism for Simultaneous Speech Translation + + +
+ Recent advances in simultaneous speech translation (SST) focus on the +decision policies that enable the use of offline-trained ST models for +simultaneous inference. These decision policies not only control the +quality-latency trade-off in SST but also mitigate the impact of unstable +predictions on translation quality by delaying translation for more context or +discarding these predictions through stable hypothesis detection. However, +these policies often overlook the potential benefits of utilizing unstable +predictions. We introduce the contrastive feedback mechanism (CFM) for SST, a +novel method that leverages these unstable predictions as feedback to improve +translation quality. CFM guides the system to eliminate undesired model +behaviors from these predictions through a contrastive objective. The +experiments on 3 state-of-the-art decision policies across 8 languages in the +MuST-C v1.0 dataset show that CFM effectively improves the performance of SST. + +
+
+ comment: Accepted to Interspeech 2024 main conference +
+
+
+
+
+ + ♻ ☆ A Role-specific Guided Large Language Model for Ophthalmic Consultation + Based on Stylistic Differentiation + + +
+ Ophthalmology consultations are crucial for diagnosing, treating, and +preventing eye diseases. However, the growing demand for consultations exceeds +the availability of ophthalmologists. By leveraging large pre-trained language +models, we can design effective dialogues for specific scenarios, aiding in +consultations. Traditional fine-tuning strategies for question-answering tasks +are impractical due to increasing model size and often ignoring patient-doctor +role function during consultations. In this paper, we propose EyeDoctor, an +ophthalmic medical questioning large language model that enhances accuracy +through doctor-patient role perception guided and an augmented knowledge base +with external disease information. Experimental results show EyeDoctor achieves +higher question-answering precision in ophthalmology consultations. Notably, +EyeDoctor demonstrated a 7.25% improvement in Rouge-1 scores and a 10.16% +improvement in F1 scores on multi-round datasets compared to second best model +ChatGPT, highlighting the importance of doctor-patient role differentiation and +dynamic knowledge base expansion for intelligent medical consultations. EyeDoc +also serves as a free available web based service and souce code is available +at https://github.com/sperfu/EyeDoc. + +
+
+
+
+
+ + ♻ ☆ AttackEval: How to Evaluate the Effectiveness of Jailbreak Attacking on + Large Language Models + + +
+ Ensuring the security of large language models (LLMs) against attacks has +become increasingly urgent, with jailbreak attacks representing one of the most +sophisticated threats. To deal with such risks, we introduce an innovative +framework that can help evaluate the effectiveness of jailbreak attacks on +LLMs. Unlike traditional binary evaluations focusing solely on the robustness +of LLMs, our method assesses the effectiveness of the attacking prompts +themselves. We present two distinct evaluation frameworks: a coarse-grained +evaluation and a fine-grained evaluation. Each framework uses a scoring range +from 0 to 1, offering unique perspectives and allowing for the assessment of +attack effectiveness in different scenarios. Additionally, we develop a +comprehensive ground truth dataset specifically tailored for jailbreak prompts. +This dataset serves as a crucial benchmark for our current study and provides a +foundational resource for future research. By comparing with traditional +evaluation methods, our study shows that the current results align with +baseline metrics while offering a more nuanced and fine-grained assessment. It +also helps identify potentially harmful attack prompts that might appear +harmless in traditional evaluations. Overall, our work establishes a solid +foundation for assessing a broader range of attack prompts in the area of +prompt injection. + +
+
+ comment: 34 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ FrameQuant: Flexible Low-Bit Quantization for Transformers + + +
+ Transformers are the backbone of powerful foundation models for many Vision +and Natural Language Processing tasks. But their compute and memory/storage +footprint is large, and so, serving such models is expensive often requiring +high-end hardware. To mitigate this difficulty, Post-Training Quantization +seeks to modify a pre-trained model and quantize it to eight bits or lower, +significantly boosting compute/memory/latency efficiency. Such models have been +successfully quantized to four bits with some performance loss. In this work, +we outline a simple scheme to quantize Transformer-based models to just two +bits (plus some overhead) with only a small drop in accuracy. Key to our +formulation is a concept borrowed from Harmonic analysis called Fusion Frames. +Our main finding is that the quantization must take place not in the original +weight space, but instead in the Fusion Frame representations. If quantization +is interpreted as the addition of noise, our casting of the problem allows +invoking an extensive body of known consistent recovery and noise robustness +guarantees. Further, if desired, de-noising filters are known in closed form. +We show empirically, via a variety of experiments, that (almost) two-bit +quantization for Transformer models promises sizable efficiency gains. The code +is available at https://github.com/vsingh-group/FrameQuant + +
+
+ comment: 25 pages, 15 figures +
+
+
+
+
+ + ♻ ☆ Latent Causal Probing: A Formal Perspective on Probing with Causal + Models of Data + + +
+ As language models (LMs) deliver increasing performance on a range of NLP +tasks, probing classifiers have become an indispensable technique in the effort +to better understand their inner workings. A typical setup involves (1) +defining an auxiliary task consisting of a dataset of text annotated with +labels, then (2) supervising small classifiers to predict the labels from the +representations of a pretrained LM as it processed the dataset. A high probing +accuracy is interpreted as evidence that the LM has learned to perform the +auxiliary task as an unsupervised byproduct of its original pretraining +objective. Despite the widespread usage of probes, however, the robust design +and analysis of probing experiments remains a challenge. We develop a formal +perspective on probing using structural causal models (SCM). Specifically, +given an SCM which explains the distribution of tokens observed during +training, we frame the central hypothesis as whether the LM has learned to +represent the latent variables of the SCM. Empirically, we extend a recent +study of LMs in the context of a synthetic grid-world navigation task, where +having an exact model of the underlying causal structure allows us to draw +strong inferences from the result of probing experiments. Our techniques +provide robust empirical evidence for the ability of LMs to induce the latent +concepts underlying text. + +
+
+ comment: COLM 2024 +
+
+
+
+
+ + ♻ ☆ LAPIS: Language Model-Augmented Police Investigation System + + +
+ Crime situations are race against time. An AI-assisted criminal investigation +system, providing prompt but precise legal counsel is in need for police +officers. We introduce LAPIS (Language Model Augmented Police Investigation +System), an automated system that assists police officers to perform rational +and legal investigative actions. We constructed a finetuning dataset and +retrieval knowledgebase specialized in crime investigation legal reasoning +task. We extended the dataset's quality by incorporating manual curation +efforts done by a group of domain experts. We then finetuned the pretrained +weights of a smaller Korean language model to the newly constructed dataset and +integrated it with the crime investigation knowledgebase retrieval approach. +Experimental results show LAPIS' potential in providing reliable legal guidance +for police officers, even better than the proprietary GPT-4 model. Qualitative +analysis on the rationales generated by LAPIS demonstrate the model's reasoning +ability to leverage the premises and derive legally correct conclusions. + +
+
+
+
+
+ + ♻ ☆ Human vs. Machine: Behavioral Differences Between Expert Humans and + Language Models in Wargame Simulations + + +
+ To some, the advent of AI promises better decision-making and increased +military effectiveness while reducing the influence of human error and +emotions. However, there is still debate about how AI systems, especially large +language models (LLMs) that can be applied to many tasks, behave compared to +humans in high-stakes military decision-making scenarios with the potential for +increased risks towards escalation and unnecessary conflicts. To test this +potential and scrutinize the use of LLMs for such purposes, we use a new +wargame experiment with 107 national security experts designed to examine +crisis escalation in a fictional US-China scenario and compare the behavior of +human player teams to LLM-simulated team responses in separate simulations. +Here, we find that the LLM-simulated responses can be more aggressive and +significantly affected by changes in the scenario. We show a considerable +high-level agreement in the LLM and human responses and significant +quantitative and qualitative differences in individual actions and strategic +tendencies. These differences depend on intrinsic biases in LLMs regarding the +appropriate level of violence following strategic instructions, the choice of +LLM, and whether the LLMs are tasked to decide for a team of players directly +or first to simulate dialog between a team of players. When simulating the +dialog, the discussions lack quality and maintain a farcical harmony. The LLM +simulations cannot account for human player characteristics, showing no +significant difference even for extreme traits, such as "pacifist" or +"aggressive sociopath." When probing behavioral consistency across individual +moves of the simulation, the tested LLMs deviated from each other but generally +showed somewhat consistent behavior. Our results motivate policymakers to be +cautious before granting autonomy or following AI-based strategy +recommendations. + +
+
+ comment: Updated based on reviewer feedback to match AIES accepted + camera-ready version +
+
+
+
+
+ + ♻ ☆ Explainable Natural Language Processing for Corporate Sustainability + Analysis + + +
+ Sustainability commonly refers to entities, such as individuals, companies, +and institutions, having a non-detrimental (or even positive) impact on the +environment, society, and the economy. With sustainability becoming a synonym +of acceptable and legitimate behaviour, it is being increasingly demanded and +regulated. Several frameworks and standards have been proposed to measure the +sustainability impact of corporations, including United Nations' sustainable +development goals and the recently introduced global sustainability reporting +framework, amongst others. However, the concept of corporate sustainability is +complex due to the diverse and intricate nature of firm operations (i.e. +geography, size, business activities, interlinks with other stakeholders). As a +result, corporate sustainability assessments are plagued by subjectivity both +within data that reflect corporate sustainability efforts (i.e. corporate +sustainability disclosures) and the analysts evaluating them. This subjectivity +can be distilled into distinct challenges, such as incompleteness, ambiguity, +unreliability and sophistication on the data dimension, as well as limited +resources and potential bias on the analyst dimension. Put together, +subjectivity hinders effective cost attribution to entities non-compliant with +prevailing sustainability expectations, potentially rendering sustainability +efforts and its associated regulations futile. To this end, we argue that +Explainable Natural Language Processing (XNLP) can significantly enhance +corporate sustainability analysis. Specifically, linguistic understanding +algorithms (lexical, semantic, syntactic), integrated with XAI capabilities +(interpretability, explainability, faithfulness), can bridge gaps in analyst +resources and mitigate subjectivity problems within data. + +
+
+
+
+
+ + ♻ ☆ SILO Language Models: Isolating Legal Risk In a Nonparametric Datastore ICLR 2024 + + +
+ The legality of training language models (LMs) on copyrighted or otherwise +restricted data is under intense debate. However, as we show, model performance +significantly degrades if trained only on low-risk text (e.g., out-of-copyright +books or government documents), due to its limited size and domain coverage. We +present SILO, a new language model that manages this risk-performance tradeoff +during inference. SILO is built by (1) training a parametric LM on Open License +Corpus (OLC), a new corpus we curate with 228B tokens of public domain and +permissively licensed text and (2) augmenting it with a more general and easily +modifiable nonparametric datastore (e.g., containing copyrighted books or news) +that is only queried during inference. The datastore allows use of high-risk +data without training on it, supports sentence-level data attribution, and +enables data producers to opt out from the model by removing content from the +store. These capabilities can foster compliance with data-use regulations such +as the fair use doctrine in the United States and the GDPR in the European +Union. Our experiments show that the parametric LM struggles on domains not +covered by OLC. However, access to the datastore greatly improves out of domain +performance, closing 90% of the performance gap with an LM trained on the Pile, +a more diverse corpus with mostly high-risk text. We also analyze which +nonparametric approach works best, where the remaining errors lie, and how +performance scales with datastore size. Our results suggest that it is possible +to build high quality language models while mitigating their legal risk. + +
+
+ comment: 29 pages; 7 figures. Published as a conference paper at ICLR 2024 + (spotlight). Code, models, and data available at + https://github.com/kernelmachine/silo-lm +
+
+
+
+
+ + ♻ ☆ A2SF: Accumulative Attention Scoring with Forgetting Factor for Token + Pruning in Transformer Decoder + + +
+ Recently, large language models (LLM) based on transformers are facing memory +bottleneck issues due to KV cache, especially in long sequence handling. +Previous researches proposed KV cache compression techniques that identify +insignificant tokens based on Accumulative Attention Scores and removes their +items from KV cache, noting that only few tokens play an important role in +attention operations. However, we have observed that the existing Accumulative +Attention Score is not suitable for the transformer decoder structure. In the +decoder model, the number of times the Attention Score accumulates varies +depending on the order of token appearance due to the effect of masking, +causing an uneven comparison between tokens. To solve this, we propose +Accumulative Attention Score with Forgetting Factor (A2SF) technique, which +introduces a Forgetting Factor in the Attention Score accumulation process. +A2SF applies a penalty to the past Attention Score generated from old tokens by +repeatedly multiplying the Forgetting Factor to the Attention Score over time. +Therefore, older tokens receive a larger penalty, providing fairness among +different ages of tokens. Through the fair comparison among tokens, we can more +effectively select important tokens. We have verified the accuracy improvement +through A2SF in the OPT and LLaMA models and A2SF improves the accuracy of +LLaMA 2 by up to 7.8% and 5.1% on 1-shot and 0-shot. + +
+
+ comment: 11 pages(9 pages + reference 2 pages), 6 figures +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 122 + +
+
+
+ + ☆ Generalized Out-of-Distribution Detection and Beyond in Vision Language + Model Era: A Survey + + +
+ Detecting out-of-distribution (OOD) samples is crucial for ensuring the +safety of machine learning systems and has shaped the field of OOD detection. +Meanwhile, several other problems are closely related to OOD detection, +including anomaly detection (AD), novelty detection (ND), open set recognition +(OSR), and outlier detection (OD). To unify these problems, a generalized OOD +detection framework was proposed, taxonomically categorizing these five +problems. However, Vision Language Models (VLMs) such as CLIP have +significantly changed the paradigm and blurred the boundaries between these +fields, again confusing researchers. In this survey, we first present a +generalized OOD detection v2, encapsulating the evolution of AD, ND, OSR, OOD +detection, and OD in the VLM era. Our framework reveals that, with some field +inactivity and integration, the demanding challenges have become OOD detection +and AD. In addition, we also highlight the significant shift in the definition, +problem settings, and benchmarks; we thus feature a comprehensive review of the +methodology for OOD detection, including the discussion over other related +tasks to clarify their relationship to OOD detection. Finally, we explore the +advancements in the emerging Large Vision Language Model (LVLM) era, such as +GPT-4V. We conclude this survey with open challenges and future directions. + +
+
+ comment: survey paper. We welcome questions, issues, and paper requests via + https://github.com/AtsuMiyai/Awesome-OOD-VLM +
+
+
+
+
+ + ☆ Vision-Language Model Based Handwriting Verification + + +
+ Handwriting Verification is a critical in document forensics. Deep learning +based approaches often face skepticism from forensic document examiners due to +their lack of explainability and reliance on extensive training data and +handcrafted features. This paper explores using Vision Language Models (VLMs), +such as OpenAI's GPT-4o and Google's PaliGemma, to address these challenges. By +leveraging their Visual Question Answering capabilities and 0-shot +Chain-of-Thought (CoT) reasoning, our goal is to provide clear, +human-understandable explanations for model decisions. Our experiments on the +CEDAR handwriting dataset demonstrate that VLMs offer enhanced +interpretability, reduce the need for large training datasets, and adapt better +to diverse handwriting styles. However, results show that the CNN-based +ResNet-18 architecture outperforms the 0-shot CoT prompt engineering approach +with GPT-4o (Accuracy: 70%) and supervised fine-tuned PaliGemma (Accuracy: +71%), achieving an accuracy of 84% on the CEDAR AND dataset. These findings +highlight the potential of VLMs in generating human-interpretable decisions +while underscoring the need for further advancements to match the performance +of specialized deep learning models. + +
+
+ comment: 4 Pages, 1 Figure, 1 Table, Accepted as Short paper at Irish Machine + Vision and Image Processing (IMVIP) Conference +
+
+
+
+
+ + ☆ The Llama 3 Herd of Models + + +
+ Modern artificial intelligence (AI) systems are powered by foundation models. +This paper presents a new set of foundation models, called Llama 3. It is a +herd of language models that natively support multilinguality, coding, +reasoning, and tool usage. Our largest model is a dense Transformer with 405B +parameters and a context window of up to 128K tokens. This paper presents an +extensive empirical evaluation of Llama 3. We find that Llama 3 delivers +comparable quality to leading language models such as GPT-4 on a plethora of +tasks. We publicly release Llama 3, including pre-trained and post-trained +versions of the 405B parameter language model and our Llama Guard 3 model for +input and output safety. The paper also presents the results of experiments in +which we integrate image, video, and speech capabilities into Llama 3 via a +compositional approach. We observe this approach performs competitively with +the state-of-the-art on image, video, and speech recognition tasks. The +resulting models are not yet being broadly released as they are still under +development. + +
+
+
+
+
+ + ☆ RainMamba: Enhanced Locality Learning with State Space Models for Video + Deraining + + +
+ The outdoor vision systems are frequently contaminated by rain streaks and +raindrops, which significantly degenerate the performance of visual tasks and +multimedia applications. The nature of videos exhibits redundant temporal cues +for rain removal with higher stability. Traditional video deraining methods +heavily rely on optical flow estimation and kernel-based manners, which have a +limited receptive field. Yet, transformer architectures, while enabling +long-term dependencies, bring about a significant increase in computational +complexity. Recently, the linear-complexity operator of the state space models +(SSMs) has contrarily facilitated efficient long-term temporal modeling, which +is crucial for rain streaks and raindrops removal in videos. Unexpectedly, its +uni-dimensional sequential process on videos destroys the local correlations +across the spatio-temporal dimension by distancing adjacent pixels. To address +this, we present an improved SSMs-based video deraining network (RainMamba) +with a novel Hilbert scanning mechanism to better capture sequence-level local +information. We also introduce a difference-guided dynamic contrastive locality +learning strategy to enhance the patch-level self-similarity learning ability +of the proposed network. Extensive experiments on four synthesized video +deraining datasets and real-world rainy videos demonstrate the superiority of +our network in the removal of rain streaks and raindrops. + +
+
+ comment: ACM Multimedia 2024 +
+
+
+
+
+ + ☆ Paying More Attention to Image: A Training-Free Method for Alleviating + Hallucination in LVLMs + + +
+ Existing Large Vision-Language Models (LVLMs) primarily align image features +of vision encoder with Large Language Models (LLMs) to leverage their superior +text generation capabilities. However, the scale disparity between vision +encoder and language model may led to LLMs assuming a predominant role in +multi-modal comprehension. This imbalance in LVLMs may result in the instances +of hallucinatory. Concretely, LVLMs may generate consistent descriptions with +or without visual input, indicating that certain outputs are influenced solely +by context text. We refer to this phenomenon as "text inertia." To counteract +this issue, we introduce a training-free algorithm to find an equilibrium point +between image comprehension and language inference. Specifically, we adaptively +involve adjusting and amplifying the attention weights assigned to image +tokens, thereby granting greater prominence to visual elements. Meanwhile, we +subtract the logits of multi-modal inputs from ones of pure text input, which +can help LVLMs be not biased towards LLMs. By enhancing images tokens and +reducing the stubborn output of LLM, we can let LVLM pay more attention to +images, towards alleviating text inertia and reducing the hallucination in +LVLMs. Our extensive experiments shows that this method substantially reduces +the frequency of hallucinatory outputs in various LVLMs in terms of different +metrics. Project page is available at https://lalbj.github.io/projects/PAI/. + +
+
+
+
+
+ + ☆ Learning Video Context as Interleaved Multimodal Sequences ECCV 2024 + + +
+ Narrative videos, such as movies, pose significant challenges in video +understanding due to their rich contexts (characters, dialogues, storylines) +and diverse demands (identify who, relationship, and reason). In this paper, we +introduce MovieSeq, a multimodal language model developed to address the wide +range of challenges in understanding video contexts. Our core idea is to +represent videos as interleaved multimodal sequences (including images, plots, +videos, and subtitles), either by linking external knowledge databases or using +offline models (such as whisper for subtitles). Through instruction-tuning, +this approach empowers the language model to interact with videos using +interleaved multimodal instructions. For example, instead of solely relying on +video as input, we jointly provide character photos alongside their names and +dialogues, allowing the model to associate these elements and generate more +comprehensive responses. To demonstrate its effectiveness, we validate +MovieSeq's performance on six datasets (LVU, MAD, Movienet, CMD, TVC, MovieQA) +across five settings (video classification, audio description, video-text +retrieval, video captioning, and video question-answering). The code will be +public at https://github.com/showlab/MovieSeq. + +
+
+ comment: Accepted by ECCV 2024 +
+
+
+
+
+ + ☆ Contrastive Factor Analysis + + +
+ Factor analysis, often regarded as a Bayesian variant of matrix +factorization, offers superior capabilities in capturing uncertainty, modeling +complex dependencies, and ensuring robustness. As the deep learning era +arrives, factor analysis is receiving less and less attention due to their +limited expressive ability. On the contrary, contrastive learning has emerged +as a potent technique with demonstrated efficacy in unsupervised +representational learning. While the two methods are different paradigms, +recent theoretical analysis has revealed the mathematical equivalence between +contrastive learning and matrix factorization, providing a potential +possibility for factor analysis combined with contrastive learning. Motivated +by the interconnectedness of contrastive learning, matrix factorization, and +factor analysis, this paper introduces a novel Contrastive Factor Analysis +framework, aiming to leverage factor analysis's advantageous properties within +the realm of contrastive learning. To further leverage the interpretability +properties of non-negative factor analysis, which can learn disentangled +representations, contrastive factor analysis is extended to a non-negative +version. Finally, extensive experimental validation showcases the efficacy of +the proposed contrastive (non-negative) factor analysis methodology across +multiple key properties, including expressiveness, robustness, +interpretability, and accurate uncertainty estimation. + +
+
+
+
+
+ + ☆ A Federated Learning-Friendly Approach for Parameter-Efficient + Fine-Tuning of SAM in 3D Segmentation + + +
+ Adapting foundation models for medical image analysis requires finetuning +them on a considerable amount of data because of extreme distribution shifts +between natural (source) data used for pretraining and medical (target) data. +However, collecting task-specific medical data for such finetuning at a central +location raises many privacy concerns. Although Federated learning (FL) +provides an effective means for training on private decentralized data, +communication costs in federating large foundation models can quickly become a +significant bottleneck, impacting the solution's scalability. In this work, we +address this problem of efficient communication while ensuring effective +learning in FL by combining the strengths of Parameter-Efficient Fine-tuning +(PEFT) with FL. Specifically, we study plug-and-play Low-Rank Adapters (LoRA) +in a federated manner to adapt the Segment Anything Model (SAM) for 3D medical +image segmentation. Unlike prior works that utilize LoRA and finetune the +entire decoder, we critically analyze the contribution of each granular +component of SAM on finetuning performance. Thus, we identify specific layers +to be federated that are very efficient in terms of communication cost while +producing on-par accuracy. Our experiments show that retaining the parameters +of the SAM model (including most of the decoder) in their original state during +adaptation is beneficial because fine-tuning on small datasets tends to distort +the inherent capabilities of the underlying foundation model. On Fed-KiTS, our +approach decreases communication cost (~48x) compared to full fine-tuning while +increasing performance (~6% Dice score) in 3D segmentation tasks. Our approach +performs similar to SAMed while achieving ~2.8x reduction in communication and +parameters to be finetuned. We further validate our approach with experiments +on Fed-IXI and Prostate MRI datasets. + +
+
+
+
+
+ + ☆ Leveraging Self-Supervised Learning for Fetal Cardiac Planes + Classification using Ultrasound Scan Videos MICCAI 2023 + + +
+ Self-supervised learning (SSL) methods are popular since they can address +situations with limited annotated data by directly utilising the underlying +data distribution. However, the adoption of such methods is not explored enough +in ultrasound (US) imaging, especially for fetal assessment. We investigate the +potential of dual-encoder SSL in utilizing unlabelled US video data to improve +the performance of challenging downstream Standard Fetal Cardiac Planes (SFCP) +classification using limited labelled 2D US images. We study 7 SSL approaches +based on reconstruction, contrastive loss, distillation, and information theory +and evaluate them extensively on a large private US dataset. Our observations +and findings are consolidated from more than 500 downstream training +experiments under different settings. Our primary observation shows that for +SSL training, the variance of the dataset is more crucial than its size because +it allows the model to learn generalisable representations, which improve the +performance of downstream tasks. Overall, the BarlowTwins method shows robust +performance, irrespective of the training settings and data variations, when +used as an initialisation for downstream tasks. Notably, full fine-tuning with +1% of labelled data outperforms ImageNet initialisation by 12% in F1-score and +outperforms other SSL initialisations by at least 4% in F1-score, thus making +it a promising candidate for transfer learning from US video to image data. + +
+
+ comment: Simplifying Medical Ultrasound: 4th International Workshop, ASMUS + 2023, Held in Conjunction with MICCAI 2023, Vancouver, BC, Canada, October 8, + 2023, Proceedings +
+
+
+
+
+ + ☆ Unifying Event-based Flow, Stereo and Depth Estimation via Feature + Similarity Matching + + +
+ As an emerging vision sensor, the event camera has gained popularity in +various vision tasks such as optical flow estimation, stereo matching, and +depth estimation due to its high-speed, sparse, and asynchronous event streams. +Unlike traditional approaches that use specialized architectures for each +specific task, we propose a unified framework, EventMatch, that reformulates +these tasks as an event-based dense correspondence matching problem, allowing +them to be solved with a single model by directly comparing feature +similarities. By utilizing a shared feature similarities module, which +integrates knowledge from other event flows via temporal or spatial +interactions, and distinct task heads, our network can concurrently perform +optical flow estimation from temporal inputs (e.g., two segments of event +streams in the temporal domain) and stereo matching from spatial inputs (e.g., +two segments of event streams from different viewpoints in the spatial domain). +Moreover, we further demonstrate that our unified model inherently supports +cross-task transfer since the architecture and parameters are shared across +tasks. Without the need for retraining on each task, our model can effectively +handle both optical flow and disparity estimation simultaneously. The +experiment conducted on the DSEC benchmark demonstrates that our model exhibits +superior performance in both optical flow and disparity estimation tasks, +outperforming existing state-of-the-art methods. Our unified approach not only +advances event-based models but also opens new possibilities for cross-task +transfer and inter-task fusion in both spatial and temporal dimensions. Our +code will be available later. + +
+
+
+
+
+ + ☆ Detecting, Explaining, and Mitigating Memorization in Diffusion Models ICLR 2024 + + +
+ Recent breakthroughs in diffusion models have exhibited exceptional +image-generation capabilities. However, studies show that some outputs are +merely replications of training data. Such replications present potential legal +challenges for model owners, especially when the generated content contains +proprietary information. In this work, we introduce a straightforward yet +effective method for detecting memorized prompts by inspecting the magnitude of +text-conditional predictions. Our proposed method seamlessly integrates without +disrupting sampling algorithms, and delivers high accuracy even at the first +generation step, with a single generation per prompt. Building on our detection +strategy, we unveil an explainable approach that shows the contribution of +individual words or tokens to memorization. This offers an interactive medium +for users to adjust their prompts. Moreover, we propose two strategies i.e., to +mitigate memorization by leveraging the magnitude of text-conditional +predictions, either through minimization during inference or filtering during +training. These proposed strategies effectively counteract memorization while +maintaining high-generation quality. Code is available at +https://github.com/YuxinWenRick/diffusion_memorization. + +
+
+ comment: 16 pages, 9 figures, accepted as oral presentation in ICLR 2024 +
+
+
+
+
+ + ☆ Tora: Trajectory-oriented Diffusion Transformer for Video Generation + + +
+ Recent advancements in Diffusion Transformer (DiT) have demonstrated +remarkable proficiency in producing high-quality video content. Nonetheless, +the potential of transformer-based diffusion models for effectively generating +videos with controllable motion remains an area of limited exploration. This +paper introduces Tora, the first trajectory-oriented DiT framework that +integrates textual, visual, and trajectory conditions concurrently for video +generation. Specifically, Tora consists of a Trajectory Extractor~(TE), a +Spatial-Temporal DiT, and a Motion-guidance Fuser~(MGF). The TE encodes +arbitrary trajectories into hierarchical spacetime motion patches with a 3D +video compression network. The MGF integrates the motion patches into the DiT +blocks to generate consistent videos following trajectories. Our design aligns +seamlessly with DiT's scalability, allowing precise control of video content's +dynamics with diverse durations, aspect ratios, and resolutions. Extensive +experiments demonstrate Tora's excellence in achieving high motion fidelity, +while also meticulously simulating the movement of the physical world. Page can +be found at https://ali-videoai.github.io/tora_video. + +
+
+
+
+
+ + ☆ Hyper-parameter tuning for text guided image editing + + +
+ The test-time finetuning text-guided image editing method, Forgedit, is +capable of tackling general and complex image editing problems given only the +input image itself and the target text prompt. During finetuning stage, using +the same set of finetuning hyper-paramters every time for every given image, +Forgedit remembers and understands the input image in 30 seconds. During +editing stage, the workflow of Forgedit might seem complicated. However, in +fact, the editing process of Forgedit is not more complex than previous SOTA +Imagic, yet completely solves the overfitting problem of Imagic. In this paper, +we will elaborate the workflow of Forgedit editing stage with examples. We will +show how to tune the hyper-parameters in an efficient way to obtain ideal +editing results. + +
+
+ comment: Codes are available at https://github.com/witcherofresearch/Forgedit/ +
+
+
+
+
+ + ☆ Explainable Artificial Intelligence for Quantifying Interfering and + High-Risk Behaviors in Autism Spectrum Disorder in a Real-World Classroom + Environment Using Privacy-Preserving Video Analysis + + +
+ Rapid identification and accurate documentation of interfering and high-risk +behaviors in ASD, such as aggression, self-injury, disruption, and restricted +repetitive behaviors, are important in daily classroom environments for +tracking intervention effectiveness and allocating appropriate resources to +manage care needs. However, having a staff dedicated solely to observing is +costly and uncommon in most educational settings. Recently, multiple research +studies have explored developing automated, continuous, and objective tools +using machine learning models to quantify behaviors in ASD. However, the +majority of the work was conducted under a controlled environment and has not +been validated for real-world conditions. In this work, we demonstrate that the +latest advances in video-based group activity recognition techniques can +quantify behaviors in ASD in real-world activities in classroom environments +while preserving privacy. Our explainable model could detect the episode of +problem behaviors with a 77% F1-score and capture distinctive behavior features +in different types of behaviors in ASD. To the best of our knowledge, this is +the first work that shows the promise of objectively quantifying behaviors in +ASD in a real-world environment, which is an important step toward the +development of a practical tool that can ease the burden of data collection for +classroom staff. + +
+
+
+
+
+ + ☆ Dynamic Object Queries for Transformer-based Incremental Object + Detection + + +
+ Incremental object detection (IOD) aims to sequentially learn new classes, +while maintaining the capability to locate and identify old ones. As the +training data arrives with annotations only with new classes, IOD suffers from +catastrophic forgetting. Prior methodologies mainly tackle the forgetting issue +through knowledge distillation and exemplar replay, ignoring the conflict +between limited model capacity and increasing knowledge. In this paper, we +explore \textit{dynamic object queries} for incremental object detection built +on Transformer architecture. We propose the \textbf{Dy}namic object +\textbf{Q}uery-based \textbf{DE}tection \textbf{TR}ansformer (DyQ-DETR), which +incrementally expands the model representation ability to achieve +stability-plasticity tradeoff. First, a new set of learnable object queries are +fed into the decoder to represent new classes. These new object queries are +aggregated with those from previous phases to adapt both old and new knowledge +well. Second, we propose the isolated bipartite matching for object queries in +different phases, based on disentangled self-attention. The interaction among +the object queries at different phases is eliminated to reduce inter-class +confusion. Thanks to the separate supervision and computation over object +queries, we further present the risk-balanced partial calibration for effective +exemplar replay. Extensive experiments demonstrate that DyQ-DETR significantly +surpasses the state-of-the-art methods, with limited parameter overhead. Code +will be made publicly available. + +
+
+
+
+
+ + ☆ Expressive Whole-Body 3D Gaussian Avatar ECCV 2024 + + +
+ Facial expression and hand motions are necessary to express our emotions and +interact with the world. Nevertheless, most of the 3D human avatars modeled +from a casually captured video only support body motions without facial +expressions and hand motions.In this work, we present ExAvatar, an expressive +whole-body 3D human avatar learned from a short monocular video. We design +ExAvatar as a combination of the whole-body parametric mesh model (SMPL-X) and +3D Gaussian Splatting (3DGS). The main challenges are 1) a limited diversity of +facial expressions and poses in the video and 2) the absence of 3D +observations, such as 3D scans and RGBD images. The limited diversity in the +video makes animations with novel facial expressions and poses non-trivial. In +addition, the absence of 3D observations could cause significant ambiguity in +human parts that are not observed in the video, which can result in noticeable +artifacts under novel motions. To address them, we introduce our hybrid +representation of the mesh and 3D Gaussians. Our hybrid representation treats +each 3D Gaussian as a vertex on the surface with pre-defined connectivity +information (i.e., triangle faces) between them following the mesh topology of +SMPL-X. It makes our ExAvatar animatable with novel facial expressions by +driven by the facial expression space of SMPL-X. In addition, by using +connectivity-based regularizers, we significantly reduce artifacts in novel +facial expressions and poses. + +
+
+ comment: Accepted to ECCV 2024. Project page: + https://mks0601.github.io/ExAvatar/ +
+
+
+
+
+ + ☆ Synthetic Simplicity: Unveiling Bias in Medical Data Augmentation + + +
+ Synthetic data is becoming increasingly integral in data-scarce fields such +as medical imaging, serving as a substitute for real data. However, its +inherent statistical characteristics can significantly impact downstream tasks, +potentially compromising deployment performance. In this study, we empirically +investigate this issue and uncover a critical phenomenon: downstream neural +networks often exploit spurious distinctions between real and synthetic data +when there is a strong correlation between the data source and the task label. +This exploitation manifests as \textit{simplicity bias}, where models overly +rely on superficial features rather than genuine task-related complexities. +Through principled experiments, we demonstrate that the source of data (real +vs.\ synthetic) can introduce spurious correlating factors leading to poor +performance during deployment when the correlation is absent. We first +demonstrate this vulnerability on a digit classification task, where the model +spuriously utilizes the source of data instead of the digit to provide an +inference. We provide further evidence of this phenomenon in a medical imaging +problem related to cardiac view classification in echocardiograms, particularly +distinguishing between 2-chamber and 4-chamber views. Given the increasing role +of utilizing synthetic datasets, we hope that our experiments serve as +effective guidelines for the utilization of synthetic datasets in model +training. + +
+
+
+
+
+ + ☆ An Explainable Vision Transformer with Transfer Learning Combined with + Support Vector Machine Based Efficient Drought Stress Identification + + +
+ Early detection of drought stress is critical for taking timely measures for +reducing crop loss before the drought impact becomes irreversible. The subtle +phenotypical and physiological changes in response to drought stress are +captured by non-invasive imaging techniques and these imaging data serve as +valuable resource for machine learning methods to identify drought stress. +While convolutional neural networks (CNNs) are in wide use, vision transformers +(ViTs) present a promising alternative in capturing long-range dependencies and +intricate spatial relationships, thereby enhancing the detection of subtle +indicators of drought stress. We propose an explainable deep learning pipeline +that leverages the power of ViTs for drought stress detection in potato crops +using aerial imagery. We applied two distinct approaches: a synergistic +combination of ViT and support vector machine (SVM), where ViT extracts +intricate spatial features from aerial images, and SVM classifies the crops as +stressed or healthy and an end-to-end approach using a dedicated classification +layer within ViT to directly detect drought stress. Our key findings explain +the ViT model's decision-making process by visualizing attention maps. These +maps highlight the specific spatial features within the aerial images that the +ViT model focuses as the drought stress signature. Our findings demonstrate +that the proposed methods not only achieve high accuracy in drought stress +identification but also shedding light on the diverse subtle plant features +associated with drought stress. This offers a robust and interpretable solution +for drought stress monitoring for farmers to undertake informed decisions for +improved crop management. + +
+
+ comment: 30 pages, 6 figures, 4 tables +
+
+
+
+
+ + ☆ MTA-CLIP: Language-Guided Semantic Segmentation with Mask-Text Alignment ECCV 2024 + + +
+ Recent approaches have shown that large-scale vision-language models such as +CLIP can improve semantic segmentation performance. These methods typically aim +for pixel-level vision-language alignment, but often rely on low resolution +image features from CLIP, resulting in class ambiguities along boundaries. +Moreover, the global scene representations in CLIP text embeddings do not +directly correlate with the local and detailed pixel-level features, making +meaningful alignment more difficult. To address these limitations, we introduce +MTA-CLIP, a novel framework employing mask-level vision-language alignment. +Specifically, we first propose Mask-Text Decoder that enhances the mask +representations using rich textual data with the CLIP language model. +Subsequently, it aligns mask representations with text embeddings using +Mask-to-Text Contrastive Learning. Furthermore, we introduce MaskText Prompt +Learning, utilizing multiple context-specific prompts for text embeddings to +capture diverse class representations across masks. Overall, MTA-CLIP achieves +state-of-the-art, surpassing prior works by an average of 2.8% and 1.3% on on +standard benchmark datasets, ADE20k and Cityscapes, respectively. + +
+
+ comment: accepted at ECCV 2024 +
+
+
+
+
+ + ☆ Spatial Transformer Network YOLO Model for Agricultural Object Detection + + +
+ Object detection plays a crucial role in the field of computer vision by +autonomously identifying and locating objects of interest. The You Only Look +Once (YOLO) model is an effective single-shot detector. However, YOLO faces +challenges in cluttered or partially occluded scenes and can struggle with +small, low-contrast objects. We propose a new method that integrates spatial +transformer networks (STNs) into YOLO to improve performance. The proposed +STN-YOLO aims to enhance the model's effectiveness by focusing on important +areas of the image and improving the spatial invariance of the model before the +detection process. Our proposed method improved object detection performance +both qualitatively and quantitatively. We explore the impact of different +localization networks within the STN module as well as the robustness of the +model across different spatial transformations. We apply the STN-YOLO on +benchmark datasets for Agricultural object detection as well as a new dataset +from a state-of-the-art plant phenotyping greenhouse facility. Our code and +dataset are publicly available. + +
+
+ comment: 7 pages, 5 figures, submitted for review +
+
+
+
+
+ + ☆ MSA2Net: Multi-scale Adaptive Attention-guided Network for Medical Image + Segmentation BMVC 2025 + + +
+ Medical image segmentation involves identifying and separating object +instances in a medical image to delineate various tissues and structures, a +task complicated by the significant variations in size, shape, and density of +these features. Convolutional neural networks (CNNs) have traditionally been +used for this task but have limitations in capturing long-range dependencies. +Transformers, equipped with self-attention mechanisms, aim to address this +problem. However, in medical image segmentation it is beneficial to merge both +local and global features to effectively integrate feature maps across various +scales, capturing both detailed features and broader semantic elements for +dealing with variations in structures. In this paper, we introduce MSA2Net, a +new deep segmentation framework featuring an expedient design of +skip-connections. These connections facilitate feature fusion by dynamically +weighting and combining coarse-grained encoder features with fine-grained +decoder feature maps. Specifically, we propose a Multi-Scale Adaptive Spatial +Attention Gate (MASAG), which dynamically adjusts the receptive field (Local +and Global contextual information) to ensure that spatially relevant features +are selectively highlighted while minimizing background distractions. Extensive +evaluations involving dermatology, and radiological datasets demonstrate that +our MSA2Net outperforms state-of-the-art (SOTA) works or matches their +performance. The source code is publicly available at +https://github.com/xmindflow/MSA-2Net. + +
+
+ comment: Accepted at BMVC 2025. Supplementary materials included at the end of + the main paper (3 pages, 2 figures, 1 table) +
+
+
+
+
+ + ☆ Quality Control for Radiology Report Generation Models via Auxiliary + Auditing Components MICCAI + + +
+ Automation of medical image interpretation could alleviate bottlenecks in +diagnostic workflows, and has become of particular interest in recent years due +to advancements in natural language processing. Great strides have been made +towards automated radiology report generation via AI, yet ensuring clinical +accuracy in generated reports is a significant challenge, hindering deployment +of such methods in clinical practice. In this work we propose a quality control +framework for assessing the reliability of AI-generated radiology reports with +respect to semantics of diagnostic importance using modular auxiliary auditing +components (AC). Evaluating our pipeline on the MIMIC-CXR dataset, our findings +show that incorporating ACs in the form of disease-classifiers can enable +auditing that identifies more reliable reports, resulting in higher F1 scores +compared to unfiltered generated reports. Additionally, leveraging the +confidence of the AC labels further improves the audit's effectiveness. + +
+
+ comment: Accepted to MICCAI UNSURE Workshop +
+
+
+
+
+ + ☆ RoadFormer+: Delivering RGB-X Scene Parsing through Scale-Aware + Information Decoupling and Advanced Heterogeneous Feature Fusion + + +
+ Task-specific data-fusion networks have marked considerable achievements in +urban scene parsing. Among these networks, our recently proposed RoadFormer +successfully extracts heterogeneous features from RGB images and surface normal +maps and fuses these features through attention mechanisms, demonstrating +compelling efficacy in RGB-Normal road scene parsing. However, its performance +significantly deteriorates when handling other types/sources of data or +performing more universal, all-category scene parsing tasks. To overcome these +limitations, this study introduces RoadFormer+, an efficient, robust, and +adaptable model capable of effectively fusing RGB-X data, where ``X'', +represents additional types/modalities of data such as depth, thermal, surface +normal, and polarization. Specifically, we propose a novel hybrid feature +decoupling encoder to extract heterogeneous features and decouple them into +global and local components. These decoupled features are then fused through a +dual-branch multi-scale heterogeneous feature fusion block, which employs +parallel Transformer attentions and convolutional neural network modules to +merge multi-scale features across different scales and receptive fields. The +fused features are subsequently fed into a decoder to generate the final +semantic predictions. Notably, our proposed RoadFormer+ ranks first on the +KITTI Road benchmark and achieves state-of-the-art performance in mean +intersection over union on the Cityscapes, MFNet, FMB, and ZJU datasets. +Moreover, it reduces the number of learnable parameters by 65\% compared to +RoadFormer. Our source code will be publicly available at +mias.group/RoadFormerPlus. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ☆ EZSR: Event-based Zero-Shot Recognition + + +
+ This paper studies zero-shot object recognition using event camera data. +Guided by CLIP, which is pre-trained on RGB images, existing approaches achieve +zero-shot object recognition by maximizing embedding similarities between event +data encoded by an event encoder and RGB images encoded by the CLIP image +encoder. Alternatively, several methods learn RGB frame reconstructions from +event data for the CLIP image encoder. However, these approaches often result +in suboptimal zero-shot performance. + This study develops an event encoder without relying on additional +reconstruction networks. We theoretically analyze the performance bottlenecks +of previous approaches: global similarity-based objective (i.e., maximizing the +embedding similarities) cause semantic misalignments between the learned event +embedding space and the CLIP text embedding space due to the degree of freedom. +To mitigate the issue, we explore a scalar-wise regularization strategy. +Furthermore, to scale up the number of events and RGB data pairs for training, +we also propose a pipeline for synthesizing event data from static RGB images. + Experimentally, our data synthesis strategy exhibits an attractive scaling +property, and our method achieves superior zero-shot object recognition +performance on extensive standard benchmark datasets, even compared with past +supervised learning approaches. For example, we achieve 47.84% zero-shot +accuracy on the N-ImageNet dataset. + +
+
+
+
+
+ + ☆ MicroMIL: Graph-based Contextual Multiple Instance Learning for Patient + Diagnosis Using Microscopy Images + + +
+ Current histopathology research has primarily focused on using whole-slide +images (WSIs) produced by scanners with weakly-supervised multiple instance +learning (MIL). However, WSIs are costly, memory-intensive, and require +extensive analysis time. As an alternative, microscopy-based analysis offers +cost and memory efficiency, though microscopy images face issues with unknown +absolute positions and redundant images due to multiple captures from the +subjective perspectives of pathologists. To this end, we introduce MicroMIL, a +weakly-supervised MIL framework specifically built to address these challenges +by dynamically clustering images using deep cluster embedding (DCE) and Gumbel +Softmax for representative image extraction. Graph edges are then constructed +from the upper triangular similarity matrix, with nodes connected to their most +similar neighbors, and a graph neural network (GNN) is utilized to capture +local and diverse areas of contextual information. Unlike existing graph-based +MIL methods designed for WSIs that require absolute positions, MicroMIL +efficiently handles the graph edges without this need. Extensive evaluations on +real-world colon cancer (Seegene) and public BreakHis datasets demonstrate that +MicroMIL outperforms state-of-the-art (SOTA) methods, offering a robust and +efficient solution for patient diagnosis using microscopy images. The code is +available at https://anonymous.4open.science/r/MicroMIL-6C7C + +
+
+ comment: The first two authors contributed equally to this work +
+
+
+
+
+ + ☆ Robust Simultaneous Multislice MRI Reconstruction Using Deep Generative + Priors + + +
+ Simultaneous multislice (SMS) imaging is a powerful technique for +accelerating magnetic resonance imaging (MRI) acquisitions. However, SMS +reconstruction remains challenging due to the complex signal interactions +between and within the excited slices. This study presents a robust SMS MRI +reconstruction method using deep generative priors. Starting from Gaussian +noise, we leverage denoising diffusion probabilistic models (DDPM) to gradually +recover the individual slices through reverse diffusion iterations while +imposing data consistency from the measured k-space under readout concatenation +framework. The posterior sampling procedure is designed such that the DDPM +training can be performed on single-slice images without special adjustments +for SMS tasks. Additionally, our method integrates a low-frequency enhancement +(LFE) module to address a practical issue that SMS-accelerated fast spin echo +(FSE) and echo-planar imaging (EPI) sequences cannot easily embed +autocalibration signals. Extensive experiments demonstrate that our approach +consistently outperforms existing methods and generalizes well to unseen +datasets. The code is available at https://github.com/Solor-pikachu/ROGER after +the review process. + +
+
+
+
+
+ + ☆ Evaluating SAM2's Role in Camouflaged Object Detection: From SAM to SAM2 + + +
+ The Segment Anything Model (SAM), introduced by Meta AI Research as a generic +object segmentation model, quickly garnered widespread attention and +significantly influenced the academic community. To extend its application to +video, Meta further develops Segment Anything Model 2 (SAM2), a unified model +capable of both video and image segmentation. SAM2 shows notable improvements +over its predecessor in terms of applicable domains, promptable segmentation +accuracy, and running speed. However, this report reveals a decline in SAM2's +ability to perceive different objects in images without prompts in its auto +mode, compared to SAM. Specifically, we employ the challenging task of +camouflaged object detection to assess this performance decrease, hoping to +inspire further exploration of the SAM model family by researchers. The results +of this paper are provided in \url{https://github.com/luckybird1994/SAMCOD}. + +
+
+
+
+
+ + ☆ Adaptive Mix for Semi-Supervised Medical Image Segmentation + + +
+ Mix-up is a key technique for consistency regularization-based +semi-supervised learning methods, generating strong-perturbed samples for +strong-weak pseudo-supervision. Existing mix-up operations are performed either +randomly or with predefined rules, such as replacing low-confidence patches +with high-confidence ones. The former lacks control over the perturbation +degree, leading to overfitting on randomly perturbed samples, while the latter +tends to generate images with trivial perturbations, both of which limit the +effectiveness of consistency learning. This paper aims to answer the following +question: How can image mix-up perturbation be adaptively performed during +training? To this end, we propose an Adaptive Mix algorithm (AdaMix) for image +mix-up in a self-paced learning manner. Given that, in general, a model's +performance gradually improves during training, AdaMix is equipped with a +self-paced curriculum that, in the initial training stage, provides relatively +simple perturbed samples and then gradually increases the difficulty of +perturbed images by adaptively controlling the perturbation degree based on the +model's learning state estimated by a self-paced regularize. We develop three +frameworks with our AdaMix, i.e., AdaMix-ST, AdaMix-MT, and AdaMix-CT, for +semi-supervised medical image segmentation. Extensive experiments on three +public datasets, including both 2D and 3D modalities, show that the proposed +frameworks are capable of achieving superior performance. For example, compared +with the state-of-the-art, AdaMix-CT achieves relative improvements of 2.62% in +Dice and 48.25% in average surface distance on the ACDC dataset with 10% +labeled data. The results demonstrate that mix-up operations with dynamically +adjusted perturbation strength based on the segmentation model's state can +significantly enhance the effectiveness of consistency regularization. + +
+
+
+
+
+ + ☆ InScope: A New Real-world 3D Infrastructure-side Collaborative + Perception Dataset for Open Traffic Scenarios + + +
+ Perception systems of autonomous vehicles are susceptible to occlusion, +especially when examined from a vehicle-centric perspective. Such occlusion can +lead to overlooked object detections, e.g., larger vehicles such as trucks or +buses may create blind spots where cyclists or pedestrians could be obscured, +accentuating the safety concerns associated with such perception system +limitations. To mitigate these challenges, the vehicle-to-everything (V2X) +paradigm suggests employing an infrastructure-side perception system (IPS) to +complement autonomous vehicles with a broader perceptual scope. Nevertheless, +the scarcity of real-world 3D infrastructure-side datasets constrains the +advancement of V2X technologies. To bridge these gaps, this paper introduces a +new 3D infrastructure-side collaborative perception dataset, abbreviated as +inscope. Notably, InScope is the first dataset dedicated to addressing +occlusion challenges by strategically deploying multiple-position Light +Detection and Ranging (LiDAR) systems on the infrastructure side. Specifically, +InScope encapsulates a 20-day capture duration with 303 tracking trajectories +and 187,787 3D bounding boxes annotated by experts. Through analysis of +benchmarks, four different benchmarks are presented for open traffic scenarios, +including collaborative 3D object detection, multisource data fusion, data +domain transfer, and 3D multiobject tracking tasks. Additionally, a new metric +is designed to quantify the impact of occlusion, facilitating the evaluation of +detection degradation ratios among various algorithms. The Experimental +findings showcase the enhanced performance of leveraging InScope to assist in +detecting and tracking 3D multiobjects in real-world scenarios, particularly in +tracking obscured, small, and distant objects. The dataset and benchmarks are +available at https://github.com/xf-zh/InScope. + +
+
+
+
+
+ + ☆ Voxel Scene Graph for Intracranial Hemorrhage + + +
+ Patients with Intracranial Hemorrhage (ICH) face a potentially +life-threatening condition, and patient-centered individualized treatment +remains challenging due to possible clinical complications. Deep-Learning-based +methods can efficiently analyze the routinely acquired head CTs to support the +clinical decision-making. The majority of early work focuses on the detection +and segmentation of ICH, but do not model the complex relations between ICH and +adjacent brain structures. In this work, we design a tailored object detection +method for ICH, which we unite with segmentation-grounded Scene Graph +Generation (SGG) methods to learn a holistic representation of the clinical +cerebral scene. To the best of our knowledge, this is the first application of +SGG for 3D voxel images. We evaluate our method on two head-CT datasets and +demonstrate that our model can recall up to 74% of clinically relevant +relations. This work lays the foundation towards SGG for 3D voxel data. The +generated Scene Graphs can already provide insights for the clinician, but are +also valuable for all downstream tasks as a compact and interpretable +representation. + +
+
+
+
+
+ + ☆ Multi-Site Class-Incremental Learning with Weighted Experts in + Echocardiography MICCAI + + +
+ Building an echocardiography view classifier that maintains performance in +real-life cases requires diverse multi-site data, and frequent updates with +newly available data to mitigate model drift. Simply fine-tuning on new +datasets results in "catastrophic forgetting", and cannot adapt to variations +of view labels between sites. Alternatively, collecting all data on a single +server and re-training may not be feasible as data sharing agreements may +restrict image transfer, or datasets may only become available at different +times. Furthermore, time and cost associated with re-training grows with every +new dataset. We propose a class-incremental learning method which learns an +expert network for each dataset, and combines all expert networks with a score +fusion model. The influence of ``unqualified experts'' is minimised by +weighting each contribution with a learnt in-distribution score. These weights +promote transparency as the contribution of each expert is known during +inference. Instead of using the original images, we use learned features from +each dataset, which are easier to share and raise fewer licensing and privacy +concerns. We validate our work on six datasets from multiple sites, +demonstrating significant reductions in training time while improving view +classification performance. + +
+
+ comment: Accepted for Oral at MICCAI workshop ASMUS-2024 +
+
+
+
+
+ + ☆ Conditioned Prompt-Optimization for Continual Deepfake Detection ICPR 2024 + + +
+ The rapid advancement of generative models has significantly enhanced the +realism and customization of digital content creation. The increasing power of +these tools, coupled with their ease of access, fuels the creation of +photorealistic fake content, termed deepfakes, that raises substantial concerns +about their potential misuse. In response, there has been notable progress in +developing detection mechanisms to identify content produced by these advanced +systems. However, existing methods often struggle to adapt to the continuously +evolving landscape of deepfake generation. This paper introduces Prompt2Guard, +a novel solution for exemplar-free continual deepfake detection of images, that +leverages Vision-Language Models (VLMs) and domain-specific multimodal prompts. +Compared to previous VLM-based approaches that are either bounded by prompt +selection accuracy or necessitate multiple forward passes, we leverage a +prediction ensembling technique with read-only prompts. Read-only prompts do +not interact with VLMs internal representation, mitigating the need for +multiple forward passes. Thus, we enhance efficiency and accuracy in detecting +generated content. Additionally, our method exploits a text-prompt conditioning +tailored to deepfake detection, which we demonstrate is beneficial in our +setting. We evaluate Prompt2Guard on CDDB-Hard, a continual deepfake detection +benchmark composed of five deepfake detection datasets spanning multiple +domains and generators, achieving a new state-of-the-art. Additionally, our +results underscore the effectiveness of our approach in addressing the +challenges posed by continual deepfake detection, paving the way for more +robust and adaptable solutions in deepfake detection. + +
+
+ comment: Accepted at ICPR 2024 +
+
+
+
+
+ + ☆ ControlMLLM: Training-Free Visual Prompt Learning for Multimodal Large + Language Models + + +
+ In this work, we propose a training-free method to inject visual referring +into Multimodal Large Language Models (MLLMs) through learnable visual token +optimization. We observe the relationship between text prompt tokens and visual +tokens in MLLMs, where attention layers model the connection between them. Our +approach involves adjusting visual tokens from the MLP output during inference, +controlling which text prompt tokens attend to which visual tokens. We optimize +a learnable visual token based on an energy function, enhancing the strength of +referential regions in the attention map. This enables detailed region +description and reasoning without the need for substantial training costs or +model retraining. Our method offers a promising direction for integrating +referential abilities into MLLMs. Our method support referring with box, mask, +scribble and point. The results demonstrate that our method exhibits +controllability and interpretability. + +
+
+ comment: Code:https://github.com/mrwu-mac/ControlMLLM +
+
+
+
+
+ + ☆ Skeleton-Based Action Recognition with Spatial-Structural Graph + Convolution + + +
+ Human Activity Recognition (HAR) is a field of study that focuses on +identifying and classifying human activities. Skeleton-based Human Activity +Recognition has received much attention in recent years, where Graph +Convolutional Network (GCN) based method is widely used and has achieved +remarkable results. However, the representation of skeleton data and the issue +of over-smoothing in GCN still need to be studied. 1). Compared to central +nodes, edge nodes can only aggregate limited neighbor information, and +different edge nodes of the human body are always structurally related. +However, the information from edge nodes is crucial for fine-grained activity +recognition. 2). The Graph Convolutional Network suffers from a significant +over-smoothing issue, causing nodes to become increasingly similar as the +number of network layers increases. Based on these two ideas, we propose a +two-stream graph convolution method called Spatial-Structural GCN (SpSt-GCN). +Spatial GCN performs information aggregation based on the topological structure +of the human body, and structural GCN performs differentiation based on the +similarity of edge node sequences. The spatial connection is fixed, and the +human skeleton naturally maintains this topology regardless of the actions +performed by humans. However, the structural connection is dynamic and depends +on the type of movement the human body is performing. Based on this idea, we +also propose an entirely data-driven structural connection, which greatly +increases flexibility. We evaluate our method on two large-scale datasets, +i.e., NTU RGB+D and NTU RGB+D 120. The proposed method achieves good results +while being efficient. + +
+
+
+
+
+ + ☆ PhysFlow: Skin tone transfer for remote heart rate estimation through + conditional normalizing flows + + +
+ In recent years, deep learning methods have shown impressive results for +camera-based remote physiological signal estimation, clearly surpassing +traditional methods. However, the performance and generalization ability of +Deep Neural Networks heavily depends on rich training data truly representing +different factors of variation encountered in real applications. Unfortunately, +many current remote photoplethysmography (rPPG) datasets lack diversity, +particularly in darker skin tones, leading to biased performance of existing +rPPG approaches. To mitigate this bias, we introduce PhysFlow, a novel method +for augmenting skin diversity in remote heart rate estimation using conditional +normalizing flows. PhysFlow adopts end-to-end training optimization, enabling +simultaneous training of supervised rPPG approaches on both original and +generated data. Additionally, we condition our model using CIELAB color space +skin features directly extracted from the facial videos without the need for +skin-tone labels. We validate PhysFlow on publicly available datasets, +UCLA-rPPG and MMPD, demonstrating reduced heart rate error, particularly in +dark skin tones. Furthermore, we demonstrate its versatility and adaptability +across different data-driven rPPG methods. + +
+
+
+
+
+ + ☆ A Simple Low-bit Quantization Framework for Video Snapshot Compressive + Imaging ECCV 2024 + + +
+ Video Snapshot Compressive Imaging (SCI) aims to use a low-speed 2D camera to +capture high-speed scene as snapshot compressed measurements, followed by a +reconstruction algorithm to reconstruct the high-speed video frames. +State-of-the-art (SOTA) deep learning-based algorithms have achieved impressive +performance, yet with heavy computational workload. Network quantization is a +promising way to reduce computational cost. However, a direct low-bit +quantization will bring large performance drop. To address this challenge, in +this paper, we propose a simple low-bit quantization framework (dubbed Q-SCI) +for the end-to-end deep learning-based video SCI reconstruction methods which +usually consist of a feature extraction, feature enhancement, and video +reconstruction module. Specifically, we first design a high-quality feature +extraction module and a precise video reconstruction module to extract and +propagate high-quality features in the low-bit quantized model. In addition, to +alleviate the information distortion of the Transformer branch in the quantized +feature enhancement module, we introduce a shift operation on the query and key +distributions to further bridge the performance gap. Comprehensive experimental +results manifest that our Q-SCI framework can achieve superior performance, +e.g., 4-bit quantized EfficientSCI-S derived by our Q-SCI framework can +theoretically accelerate the real-valued EfficientSCI-S by 7.8X with only 2.3% +performance gap on the simulation testing datasets. Code is available at +https://github.com/mcao92/QuantizedSCI. + +
+
+ comment: 18 pages, Accepted by ECCV 2024 +
+
+
+
+
+ + ☆ Expanding the Medical Decathlon dataset: segmentation of colon and + colorectal cancer from computed tomography images + + +
+ Colorectal cancer is the third-most common cancer in the Western Hemisphere. +The segmentation of colorectal and colorectal cancer by computed tomography is +an urgent problem in medicine. Indeed, a system capable of solving this problem +will enable the detection of colorectal cancer at early stages of the disease, +facilitate the search for pathology by the radiologist, and significantly +accelerate the process of diagnosing the disease. However, scientific +publications on medical image processing mostly use closed, non-public data. +This paper presents an extension of the Medical Decathlon dataset with +colorectal markups in order to improve the quality of segmentation algorithms. +An experienced radiologist validated the data, categorized it into subsets by +quality, and published it in the public domain. Based on the obtained results, +we trained neural network models of the UNet architecture with 5-part +cross-validation and achieved a Dice metric quality of $0.6988 \pm 0.3$. The +published markups will improve the quality of colorectal cancer detection and +simplify the radiologist's job for study description. + +
+
+ comment: 8 pages, 2 figures, 2 tables +
+
+
+
+
+ + ☆ PEAR: Phrase-Based Hand-Object Interaction Anticipation + + +
+ First-person hand-object interaction anticipation aims to predict the +interaction process over a forthcoming period based on current scenes and +prompts. This capability is crucial for embodied intelligence and human-robot +collaboration. The complete interaction process involves both pre-contact +interaction intention (i.e., hand motion trends and interaction hotspots) and +post-contact interaction manipulation (i.e., manipulation trajectories and hand +poses with contact). Existing research typically anticipates only interaction +intention while neglecting manipulation, resulting in incomplete predictions +and an increased likelihood of intention errors due to the lack of manipulation +constraints. To address this, we propose a novel model, PEAR (Phrase-Based +Hand-Object Interaction Anticipation), which jointly anticipates interaction +intention and manipulation. To handle uncertainties in the interaction process, +we employ a twofold approach. Firstly, we perform cross-alignment of verbs, +nouns, and images to reduce the diversity of hand movement patterns and object +functional attributes, thereby mitigating intention uncertainty. Secondly, we +establish bidirectional constraints between intention and manipulation using +dynamic integration and residual connections, ensuring consistency among +elements and thus overcoming manipulation uncertainty. To rigorously evaluate +the performance of the proposed model, we collect a new task-relevant dataset, +EGO-HOIP, with comprehensive annotations. Extensive experimental results +demonstrate the superiority of our method. + +
+
+ comment: 22 pages, 10 figures, 4 tables +
+
+
+
+
+ + ☆ MaskUno: Switch-Split Block For Enhancing Instance Segmentation + + +
+ Instance segmentation is an advanced form of image segmentation which, beyond +traditional segmentation, requires identifying individual instances of +repeating objects in a scene. Mask R-CNN is the most common architecture for +instance segmentation, and improvements to this architecture include steps such +as benefiting from bounding box refinements, adding semantics, or backbone +enhancements. In all the proposed variations to date, the problem of competing +kernels (each class aims to maximize its own accuracy) persists when models try +to synchronously learn numerous classes. In this paper, we propose mitigating +this problem by replacing mask prediction with a Switch-Split block that +processes refined ROIs, classifies them, and assigns them to specialized mask +predictors. We name the method MaskUno and test it on various models from the +literature, which are then trained on multiple classes using the benchmark COCO +dataset. An increase in the mean Average Precision (mAP) of 2.03% was observed +for the high-performing DetectoRS when trained on 80 classes. MaskUno proved to +enhance the mAP of instance segmentation models regardless of the number and +typ + +
+
+
+
+
+ + ☆ Mitral Regurgitation Recogniton based on Unsupervised + Out-of-Distribution Detection with Residual Diffusion Amplification MICCAI + + +
+ Mitral regurgitation (MR) is a serious heart valve disease. Early and +accurate diagnosis of MR via ultrasound video is critical for timely clinical +decision-making and surgical intervention. However, manual MR diagnosis heavily +relies on the operator's experience, which may cause misdiagnosis and +inter-observer variability. Since MR data is limited and has large intra-class +variability, we propose an unsupervised out-of-distribution (OOD) detection +method to identify MR rather than building a deep classifier. To our knowledge, +we are the first to explore OOD in MR ultrasound videos. Our method consists of +a feature extractor, a feature reconstruction model, and a residual +accumulation amplification algorithm. The feature extractor obtains features +from the video clips and feeds them into the feature reconstruction model to +restore the original features. The residual accumulation amplification +algorithm then iteratively performs noise feature reconstruction, amplifying +the reconstructed error of OOD features. This algorithm is straightforward yet +efficient and can seamlessly integrate as a plug-and-play component in +reconstruction-based OOD detection methods. We validated the proposed method on +a large ultrasound dataset containing 893 non-MR and 267 MR videos. +Experimental results show that our OOD detection method can effectively +identify MR samples. + +
+
+ comment: Accepted by MICCAI MLMI 2024, 11 pages, 3 figures +
+
+
+
+
+ + ☆ Explainable and Controllable Motion Curve Guided Cardiac Ultrasound + Video Generation MICCAI + + +
+ Echocardiography video is a primary modality for diagnosing heart diseases, +but the limited data poses challenges for both clinical teaching and machine +learning training. Recently, video generative models have emerged as a +promising strategy to alleviate this issue. However, previous methods often +relied on holistic conditions during generation, hindering the flexible +movement control over specific cardiac structures. In this context, we propose +an explainable and controllable method for echocardiography video generation, +taking an initial frame and a motion curve as guidance. Our contributions are +three-fold. First, we extract motion information from each heart substructure +to construct motion curves, enabling the diffusion model to synthesize +customized echocardiography videos by modifying these curves. Second, we +propose the structure-to-motion alignment module, which can map semantic +features onto motion curves across cardiac structures. Third, The +position-aware attention mechanism is designed to enhance video consistency +utilizing Gaussian masks with structural position information. Extensive +experiments on three echocardiography datasets show that our method outperforms +others regarding fidelity and consistency. The full code will be released at +https://github.com/mlmi-2024-72/ECM. + +
+
+ comment: Accepted by MICCAI MLMI 2024 +
+
+
+
+
+ + ☆ Fine-gained Zero-shot Video Sampling + + +
+ Incorporating a temporal dimension into pretrained image diffusion models for +video generation is a prevalent approach. However, this method is +computationally demanding and necessitates large-scale video datasets. More +critically, the heterogeneity between image and video datasets often results in +catastrophic forgetting of the image expertise. Recent attempts to directly +extract video snippets from image diffusion models have somewhat mitigated +these problems. Nevertheless, these methods can only generate brief video clips +with simple movements and fail to capture fine-grained motion or non-grid +deformation. In this paper, we propose a novel Zero-Shot video Sampling +algorithm, denoted as $\mathcal{ZS}^2$, capable of directly sampling +high-quality video clips from existing image synthesis methods, such as Stable +Diffusion, without any training or optimization. Specifically, $\mathcal{ZS}^2$ +utilizes the dependency noise model and temporal momentum attention to ensure +content consistency and animation coherence, respectively. This ability enables +it to excel in related tasks, such as conditional and context-specialized video +generation and instruction-guided video editing. Experimental results +demonstrate that $\mathcal{ZS}^2$ achieves state-of-the-art performance in +zero-shot video generation, occasionally outperforming recent supervised +methods. + Homepage: \url{https://densechen.github.io/zss/}. + +
+
+
+
+
+ + ☆ Deep Learning-Based Longitudinal Prediction of Childhood Myopia + Progression Using Fundus Image Sequences and Baseline Refraction Data + + +
+ Childhood myopia constitutes a significant global health concern. It exhibits +an escalating prevalence and has the potential to evolve into severe, +irreversible conditions that detrimentally impact familial well-being and +create substantial economic costs. Contemporary research underscores the +importance of precisely predicting myopia progression to enable timely and +effective interventions, thereby averting severe visual impairment in children. +Such predictions predominantly rely on subjective clinical assessments, which +are inherently biased and resource-intensive, thus hindering their widespread +application. In this study, we introduce a novel, high-accuracy method for +quantitatively predicting the myopic trajectory and myopia risk in children +using only fundus images and baseline refraction data. This approach was +validated through a six-year longitudinal study of 3,408 children in Henan, +utilizing 16,211 fundus images and corresponding refractive data. Our method +based on deep learning demonstrated predictive accuracy with an error margin of +0.311D per year and AUC scores of 0.944 and 0.995 for forecasting the risks of +developing myopia and high myopia, respectively. These findings confirm the +utility of our model in supporting early intervention strategies and in +significantly reducing healthcare costs, particularly by obviating the need for +additional metadata and repeated consultations. Furthermore, our method was +designed to rely only on fundus images and refractive error data, without the +need for meta data or multiple inquiries from doctors, strongly reducing the +associated medical costs and facilitating large-scale screening. Our model can +even provide good predictions based on only a single time measurement. +Consequently, the proposed method is an important means to reduce medical +inequities caused by economic disparities. + +
+
+
+
+
+ + ☆ MarvelOVD: Marrying Object Recognition and Vision-Language Models for + Robust Open-Vocabulary Object Detection + + +
+ Learning from pseudo-labels that generated with VLMs~(Vision Language Models) +has been shown as a promising solution to assist open vocabulary detection +(OVD) in recent studies. However, due to the domain gap between VLM and +vision-detection tasks, pseudo-labels produced by the VLMs are prone to be +noisy, while the training design of the detector further amplifies the bias. In +this work, we investigate the root cause of VLMs' biased prediction under the +OVD context. Our observations lead to a simple yet effective paradigm, coded +MarvelOVD, that generates significantly better training targets and optimizes +the learning procedure in an online manner by marrying the capability of the +detector with the vision-language model. Our key insight is that the detector +itself can act as a strong auxiliary guidance to accommodate VLM's inability of +understanding both the ``background'' and the context of a proposal within the +image. Based on it, we greatly purify the noisy pseudo-labels via Online Mining +and propose Adaptive Reweighting to effectively suppress the biased training +boxes that are not well aligned with the target object. In addition, we also +identify a neglected ``base-novel-conflict'' problem and introduce stratified +label assignments to prevent it. Extensive experiments on COCO and LVIS +datasets demonstrate that our method outperforms the other state-of-the-arts by +significant margins. Codes are available at https://github.com/wkfdb/MarvelOVD + +
+
+ comment: Codes are available at https://github.com/wkfdb/MarvelOVD +
+
+
+
+
+ + ☆ StreetSurfaceVis: a dataset of crowdsourced street-level imagery with + semi-automated annotations of road surface type and quality + + +
+ Road unevenness significantly impacts the safety and comfort of various +traffic participants, especially vulnerable road users such as cyclists and +wheelchair users. This paper introduces StreetSurfaceVis, a novel dataset +comprising 9,122 street-level images collected from a crowdsourcing platform +and manually annotated by road surface type and quality. The dataset is +intended to train models for comprehensive surface assessments of road +networks. Existing open datasets are constrained by limited geospatial coverage +and camera setups, typically excluding cycleways and footways. By crafting a +heterogeneous dataset, we aim to fill this gap and enable robust models that +maintain high accuracy across diverse image sources. However, the frequency +distribution of road surface types and qualities is highly imbalanced. We +address the challenge of ensuring sufficient images per class while reducing +manual annotation by proposing a sampling strategy that incorporates various +external label prediction resources. More precisely, we estimate the impact of +(1) enriching the image data with OpenStreetMap tags, (2) iterative training +and application of a custom surface type classification model, (3) amplifying +underrepresented classes through prompt-based classification with GPT-4o or +similarity search using image embeddings. We show that utilizing a combination +of these strategies effectively reduces manual annotation workload while +ensuring sufficient class representation. + +
+
+ comment: 11 pages, 2 figures +
+
+
+
+
+ + ☆ Navigating Beyond Instructions: Vision-and-Language Navigation in + Obstructed Environments + + +
+ Real-world navigation often involves dealing with unexpected obstructions +such as closed doors, moved objects, and unpredictable entities. However, +mainstream Vision-and-Language Navigation (VLN) tasks typically assume +instructions perfectly align with the fixed and predefined navigation graphs +without any obstructions. This assumption overlooks potential discrepancies in +actual navigation graphs and given instructions, which can cause major failures +for both indoor and outdoor agents. To address this issue, we integrate diverse +obstructions into the R2R dataset by modifying both the navigation graphs and +visual observations, introducing an innovative dataset and task, R2R with +UNexpected Obstructions (R2R-UNO). R2R-UNO contains various types and numbers +of path obstructions to generate instruction-reality mismatches for VLN +research. Experiments on R2R-UNO reveal that state-of-the-art VLN methods +inevitably encounter significant challenges when facing such mismatches, +indicating that they rigidly follow instructions rather than navigate +adaptively. Therefore, we propose a novel method called ObVLN (Obstructed VLN), +which includes a curriculum training strategy and virtual graph construction to +help agents effectively adapt to obstructed environments. Empirical results +show that ObVLN not only maintains robust performance in unobstructed scenarios +but also achieves a substantial performance advantage with unexpected +obstructions. + +
+
+ comment: Accepted to MM 2024 +
+
+
+
+
+ + ☆ Forecasting Future Videos from Novel Views via Disentangled 3D Scene + Representation ECCV 2024 + + +
+ Video extrapolation in space and time (VEST) enables viewers to forecast a 3D +scene into the future and view it from novel viewpoints. Recent methods propose +to learn an entangled representation, aiming to model layered scene geometry, +motion forecasting and novel view synthesis together, while assuming simplified +affine motion and homography-based warping at each scene layer, leading to +inaccurate video extrapolation. Instead of entangled scene representation and +rendering, our approach chooses to disentangle scene geometry from scene +motion, via lifting the 2D scene to 3D point clouds, which enables high quality +rendering of future videos from novel views. To model future 3D scene motion, +we propose a disentangled two-stage approach that initially forecasts +ego-motion and subsequently the residual motion of dynamic objects (e.g., cars, +people). This approach ensures more precise motion predictions by reducing +inaccuracies from entanglement of ego-motion with dynamic object motion, where +better ego-motion forecasting could significantly enhance the visual outcomes. +Extensive experimental analysis on two urban scene datasets demonstrate +superior performance of our proposed method in comparison to strong baselines. + +
+
+ comment: Accepted to ECCV 2024. Project Page: + https://skrya.github.io/projects/ffn-dsr/ +
+
+
+
+
+ + ☆ Accelerating Image Super-Resolution Networks with Pixel-Level + Classification ECCV 2024 + + +
+ In recent times, the need for effective super-resolution (SR) techniques has +surged, especially for large-scale images ranging 2K to 8K resolutions. For +DNN-based SISR, decomposing images into overlapping patches is typically +necessary due to computational constraints. In such patch-decomposing scheme, +one can allocate computational resources differently based on each patch's +difficulty to further improve efficiency while maintaining SR performance. +However, this approach has a limitation: computational resources is uniformly +allocated within a patch, leading to lower efficiency when the patch contain +pixels with varying levels of restoration difficulty. To address the issue, we +propose the Pixel-level Classifier for Single Image Super-Resolution (PCSR), a +novel method designed to distribute computational resources adaptively at the +pixel level. A PCSR model comprises a backbone, a pixel-level classifier, and a +set of pixel-level upsamplers with varying capacities. The pixel-level +classifier assigns each pixel to an appropriate upsampler based on its +restoration difficulty, thereby optimizing computational resource usage. Our +method allows for performance and computational cost balance during inference +without re-training. Our experiments demonstrate PCSR's advantage over existing +patch-distributing methods in PSNR-FLOP trade-offs across different backbone +models and benchmarks. The code is available at +https://github.com/3587jjh/PCSR. + +
+
+ comment: Accepted by ECCV 2024 +
+
+
+
+
+ + ☆ A Plug-and-Play Method for Rare Human-Object Interactions Detection by + Bridging Domain Gap + + +
+ Human-object interactions (HOI) detection aims at capturing human-object +pairs in images and corresponding actions. It is an important step toward +high-level visual reasoning and scene understanding. However, due to the +natural bias from the real world, existing methods mostly struggle with rare +human-object pairs and lead to sub-optimal results. Recently, with the +development of the generative model, a straightforward approach is to construct +a more balanced dataset based on a group of supplementary samples. +Unfortunately, there is a significant domain gap between the generated data and +the original data, and simply merging the generated images into the original +dataset cannot significantly boost the performance. To alleviate the above +problem, we present a novel model-agnostic framework called +\textbf{C}ontext-\textbf{E}nhanced \textbf{F}eature \textbf{A}lignment (CEFA) +module, which can effectively align the generated data with the original data +at the feature level and bridge the domain gap. Specifically, CEFA consists of +a feature alignment module and a context enhancement module. On one hand, +considering the crucial role of human-object pairs information in HOI tasks, +the feature alignment module aligns the human-object pairs by aggregating +instance information. On the other hand, to mitigate the issue of losing +important context information caused by the traditional discriminator-style +alignment method, we employ a context-enhanced image reconstruction module to +improve the model's learning ability of contextual cues. Extensive experiments +have shown that our method can serve as a plug-and-play module to improve the +detection performance of HOI models on rare +categories\footnote{https://github.com/LijunZhang01/CEFA}. + +
+
+
+
+
+ + ☆ Enriching thermal point clouds of buildings using semantic 3D building + modelsenriching thermal point clouds of buildings using semantic 3D building + models + + +
+ Thermal point clouds integrate thermal radiation and laser point clouds +effectively. However, the semantic information for the interpretation of +building thermal point clouds can hardly be precisely inferred. Transferring +the semantics encapsulated in 3D building models at LoD3 has a potential to +fill this gap. In this work, we propose a workflow enriching thermal point +clouds with the geo-position and semantics of LoD3 building models, which +utilizes features of both modalities: The proposed method can automatically +co-register the point clouds from different sources and enrich the thermal +point cloud in facade-detailed semantics. The enriched thermal point cloud +supports thermal analysis and can facilitate the development of currently +scarce deep learning models operating directly on thermal point clouds. + +
+
+ comment: Accepted to the 3D GeoInfo 2024 +
+
+
+
+
+ + ☆ Analyzing the impact of semantic LoD3 building models on image-based + vehicle localization + + +
+ Numerous navigation applications rely on data from global navigation +satellite systems (GNSS), even though their accuracy is compromised in urban +areas, posing a significant challenge, particularly for precise autonomous car +localization. Extensive research has focused on enhancing localization accuracy +by integrating various sensor types to address this issue. This paper +introduces a novel approach for car localization, leveraging image features +that correspond with highly detailed semantic 3D building models. The core +concept involves augmenting positioning accuracy by incorporating prior +geometric and semantic knowledge into calculations. The work assesses outcomes +using Level of Detail 2 (LoD2) and Level of Detail 3 (LoD3) models, analyzing +whether facade-enriched models yield superior accuracy. This comprehensive +analysis encompasses diverse methods, including off-the-shelf feature matching +and deep learning, facilitating thorough discussion. Our experiments +corroborate that LoD3 enables detecting up to 69\% more features than using +LoD2 models. We believe that this study will contribute to the research of +enhancing positioning accuracy in GNSS-denied urban canyons. It also shows a +practical application of under-explored LoD3 building models on map-based car +positioning. + +
+
+ comment: Accepted to the 3D GeoInfo 2024 +
+
+
+
+
+ + ☆ Generalized Tampered Scene Text Detection in the era of Generative AI + + +
+ The rapid advancements of generative AI have fueled the potential of +generative text image editing while simultaneously escalating the threat of +misinformation spreading. However, existing forensics methods struggle to +detect unseen forgery types that they have not been trained on, leaving the +development of a model capable of generalized detection of tampered scene text +as an unresolved issue. To tackle this, we propose a novel task: open-set +tampered scene text detection, which evaluates forensics models on their +ability to identify both seen and previously unseen forgery types. We have +curated a comprehensive, high-quality dataset, featuring the texts tampered by +eight text editing models, to thoroughly assess the open-set generalization +capabilities. Further, we introduce a novel and effective pre-training paradigm +that subtly alters the texture of selected texts within an image and trains the +model to identify these regions. This approach not only mitigates the scarcity +of high-quality training data but also enhances models' fine-grained perception +and open-set generalization abilities. Additionally, we present DAF, a novel +framework that improves open-set generalization by distinguishing between the +features of authentic and tampered text, rather than focusing solely on the +tampered text's features. Our extensive experiments validate the remarkable +efficacy of our methods. For example, our zero-shot performance can even beat +the previous state-of-the-art full-shot model by a large margin. Our dataset +and code will be open-source. + +
+
+
+
+
+ + ☆ VIPeR: Visual Incremental Place Recognition with Adaptive Mining and + Lifelong Learning + + +
+ Visual place recognition (VPR) is an essential component of many autonomous +and augmented/virtual reality systems. It enables the systems to robustly +localize themselves in large-scale environments. Existing VPR methods +demonstrate attractive performance at the cost of heavy pre-training and +limited generalizability. When deployed in unseen environments, these methods +exhibit significant performance drops. Targeting this issue, we present VIPeR, +a novel approach for visual incremental place recognition with the ability to +adapt to new environments while retaining the performance of previous +environments. We first introduce an adaptive mining strategy that balances the +performance within a single environment and the generalizability across +multiple environments. Then, to prevent catastrophic forgetting in lifelong +learning, we draw inspiration from human memory systems and design a novel +memory bank for our VIPeR. Our memory bank contains a sensory memory, a working +memory and a long-term memory, with the first two focusing on the current +environment and the last one for all previously visited environments. +Additionally, we propose a probabilistic knowledge distillation to explicitly +safeguard the previously learned knowledge. We evaluate our proposed VIPeR on +three large-scale datasets, namely Oxford Robotcar, Nordland, and TartanAir. +For comparison, we first set a baseline performance with naive finetuning. +Then, several more recent lifelong learning methods are compared. Our VIPeR +achieves better performance in almost all aspects with the biggest improvement +of 13.65% in average performance. + +
+
+ comment: 8 pages, 4 figures +
+
+
+
+
+ + ☆ Benchmarking AIGC Video Quality Assessment: A Dataset and Unified Model + + +
+ In recent years, artificial intelligence (AI) driven video generation has +garnered significant attention due to advancements in stable diffusion and +large language model techniques. Thus, there is a great demand for accurate +video quality assessment (VQA) models to measure the perceptual quality of +AI-generated content (AIGC) videos as well as optimize video generation +techniques. However, assessing the quality of AIGC videos is quite challenging +due to the highly complex distortions they exhibit (e.g., unnatural action, +irrational objects, etc.). Therefore, in this paper, we try to systemically +investigate the AIGC-VQA problem from both subjective and objective quality +assessment perspectives. For the subjective perspective, we construct a +Large-scale Generated Vdeo Quality assessment (LGVQ) dataset, consisting of +2,808 AIGC videos generated by 6 video generation models using 468 carefully +selected text prompts. Unlike previous subjective VQA experiments, we evaluate +the perceptual quality of AIGC videos from three dimensions: spatial quality, +temporal quality, and text-to-video alignment, which hold utmost importance for +current video generation techniques. For the objective perspective, we +establish a benchmark for evaluating existing quality assessment metrics on the +LGVQ dataset, which reveals that current metrics perform poorly on the LGVQ +dataset. Thus, we propose a Unify Generated Video Quality assessment (UGVQ) +model to comprehensively and accurately evaluate the quality of AIGC videos +across three aspects using a unified model, which uses visual, textual and +motion features of video and corresponding prompt, and integrates key features +to enhance feature expression. We hope that our benchmark can promote the +development of quality evaluation metrics for AIGC videos. The LGVQ dataset and +the UGVQ metric will be publicly released. + +
+
+
+
+
+ + ☆ DD-rPPGNet: De-interfering and Descriptive Feature Learning for + Unsupervised rPPG Estimation + + +
+ Remote Photoplethysmography (rPPG) aims to measure physiological signals and +Heart Rate (HR) from facial videos. Recent unsupervised rPPG estimation methods +have shown promising potential in estimating rPPG signals from facial regions +without relying on ground truth rPPG signals. However, these methods seem +oblivious to interference existing in rPPG signals and still result in +unsatisfactory performance. In this paper, we propose a novel De-interfered and +Descriptive rPPG Estimation Network (DD-rPPGNet) to eliminate the interference +within rPPG features for learning genuine rPPG signals. First, we investigate +the characteristics of local spatial-temporal similarities of interference and +design a novel unsupervised model to estimate the interference. Next, we +propose an unsupervised de-interfered method to learn genuine rPPG signals with +two stages. In the first stage, we estimate the initial rPPG signals by +contrastive learning from both the training data and their augmented +counterparts. In the second stage, we use the estimated interference features +to derive de-interfered rPPG features and encourage the rPPG signals to be +distinct from the interference. In addition, we propose an effective +descriptive rPPG feature learning by developing a strong 3D Learnable +Descriptive Convolution (3DLDC) to capture the subtle chrominance changes for +enhancing rPPG estimation. Extensive experiments conducted on five rPPG +benchmark datasets demonstrate that the proposed DD-rPPGNet outperforms +previous unsupervised rPPG estimation methods and achieves competitive +performances with state-of-the-art supervised rPPG methods. + +
+
+
+
+
+ + ☆ Force Sensing Guided Artery-Vein Segmentation via Sequential Ultrasound + Images + + +
+ Accurate identification of arteries and veins in ultrasound images is crucial +for vascular examinations and interventions in robotics-assisted surgeries. +However, current methods for ultrasound vessel segmentation face challenges in +distinguishing between arteries and veins due to their morphological +similarities. To address this challenge, this study introduces a novel force +sensing guided segmentation approach to enhance artery-vein segmentation +accuracy by leveraging their distinct deformability. Our proposed method +utilizes force magnitude to identify key frames with the most significant +vascular deformation in a sequence of ultrasound images. These key frames are +then integrated with the current frame through attention mechanisms, with +weights assigned in accordance with force magnitude. Our proposed force sensing +guided framework can be seamlessly integrated into various segmentation +networks and achieves significant performance improvements in multiple U-shaped +networks such as U-Net, Swin-unet and Transunet. Furthermore, we contribute the +first multimodal ultrasound artery-vein segmentation dataset, Mus-V, which +encompasses both force and image data simultaneously. The dataset comprises +3114 ultrasound images of carotid and femoral vessels extracted from 105 +videos, with corresponding force data recorded by the force sensor mounted on +the US probe. Our code and dataset will be publicly available. + +
+
+
+
+
+ + ☆ Design and Development of Laughter Recognition System Based on + Multimodal Fusion and Deep Learning + + +
+ This study aims to design and implement a laughter recognition system based +on multimodal fusion and deep learning, leveraging image and audio processing +technologies to achieve accurate laughter recognition and emotion analysis. +First, the system loads video files and uses the OpenCV library to extract +facial information while employing the Librosa library to process audio +features such as MFCC. Then, multimodal fusion techniques are used to integrate +image and audio features, followed by training and prediction using deep +learning models. Evaluation results indicate that the model achieved 80% +accuracy, precision, and recall on the test dataset, with an F1 score of 80%, +demonstrating robust performance and the ability to handle real-world data +variability. This study not only verifies the effectiveness of multimodal +fusion methods in laughter recognition but also highlights their potential +applications in affective computing and human-computer interaction. Future work +will focus on further optimizing feature extraction and model architecture to +improve recognition accuracy and expand application scenarios, promoting the +development of laughter recognition technology in fields such as mental health +monitoring and educational activity evaluation + +
+
+ comment: 7 pages,2 figures +
+
+
+
+
+ + ☆ SmileyNet -- Towards the Prediction of the Lottery by Reading Tea Leaves + with AI + + +
+ We introduce SmileyNet, a novel neural network with psychic abilities. It is +inspired by the fact that a positive mood can lead to improved cognitive +capabilities including classification tasks. The network is hence presented in +a first phase with smileys and an encouraging loss function is defined to bias +it into a good mood. SmileyNet is then used to forecast the flipping of a coin +based on an established method of Tasseology, namely by reading tea leaves. +Training and testing in this second phase are done with a high-fidelity +simulation based on real-world pixels sampled from a professional tea-reading +cup. SmileyNet has an amazing accuracy of 72% to correctly predict the flip of +a coin. Resnet-34, respectively YOLOv5 achieve only 49%, respectively 53%. It +is then shown how multiple SmileyNets can be combined to win the lottery. + +
+
+ comment: This is a satirical accumulation of misconceptions, mistakes, and + flawed reasoning I have encountered in recent times as a reviewer and + sometimes even as a reader of published papers. I hope it is entertaining and + useful in the context of the education of BSc, MSc, and PhD students in + Machine Learning, Artificial Intelligence, and Cognitive Science +
+
+
+
+
+ + ☆ Identity-Consistent Diffusion Network for Grading Knee Osteoarthritis + Progression in Radiographic Imaging ECCV 2024 + + +
+ Knee osteoarthritis (KOA), a common form of arthritis that causes physical +disability, has become increasingly prevalent in society. Employing +computer-aided techniques to automatically assess the severity and progression +of KOA can greatly benefit KOA treatment and disease management. Particularly, +the advancement of X-ray technology in KOA demonstrates its potential for this +purpose. Yet, existing X-ray prognosis research generally yields a singular +progression severity grade, overlooking the potential visual changes for +understanding and explaining the progression outcome. Therefore, in this study, +a novel generative model is proposed, namely Identity-Consistent Radiographic +Diffusion Network (IC-RDN), for multifaceted KOA prognosis encompassing a +predicted future knee X-ray scan conditioned on the baseline scan. +Specifically, an identity prior module for the diffusion and a downstream +generation-guided progression prediction module are introduced. Compared to +conventional image-to-image generative models, identity priors regularize and +guide the diffusion to focus more on the clinical nuances of the prognosis +based on a contrastive learning strategy. The progression prediction module +utilizes both forecasted and baseline knee scans, and a more comprehensive +formulation of KOA severity progression grading is expected. Extensive +experiments on a widely used public dataset, OAI, demonstrate the effectiveness +of the proposed method. + +
+
+ comment: Accepted by ECCV 2024 +
+
+
+
+
+ + ☆ Dynamic Gesture Recognition in Ultra-Range Distance for Effective + Human-Robot Interaction + + +
+ This paper presents a novel approach for ultra-range gesture recognition, +addressing Human-Robot Interaction (HRI) challenges over extended distances. By +leveraging human gestures in video data, we propose the Temporal-Spatiotemporal +Fusion Network (TSFN) model that surpasses the limitations of current methods, +enabling robots to understand gestures from long distances. With applications +in service robots, search and rescue operations, and drone-based interactions, +our approach enhances HRI in expansive environments. Experimental validation +demonstrates significant advancements in gesture recognition accuracy, +particularly in prolonged gesture sequences. + +
+
+
+
+
+ + ☆ Prompting Medical Large Vision-Language Models to Diagnose Pathologies + by Visual Question Answering + + +
+ Large Vision-Language Models (LVLMs) have achieved significant success in +recent years, and they have been extended to the medical domain. Although +demonstrating satisfactory performance on medical Visual Question Answering +(VQA) tasks, Medical LVLMs (MLVLMs) suffer from the hallucination problem, +which makes them fail to diagnose complex pathologies. Moreover, they readily +fail to learn minority pathologies due to imbalanced training data. We propose +two prompting strategies for MLVLMs that reduce hallucination and improve VQA +performance. In the first strategy, we provide a detailed explanation of the +queried pathology. In the second strategy, we fine-tune a cheap, weak learner +to achieve high performance on a specific metric, and textually provide its +judgment to the MLVLM. Tested on the MIMIC-CXR-JPG and Chexpert datasets, our +methods significantly improve the diagnostic F1 score, with the highest +increase being 0.27. We also demonstrate that our prompting strategies can be +extended to general LVLM domains. Based on POPE metrics, it effectively +suppresses the false negative predictions of existing LVLMs and improves Recall +by approximately 0.07. + +
+
+
+
+
+ + ☆ ESIQA: Perceptual Quality Assessment of Vision-Pro-based Egocentric + Spatial Images + + +
+ With the development of eXtended Reality (XR), head-mounted shooting and +display technology have experienced significant advancement and gained +considerable attention. Egocentric spatial images and videos are emerging as a +compelling form of stereoscopic XR content. Different from traditional 2D +images, egocentric spatial images present challenges for perceptual quality +assessment due to their special shooting, processing methods, and stereoscopic +characteristics. However, the corresponding image quality assessment (IQA) +research for egocentric spatial images is still lacking. In this paper, we +establish the Egocentric Spatial Images Quality Assessment Database (ESIQAD), +the first IQA database dedicated for egocentric spatial images as far as we +know. Our ESIQAD includes 500 egocentric spatial images, containing 400 images +captured with the Apple Vision Pro and 100 images generated via an iPhone's +"Spatial Camera" app. The corresponding mean opinion scores (MOSs) are +collected under three viewing modes, including 2D display, 3D-window display, +and 3D-immersive display. Furthermore, based on our database, we conduct a +benchmark experiment and evaluate the performance of 22 state-of-the-art IQA +models under three different viewing modes. We hope this research can +facilitate future IQA research on egocentric spatial images. The database is +available at https://github.com/IntMeGroup/ESIQA. + +
+
+ comment: 8 pages, 8 figures +
+
+
+
+
+ + ☆ Small Object Few-shot Segmentation for Vision-based Industrial + Inspection + + +
+ Vision-based industrial inspection (VII) aims to locate defects quickly and +accurately. Supervised learning under a close-set setting and industrial +anomaly detection, as two common paradigms in VII, face different problems in +practical applications. The former is that various and sufficient defects are +difficult to obtain, while the latter is that specific defects cannot be +located. To solve these problems, in this paper, we focus on the few-shot +semantic segmentation (FSS) method, which can locate unseen defects conditioned +on a few annotations without retraining. Compared to common objects in natural +images, the defects in VII are small. This brings two problems to current FSS +methods: 1 distortion of target semantics and 2 many false positives for +backgrounds. To alleviate these problems, we propose a small object few-shot +segmentation (SOFS) model. The key idea for alleviating 1 is to avoid the +resizing of the original image and correctly indicate the intensity of target +semantics. SOFS achieves this idea via the non-resizing procedure and the +prototype intensity downsampling of support annotations. To alleviate 2, we +design an abnormal prior map in SOFS to guide the model to reduce false +positives and propose a mixed normal Dice loss to preferentially prevent the +model from predicting false positives. SOFS can achieve FSS and few-shot +anomaly detection determined by support masks. Diverse experiments substantiate +the superior performance of SOFS. Code is available at +https://github.com/zhangzilongc/SOFS. + +
+
+
+
+
+ + ☆ MIST: A Simple and Scalable End-To-End 3D Medical Imaging Segmentation + Framework + + +
+ Medical imaging segmentation is a highly active area of research, with deep +learning-based methods achieving state-of-the-art results in several +benchmarks. However, the lack of standardized tools for training, testing, and +evaluating new methods makes the comparison of methods difficult. To address +this, we introduce the Medical Imaging Segmentation Toolkit (MIST), a simple, +modular, and end-to-end medical imaging segmentation framework designed to +facilitate consistent training, testing, and evaluation of deep learning-based +medical imaging segmentation methods. MIST standardizes data analysis, +preprocessing, and evaluation pipelines, accommodating multiple architectures +and loss functions. This standardization ensures reproducible and fair +comparisons across different methods. We detail MIST's data format +requirements, pipelines, and auxiliary features and demonstrate its efficacy +using the BraTS Adult Glioma Post-Treatment Challenge dataset. Our results +highlight MIST's ability to produce accurate segmentation masks and its +scalability across multiple GPUs, showcasing its potential as a powerful tool +for future medical imaging research and development. + +
+
+ comment: Submitted to BraTS 2024 +
+
+
+
+
+ + ☆ High-throughput 3D shape completion of potato tubers on a harvester + + +
+ Potato yield is an important metric for farmers to further optimize their +cultivation practices. Potato yield can be estimated on a harvester using an +RGB-D camera that can estimate the three-dimensional (3D) volume of individual +potato tubers. A challenge, however, is that the 3D shape derived from RGB-D +images is only partially completed, underestimating the actual volume. To +address this issue, we developed a 3D shape completion network, called CoRe++, +which can complete the 3D shape from RGB-D images. CoRe++ is a deep learning +network that consists of a convolutional encoder and a decoder. The encoder +compresses RGB-D images into latent vectors that are used by the decoder to +complete the 3D shape using the deep signed distance field network (DeepSDF). +To evaluate our CoRe++ network, we collected partial and complete 3D point +clouds of 339 potato tubers on an operational harvester in Japan. On the 1425 +RGB-D images in the test set (representing 51 unique potato tubers), our +network achieved a completion accuracy of 2.8 mm on average. For volumetric +estimation, the root mean squared error (RMSE) was 22.6 ml, and this was better +than the RMSE of the linear regression (31.1 ml) and the base model (36.9 ml). +We found that the RMSE can be further reduced to 18.2 ml when performing the 3D +shape completion in the center of the RGB-D image. With an average 3D shape +completion time of 10 milliseconds per tuber, we can conclude that CoRe++ is +both fast and accurate enough to be implemented on an operational harvester for +high-throughput potato yield estimation. Our code, network weights and dataset +are publicly available at +https://github.com/UTokyo-FieldPhenomics-Lab/corepp.git. + +
+
+ comment: 18 pages, 11 figures, 6 tables +
+
+
+
+
+ + ☆ On-the-fly Point Feature Representation for Point Clouds Analysis ACM MM 2024 + + +
+ Point cloud analysis is challenging due to its unique characteristics of +unorderness, sparsity and irregularity. Prior works attempt to capture local +relationships by convolution operations or attention mechanisms, exploiting +geometric information from coordinates implicitly. These methods, however, are +insufficient to describe the explicit local geometry, e.g., curvature and +orientation. In this paper, we propose On-the-fly Point Feature Representation +(OPFR), which captures abundant geometric information explicitly through Curve +Feature Generator module. This is inspired by Point Feature Histogram (PFH) +from computer vision community. However, the utilization of vanilla PFH +encounters great difficulties when applied to large datasets and dense point +clouds, as it demands considerable time for feature generation. In contrast, we +introduce the Local Reference Constructor module, which approximates the local +coordinate systems based on triangle sets. Owing to this, our OPFR only +requires extra 1.56ms for inference (65x faster than vanilla PFH) and 0.012M +more parameters, and it can serve as a versatile plug-and-play module for +various backbones, particularly MLP-based and Transformer-based backbones +examined in this study. Additionally, we introduce the novel Hierarchical +Sampling module aimed at enhancing the quality of triangle sets, thereby +ensuring robustness of the obtained geometric features. Our proposed method +improves overall accuracy (OA) on ModelNet40 from 90.7% to 94.5% (+3.8%) for +classification, and OA on S3DIS Area-5 from 86.4% to 90.0% (+3.6%) for semantic +segmentation, respectively, building upon PointNet++ backbone. When integrated +with Point Transformer backbone, we achieve state-of-the-art results on both +tasks: 94.8% OA on ModelNet40 and 91.7% OA on S3DIS Area-5. + +
+
+ comment: Accepted by ACM MM 2024 +
+
+
+
+
+ + ☆ Chat2Layout: Interactive 3D Furniture Layout with a Multimodal LLM + + +
+ Automatic furniture layout is long desired for convenient interior design. +Leveraging the remarkable visual reasoning capabilities of multimodal large +language models (MLLMs), recent methods address layout generation in a static +manner, lacking the feedback-driven refinement essential for interactive user +engagement. We introduce Chat2Layout, a novel interactive furniture layout +generation system that extends the functionality of MLLMs into the realm of +interactive layout design. To achieve this, we establish a unified +vision-question paradigm for in-context learning, enabling seamless +communication with MLLMs to steer their behavior without altering model +weights. Within this framework, we present a novel training-free visual +prompting mechanism. This involves a visual-text prompting technique that +assist MLLMs in reasoning about plausible layout plans, followed by an +Offline-to-Online search (O2O-Search) method, which automatically identifies +the minimal set of informative references to provide exemplars for visual-text +prompting. By employing an agent system with MLLMs as the core controller, we +enable bidirectional interaction. The agent not only comprehends the 3D +environment and user requirements through linguistic and visual perception but +also plans tasks and reasons about actions to generate and arrange furniture +within the virtual space. Furthermore, the agent iteratively updates based on +visual feedback from execution results. Experimental results demonstrate that +our approach facilitates language-interactive generation and arrangement for +diverse and complex 3D furniture. + +
+
+ comment: Main paper with supplemental materials +
+
+
+
+
+ + ☆ CAMAv2: A Vision-Centric Approach for Static Map Element Annotation + + +
+ The recent development of online static map element (a.k.a. HD map) +construction algorithms has raised a vast demand for data with ground truth +annotations. However, available public datasets currently cannot provide +high-quality training data regarding consistency and accuracy. For instance, +the manual labelled (low efficiency) nuScenes still contains misalignment and +inconsistency between the HD maps and images (e.g., around 8.03 pixels +reprojection error on average). To this end, we present CAMAv2: a +vision-centric approach for Consistent and Accurate Map Annotation. Without +LiDAR inputs, our proposed framework can still generate high-quality 3D +annotations of static map elements. Specifically, the annotation can achieve +high reprojection accuracy across all surrounding cameras and is +spatial-temporal consistent across the whole sequence. We apply our proposed +framework to the popular nuScenes dataset to provide efficient and highly +accurate annotations. Compared with the original nuScenes static map element, +our CAMAv2 annotations achieve lower reprojection errors (e.g., 4.96 vs. 8.03 +pixels). Models trained with annotations from CAMAv2 also achieve lower +reprojection errors (e.g., 5.62 vs. 8.43 pixels). + +
+
+ comment: arXiv admin note: text overlap with arXiv:2309.11754 +
+
+
+
+
+ + ☆ Knowledge-Guided Prompt Learning for Lifespan Brain MR Image + Segmentation + + +
+ Automatic and accurate segmentation of brain MR images throughout the human +lifespan into tissue and structure is crucial for understanding brain +development and diagnosing diseases. However, challenges arise from the +intricate variations in brain appearance due to rapid early brain development, +aging, and disorders, compounded by the limited availability of +manually-labeled datasets. In response, we present a two-step segmentation +framework employing Knowledge-Guided Prompt Learning (KGPL) for brain MRI. +Specifically, we first pre-train segmentation models on large-scale datasets +with sub-optimal labels, followed by the incorporation of knowledge-driven +embeddings learned from image-text alignment into the models. The introduction +of knowledge-wise prompts captures semantic relationships between anatomical +variability and biological processes, enabling models to learn structural +feature embeddings across diverse age groups. Experimental findings demonstrate +the superiority and robustness of our proposed method, particularly noticeable +when employing Swin UNETR as the backbone. Our approach achieves average DSC +values of 95.17% and 94.19% for brain tissue and structure segmentation, +respectively. Our code is available at https://github.com/TL9792/KGPL. + +
+
+
+
+
+ + ☆ STANet: A Novel Spatio-Temporal Aggregation Network for Depression + Classification with Small and Unbalanced FMRI Data + + +
+ Accurate diagnosis of depression is crucial for timely implementation of +optimal treatments, preventing complications and reducing the risk of suicide. +Traditional methods rely on self-report questionnaires and clinical assessment, +lacking objective biomarkers. Combining fMRI with artificial intelligence can +enhance depression diagnosis by integrating neuroimaging indicators. However, +the specificity of fMRI acquisition for depression often results in unbalanced +and small datasets, challenging the sensitivity and accuracy of classification +models. In this study, we propose the Spatio-Temporal Aggregation Network +(STANet) for diagnosing depression by integrating CNN and RNN to capture both +temporal and spatial features of brain activity. STANet comprises the following +steps:(1) Aggregate spatio-temporal information via ICA. (2) Utilize +multi-scale deep convolution to capture detailed features. (3) Balance data +using the SMOTE to generate new samples for minority classes. (4) Employ the +AFGRU classifier, which combines Fourier transformation with GRU, to capture +long-term dependencies, with an adaptive weight assignment mechanism to enhance +model generalization. The experimental results demonstrate that STANet achieves +superior depression diagnostic performance with 82.38% accuracy and a 90.72% +AUC. The STFA module enhances classification by capturing deeper features at +multiple scales. The AFGRU classifier, with adaptive weights and stacked GRU, +attains higher accuracy and AUC. SMOTE outperforms other oversampling methods. +Additionally, spatio-temporal aggregated features achieve better performance +compared to using only temporal or spatial features. STANet outperforms +traditional or deep learning classifiers, and functional connectivity-based +classifiers, as demonstrated by ten-fold cross-validation. + +
+
+
+
+
+ + ☆ Pathology Foundation Models + + +
+ Pathology has played a crucial role in the diagnosis and evaluation of +patient tissue samples obtained from surgeries and biopsies for many years. The +advent of Whole Slide Scanners and the development of deep learning +technologies have significantly advanced the field, leading to extensive +research and development in pathology AI (Artificial Intelligence). These +advancements have contributed to reducing the workload of pathologists and +supporting decision-making in treatment plans. Recently, large-scale AI models +known as Foundation Models (FMs), which are more accurate and applicable to a +wide range of tasks compared to traditional AI, have emerged, and expanded +their application scope in the healthcare field. Numerous FMs have been +developed in pathology, and there are reported cases of their application in +various tasks, such as disease diagnosis, rare cancer diagnosis, patient +survival prognosis prediction, biomarker expression prediction, and the scoring +of immunohistochemical expression intensity. However, several challenges remain +for the clinical application of FMs, which healthcare professionals, as users, +must be aware of. Research is ongoing to address these challenges. In the +future, it is expected that the development of Generalist Medical AI, which +integrates pathology FMs with FMs from other medical domains, will progress, +leading to the effective utilization of AI in real clinical settings to promote +precision and personalized medicine. + +
+
+ comment: 19 pages, 1 figure, 3 tables +
+
+
+
+
+ + ☆ EUDA: An Efficient Unsupervised Domain Adaptation via Self-Supervised + Vision Transformer + + +
+ Unsupervised domain adaptation (UDA) aims to mitigate the domain shift issue, +where the distribution of training (source) data differs from that of testing +(target) data. Many models have been developed to tackle this problem, and +recently vision transformers (ViTs) have shown promising results. However, the +complexity and large number of trainable parameters of ViTs restrict their +deployment in practical applications. This underscores the need for an +efficient model that not only reduces trainable parameters but also allows for +adjustable complexity based on specific needs while delivering comparable +performance. To achieve this, in this paper we introduce an Efficient +Unsupervised Domain Adaptation (EUDA) framework. EUDA employs the DINOv2, which +is a self-supervised ViT, as a feature extractor followed by a simplified +bottleneck of fully connected layers to refine features for enhanced domain +adaptation. Additionally, EUDA employs the synergistic domain alignment loss +(SDAL), which integrates cross-entropy (CE) and maximum mean discrepancy (MMD) +losses, to balance adaptation by minimizing classification errors in the source +domain while aligning the source and target domain distributions. The +experimental results indicate the effectiveness of EUDA in producing comparable +results as compared with other state-of-the-art methods in domain adaptation +with significantly fewer trainable parameters, between 42% to 99.7% fewer. This +showcases the ability to train the model in a resource-limited environment. The +code of the model is available at: https://github.com/A-Abedi/EUDA. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ☆ Enhanced Self-Checkout System for Retail Based on Improved YOLOv10 + + +
+ With the rapid advancement of deep learning technologies, computer vision has +shown immense potential in retail automation. This paper presents a novel +self-checkout system for retail based on an improved YOLOv10 network, aimed at +enhancing checkout efficiency and reducing labor costs. We propose targeted +optimizations to the YOLOv10 model, by incorporating the detection head +structure from YOLOv8, which significantly improves product recognition +accuracy. Additionally, we develop a post-processing algorithm tailored for +self-checkout scenarios, to further enhance the application of system. +Experimental results demonstrate that our system outperforms existing methods +in both product recognition accuracy and checkout speed. This research not only +provides a new technical solution for retail automation but offers valuable +insights into optimizing deep learning models for real-world applications. + +
+
+
+
+
+ + ☆ SimpleLLM4AD: An End-to-End Vision-Language Model with Graph Visual + Question Answering for Autonomous Driving + + +
+ Many fields could benefit from the rapid development of the large language +models (LLMs). The end-to-end autonomous driving (e2eAD) is one of the +typically fields facing new opportunities as the LLMs have supported more and +more modalities. Here, by utilizing vision-language model (VLM), we proposed an +e2eAD method called SimpleLLM4AD. In our method, the e2eAD task are divided +into four stages, which are perception, prediction, planning, and behavior. +Each stage consists of several visual question answering (VQA) pairs and VQA +pairs interconnect with each other constructing a graph called Graph VQA +(GVQA). By reasoning each VQA pair in the GVQA through VLM stage by stage, our +method could achieve e2e driving with language. In our method, vision +transformers (ViT) models are employed to process nuScenes visual data, while +VLM are utilized to interpret and reason about the information extracted from +the visual inputs. In the perception stage, the system identifies and +classifies objects from the driving environment. The prediction stage involves +forecasting the potential movements of these objects. The planning stage +utilizes the gathered information to develop a driving strategy, ensuring the +safety and efficiency of the autonomous vehicle. Finally, the behavior stage +translates the planned actions into executable commands for the vehicle. Our +experiments demonstrate that SimpleLLM4AD achieves competitive performance in +complex driving scenarios. + +
+
+ comment: 16 pages, 3 figures +
+
+
+
+
+ + ☆ Fine-grained Metrics for Point Cloud Semantic Segmentation + + +
+ Two forms of imbalances are commonly observed in point cloud semantic +segmentation datasets: (1) category imbalances, where certain objects are more +prevalent than others; and (2) size imbalances, where certain objects occupy +more points than others. Because of this, the majority of categories and large +objects are favored in the existing evaluation metrics. This paper suggests +fine-grained mIoU and mAcc for a more thorough assessment of point cloud +segmentation algorithms in order to address these issues. Richer statistical +information is provided for models and datasets by these fine-grained metrics, +which also lessen the bias of current semantic segmentation metrics towards +large objects. The proposed metrics are used to train and assess various +semantic segmentation algorithms on three distinct indoor and outdoor semantic +segmentation datasets. + +
+
+ comment: PRCV 2024 +
+
+
+
+
+ + ☆ Robust Box Prompt based SAM for Medical Image Segmentation MICCAI + + +
+ The Segment Anything Model (SAM) can achieve satisfactory segmentation +performance under high-quality box prompts. However, SAM's robustness is +compromised by the decline in box quality, limiting its practicality in +clinical reality. In this study, we propose a novel Robust Box prompt based SAM +(\textbf{RoBox-SAM}) to ensure SAM's segmentation performance under prompts +with different qualities. Our contribution is three-fold. First, we propose a +prompt refinement module to implicitly perceive the potential targets, and +output the offsets to directly transform the low-quality box prompt into a +high-quality one. We then provide an online iterative strategy for further +prompt refinement. Second, we introduce a prompt enhancement module to +automatically generate point prompts to assist the box-promptable segmentation +effectively. Last, we build a self-information extractor to encode the prior +information from the input image. These features can optimize the image +embeddings and attention calculation, thus, the robustness of SAM can be +further enhanced. Extensive experiments on the large medical segmentation +dataset including 99,299 images, 5 modalities, and 25 organs/targets validated +the efficacy of our proposed RoBox-SAM. + +
+
+ comment: Accepted by MICCAI MLMI 2024 +
+
+
+
+
+ + ☆ Enhanced Uncertainty Estimation in Ultrasound Image Segmentation with + MSU-Net MICCAI 2024 + + +
+ Efficient intravascular access in trauma and critical care significantly +impacts patient outcomes. However, the availability of skilled medical +personnel in austere environments is often limited. Autonomous robotic +ultrasound systems can aid in needle insertion for medication delivery and +support non-experts in such tasks. Despite advances in autonomous needle +insertion, inaccuracies in vessel segmentation predictions pose risks. +Understanding the uncertainty of predictive models in ultrasound imaging is +crucial for assessing their reliability. We introduce MSU-Net, a novel +multistage approach for training an ensemble of U-Nets to yield accurate +ultrasound image segmentation maps. We demonstrate substantial improvements, +18.1% over a single Monte Carlo U-Net, enhancing uncertainty evaluations, model +transparency, and trustworthiness. By highlighting areas of model certainty, +MSU-Net can guide safe needle insertions, empowering non-experts to accomplish +such tasks. + +
+
+ comment: Accepted for the 5th International Workshop of Advances in + Simplifying Medical UltraSound (ASMUS), held in conjunction with MICCAI 2024, + the 27th International Conference on Medical Image Computing and Computer + Assisted Intervention +
+
+
+
+
+ + ☆ Automated Quantification of Hyperreflective Foci in SD-OCT With Diabetic + Retinopathy + + +
+ The presence of hyperreflective foci (HFs) is related to retinal disease +progression, and the quantity has proven to be a prognostic factor of visual +and anatomical outcome in various retinal diseases. However, lack of efficient +quantitative tools for evaluating the HFs has deprived ophthalmologist of +assessing the volume of HFs. For this reason, we propose an automated +quantification algorithm to segment and quantify HFs in spectral domain optical +coherence tomography (SD-OCT). The proposed algorithm consists of two parallel +processes namely: region of interest (ROI) generation and HFs estimation. To +generate the ROI, we use morphological reconstruction to obtain the +reconstructed image and histogram constructed for data distributions and +clustering. In parallel, we estimate the HFs by extracting the extremal regions +from the connected regions obtained from a component tree. Finally, both the +ROI and the HFs estimation process are merged to obtain the segmented HFs. The +proposed algorithm was tested on 40 3D SD-OCT volumes from 40 patients +diagnosed with non-proliferative diabetic retinopathy (NPDR), proliferative +diabetic retinopathy (PDR), and diabetic macular edema (DME). The average dice +similarity coefficient (DSC) and correlation coefficient (r) are 69.70%, 0.99 +for NPDR, 70.31%, 0.99 for PDR, and 71.30%, 0.99 for DME, respectively. The +proposed algorithm can provide ophthalmologist with good HFs quantitative +information, such as volume, size, and location of the HFs. + +
+
+ comment: IEEE Journal of Biomedical and Health Informatics, Volume: 24, Issue: + 4, pp. 1125 - 1136, 2020 +
+
+
+
+
+ + ☆ DEF-oriCORN: efficient 3D scene understanding for robust + language-directed manipulation without demonstrations + + +
+ We present DEF-oriCORN, a framework for language-directed manipulation tasks. +By leveraging a novel object-based scene representation and +diffusion-model-based state estimation algorithm, our framework enables +efficient and robust manipulation planning in response to verbal commands, even +in tightly packed environments with sparse camera views without any +demonstrations. Unlike traditional representations, our representation affords +efficient collision checking and language grounding. Compared to +state-of-the-art baselines, our framework achieves superior estimation and +motion planning performance from sparse RGB images and zero-shot generalizes to +real-world scenarios with diverse materials, including transparent and +reflective objects, despite being trained exclusively in simulation. Our code +for data generation, training, inference, and pre-trained weights are publicly +available at: https://sites.google.com/view/def-oricorn/home. + +
+
+
+
+
+ + ☆ DDU-Net: A Domain Decomposition-based CNN on Multiple GPUs + + +
+ The segmentation of ultra-high resolution images poses challenges such as +loss of spatial information or computational inefficiency. In this work, a +novel approach that combines encoder-decoder architectures with domain +decomposition strategies to address these challenges is proposed. Specifically, +a domain decomposition-based U-Net (DDU-Net) architecture is introduced, which +partitions input images into non-overlapping patches that can be processed +independently on separate devices. A communication network is added to +facilitate inter-patch information exchange to enhance the understanding of +spatial context. Experimental validation is performed on a synthetic dataset +that is designed to measure the effectiveness of the communication network. +Then, the performance is tested on the DeepGlobe land cover classification +dataset as a real-world benchmark data set. The results demonstrate that the +approach, which includes inter-patch communication for images divided into +$16\times16$ non-overlapping subimages, achieves a $2-3\,\%$ higher +intersection over union (IoU) score compared to the same network without +inter-patch communication. The performance of the network which includes +communication is equivalent to that of a baseline U-Net trained on the full +image, showing that our model provides an effective solution for segmenting +ultra-high-resolution images while preserving spatial context. The code is +available at https://github.com/corne00/HiRes-Seg-CNN. + +
+
+
+
+
+ + ☆ Outlier Detection in Large Radiological Datasets using UMAP MICCAI-2024 + + +
+ The success of machine learning algorithms heavily relies on the quality of +samples and the accuracy of their corresponding labels. However, building and +maintaining large, high-quality datasets is an enormous task. This is +especially true for biomedical data and for meta-sets that are compiled from +smaller ones, as variations in image quality, labeling, reports, and archiving +can lead to errors, inconsistencies, and repeated samples. Here, we show that +the uniform manifold approximation and projection (UMAP) algorithm can find +these anomalies essentially by forming independent clusters that are distinct +from the main (good) data but similar to other points with the same error type. +As a representative example, we apply UMAP to discover outliers in the publicly +available ChestX-ray14, CheXpert, and MURA datasets. While the results are +archival and retrospective and focus on radiological images, the graph-based +methods work for any data type and will prove equally beneficial for curation +at the time of dataset creation. + +
+
+ comment: Accepted in MICCAI-2024 Workshop on Topology- and Graph-Informed + Imaging Informatics (TGI3) +
+
+
+
+
+ + ☆ Leveraging Adaptive Implicit Representation Mapping for Ultra + High-Resolution Image Segmentation + + +
+ Implicit representation mapping (IRM) can translate image features to any +continuous resolution, showcasing its potent capability for +ultra-high-resolution image segmentation refinement. Current IRM-based methods +for refining ultra-high-resolution image segmentation often rely on CNN-based +encoders to extract image features and apply a Shared Implicit Representation +Mapping Function (SIRMF) to convert pixel-wise features into segmented results. +Hence, these methods exhibit two crucial limitations. Firstly, the CNN-based +encoder may not effectively capture long-distance information, resulting in a +lack of global semantic information in the pixel-wise features. Secondly, SIRMF +is shared across all samples, which limits its ability to generalize and handle +diverse inputs. To address these limitations, we propose a novel approach that +leverages the newly proposed Adaptive Implicit Representation Mapping (AIRM) +for ultra-high-resolution Image Segmentation. Specifically, the proposed method +comprises two components: (1) the Affinity Empowered Encoder (AEE), a robust +feature extractor that leverages the benefits of the transformer architecture +and semantic affinity to model long-distance features effectively, and (2) the +Adaptive Implicit Representation Mapping Function (AIRMF), which adaptively +translates pixel-wise features without neglecting the global semantic +information, allowing for flexible and precise feature translation. We +evaluated our method on the commonly used ultra-high-resolution segmentation +refinement datasets, i.e., BIG and PASCAL VOC 2012. The extensive experiments +demonstrate that our method outperforms competitors by a large margin. The code +is provided in supplementary material. + +
+
+
+
+
+ + ☆ Lifelong Person Search + + +
+ Person search is the task to localize a query person in gallery datasets of +scene images. Existing methods have been mainly developed to handle a single +target dataset only, however diverse datasets are continuously given in +practical applications of person search. In such cases, they suffer from the +catastrophic knowledge forgetting in the old datasets when trained on new +datasets. In this paper, we first introduce a novel problem of lifelong person +search (LPS) where the model is incrementally trained on the new datasets while +preserving the knowledge learned in the old datasets. We propose an end-to-end +LPS framework that facilitates the knowledge distillation to enforce the +consistency learning between the old and new models by utilizing the prototype +features of the foreground persons as well as the hard background proposals in +the old domains. Moreover, we also devise the rehearsal-based instance matching +to further improve the discrimination ability in the old domains by using the +unlabeled person instances additionally. Experimental results demonstrate that +the proposed method achieves significantly superior performance of both the +detection and re-identification to preserve the knowledge learned in the old +domains compared with the existing methods. + +
+
+ comment: 10 pages, 6 figure +
+
+
+
+
+ + ♻ ☆ PerAct2: Benchmarking and Learning for Robotic Bimanual Manipulation + Tasks + + +
+ Bimanual manipulation is challenging due to precise spatial and temporal +coordination required between two arms. While there exist several real-world +bimanual systems, there is a lack of simulated benchmarks with a large task +diversity for systematically studying bimanual capabilities across a wide range +of tabletop tasks. This paper addresses the gap by extending RLBench to +bimanual manipulation. We open-source our code and benchmark comprising 13 new +tasks with 23 unique task variations, each requiring a high degree of +coordination and adaptability. To kickstart the benchmark, we extended several +state-of-the art methods to bimanual manipulation and also present a +language-conditioned behavioral cloning agent -- PerAct2, which enables the +learning and execution of bimanual 6-DoF manipulation tasks. Our novel network +architecture efficiently integrates language processing with action prediction, +allowing robots to understand and perform complex bimanual tasks in response to +user-specified goals. Project website with code is available at: +http://bimanual.github.io + +
+
+
+
+
+ + ♻ ☆ An Earth Rover dataset recorded at the ICRA@40 party + + +
+ The ICRA conference is celebrating its $40^{th}$ anniversary in Rotterdam in +September 2024, with as highlight the Happy Birthday ICRA Party at the iconic +Holland America Line Cruise Terminal. One month later the IROS conference will +take place, which will include the Earth Rover Challenge. In this challenge +open-world autonomous navigation models are studied truly open-world settings. + As part of the Earth Rover Challenge several real-world navigation sets in +several cities world-wide, like Auckland, Australia and Wuhan, China. The only +dataset recorded in the Netherlands is the small village Oudewater. The +proposal is to record a dataset with the robot used in the Earth Rover +Challenge in Rotterdam, in front of the Holland America Line Cruise Terminal, +before the festivities of the Happy Birthday ICRA Party start. + See: https://github.com/SlamMate/vSLAM-on-FrodoBots-2K + +
+
+ comment: 3 page, accepted as Late-Breaking extended abstract to IEEE + Conference on Robotics and Automation +
+
+
+
+
+ + ♻ ☆ iMatching: Imperative Correspondence Learning ECCV + + +
+ Learning feature correspondence is a foundational task in computer vision, +holding immense importance for downstream applications such as visual odometry +and 3D reconstruction. Despite recent progress in data-driven models, feature +correspondence learning is still limited by the lack of accurate per-pixel +correspondence labels. To overcome this difficulty, we introduce a new +self-supervised scheme, imperative learning (IL), for training feature +correspondence. It enables correspondence learning on arbitrary uninterrupted +videos without any camera pose or depth labels, heralding a new era for +self-supervised correspondence learning. Specifically, we formulated the +problem of correspondence learning as a bilevel optimization, which takes the +reprojection error from bundle adjustment as a supervisory signal for the +model. To avoid large memory and computation overhead, we leverage the +stationary point to effectively back-propagate the implicit gradients through +bundle adjustment. Through extensive experiments, we demonstrate superior +performance on tasks including feature matching and pose estimation, in which +we obtained an average of 30% accuracy gain over the state-of-the-art matching +models. This preprint corresponds to the Accepted Manuscript in European +Conference on Computer Vision (ECCV) 2024. + +
+
+ comment: This preprint corresponds to the Accepted Manuscript in European + Conference on Computer Vision (ECCV) 2024 +
+
+
+
+
+ + ♻ ☆ PIPsUS: Self-Supervised Point Tracking in Ultrasound MICCAI 2024 + + +
+ Finding point-level correspondences is a fundamental problem in ultrasound +(US), since it can enable US landmark tracking for intraoperative image +guidance in different surgeries, including head and neck. Most existing US +tracking methods, e.g., those based on optical flow or feature matching, were +initially designed for RGB images before being applied to US. Therefore domain +shift can impact their performance. Training could be supervised by +ground-truth correspondences, but these are expensive to acquire in US. To +solve these problems, we propose a self-supervised pixel-level tracking model +called PIPsUS. Our model can track an arbitrary number of points in one forward +pass and exploits temporal information by considering multiple, instead of just +consecutive, frames. We developed a new self-supervised training strategy that +utilizes a long-term point-tracking model trained for RGB images as a teacher +to guide the model to learn realistic motions and use data augmentation to +enforce tracking from US appearance. We evaluate our method on neck and oral US +and echocardiography, showing higher point tracking accuracy when compared with +fast normalized cross-correlation and tuned optical flow. Code will be +available once the paper is accepted. + +
+
+ comment: 10 pages, 3 figures, submitted to MICCAI 2024 +
+
+
+
+
+ + ♻ ☆ Posterior-Variance-Based Error Quantification for Inverse Problems in + Imaging + + +
+ In this work, a method for obtaining pixel-wise error bounds in Bayesian +regularization of inverse imaging problems is introduced. The proposed method +employs estimates of the posterior variance together with techniques from +conformal prediction in order to obtain coverage guarantees for the error +bounds, without making any assumption on the underlying data distribution. It +is generally applicable to Bayesian regularization approaches, independent, +e.g., of the concrete choice of the prior. Furthermore, the coverage guarantees +can also be obtained in case only approximate sampling from the posterior is +possible. With this in particular, the proposed framework is able to +incorporate any learned prior in a black-box manner. Guaranteed coverage +without assumptions on the underlying distributions is only achievable since +the magnitude of the error bounds is, in general, unknown in advance. +Nevertheless, experiments with multiple regularization approaches presented in +the paper confirm that in practice, the obtained error bounds are rather tight. +For realizing the numerical experiments, also a novel primal-dual Langevin +algorithm for sampling from non-smooth distributions is introduced in this +work. + +
+
+
+
+
+ + ♻ ☆ SpaER: Learning Spatio-temporal Equivariant Representations for Fetal + Brain Motion Tracking MICCAI + + +
+ In this paper, we introduce SpaER, a pioneering method for fetal motion +tracking that leverages equivariant filters and self-attention mechanisms to +effectively learn spatio-temporal representations. Different from conventional +approaches that statically estimate fetal brain motions from pairs of images, +our method dynamically tracks the rigid movement patterns of the fetal head +across temporal and spatial dimensions. Specifically, we first develop an +equivariant neural network that efficiently learns rigid motion sequences +through low-dimensional spatial representations of images. Subsequently, we +learn spatio-temporal representations by incorporating time encoding and +self-attention neural network layers. This approach allows for the capture of +long-term dependencies of fetal brain motion and addresses alignment errors due +to contrast changes and severe motion artifacts. Our model also provides a +geometric deformation estimation that properly addresses image distortions +among all time frames. To the best of our knowledge, our approach is the first +to learn spatial-temporal representations via deep neural networks for fetal +motion tracking without data augmentation. We validated our model using real +fetal echo-planar images with simulated and real motions. Our method carries +significant potential value in accurately measuring, tracking, and correcting +fetal motion in fetal MRI sequences. + +
+
+ comment: 11 pages, 3 figures, Medical Image Computing and Computer Assisted + Interventions (MICCAI) Workshop on Perinatal Imaging, Placental and Preterm + Image analysis (PIPPI) 2024 +
+
+
+
+
+ + ♻ ☆ Pediatric Wrist Fracture Detection in X-rays via YOLOv10 Algorithm and + Dual Label Assignment System + + +
+ Wrist fractures are highly prevalent among children and can significantly +impact their daily activities, such as attending school, participating in +sports, and performing basic self-care tasks. If not treated properly, these +fractures can result in chronic pain, reduced wrist functionality, and other +long-term complications. Recently, advancements in object detection have shown +promise in enhancing fracture detection, with systems achieving accuracy +comparable to, or even surpassing, that of human radiologists. The YOLO series, +in particular, has demonstrated notable success in this domain. This study is +the first to provide a thorough evaluation of various YOLOv10 variants to +assess their performance in detecting pediatric wrist fractures using the +GRAZPEDWRI-DX dataset. It investigates how changes in model complexity, scaling +the architecture, and implementing a dual-label assignment strategy can enhance +detection performance. Experimental results indicate that our trained model +achieved mean average precision (mAP@50-95) of 51.9\% surpassing the current +YOLOv9 benchmark of 43.3\% on this dataset. This represents an improvement of +8.6\%. The implementation code is publicly available at +https://github.com/ammarlodhi255/YOLOv10-Fracture-Detection + +
+
+
+
+
+ + ♻ ☆ Explainable Light-Weight Deep Learning Pipeline for Improved Drought + Stress Identification + + +
+ Early identification of drought stress in crops is vital for implementing +effective mitigation measures and reducing yield loss. Non-invasive imaging +techniques hold immense potential by capturing subtle physiological changes in +plants under water deficit. Sensor based imaging data serves as a rich source +of information for machine learning and deep learning algorithms, facilitating +further analysis aimed at identifying drought stress. While these approaches +yield favorable results, real-time field applications requires algorithms +specifically designed for the complexities of natural agricultural conditions. +Our work proposes a novel deep learning framework for classifying drought +stress in potato crops captured by UAVs in natural settings. The novelty lies +in the synergistic combination of a pre-trained network with carefully designed +custom layers. This architecture leverages feature extraction capabilities of +the pre-trained network while the custom layers enable targeted dimensionality +reduction and enhanced regularization, ultimately leading to improved +performance. A key innovation of our work involves the integration of +Gradient-Class Activation Mapping (Grad-CAM), an explainability technique. +Grad-CAM sheds light on the internal workings of the deep learning model, +typically referred to as a black box. By visualizing the focus areas of the +model within the images, Grad-CAM fosters interpretability and builds trust in +the decision-making process of the model. Our proposed framework achieves +superior performance, particularly with the DenseNet121 pre-trained network, +reaching a precision of 97% to identify the stressed class with an overall +accuracy of 91%. Comparative analysis of existing state-of-the-art object +detection algorithms reveals the superiority of our approach in significantly +higher precision and accuracy. + +
+
+ comment: 16 pages, 10 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Automatic classification of prostate MR series type using image content + and metadata + + +
+ With the wealth of medical image data, efficient curation is essential. +Assigning the sequence type to magnetic resonance images is necessary for +scientific studies and artificial intelligence-based analysis. However, +incomplete or missing metadata prevents effective automation. We therefore +propose a deep-learning method for classification of prostate cancer scanning +sequences based on a combination of image data and DICOM metadata. We +demonstrate superior results compared to metadata or image data alone, and make +our code publicly available at +https://github.com/deepakri201/DICOMScanClassification. + +
+
+
+
+
+ + ♻ ☆ Novel Hybrid Integrated Pix2Pix and WGAN Model with Gradient Penalty for + Binary Images Denoising + + +
+ This paper introduces a novel approach to image denoising that leverages the +advantages of Generative Adversarial Networks (GANs). Specifically, we propose +a model that combines elements of the Pix2Pix model and the Wasserstein GAN +(WGAN) with Gradient Penalty (WGAN-GP). This hybrid framework seeks to +capitalize on the denoising capabilities of conditional GANs, as demonstrated +in the Pix2Pix model, while mitigating the need for an exhaustive search for +optimal hyperparameters that could potentially ruin the stability of the +learning process. In the proposed method, the GAN's generator is employed to +produce denoised images, harnessing the power of a conditional GAN for noise +reduction. Simultaneously, the implementation of the Lipschitz continuity +constraint during updates, as featured in WGAN-GP, aids in reducing +susceptibility to mode collapse. This innovative design allows the proposed +model to benefit from the strong points of both Pix2Pix and WGAN-GP, generating +superior denoising results while ensuring training stability. Drawing on +previous work on image-to-image translation and GAN stabilization techniques, +the proposed research highlights the potential of GANs as a general-purpose +solution for denoising. The paper details the development and testing of this +model, showcasing its effectiveness through numerical experiments. The dataset +was created by adding synthetic noise to clean images. Numerical results based +on real-world dataset validation underscore the efficacy of this approach in +image-denoising tasks, exhibiting significant enhancements over traditional +techniques. Notably, the proposed model demonstrates strong generalization +capabilities, performing effectively even when trained with synthetic noise. + +
+
+ comment: Systems and Soft Computing +
+
+
+
+
+ + ♻ ☆ Noise Level Adaptive Diffusion Model for Robust Reconstruction of + Accelerated MRI + + +
+ In general, diffusion model-based MRI reconstruction methods incrementally +remove artificially added noise while imposing data consistency to reconstruct +the underlying images. However, real-world MRI acquisitions already contain +inherent noise due to thermal fluctuations. This phenomenon is particularly +notable when using ultra-fast, high-resolution imaging sequences for advanced +research, or using low-field systems favored by low- and middle-income +countries. These common scenarios can lead to sub-optimal performance or +complete failure of existing diffusion model-based reconstruction techniques. +Specifically, as the artificially added noise is gradually removed, the +inherent MRI noise becomes increasingly pronounced, making the actual noise +level inconsistent with the predefined denoising schedule and consequently +inaccurate image reconstruction. To tackle this problem, we propose a posterior +sampling strategy with a novel NoIse Level Adaptive Data Consistency (Nila-DC) +operation. Extensive experiments are conducted on two public datasets and an +in-house clinical dataset with field strength ranging from 0.3T to 3T, showing +that our method surpasses the state-of-the-art MRI reconstruction methods, and +is highly robust against various noise levels. The code for Nila is available +at https://github.com/Solor-pikachu/Nila. + +
+
+
+
+
+ + ♻ ☆ Textual Query-Driven Mask Transformer for Domain Generalized + Segmentation ECCV 2024 + + +
+ In this paper, we introduce a method to tackle Domain Generalized Semantic +Segmentation (DGSS) by utilizing domain-invariant semantic knowledge from text +embeddings of vision-language models. We employ the text embeddings as object +queries within a transformer-based segmentation framework (textual object +queries). These queries are regarded as a domain-invariant basis for pixel +grouping in DGSS. To leverage the power of textual object queries, we introduce +a novel framework named the textual query-driven mask transformer (tqdm). Our +tqdm aims to (1) generate textual object queries that maximally encode +domain-invariant semantics and (2) enhance the semantic clarity of dense visual +features. Additionally, we suggest three regularization losses to improve the +efficacy of tqdm by aligning between visual and textual features. By utilizing +our method, the model can comprehend inherent semantic information for classes +of interest, enabling it to generalize to extreme domains (e.g., sketch style). +Our tqdm achieves 68.9 mIoU on GTA5$\rightarrow$Cityscapes, outperforming the +prior state-of-the-art method by 2.5 mIoU. The project page is available at +https://byeonghyunpak.github.io/tqdm. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Iterative Ensemble Training with Anti-Gradient Control for Mitigating + Memorization in Diffusion Models ECCV 2024 + + +
+ Diffusion models, known for their tremendous ability to generate novel and +high-quality samples, have recently raised concerns due to their data +memorization behavior, which poses privacy risks. Recent approaches for memory +mitigation either only focused on the text modality problem in cross-modal +generation tasks or utilized data augmentation strategies. In this paper, we +propose a novel training framework for diffusion models from the perspective of +visual modality, which is more generic and fundamental for mitigating +memorization. To facilitate forgetting of stored information in diffusion model +parameters, we propose an iterative ensemble training strategy by splitting the +data into multiple shards for training multiple models and intermittently +aggregating these model parameters. Moreover, practical analysis of losses +illustrates that the training loss for easily memorable images tends to be +obviously lower. Thus, we propose an anti-gradient control method to exclude +the sample with a lower loss value from the current mini-batch to avoid +memorizing. Extensive experiments and analysis on four datasets are conducted +to illustrate the effectiveness of our method, and results show that our method +successfully reduces memory capacity while even improving the performance +slightly. Moreover, to save the computing cost, we successfully apply our +method to fine-tune the well-trained diffusion models by limited epochs, +demonstrating the applicability of our method. Code is available in +https://github.com/liuxiao-guan/IET_AGC. + +
+
+ comment: To appear in ECCV 2024, 20 pages with 7 figures +
+
+
+
+
+ + ♻ ☆ U-Net-based Lung Thickness Map for Pixel-level Lung Volume Estimation of + Chest X-rays + + +
+ Purpose: We aimed to estimate the total lung volume (TLV) from real and +synthetic frontal X-ray radiographs on a pixel level using lung thickness maps +generated by a U-Net. + Methods: 5,959 thorax X-ray computed tomography (CT) scans were retrieved +from two publicly available datasets of the lung nodule analysis 2016 (n=656) +and the RSNA pulmonary embolism detection challenge 2020 (n=5,303). +Additionally, thorax CT scans from 72 subjects (33 healthy: 20 men, mean age +[range] = 62.4 [34, 80]; 39 suffering from chronic obstructive pulmonary +disease: 25 men, mean age [range] = 69.0 [47, 91]) were retrospectively +selected (10.2018-12.2019) from our in-house dataset such that for each +subject, a frontal chest X-ray radiograph no older than seven days was +available. All CT scans and their corresponding lung segmentation were forward +projected using a simulated X-ray spectrum to generate synthetic radiographs +and lung thickness maps, respectively. A U-Net model was trained and tested on +synthetic radiographs from the public datasets to predict lung thickness maps +and consequently estimate TLV. Model performance was further assessed by +evaluating the TLV estimations for the in-house synthetic and real radiograph +pairs using Pearson correlation coefficient (r) and significance testing. + Results: Strong correlations were measured between the predicted and +CT-derived ground truth TLV values for test data from synthetic +($n_{Public}$=1,191, r=0.987, P < 0.001; $n_{In-house}$=72, r=0.973, P < 0.001) +and real radiographs (n=72, r=0.908, P < 0.001). + Conclusion: TLV from U-Net-generated pixel-level lung thickness maps were +successfully estimated for synthetic and real radiographs. + +
+
+
+
+
+ + ♻ ☆ Investigating and Mitigating the Multimodal Hallucination Snowballing in + Large Vision-Language Models ACL 2024 + + +
+ Though advanced in understanding visual information with human languages, +Large Vision-Language Models (LVLMs) still suffer from multimodal +hallucinations. A natural concern is that during multimodal interaction, the +generated hallucinations could influence the LVLMs' subsequent generation. +Thus, we raise a question: When presented with a query relevant to the +previously generated hallucination, will LVLMs be misled and respond +incorrectly, even though the ground visual information exists? To answer this, +we propose a framework called MMHalSnowball to evaluate LVLMs' behaviors when +encountering generated hallucinations, where LVLMs are required to answer +specific visual questions within a curated hallucinatory conversation. +Crucially, our experiment shows that the performance of open-source LVLMs drops +by at least $31\%$, indicating that LVLMs are prone to accept the generated +hallucinations and make false claims that they would not have supported without +distractions. We term this phenomenon Multimodal Hallucination Snowballing. To +mitigate this, we further propose a training-free method called Residual Visual +Decoding, where we revise the output distribution of LVLMs with the one derived +from the residual visual input, providing models with direct access to the +visual information. Experiments show that our method can mitigate more than +$24\%$ of the snowballed multimodal hallucination while maintaining +capabilities. + +
+
+ comment: Accepted to ACL 2024 Main Conference. 21 pages, 20 figures +
+
+
+
+
+ + ♻ ☆ RepGhost: A Hardware-Efficient Ghost Module via Re-parameterization + + +
+ Feature reuse has been a key technique in light-weight convolutional neural +networks (CNNs) architecture design. Current methods usually utilize a +concatenation operator to keep large channel numbers cheaply (thus large +network capacity) by reusing feature maps from other layers. Although +concatenation is parameters- and FLOPs-free, its computational cost on hardware +devices is non-negligible. To address this, this paper provides a new +perspective to realize feature reuse implicitly and more efficiently instead of +concatenation. A novel hardware-efficient RepGhost module is proposed for +implicit feature reuse via reparameterization, instead of using concatenation +operator. Based on the RepGhost module, we develop our efficient RepGhost +bottleneck and RepGhostNet. Experiments on ImageNet and COCO benchmarks +demonstrate that our RepGhostNet is much more effective and efficient than +GhostNet and MobileNetV3 on mobile devices. Specially, our RepGhostNet +surpasses GhostNet 0.5x by 2.5% Top-1 accuracy on ImageNet dataset with less +parameters and comparable latency on an ARM-based mobile device. Code and model +weights are available at https://github.com/ChengpengChen/RepGhost. + +
+
+ comment: tech report +
+
+
+
+
+ + ♻ ☆ DGInStyle: Domain-Generalizable Semantic Segmentation with Image + Diffusion Models and Stylized Semantic Control ECCV 2024 + + +
+ Large, pretrained latent diffusion models (LDMs) have demonstrated an +extraordinary ability to generate creative content, specialize to user data +through few-shot fine-tuning, and condition their output on other modalities, +such as semantic maps. However, are they usable as large-scale data generators, +e.g., to improve tasks in the perception stack, like semantic segmentation? We +investigate this question in the context of autonomous driving, and answer it +with a resounding "yes". We propose an efficient data generation pipeline +termed DGInStyle. First, we examine the problem of specializing a pretrained +LDM to semantically-controlled generation within a narrow domain. Second, we +propose a Style Swap technique to endow the rich generative prior with the +learned semantic control. Third, we design a Multi-resolution Latent Fusion +technique to overcome the bias of LDMs towards dominant objects. Using +DGInStyle, we generate a diverse dataset of street scenes, train a +domain-agnostic semantic segmentation model on it, and evaluate the model on +multiple popular autonomous driving datasets. Our approach consistently +increases the performance of several domain generalization methods compared to +the previous state-of-the-art methods. The source code and the generated +dataset are available at https://dginstyle.github.io. + +
+
+ comment: ECCV 2024, camera ready +
+
+
+
+
+ + ♻ ☆ XMeCap: Meme Caption Generation with Sub-Image Adaptability + + +
+ Humor, deeply rooted in societal meanings and cultural details, poses a +unique challenge for machines. While advances have been made in natural +language processing, real-world humor often thrives in a multi-modal context, +encapsulated distinctively by memes. This paper poses a particular emphasis on +the impact of multi-images on meme captioning. After that, we introduce the +\textsc{XMeCap} framework, a novel approach that adopts supervised fine-tuning +and reinforcement learning based on an innovative reward model, which factors +in both global and local similarities between visuals and text. Our results, +benchmarked against contemporary models, manifest a marked improvement in +caption generation for both single-image and multi-image memes, as well as +different meme categories. \textsc{XMeCap} achieves an average evaluation score +of 75.85 for single-image memes and 66.32 for multi-image memes, outperforming +the best baseline by 3.71\% and 4.82\%, respectively. This research not only +establishes a new frontier in meme-related studies but also underscores the +potential of machines in understanding and generating humor in a multi-modal +setting. + +
+
+ comment: Accepted to MM 2024 +
+
+
+
+
+ + ♻ ☆ Enhancing Interpretability of Vertebrae Fracture Grading using + Human-interpretable Prototypes + + +
+ Vertebral fracture grading classifies the severity of vertebral fractures, +which is a challenging task in medical imaging and has recently attracted Deep +Learning (DL) models. Only a few works attempted to make such models +human-interpretable despite the need for transparency and trustworthiness in +critical use cases like DL-assisted medical diagnosis. Moreover, such models +either rely on post-hoc methods or additional annotations. In this work, we +propose a novel interpretable-by-design method, ProtoVerse, to find relevant +sub-parts of vertebral fractures (prototypes) that reliably explain the model's +decision in a human-understandable way. Specifically, we introduce a novel +diversity-promoting loss to mitigate prototype repetitions in small datasets +with intricate semantics. We have experimented with the VerSe'19 dataset and +outperformed the existing prototype-based method. Further, our model provides +superior interpretability against the post-hoc method. Importantly, expert +radiologists validated the visual interpretability of our results, showing +clinical applicability. + +
+
+ comment: Accepted for publication at the Journal of Machine Learning for + Biomedical Imaging (MELBA) https://melba-journal.org/2024:015 +
+
+
+
+
+ + ♻ ☆ Neural Cellular Automata for Lightweight, Robust and Explainable + Classification of White Blood Cell Images MICCAI 2024 + + +
+ Diagnosis of hematological malignancies depends on accurate identification of +white blood cells in peripheral blood smears. Deep learning techniques are +emerging as a viable solution to scale and optimize this process by automatic +cell classification. However, these techniques face several challenges such as +limited generalizability, sensitivity to domain shifts, and lack of +explainability. Here, we introduce a novel approach for white blood cell +classification based on neural cellular automata (NCA). We test our approach on +three datasets of white blood cell images and show that we achieve competitive +performance compared to conventional methods. Our NCA-based method is +significantly smaller in terms of parameters and exhibits robustness to domain +shifts. Furthermore, the architecture is inherently explainable, providing +insights into the decision process for each classification, which helps to +understand and validate model predictions. Our results demonstrate that NCA can +be used for image classification, and that they address key challenges of +conventional methods, indicating a high potential for applicability in clinical +practice. + +
+
+ comment: Accepted for publication at the 27th International Conference on + Medical Image Computing and Computer Assisted Intervention - MICCAI 2024 +
+
+
+
+
+ + ♻ ☆ 3D-GRES: Generalized 3D Referring Expression Segmentation ACM MM 2024 + + +
+ 3D Referring Expression Segmentation (3D-RES) is dedicated to segmenting a +specific instance within a 3D space based on a natural language description. +However, current approaches are limited to segmenting a single target, +restricting the versatility of the task. To overcome this limitation, we +introduce Generalized 3D Referring Expression Segmentation (3D-GRES), which +extends the capability to segment any number of instances based on natural +language instructions. In addressing this broader task, we propose the +Multi-Query Decoupled Interaction Network (MDIN), designed to break down +multi-object segmentation tasks into simpler, individual segmentations. MDIN +comprises two fundamental components: Text-driven Sparse Queries (TSQ) and +Multi-object Decoupling Optimization (MDO). TSQ generates sparse point cloud +features distributed over key targets as the initialization for queries. +Meanwhile, MDO is tasked with assigning each target in multi-object scenarios +to different queries while maintaining their semantic consistency. To adapt to +this new task, we build a new dataset, namely Multi3DRes. Our comprehensive +evaluations on this dataset demonstrate substantial enhancements over existing +models, thus charting a new path for intricate multi-object 3D scene +comprehension. The benchmark and code are available at +https://github.com/sosppxo/MDIN. + +
+
+ comment: Accepted by ACM MM 2024 (Oral), Code: https://github.com/sosppxo/MDIN +
+
+
+
+
+ + ♻ ☆ Transferring to Real-World Layouts: A Depth-aware Framework for Scene + Adaptation ACM MM 2024 + + +
+ Scene segmentation via unsupervised domain adaptation (UDA) enables the +transfer of knowledge acquired from source synthetic data to real-world target +data, which largely reduces the need for manual pixel-level annotations in the +target domain. To facilitate domain-invariant feature learning, existing +methods typically mix data from both the source domain and target domain by +simply copying and pasting the pixels. Such vanilla methods are usually +sub-optimal since they do not take into account how well the mixed layouts +correspond to real-world scenarios. Real-world scenarios are with an inherent +layout. We observe that semantic categories, such as sidewalks, buildings, and +sky, display relatively consistent depth distributions, and could be clearly +distinguished in a depth map. Based on such observation, we propose a +depth-aware framework to explicitly leverage depth estimation to mix the +categories and facilitate the two complementary tasks, i.e., segmentation and +depth learning in an end-to-end manner. In particular, the framework contains a +Depth-guided Contextual Filter (DCF) forndata augmentation and a cross-task +encoder for contextual learning. DCF simulates the real-world layouts, while +the cross-task encoder further adaptively fuses the complementing features +between two tasks. Besides, it is worth noting that several public datasets do +not provide depth annotation. Therefore, we leverage the off-the-shelf depth +estimation network to generate the pseudo depth. Extensive experiments show +that our proposed methods, even with pseudo depth, achieve competitive +performance on two widely-used bench-marks, i.e. 77.7 mIoU on GTA to Cityscapes +and 69.3 mIoU on Synthia to Cityscapes. + +
+
+ comment: ACM MM 2024 (Oral) +
+
+
+
+
+ + ♻ ☆ Knowledge Mechanisms in Large Language Models: A Survey and Perspective + + +
+ Understanding knowledge mechanisms in Large Language Models (LLMs) is crucial +for advancing towards trustworthy AGI. This paper reviews knowledge mechanism +analysis from a novel taxonomy including knowledge utilization and evolution. +Knowledge utilization delves into the mechanism of memorization, comprehension +and application, and creation. Knowledge evolution focuses on the dynamic +progression of knowledge within individual and group LLMs. Moreover, we discuss +what knowledge LLMs have learned, the reasons for the fragility of parametric +knowledge, and the potential dark knowledge (hypothesis) that will be +challenging to address. We hope this work can help understand knowledge in LLMs +and provide insights for future research. + +
+
+ comment: Ongoing work (v2); add Section 5: Application of Knowledge Mechanism; + revise Section 6 and 7; fix typos +
+
+
+
+
+ + ♻ ☆ AFGI: Towards Accurate and Fast-convergent Gradient Inversion Attack in + Federated Learning + + +
+ Federated learning (FL) empowers privacypreservation in model training by +only exposing users' model gradients. Yet, FL users are susceptible to gradient +inversion attacks (GIAs) which can reconstruct ground-truth training data such +as images based on model gradients. However, reconstructing high-resolution +images by existing GIAs faces two challenges: inferior accuracy and +slow-convergence, especially when duplicating labels exist in the training +batch. To address these challenges, we present an Accurate and Fast-convergent +Gradient Inversion attack algorithm, called AFGI, with two components: Label +Recovery Block (LRB) which can accurately restore duplicating labels of private +images based on exposed gradients; VME Regularization Term, which includes the +total variance of reconstructed images, the discrepancy between three-channel +means and edges, between values from exposed gradients and reconstructed +images, respectively. The AFGI can be regarded as a white-box attack strategy +to reconstruct images by leveraging labels recovered by LRB. In particular, +AFGI is efficient that accurately reconstruct ground-truth images when users' +training batch size is up to 48. Our experimental results manifest that AFGI +can diminish 85% time costs while achieving superb inversion quality in the +ImageNet dataset. At last, our study unveils the shortcomings of FL in +privacy-preservation, prompting the development of more advanced countermeasure +strategies. + +
+
+
+
+
+ + ♻ ☆ Monocular Human-Object Reconstruction in the Wild + + +
+ Learning the prior knowledge of the 3D human-object spatial relation is +crucial for reconstructing human-object interaction from images and +understanding how humans interact with objects in 3D space. Previous works +learn this prior from datasets collected in controlled environments, but due to +the diversity of domains, they struggle to generalize to real-world scenarios. +To overcome this limitation, we present a 2D-supervised method that learns the +3D human-object spatial relation prior purely from 2D images in the wild. Our +method utilizes a flow-based neural network to learn the prior distribution of +the 2D human-object keypoint layout and viewports for each image in the +dataset. The effectiveness of the prior learned from 2D images is demonstrated +on the human-object reconstruction task by applying the prior to tune the +relative pose between the human and the object during the post-optimization +stage. To validate and benchmark our method on in-the-wild images, we collect +the WildHOI dataset from the YouTube website, which consists of various +interactions with 8 objects in real-world scenarios. We conduct the experiments +on the indoor BEHAVE dataset and the outdoor WildHOI dataset. The results show +that our method achieves almost comparable performance with fully 3D supervised +methods on the BEHAVE dataset, even if we have only utilized the 2D layout +information, and outperforms previous methods in terms of generality and +interaction diversity on in-the-wild images. + +
+
+ comment: Accepted by MM '24 +
+
+
+
+
+ + ♻ ☆ Scalable Group Choreography via Variational Phase Manifold Learning ECCV 2024 + + +
+ Generating group dance motion from the music is a challenging task with +several industrial applications. Although several methods have been proposed to +tackle this problem, most of them prioritize optimizing the fidelity in dancing +movement, constrained by predetermined dancer counts in datasets. This +limitation impedes adaptability to real-world applications. Our study addresses +the scalability problem in group choreography while preserving naturalness and +synchronization. In particular, we propose a phase-based variational generative +model for group dance generation on learning a generative manifold. Our method +achieves high-fidelity group dance motion and enables the generation with an +unlimited number of dancers while consuming only a minimal and constant amount +of memory. The intensive experiments on two public datasets show that our +proposed method outperforms recent state-of-the-art approaches by a large +margin and is scalable to a great number of dancers beyond the training data. + +
+
+ comment: Accepted at ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Towards Natural Language-Guided Drones: GeoText-1652 Benchmark with + Spatial Relation Matching ECCV 2024 + + +
+ Navigating drones through natural language commands remains challenging due +to the dearth of accessible multi-modal datasets and the stringent precision +requirements for aligning visual and textual data. To address this pressing +need, we introduce GeoText-1652, a new natural language-guided geo-localization +benchmark. This dataset is systematically constructed through an interactive +human-computer process leveraging Large Language Model (LLM) driven annotation +techniques in conjunction with pre-trained vision models. GeoText-1652 extends +the established University-1652 image dataset with spatial-aware text +annotations, thereby establishing one-to-one correspondences between image, +text, and bounding box elements. We further introduce a new optimization +objective to leverage fine-grained spatial associations, called blending +spatial matching, for region-level spatial relation matching. Extensive +experiments reveal that our approach maintains a competitive recall rate +comparing other prevailing cross-modality methods. This underscores the +promising potential of our approach in elevating drone control and navigation +through the seamless integration of natural language commands in real-world +scenarios. + +
+
+ comment: Accepted by ECCV 2024 +
+
+
+
+
+ + ♻ ☆ SiNGR: Brain Tumor Segmentation via Signed Normalized Geodesic Transform + Regression MICCAI 2024 + + +
+ One of the primary challenges in brain tumor segmentation arises from the +uncertainty of voxels close to tumor boundaries. However, the conventional +process of generating ground truth segmentation masks fails to treat such +uncertainties properly. Those "hard labels" with 0s and 1s conceptually +influenced the majority of prior studies on brain image segmentation. As a +result, tumor segmentation is often solved through voxel classification. In +this work, we instead view this problem as a voxel-level regression, where the +ground truth represents a certainty mapping from any pixel to the border of the +tumor. We propose a novel ground truth label transformation, which is based on +a signed geodesic transform, to capture the uncertainty in brain tumors' +vicinity. We combine this idea with a Focal-like regression L1-loss that +enables effective regression learning in high-dimensional output space by +appropriately weighting voxels according to their difficulty. We thoroughly +conduct an experimental evaluation to validate the components of our proposed +method, compare it to a diverse array of state-of-the-art segmentation models, +and show that it is architecture-agnostic. The code of our method is made +publicly available (\url{https://github.com/Oulu-IMEDS/SiNGR/}). + +
+
+ comment: Accepted as a conference paper at MICCAI 2024 +
+
+
+
+
+ + ♻ ☆ Weakly Supervised Intracranial Hemorrhage Segmentation with YOLO and an + Uncertainty Rectified Segment Anything Model + + +
+ Intracranial hemorrhage (ICH) is a life-threatening condition that requires +rapid and accurate diagnosis to improve treatment outcomes and patient survival +rates. Recent advancements in supervised deep learning have greatly improved +the analysis of medical images, but often rely on extensive datasets with +high-quality annotations, which are costly, time-consuming, and require medical +expertise to prepare. To mitigate the need for large amounts of expert-prepared +segmentation data, we have developed a novel weakly supervised ICH segmentation +method that utilizes the YOLO object detection model and an +uncertainty-rectified Segment Anything Model (SAM). In addition, we have +proposed a novel point prompt generator for this model to further improve +segmentation results with YOLO-predicted bounding box prompts. Our approach +achieved a high accuracy of 0.933 and an AUC of 0.796 in ICH detection, along +with a mean Dice score of 0.629 for ICH segmentation, outperforming existing +weakly supervised and popular supervised (UNet and Swin-UNETR) approaches. +Overall, the proposed method provides a robust and accurate alternative to the +more commonly used supervised techniques for ICH quantification without +requiring refined segmentation ground truths during model training. + +
+
+ comment: Manuscript was accepted at SWITCH2024. 10 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ Robot Synesthesia: In-Hand Manipulation with Visuotactile Sensing + + +
+ Executing contact-rich manipulation tasks necessitates the fusion of tactile +and visual feedback. However, the distinct nature of these modalities poses +significant challenges. In this paper, we introduce a system that leverages +visual and tactile sensory inputs to enable dexterous in-hand manipulation. +Specifically, we propose Robot Synesthesia, a novel point cloud-based tactile +representation inspired by human tactile-visual synesthesia. This approach +allows for the simultaneous and seamless integration of both sensory inputs, +offering richer spatial information and facilitating better reasoning about +robot actions. The method, trained in a simulated environment and then deployed +to a real robot, is applicable to various in-hand object rotation tasks. +Comprehensive ablations are performed on how the integration of vision and +touch can improve reinforcement learning and Sim2Real performance. Our project +page is available at https://yingyuan0414.github.io/visuotactile/ . + +
+
+ comment: Project page: https://yingyuan0414.github.io/visuotactile/ +
+
+
+
+
+ + ♻ ☆ Dynamic Neural Radiance Field From Defocused Monocular Video ECCV 2024 + + +
+ Dynamic Neural Radiance Field (NeRF) from monocular videos has recently been +explored for space-time novel view synthesis and achieved excellent results. +However, defocus blur caused by depth variation often occurs in video capture, +compromising the quality of dynamic reconstruction because the lack of sharp +details interferes with modeling temporal consistency between input views. To +tackle this issue, we propose D2RF, the first dynamic NeRF method designed to +restore sharp novel views from defocused monocular videos. We introduce layered +Depth-of-Field (DoF) volume rendering to model the defocus blur and reconstruct +a sharp NeRF supervised by defocused views. The blur model is inspired by the +connection between DoF rendering and volume rendering. The opacity in volume +rendering aligns with the layer visibility in DoF rendering. To execute the +blurring, we modify the layered blur kernel to the ray-based kernel and employ +an optimized sparse kernel to gather the input rays efficiently and render the +optimized rays with our layered DoF volume rendering. We synthesize a dataset +with defocused dynamic scenes for our task, and extensive experiments on our +dataset show that our method outperforms existing approaches in synthesizing +all-in-focus novel views from defocus blur while maintaining spatial-temporal +consistency in the scene. + +
+
+ comment: Accepted by ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Restore-RWKV: Efficient and Effective Medical Image Restoration with + RWKV + + +
+ Transformers have revolutionized medical image restoration, but the quadratic +complexity still poses limitations for their application to high-resolution +medical images. The recent advent of RWKV in the NLP field has attracted much +attention as it can process long sequences efficiently. To leverage its +advanced design, we propose Restore-RWKV, the first RWKV-based model for +medical image restoration. Since the original RWKV model is designed for 1D +sequences, we make two necessary modifications for modeling spatial relations +in 2D images. First, we present a recurrent WKV (Re-WKV) attention mechanism +that captures global dependencies with linear computational complexity. Re-WKV +incorporates bidirectional attention as basic for a global receptive field and +recurrent attention to effectively model 2D dependencies from various scan +directions. Second, we develop an omnidirectional token shift (Omni-Shift) +layer that enhances local dependencies by shifting tokens from all directions +and across a wide context range. These adaptations make the proposed +Restore-RWKV an efficient and effective model for medical image restoration. +Extensive experiments demonstrate that Restore-RWKV achieves superior +performance across various medical image restoration tasks, including MRI image +super-resolution, CT image denoising, PET image synthesis, and all-in-one +medical image restoration. Code is available at: +\href{https://github.com/Yaziwel/Restore-RWKV.git}{https://github.com/Yaziwel/Restore-RWKV}. + +
+
+ comment: This paper introduces the first RWKV-based model for image + restoration +
+
+
+
+
+ + ♻ ☆ MOD-UV: Learning Mobile Object Detectors from Unlabeled Videos ECCV 2024 + + +
+ Embodied agents must detect and localize objects of interest, e.g. traffic +participants for self-driving cars. Supervision in the form of bounding boxes +for this task is extremely expensive. As such, prior work has looked at +unsupervised instance detection and segmentation, but in the absence of +annotated boxes, it is unclear how pixels must be grouped into objects and +which objects are of interest. This results in over-/under-segmentation and +irrelevant objects. Inspired by human visual system and practical applications, +we posit that the key missing cue for unsupervised detection is motion: objects +of interest are typically mobile objects that frequently move and their motions +can specify separate instances. In this paper, we propose MOD-UV, a Mobile +Object Detector learned from Unlabeled Videos only. We begin with instance +pseudo-labels derived from motion segmentation, but introduce a novel training +paradigm to progressively discover small objects and static-but-mobile objects +that are missed by motion segmentation. As a result, though only learned from +unlabeled videos, MOD-UV can detect and segment mobile objects from a single +static image. Empirically, we achieve state-of-the-art performance in +unsupervised mobile object detection on Waymo Open, nuScenes, and KITTI +Datasets without using any external data or supervised models. Code is +available at https://github.com/YihongSun/MOD-UV. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Spectral-Spatial Mamba for Hyperspectral Image Classification + + +
+ Recently, deep learning models have achieved excellent performance in +hyperspectral image (HSI) classification. Among the many deep models, +Transformer has gradually attracted interest for its excellence in modeling the +long-range dependencies of spatial-spectral features in HSI. However, +Transformer has the problem of quadratic computational complexity due to the +self-attention mechanism, which is heavier than other models and thus has +limited adoption in HSI processing. Fortunately, the recently emerging state +space model-based Mamba shows great computational efficiency while achieving +the modeling power of Transformers. Therefore, in this paper, we make a +preliminary attempt to apply the Mamba to HSI classification, leading to the +proposed spectral-spatial Mamba (SS-Mamba). Specifically, the proposed SS-Mamba +mainly consists of spectral-spatial token generation module and several stacked +spectral-spatial Mamba blocks. Firstly, the token generation module converts +any given HSI cube to spatial and spectral tokens as sequences. And then these +tokens are sent to stacked spectral-spatial mamba blocks (SS-MB). Each SS-MB +block consists of two basic mamba blocks and a spectral-spatial feature +enhancement module. The spatial and spectral tokens are processed separately by +the two basic mamba blocks, respectively. Besides, the feature enhancement +module modulates spatial and spectral tokens using HSI sample's center region +information. In this way, the spectral and spatial tokens cooperate with each +other and achieve information fusion within each block. The experimental +results conducted on widely used HSI datasets reveal that the proposed model +achieves competitive results compared with the state-of-the-art methods. The +Mamba-based method opens a new window for HSI classification. + +
+
+ comment: 23 pages +
+
+
+
+
+ + ♻ ☆ DISORF: A Distributed Online 3D Reconstruction Framework for Mobile + Robots + + +
+ We present a framework, DISORF, to enable online 3D reconstruction and +visualization of scenes captured by resource-constrained mobile robots and edge +devices. To address the limited computing capabilities of edge devices and +potentially limited network availability, we design a framework that +efficiently distributes computation between the edge device and the remote +server. We leverage on-device SLAM systems to generate posed keyframes and +transmit them to remote servers that can perform high-quality 3D reconstruction +and visualization at runtime by leveraging recent advances in neural 3D +methods. We identify a key challenge with online training where naive image +sampling strategies can lead to significant degradation in rendering quality. +We propose a novel shifted exponential frame sampling method that addresses +this challenge for online training. We demonstrate the effectiveness of our +framework in enabling high-quality real-time reconstruction and visualization +of unknown scenes as they are captured and streamed from cameras in mobile +robots and edge devices. + +
+
+
+
+
+ + ♻ ☆ MINT-1T: Scaling Open-Source Multimodal Data by 10x: A Multimodal + Dataset with One Trillion Tokens + + +
+ Multimodal interleaved datasets featuring free-form interleaved sequences of +images and text are crucial for training frontier large multimodal models +(LMMs). Despite the rapid progression of open-source LMMs, there remains a +pronounced scarcity of large-scale, diverse open-source multimodal interleaved +datasets. In response, we introduce MINT-1T, the most extensive and diverse +open-source Multimodal INTerleaved dataset to date. MINT-1T comprises one +trillion text tokens and 3.4 billion images, a 10x scale-up from existing +open-source datasets. Additionally, we include previously untapped sources such +as PDFs and ArXiv papers. As scaling multimodal interleaved datasets requires +substantial engineering effort, sharing the data curation process and releasing +the dataset greatly benefits the community. Our experiments show that LMMs +trained on MINT-1T rival the performance of models trained on the previous +leading dataset, OBELICS. Our data and code will be released at +https://github.com/mlfoundations/MINT-1T. + +
+
+
+
+
+ + ♻ ☆ Saliency Guided Image Warping for Unsupervised Domain Adaptation + + +
+ Driving is challenging in conditions like night, rain, and snow. The lack of +good labeled datasets has hampered progress in scene understanding under such +conditions. Unsupervised domain adaptation (UDA) using large labeled clear-day +datasets is a promising research direction in such cases. Current UDA methods, +however, treat all image pixels uniformly, leading to over-reliance on the +dominant scene backgrounds (e.g., roads, sky, sidewalks) that appear +dramatically different across domains. As a result, they struggle to learn +effective features of smaller and often sparse foreground objects (e.g., +people, vehicles, signs). + In this work, we improve UDA training by using in-place image warping to +focus on salient object regions. Our insight is that while backgrounds vary +significantly across domains (e.g., snowy night vs. clear day), object +appearances vary to a lesser extent. Therefore, we design instance-level +saliency guidance to adaptively oversample object regions, which reduces +adverse effects from background context and enhances backbone feature learning. +We then unwarp the better learned features while adapting from source to +target. Our approach improves adaptation across geographies, lighting, and +weather conditions, and is agnostic to the task (segmentation, detection), +domain adaptation algorithm, saliency guidance, and underlying model +architecture. Result highlights include +6.1 mAP50 for BDD100K Clear +$\rightarrow$ DENSE Foggy, +3.7 mAP50 for BDD100K Day $\rightarrow$ Night, +3.0 +mAP50 for BDD100K Clear $\rightarrow$ Rainy, and +6.3 mIoU for Cityscapes +$\rightarrow$ ACDC. Our method adds minimal training memory and incurs no +additional inference latency. Please see Appendix for more results and +analysis. + +
+
+
+
+
+ + ♻ ☆ Vid3D: Synthesis of Dynamic 3D Scenes using 2D Video Diffusion + + +
+ A recent frontier in computer vision has been the task of 3D video +generation, which consists of generating a time-varying 3D representation of a +scene. To generate dynamic 3D scenes, current methods explicitly model 3D +temporal dynamics by jointly optimizing for consistency across both time and +views of the scene. In this paper, we instead investigate whether it is +necessary to explicitly enforce multiview consistency over time, as current +approaches do, or if it is sufficient for a model to generate 3D +representations of each timestep independently. We hence propose a model, +Vid3D, that leverages 2D video diffusion to generate 3D videos by first +generating a 2D "seed" of the video's temporal dynamics and then independently +generating a 3D representation for each timestep in the seed video. We evaluate +Vid3D against two state-of-the-art 3D video generation methods and find that +Vid3D is achieves comparable results despite not explicitly modeling 3D +temporal dynamics. We further ablate how the quality of Vid3D depends on the +number of views generated per frame. While we observe some degradation with +fewer views, performance degradation remains minor. Our results thus suggest +that 3D temporal knowledge may not be necessary to generate high-quality +dynamic 3D scenes, potentially enabling simpler generative algorithms for this +task. + +
+
+ comment: 14 pages, 10 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ Neural Point Cloud Diffusion for Disentangled 3D Shape and Appearance + Generation CVPR 2024 + + +
+ Controllable generation of 3D assets is important for many practical +applications like content creation in movies, games and engineering, as well as +in AR/VR. Recently, diffusion models have shown remarkable results in +generation quality of 3D objects. However, none of the existing models enable +disentangled generation to control the shape and appearance separately. For the +first time, we present a suitable representation for 3D diffusion models to +enable such disentanglement by introducing a hybrid point cloud and neural +radiance field approach. We model a diffusion process over point positions +jointly with a high-dimensional feature space for a local density and radiance +decoder. While the point positions represent the coarse shape of the object, +the point features allow modeling the geometry and appearance details. This +disentanglement enables us to sample both independently and therefore to +control both separately. Our approach sets a new state of the art in generation +compared to previous disentanglement-capable methods by reduced FID scores of +30-90% and is on-par with other non disentanglement-capable state-of-the art +methods. + +
+
+ comment: CVPR 2024. Project page: + https://neural-point-cloud-diffusion.github.io/ +
+
+
+
+
+
+
+
+ + Information Retrieval 16 + +
+
+
+ + ☆ MOSAIC: Multimodal Multistakeholder-aware Visual Art Recommendation + + +
+ Visual art (VA) recommendation is complex, as it has to consider the +interests of users (e.g. museum visitors) and other stakeholders (e.g. museum +curators). We study how to effectively account for key stakeholders in VA +recommendations while also considering user-centred measures such as novelty, +serendipity, and diversity. We propose MOSAIC, a novel multimodal +multistakeholder-aware approach using state-of-the-art CLIP and BLIP backbone +architectures and two joint optimisation objectives: popularity and +representative selection of paintings across different categories. We conducted +an offline evaluation using preferences elicited from 213 users followed by a +user study with 100 crowdworkers. We found a strong effect of popularity, which +was positively perceived by users, and a minimal effect of representativeness. +MOSAIC's impact extends beyond visitors, benefiting various art stakeholders. +Its user-centric approach has broader applicability, offering advancements for +content recommendation across domains that require considering multiple +stakeholders. + +
+
+
+
+
+ + ☆ Adaptive Retrieval-Augmented Generation for Conversational Systems + + +
+ Despite the success of integrating large language models into the development +of conversational systems, many studies have shown the effectiveness of +retrieving and augmenting external knowledge for informative responses. Hence, +many existing studies commonly assume the always need for Retrieval Augmented +Generation (RAG) in a conversational system without explicit control. This +raises a research question about such a necessity. In this study, we propose to +investigate the need for each turn of system response to be augmented with +external knowledge. In particular, by leveraging human judgements on the binary +choice of adaptive augmentation, we develop RAGate, a gating model, which +models conversation context and relevant inputs to predict if a conversational +system requires RAG for improved responses. We conduct extensive experiments on +devising and applying RAGate to conversational models and well-rounded analyses +of different conversational scenarios. Our experimental results and analysis +indicate the effective application of RAGate in RAG-based conversational +systems in identifying system responses for appropriate RAG with high-quality +responses and a high generation confidence. This study also identifies the +correlation between the generation's confidence level and the relevance of the +augmented knowledge. + +
+
+ comment: 12 pages, under review +
+
+
+
+
+ + ☆ Learning Effective Representations for Retrieval Using Self-Distillation + with Adaptive Relevance Margins + + +
+ Representation-based retrieval models, so-called biencoders, estimate the +relevance of a document to a query by calculating the similarity of their +respective embeddings. Current state-of-the-art biencoders are trained using an +expensive training regime involving knowledge distillation from a teacher model +and batch-sampling. Instead of relying on a teacher model, we contribute a +novel parameter-free loss function for self-supervision that exploits the +pre-trained language modeling capabilities of the encoder model as a training +signal, eliminating the need for batch sampling by performing implicit hard +negative mining. We investigate the capabilities of our proposed approach +through extensive ablation studies, demonstrating that self-distillation can +match the effectiveness of teacher distillation using only 13.5% of the data, +while offering a speedup in training time between 3x and 15x compared to +parametrized losses. Code and data is made openly available. + +
+
+ comment: 9 Pages, 4 Tables, 6 Figures +
+
+
+
+
+ + ☆ Breaking the Hourglass Phenomenon of Residual Quantization: Enhancing + the Upper Bound of Generative Retrieval + + +
+ Generative retrieval (GR) has emerged as a transformative paradigm in search +and recommender systems, leveraging numeric-based identifier representations to +enhance efficiency and generalization. Notably, methods like TIGER employing +Residual Quantization-based Semantic Identifiers (RQ-SID), have shown +significant promise in e-commerce scenarios by effectively managing item IDs. +However, a critical issue termed the "\textbf{Hourglass}" phenomenon, occurs in +RQ-SID, where intermediate codebook tokens become overly concentrated, +hindering the full utilization of generative retrieval methods. This paper +analyses and addresses this problem by identifying data sparsity and +long-tailed distribution as the primary causes. Through comprehensive +experiments and detailed ablation studies, we analyze the impact of these +factors on codebook utilization and data distribution. Our findings reveal that +the "Hourglass" phenomenon substantially impacts the performance of RQ-SID in +generative retrieval. We propose effective solutions to mitigate this issue, +thereby significantly enhancing the effectiveness of generative retrieval in +real-world E-commerce applications. + +
+
+
+
+
+ + ☆ ABCDE: Application-Based Cluster Diff Evals + + +
+ This paper considers the problem of evaluating clusterings of very large +populations of items. Given two clusterings, namely a Baseline clustering and +an Experiment clustering, the tasks are twofold: 1) characterize their +differences, and 2) determine which clustering is better. ABCDE is a novel +evaluation technique for accomplishing that. It aims to be practical: it allows +items to have associated importance values that are application-specific, it is +frugal in its use of human judgements when determining which clustering is +better, and it can report metrics for arbitrary slices of items, thereby +facilitating understanding and debugging. The approach to measuring the delta +in the clustering quality is novel: instead of trying to construct an expensive +ground truth up front and evaluating the each clustering with respect to that, +where the ground truth must effectively pre-anticipate clustering changes, +ABCDE samples questions for judgement on the basis of the actual diffs between +the clusterings. ABCDE builds upon the pointwise metrics for clustering +evaluation, which make the ABCDE metrics intuitive and simple to understand. +The mathematical elegance of the pointwise metrics equip ABCDE with rigorous +yet practical ways to explore the clustering diffs and to estimate the quality +delta. + +
+
+
+
+
+ + ☆ Personalized Multi-task Training for Recommender System + + +
+ In the vast landscape of internet information, recommender systems (RecSys) +have become essential for guiding users through a sea of choices aligned with +their preferences. These systems have applications in diverse domains, such as +news feeds, game suggestions, and shopping recommendations. Personalization is +a key technique in RecSys, where modern methods leverage representation +learning to encode user/item interactions into embeddings, forming the +foundation for personalized recommendations. However, integrating information +from multiple sources to enhance recommendation performance remains +challenging. This paper introduces a novel approach named PMTRec, the first +personalized multi-task learning algorithm to obtain comprehensive user/item +embeddings from various information sources. Addressing challenges specific to +personalized RecSys, we develop modules to handle personalized task weights, +diverse task orientations, and variations in gradient magnitudes across tasks. +PMTRec dynamically adjusts task weights based on gradient norms for each +user/item, employs a Task Focusing module to align gradient combinations with +the main recommendation task, and uses a Gradient Magnitude Balancing module to +ensure balanced training across tasks. Through extensive experiments on three +real-world datasets with different scales, we demonstrate that PMTRec +significantly outperforms existing multi-task learning methods, showcasing its +effectiveness in achieving enhanced recommendation accuracy by leveraging +multiple tasks simultaneously. Our contributions open new avenues for advancing +personalized multi-task training in recommender systems. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ ProSpec RL: Plan Ahead, then Execute + + +
+ Imagining potential outcomes of actions before execution helps agents make +more informed decisions, a prospective thinking ability fundamental to human +cognition. However, mainstream model-free Reinforcement Learning (RL) methods +lack the ability to proactively envision future scenarios, plan, and guide +strategies. These methods typically rely on trial and error to adjust policy +functions, aiming to maximize cumulative rewards or long-term value, even if +such high-reward decisions place the environment in extremely dangerous states. +To address this, we propose the Prospective (ProSpec) RL method, which makes +higher-value, lower-risk optimal decisions by imagining future n-stream +trajectories. Specifically, ProSpec employs a dynamic model to predict future +states (termed "imagined states") based on the current state and a series of +sampled actions. Furthermore, we integrate the concept of Model Predictive +Control and introduce a cycle consistency constraint that allows the agent to +evaluate and select the optimal actions from these trajectories. Moreover, +ProSpec employs cycle consistency to mitigate two fundamental issues in RL: +augmenting state reversibility to avoid irreversible events (low risk) and +augmenting actions to generate numerous virtual trajectories, thereby improving +data efficiency. We validated the effectiveness of our method on the DMControl +benchmarks, where our approach achieved significant performance improvements. +Code will be open-sourced upon acceptance. + +
+
+
+
+
+ + ☆ Implementing Streaming algorithm and k-means clusters to RAG + + +
+ Retrieval-augmented generation (RAG) has achieved great success in +information retrieval to assist large models because it builds an external +knowledge database. However, it also has many problems: it consumes a lot of +memory because of the huge database. When faced with massive streaming data, it +is unable to update the established index database in time. To save the memory +of building the database and maintain accuracy simultaneously, we proposed a +new approach combining a streaming algorithm and k-means cluster with RAG. Our +approach applies a streaming algorithm to update the index and reduce memory +consumption. Then use the k-means algorithm to cluster documents with high +similarities together, the query time will be shortened by doing this. We +conducted comparative experiments on four methods, and the results show that +RAG with streaming algorithm and k-means cluster performs well in accuracy and +memory. For massive streaming data, we find that our method behaves better than +traditional RAG + +
+
+
+
+
+ + ♻ ☆ Attribute-driven Disentangled Representation Learning for Multimodal + Recommendation + + +
+ Recommendation algorithms forecast user preferences by correlating user and +item representations derived from historical interaction patterns. In pursuit +of enhanced performance, many methods focus on learning robust and independent +representations by disentangling the intricate factors within interaction data +across various modalities in an unsupervised manner. However, such an approach +obfuscates the discernment of how specific factors (e.g., category or brand) +influence the outcomes, making it challenging to regulate their effects. In +response to this challenge, we introduce a novel method called Attribute-Driven +Disentangled Representation Learning (short for AD-DRL), which explicitly +incorporates attributes from different modalities into the disentangled +representation learning process. By assigning a specific attribute to each +factor in multimodal features, AD-DRL can disentangle the factors at both +attribute and attribute-value levels. To obtain robust and independent +representations for each factor associated with a specific attribute, we first +disentangle the representations of features both within and across different +modalities. Moreover, we further enhance the robustness of the representations +by fusing the multimodal features of the same factor. Empirical evaluations +conducted on three public real-world datasets substantiate the effectiveness of +AD-DRL, as well as its interpretability and controllability. + +
+
+ comment: ACM Multimedia 2024 Accepted +
+
+
+
+
+ + ♻ ☆ DCNv3: Towards Next Generation Deep Cross Network for CTR Prediction + + +
+ Deep & Cross Network and its derivative models have become an important +paradigm in click-through rate (CTR) prediction due to their effective balance +between computational cost and performance. However, these models face four +major limitations: (1) while most models claim to capture high-order feature +interactions, they often do so implicitly and non-interpretably through deep +neural networks (DNN), which limits the trustworthiness of the model's +predictions; (2) the performance of existing explicit feature interaction +methods is often weaker than that of implicit DNN, undermining their necessity; +(3) many models fail to adaptively filter noise while enhancing the order of +feature interactions; (4) the fusion methods of most models cannot provide +suitable supervision signals for their different interaction methods. + To address the identified limitations, this paper proposes the next +generation Deep Cross Network (DCNv3) and Shallow & Deep Cross Network +(SDCNv3). These models ensure interpretability in feature interaction modeling +while exponentially increasing the order of feature interactions to achieve +genuine Deep Crossing rather than just Deep & Cross. Additionally, we employ a +Self-Mask operation to filter noise and reduce the number of parameters in the +cross network by half. In the fusion layer, we use a simple yet effective loss +weight calculation method called Tri-BCE to provide appropriate supervision +signals. Comprehensive experiments on six datasets demonstrate the +effectiveness, efficiency, and interpretability of DCNv3 and SDCNv3. The code, +running logs, and detailed hyperparameter configurations are available at: +https://anonymous.4open.science/r/DCNv3-E352. + +
+
+
+
+
+ + ♻ ☆ Context-augmented Retrieval: A Novel Framework for Fast Information + Retrieval based Response Generation using Large Language Model + + +
+ Generating high-quality answers consistently by providing contextual +information embedded in the prompt passed to the Large Language Model (LLM) is +dependent on the quality of information retrieval. As the corpus of contextual +information grows, the answer/inference quality of Retrieval Augmented +Generation (RAG) based Question Answering (QA) systems declines. This work +solves this problem by combining classical text classification with the Large +Language Model (LLM) to enable quick information retrieval from the vector +store and ensure the relevancy of retrieved information. For the same, this +work proposes a new approach Context Augmented retrieval (CAR), where +partitioning of vector database by real-time classification of information +flowing into the corpus is done. CAR demonstrates good quality answer +generation along with significant reduction in information retrieval and answer +generation time. + +
+
+ comment: Because the dataset in which the model was trained upon wasn't + consistent across different sections so it was preferred to delete this + preprint +
+
+
+
+
+ + ♻ ☆ Neural Retrievers are Biased Towards LLM-Generated Content KDD 2024 + + +
+ Recently, the emergence of large language models (LLMs) has revolutionized +the paradigm of information retrieval (IR) applications, especially in web +search, by generating vast amounts of human-like texts on the Internet. As a +result, IR systems in the LLM era are facing a new challenge: the indexed +documents are now not only written by human beings but also automatically +generated by the LLMs. How these LLM-generated documents influence the IR +systems is a pressing and still unexplored question. In this work, we conduct a +quantitative evaluation of IR models in scenarios where both human-written and +LLM-generated texts are involved. Surprisingly, our findings indicate that +neural retrieval models tend to rank LLM-generated documents higher. We refer +to this category of biases in neural retrievers towards the LLM-generated +content as the \textbf{source bias}. Moreover, we discover that this bias is +not confined to the first-stage neural retrievers, but extends to the +second-stage neural re-rankers. Then, in-depth analyses from the perspective of +text compression indicate that LLM-generated texts exhibit more focused +semantics with less noise, making it easier for neural retrieval models to +semantic match. To mitigate the source bias, we also propose a plug-and-play +debiased constraint for the optimization objective, and experimental results +show its effectiveness. Finally, we discuss the potential severe concerns +stemming from the observed source bias and hope our findings can serve as a +critical wake-up call to the IR community and beyond. To facilitate future +explorations of IR in the LLM era, the constructed two new benchmarks are +available at https://github.com/KID-22/Source-Bias. + +
+
+ comment: KDD 2024 +
+
+
+
+
+ + ♻ ☆ Semantic-aware Representation Learning for Homography Estimation ACM MM 2024 + + +
+ Homography estimation is the task of determining the transformation from an +image pair. Our approach focuses on employing detector-free feature matching +methods to address this issue. Previous work has underscored the importance of +incorporating semantic information, however there still lacks an efficient way +to utilize semantic information. Previous methods suffer from treating the +semantics as a pre-processing, causing the utilization of semantics overly +coarse-grained and lack adaptability when dealing with different tasks. In our +work, we seek another way to use the semantic information, that is +semantic-aware feature representation learning framework.Based on this, we +propose SRMatcher, a new detector-free feature matching method, which +encourages the network to learn integrated semantic feature +representation.Specifically, to capture precise and rich semantics, we leverage +the capabilities of recently popularized vision foundation models (VFMs) +trained on extensive datasets. Then, a cross-images Semantic-aware Fusion Block +(SFB) is proposed to integrate its fine-grained semantic features into the +feature representation space. In this way, by reducing errors stemming from +semantic inconsistencies in matching pairs, our proposed SRMatcher is able to +deliver more accurate and realistic outcomes. Extensive experiments show that +SRMatcher surpasses solid baselines and attains SOTA results on multiple +real-world datasets. Compared to the previous SOTA approach GeoFormer, +SRMatcher increases the area under the cumulative curve (AUC) by about 11% on +HPatches. Additionally, the SRMatcher could serve as a plug-and-play framework +for other matching methods like LoFTR, yielding substantial precision +improvement. + +
+
+ comment: Accepted by ACM MM 2024 +
+
+
+
+
+ + ♻ ☆ Ontologies for Models and Algorithms in Applied Mathematics and Related + Disciplines + + +
+ In applied mathematics and related disciplines, the +modeling-simulation-optimization workflow is a prominent scheme, with +mathematical models and numerical algorithms playing a crucial role. For these +types of mathematical research data, the Mathematical Research Data Initiative +has developed, merged and implemented ontologies and knowledge graphs. This +contributes to making mathematical research data FAIR by introducing semantic +technology and documenting the mathematical foundations accordingly. Using the +concrete example of microfracture analysis of porous media, it is shown how the +knowledge of the underlying mathematical model and the corresponding numerical +algorithms for its solution can be represented by the ontologies. + +
+
+
+
+
+ + ♻ ☆ Scalable Dynamic Embedding Size Search for Streaming Recommendation CIKM 2024 + + +
+ Recommender systems typically represent users and items by learning their +embeddings, which are usually set to uniform dimensions and dominate the model +parameters. However, real-world recommender systems often operate in streaming +recommendation scenarios, where the number of users and items continues to +grow, leading to substantial storage resource consumption for these embeddings. +Although a few methods attempt to mitigate this by employing embedding size +search strategies to assign different embedding dimensions in streaming +recommendations, they assume that the embedding size grows with the frequency +of users/items, which eventually still exceeds the predefined memory budget +over time. To address this issue, this paper proposes to learn Scalable +Lightweight Embeddings for streaming recommendation, called SCALL, which can +adaptively adjust the embedding sizes of users/items within a given memory +budget over time. Specifically, we propose to sample embedding sizes from a +probabilistic distribution, with the guarantee to meet any predefined memory +budget. By fixing the memory budget, the proposed embedding size sampling +strategy can increase and decrease the embedding sizes in accordance to the +frequency of the corresponding users or items. Furthermore, we develop a +reinforcement learning-based search paradigm that models each state with mean +pooling to keep the length of the state vectors fixed, invariant to the +changing number of users and items. As a result, the proposed method can +provide embedding sizes to unseen users and items. Comprehensive empirical +evaluations on two public datasets affirm the advantageous effectiveness of our +proposed method. + +
+
+ comment: accepted to CIKM 2024 +
+
+
+
+
+ + ♻ ☆ Multi-Tower Multi-Interest Recommendation with User Representation Repel + + +
+ In the era of information overload, the value of recommender systems has been +profoundly recognized in academia and industry alike. Multi-interest sequential +recommendation, in particular, is a subfield that has been receiving increasing +attention in recent years. By generating multiple-user representations, +multi-interest learning models demonstrate superior expressiveness than +single-user representation models, both theoretically and empirically. Despite +major advancements in the field, three major issues continue to plague the +performance and adoptability of multi-interest learning methods, the difference +between training and deployment objectives, the inability to access item +information, and the difficulty of industrial adoption due to its single-tower +architecture. We address these challenges by proposing a novel multi-tower +multi-interest framework with user representation repel. Experimental results +across multiple large-scale industrial datasets proved the effectiveness and +generalizability of our proposed framework. + +
+
+ comment: Not accepted by conference +
+
+
+
+
+
+
+
+ + Machine Learning 112 + +
+
+
+ + ☆ Generalized Out-of-Distribution Detection and Beyond in Vision Language + Model Era: A Survey + + +
+ Detecting out-of-distribution (OOD) samples is crucial for ensuring the +safety of machine learning systems and has shaped the field of OOD detection. +Meanwhile, several other problems are closely related to OOD detection, +including anomaly detection (AD), novelty detection (ND), open set recognition +(OSR), and outlier detection (OD). To unify these problems, a generalized OOD +detection framework was proposed, taxonomically categorizing these five +problems. However, Vision Language Models (VLMs) such as CLIP have +significantly changed the paradigm and blurred the boundaries between these +fields, again confusing researchers. In this survey, we first present a +generalized OOD detection v2, encapsulating the evolution of AD, ND, OSR, OOD +detection, and OD in the VLM era. Our framework reveals that, with some field +inactivity and integration, the demanding challenges have become OOD detection +and AD. In addition, we also highlight the significant shift in the definition, +problem settings, and benchmarks; we thus feature a comprehensive review of the +methodology for OOD detection, including the discussion over other related +tasks to clarify their relationship to OOD detection. Finally, we explore the +advancements in the emerging Large Vision Language Model (LVLM) era, such as +GPT-4V. We conclude this survey with open challenges and future directions. + +
+
+ comment: survey paper. We welcome questions, issues, and paper requests via + https://github.com/AtsuMiyai/Awesome-OOD-VLM +
+
+
+
+
+ + ☆ Safetywashing: Do AI Safety Benchmarks Actually Measure Safety Progress? + + +
+ As artificial intelligence systems grow more powerful, there has been +increasing interest in "AI safety" research to address emerging and future +risks. However, the field of AI safety remains poorly defined and +inconsistently measured, leading to confusion about how researchers can +contribute. This lack of clarity is compounded by the unclear relationship +between AI safety benchmarks and upstream general capabilities (e.g., general +knowledge and reasoning). To address these issues, we conduct a comprehensive +meta-analysis of AI safety benchmarks, empirically analyzing their correlation +with general capabilities across dozens of models and providing a survey of +existing directions in AI safety. Our findings reveal that many safety +benchmarks highly correlate with upstream model capabilities, potentially +enabling "safetywashing" -- where capability improvements are misrepresented as +safety advancements. Based on these findings, we propose an empirical +foundation for developing more meaningful safety metrics and define AI safety +in a machine learning research context as a set of clearly delineated research +goals that are empirically separable from generic capabilities advancements. In +doing so, we aim to provide a more rigorous framework for AI safety research, +advancing the science of safety evaluations and clarifying the path towards +measurable progress. + +
+
+
+
+
+ + ☆ Deep Learning for Options Trading: An End-To-End Approach + + +
+ We introduce a novel approach to options trading strategies using a highly +scalable and data-driven machine learning algorithm. In contrast to traditional +approaches that often require specifications of underlying market dynamics or +assumptions on an option pricing model, our models depart fundamentally from +the need for these prerequisites, directly learning non-trivial mappings from +market data to optimal trading signals. Backtesting on more than a decade of +option contracts for equities listed on the S&P 100, we demonstrate that deep +learning models trained according to our end-to-end approach exhibit +significant improvements in risk-adjusted performance over existing rules-based +trading strategies. We find that incorporating turnover regularization into the +models leads to further performance enhancements at prohibitively high levels +of transaction costs. + +
+
+
+
+
+ + ☆ Vision-Language Model Based Handwriting Verification + + +
+ Handwriting Verification is a critical in document forensics. Deep learning +based approaches often face skepticism from forensic document examiners due to +their lack of explainability and reliance on extensive training data and +handcrafted features. This paper explores using Vision Language Models (VLMs), +such as OpenAI's GPT-4o and Google's PaliGemma, to address these challenges. By +leveraging their Visual Question Answering capabilities and 0-shot +Chain-of-Thought (CoT) reasoning, our goal is to provide clear, +human-understandable explanations for model decisions. Our experiments on the +CEDAR handwriting dataset demonstrate that VLMs offer enhanced +interpretability, reduce the need for large training datasets, and adapt better +to diverse handwriting styles. However, results show that the CNN-based +ResNet-18 architecture outperforms the 0-shot CoT prompt engineering approach +with GPT-4o (Accuracy: 70%) and supervised fine-tuned PaliGemma (Accuracy: +71%), achieving an accuracy of 84% on the CEDAR AND dataset. These findings +highlight the potential of VLMs in generating human-interpretable decisions +while underscoring the need for further advancements to match the performance +of specialized deep learning models. + +
+
+ comment: 4 Pages, 1 Figure, 1 Table, Accepted as Short paper at Irish Machine + Vision and Image Processing (IMVIP) Conference +
+
+
+
+
+ + ☆ Large Language Monkeys: Scaling Inference Compute with Repeated Sampling + + +
+ Scaling the amount of compute used to train language models has dramatically +improved their capabilities. However, when it comes to inference, we often +limit the amount of compute to only one attempt per problem. Here, we explore +inference compute as another axis for scaling by increasing the number of +generated samples. Across multiple tasks and models, we observe that coverage - +the fraction of problems solved by any attempt - scales with the number of +samples over four orders of magnitude. In domains like coding and formal +proofs, where all answers can be automatically verified, these increases in +coverage directly translate into improved performance. When we apply repeated +sampling to SWE-bench Lite, the fraction of issues solved with +DeepSeek-V2-Coder-Instruct increases from 15.9% with one sample to 56% with 250 +samples, outperforming the single-attempt state-of-the-art of 43% which uses +more capable frontier models. Moreover, using current API pricing, amplifying +the cheaper DeepSeek model with five samples is more cost-effective and solves +more issues than paying a premium for one sample from GPT-4o or Claude 3.5 +Sonnet. Interestingly, the relationship between coverage and the number of +samples is often log-linear and can be modelled with an exponentiated power +law, suggesting the existence of inference-time scaling laws. Finally, we find +that identifying correct samples out of many generations remains an important +direction for future research in domains without automatic verifiers. When +solving math word problems from GSM8K and MATH, coverage with Llama-3 models +grows to over 95% with 10,000 samples. However, common methods to pick correct +solutions from a sample collection, such as majority voting or reward models, +plateau beyond several hundred samples and fail to fully scale with the sample +budget. + +
+
+
+
+
+ + ☆ ShieldGemma: Generative AI Content Moderation Based on Gemma + + +
+ We present ShieldGemma, a comprehensive suite of LLM-based safety content +moderation models built upon Gemma2. These models provide robust, +state-of-the-art predictions of safety risks across key harm types (sexually +explicit, dangerous content, harassment, hate speech) in both user input and +LLM-generated output. By evaluating on both public and internal benchmarks, we +demonstrate superior performance compared to existing models, such as Llama +Guard (+10.8\% AU-PRC on public benchmarks) and WildCard (+4.3\%). +Additionally, we present a novel LLM-based data curation pipeline, adaptable to +a variety of safety-related tasks and beyond. We have shown strong +generalization performance for model trained mainly on synthetic data. By +releasing ShieldGemma, we provide a valuable resource to the research +community, advancing LLM safety and enabling the creation of more effective +content moderation solutions for developers. + +
+
+
+
+
+ + ☆ MoMa: Efficient Early-Fusion Pre-training with Mixture of Modality-Aware + Experts + + +
+ We introduce MoMa, a novel modality-aware mixture-of-experts (MoE) +architecture designed for pre-training mixed-modal, early-fusion language +models. MoMa processes images and text in arbitrary sequences by dividing +expert modules into modality-specific groups. These groups exclusively process +designated tokens while employing learned routing within each group to maintain +semantically informed adaptivity. Our empirical results reveal substantial +pre-training efficiency gains through this modality-specific parameter +allocation. Under a 1-trillion-token training budget, the MoMa 1.4B model, +featuring 4 text experts and 4 image experts, achieves impressive FLOPs +savings: 3.7x overall, with 2.6x for text and 5.2x for image processing +compared to a compute-equivalent dense baseline, measured by pre-training loss. +This outperforms the standard expert-choice MoE with 8 mixed-modal experts, +which achieves 3x overall FLOPs savings (3x for text, 2.8x for image). +Combining MoMa with mixture-of-depths (MoD) further improves pre-training FLOPs +savings to 4.2x overall (text: 3.4x, image: 5.3x), although this combination +hurts performance in causal inference due to increased sensitivity to router +accuracy. These results demonstrate MoMa's potential to significantly advance +the efficiency of mixed-modal, early-fusion language model pre-training, paving +the way for more resource-efficient and capable multimodal AI systems. + +
+
+
+
+
+ + ☆ Diagnostic Runtime Monitoring with Martingales + + +
+ Machine learning systems deployed in safety-critical robotics settings must +be robust to distribution shifts. However, system designers must understand the +cause of a distribution shift in order to implement the appropriate +intervention or mitigation strategy and prevent system failure. In this paper, +we present a novel framework for diagnosing distribution shifts in a streaming +fashion by deploying multiple stochastic martingales simultaneously. We show +that knowledge of the underlying cause of a distribution shift can lead to +proper interventions over the lifecycle of a deployed system. Our experimental +framework can easily be adapted to different types of distribution shifts, +models, and datasets. We find that our method outperforms existing work on +diagnosing distribution shifts in terms of speed, accuracy, and flexibility, +and validate the efficiency of our model in both simulated and live hardware +settings. + +
+
+
+
+
+ + ☆ HGOE: Hybrid External and Internal Graph Outlier Exposure for Graph + Out-of-Distribution Detection + + +
+ With the progressive advancements in deep graph learning, out-of-distribution +(OOD) detection for graph data has emerged as a critical challenge. While the +efficacy of auxiliary datasets in enhancing OOD detection has been extensively +studied for image and text data, such approaches have not yet been explored for +graph data. Unlike Euclidean data, graph data exhibits greater diversity but +lower robustness to perturbations, complicating the integration of outliers. To +tackle these challenges, we propose the introduction of \textbf{H}ybrid +External and Internal \textbf{G}raph \textbf{O}utlier \textbf{E}xposure (HGOE) +to improve graph OOD detection performance. Our framework involves using +realistic external graph data from various domains and synthesizing internal +outliers within ID subgroups to address the poor robustness and presence of OOD +samples within the ID class. Furthermore, we develop a boundary-aware OE loss +that adaptively assigns weights to outliers, maximizing the use of high-quality +OOD samples while minimizing the impact of low-quality ones. Our proposed HGOE +framework is model-agnostic and designed to enhance the effectiveness of +existing graph OOD detection models. Experimental results demonstrate that our +HGOE framework can significantly improve the performance of existing OOD +detection models across all 8 real datasets. + +
+
+ comment: Proceedings of the 32nd ACM International Conference on Multimedia +
+
+
+
+
+ + ☆ Contrastive Factor Analysis + + +
+ Factor analysis, often regarded as a Bayesian variant of matrix +factorization, offers superior capabilities in capturing uncertainty, modeling +complex dependencies, and ensuring robustness. As the deep learning era +arrives, factor analysis is receiving less and less attention due to their +limited expressive ability. On the contrary, contrastive learning has emerged +as a potent technique with demonstrated efficacy in unsupervised +representational learning. While the two methods are different paradigms, +recent theoretical analysis has revealed the mathematical equivalence between +contrastive learning and matrix factorization, providing a potential +possibility for factor analysis combined with contrastive learning. Motivated +by the interconnectedness of contrastive learning, matrix factorization, and +factor analysis, this paper introduces a novel Contrastive Factor Analysis +framework, aiming to leverage factor analysis's advantageous properties within +the realm of contrastive learning. To further leverage the interpretability +properties of non-negative factor analysis, which can learn disentangled +representations, contrastive factor analysis is extended to a non-negative +version. Finally, extensive experimental validation showcases the efficacy of +the proposed contrastive (non-negative) factor analysis methodology across +multiple key properties, including expressiveness, robustness, +interpretability, and accurate uncertainty estimation. + +
+
+
+
+
+ + ☆ A Federated Learning-Friendly Approach for Parameter-Efficient + Fine-Tuning of SAM in 3D Segmentation + + +
+ Adapting foundation models for medical image analysis requires finetuning +them on a considerable amount of data because of extreme distribution shifts +between natural (source) data used for pretraining and medical (target) data. +However, collecting task-specific medical data for such finetuning at a central +location raises many privacy concerns. Although Federated learning (FL) +provides an effective means for training on private decentralized data, +communication costs in federating large foundation models can quickly become a +significant bottleneck, impacting the solution's scalability. In this work, we +address this problem of efficient communication while ensuring effective +learning in FL by combining the strengths of Parameter-Efficient Fine-tuning +(PEFT) with FL. Specifically, we study plug-and-play Low-Rank Adapters (LoRA) +in a federated manner to adapt the Segment Anything Model (SAM) for 3D medical +image segmentation. Unlike prior works that utilize LoRA and finetune the +entire decoder, we critically analyze the contribution of each granular +component of SAM on finetuning performance. Thus, we identify specific layers +to be federated that are very efficient in terms of communication cost while +producing on-par accuracy. Our experiments show that retaining the parameters +of the SAM model (including most of the decoder) in their original state during +adaptation is beneficial because fine-tuning on small datasets tends to distort +the inherent capabilities of the underlying foundation model. On Fed-KiTS, our +approach decreases communication cost (~48x) compared to full fine-tuning while +increasing performance (~6% Dice score) in 3D segmentation tasks. Our approach +performs similar to SAMed while achieving ~2.8x reduction in communication and +parameters to be finetuned. We further validate our approach with experiments +on Fed-IXI and Prostate MRI datasets. + +
+
+
+
+
+ + ☆ Leveraging Self-Supervised Learning for Fetal Cardiac Planes + Classification using Ultrasound Scan Videos MICCAI 2023 + + +
+ Self-supervised learning (SSL) methods are popular since they can address +situations with limited annotated data by directly utilising the underlying +data distribution. However, the adoption of such methods is not explored enough +in ultrasound (US) imaging, especially for fetal assessment. We investigate the +potential of dual-encoder SSL in utilizing unlabelled US video data to improve +the performance of challenging downstream Standard Fetal Cardiac Planes (SFCP) +classification using limited labelled 2D US images. We study 7 SSL approaches +based on reconstruction, contrastive loss, distillation, and information theory +and evaluate them extensively on a large private US dataset. Our observations +and findings are consolidated from more than 500 downstream training +experiments under different settings. Our primary observation shows that for +SSL training, the variance of the dataset is more crucial than its size because +it allows the model to learn generalisable representations, which improve the +performance of downstream tasks. Overall, the BarlowTwins method shows robust +performance, irrespective of the training settings and data variations, when +used as an initialisation for downstream tasks. Notably, full fine-tuning with +1% of labelled data outperforms ImageNet initialisation by 12% in F1-score and +outperforms other SSL initialisations by at least 4% in F1-score, thus making +it a promising candidate for transfer learning from US video to image data. + +
+
+ comment: Simplifying Medical Ultrasound: 4th International Workshop, ASMUS + 2023, Held in Conjunction with MICCAI 2023, Vancouver, BC, Canada, October 8, + 2023, Proceedings +
+
+
+
+
+ + ☆ Social Learning through Interactions with Other Agents: A Survey IJCAI 2024 + + +
+ Social learning plays an important role in the development of human +intelligence. As children, we imitate our parents' speech patterns until we are +able to produce sounds; we learn from them praising us and scolding us; and as +adults, we learn by working with others. In this work, we survey the degree to +which this paradigm -- social learning -- has been mirrored in machine +learning. In particular, since learning socially requires interacting with +others, we are interested in how embodied agents can and have utilised these +techniques. This is especially in light of the degree to which recent advances +in natural language processing (NLP) enable us to perform new forms of social +learning. We look at how behavioural cloning and next-token prediction mirror +human imitation, how learning from human feedback mirrors human education, and +how we can go further to enable fully communicative agents that learn from each +other. We find that while individual social learning techniques have been used +successfully, there has been little unifying work showing how to bring them +together into socially embodied agents. + +
+
+ comment: To be published in IJCAI 2024, available on http://www.ijcai.org +
+
+
+
+
+ + ☆ Universal Approximation Theory: Foundations for Parallelism in Neural + Networks + + +
+ Neural networks are increasingly evolving towards training large models with +big data, a method that has demonstrated superior performance across many +tasks. However, this approach introduces an urgent problem: current deep +learning models are predominantly serial, meaning that as the number of network +layers increases, so do the training and inference times. This is unacceptable +if deep learning is to continue advancing. Therefore, this paper proposes a +deep learning parallelization strategy based on the Universal Approximation +Theorem (UAT). From this foundation, we designed a parallel network called +Para-Former to test our theory. Unlike traditional serial models, the inference +time of Para-Former does not increase with the number of layers, significantly +accelerating the inference speed of multi-layer networks. Experimental results +validate the effectiveness of this network. + +
+
+
+
+
+ + ☆ Synth-Empathy: Towards High-Quality Synthetic Empathy Data + + +
+ In recent years, with the rapid advancements in large language models (LLMs), +achieving excellent empathetic response capabilities has become a crucial +prerequisite. Consequently, managing and understanding empathetic datasets have +gained increasing significance. However, empathetic data are typically +human-labeled, leading to insufficient datasets and wasted human labor. In this +work, we present Synth-Empathy, an LLM-based data generation and quality and +diversity selection pipeline that automatically generates high-quality +empathetic data while discarding low-quality data. With the data generated from +a low empathetic model, we are able to further improve empathetic response +performance and achieve state-of-the-art (SoTA) results across multiple +benchmarks. Moreover, our model achieves SoTA performance on various human +evaluation benchmarks, demonstrating its effectiveness and robustness in +real-world applications. Furthermore, we show the trade-off between data +quantity and quality, providing insights into empathetic data generation and +selection. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2407.01937 +
+
+
+
+
+ + ☆ An Explainable Vision Transformer with Transfer Learning Combined with + Support Vector Machine Based Efficient Drought Stress Identification + + +
+ Early detection of drought stress is critical for taking timely measures for +reducing crop loss before the drought impact becomes irreversible. The subtle +phenotypical and physiological changes in response to drought stress are +captured by non-invasive imaging techniques and these imaging data serve as +valuable resource for machine learning methods to identify drought stress. +While convolutional neural networks (CNNs) are in wide use, vision transformers +(ViTs) present a promising alternative in capturing long-range dependencies and +intricate spatial relationships, thereby enhancing the detection of subtle +indicators of drought stress. We propose an explainable deep learning pipeline +that leverages the power of ViTs for drought stress detection in potato crops +using aerial imagery. We applied two distinct approaches: a synergistic +combination of ViT and support vector machine (SVM), where ViT extracts +intricate spatial features from aerial images, and SVM classifies the crops as +stressed or healthy and an end-to-end approach using a dedicated classification +layer within ViT to directly detect drought stress. Our key findings explain +the ViT model's decision-making process by visualizing attention maps. These +maps highlight the specific spatial features within the aerial images that the +ViT model focuses as the drought stress signature. Our findings demonstrate +that the proposed methods not only achieve high accuracy in drought stress +identification but also shedding light on the diverse subtle plant features +associated with drought stress. This offers a robust and interpretable solution +for drought stress monitoring for farmers to undertake informed decisions for +improved crop management. + +
+
+ comment: 30 pages, 6 figures, 4 tables +
+
+
+
+
+ + ☆ A State-of-the-Art Review of Computational Models for Analyzing + Longitudinal Wearable Sensor Data in Healthcare + + +
+ Wearable devices are increasingly used as tools for biomedical research, as +the continuous stream of behavioral and physiological data they collect can +provide insights about our health in everyday contexts. Long-term tracking, +defined in the timescale of months of year, can provide insights of patterns +and changes as indicators of health changes. These insights can make medicine +and healthcare more predictive, preventive, personalized, and participative +(The 4P's). However, the challenges in modeling, understanding and processing +longitudinal data are a significant barrier to their adoption in research +studies and clinical settings. In this paper, we review and discuss three +models used to make sense of longitudinal data: routines, rhythms and stability +metrics. We present the challenges associated with the processing and analysis +of longitudinal wearable sensor data, with a special focus on how to handle the +different temporal dynamics at various granularities. We then discuss current +limitations and identify directions for future work. This review is essential +to the advancement of computational modeling and analysis of longitudinal +sensor data for pervasive healthcare. + +
+
+
+
+
+ + ☆ Beat this! Accurate beat tracking without DBN postprocessing + + +
+ We propose a system for tracking beats and downbeats with two objectives: +generality across a diverse music range, and high accuracy. We achieve +generality by training on multiple datasets -- including solo instrument +recordings, pieces with time signature changes, and classical music with high +tempo variations -- and by removing the commonly used Dynamic Bayesian Network +(DBN) postprocessing, which introduces constraints on the meter and tempo. For +high accuracy, among other improvements, we develop a loss function tolerant to +small time shifts of annotations, and an architecture alternating convolutions +with transformers either over frequency or time. Our system surpasses the +current state of the art in F1 score despite using no DBN. However, it can +still fail, especially for difficult and underrepresented genres, and performs +worse on continuity metrics, so we publish our model, code, and preprocessed +datasets, and invite others to beat this. + +
+
+ comment: Accepted at the 25th International Society for Music Information + Retrieval Conference (ISMIR), 2024 +
+
+
+
+
+ + ☆ Comgra: A Tool for Analyzing and Debugging Neural Networks + + +
+ Neural Networks are notoriously difficult to inspect. We introduce comgra, an +open source python library for use with PyTorch. Comgra extracts data about the +internal activations of a model and organizes it in a GUI (graphical user +interface). It can show both summary statistics and individual data points, +compare early and late stages of training, focus on individual samples of +interest, and visualize the flow of the gradient through the network. This +makes it possible to inspect the model's behavior from many different angles +and save time by rapidly testing different hypotheses without having to rerun +it. Comgra has applications for debugging, neural architecture design, and +mechanistic interpretability. We publish our library through Python Package +Index (PyPI) and provide code, documentation, and tutorials at +https://github.com/FlorianDietz/comgra. + +
+
+
+
+
+ + ☆ Spatial Transformer Network YOLO Model for Agricultural Object Detection + + +
+ Object detection plays a crucial role in the field of computer vision by +autonomously identifying and locating objects of interest. The You Only Look +Once (YOLO) model is an effective single-shot detector. However, YOLO faces +challenges in cluttered or partially occluded scenes and can struggle with +small, low-contrast objects. We propose a new method that integrates spatial +transformer networks (STNs) into YOLO to improve performance. The proposed +STN-YOLO aims to enhance the model's effectiveness by focusing on important +areas of the image and improving the spatial invariance of the model before the +detection process. Our proposed method improved object detection performance +both qualitatively and quantitatively. We explore the impact of different +localization networks within the STN module as well as the robustness of the +model across different spatial transformations. We apply the STN-YOLO on +benchmark datasets for Agricultural object detection as well as a new dataset +from a state-of-the-art plant phenotyping greenhouse facility. Our code and +dataset are publicly available. + +
+
+ comment: 7 pages, 5 figures, submitted for review +
+
+
+
+
+ + ☆ Lyapunov weights to convey the meaning of time in physics-informed + neural networks + + +
+ Time is not a dimension as the others. In Physics-Informed Neural Networks +(PINN) several proposals attempted to adapt the time sampling or time weighting +to take into account the specifics of this special dimension. But these +proposals are not principled and need guidance to be used. We explain here +theoretically why the Lyapunov exponents give actionable insights and propose a +weighting scheme to automatically adapt to chaotic, periodic or stable +dynamics. We characterize theoretically the best weighting scheme under +computational constraints as a cumulative exponential integral of the local +Lyapunov exponent estimators and show that it performs well in practice under +the regimes mentioned above. + +
+
+
+
+
+ + ☆ MART: MultiscAle Relational Transformer Networks for Multi-agent + Trajectory Prediction ECCV 2024 + + +
+ Multi-agent trajectory prediction is crucial to autonomous driving and +understanding the surrounding environment. Learning-based approaches for +multi-agent trajectory prediction, such as primarily relying on graph neural +networks, graph transformers, and hypergraph neural networks, have demonstrated +outstanding performance on real-world datasets in recent years. However, the +hypergraph transformer-based method for trajectory prediction is yet to be +explored. Therefore, we present a MultiscAle Relational Transformer (MART) +network for multi-agent trajectory prediction. MART is a hypergraph transformer +architecture to consider individual and group behaviors in transformer +machinery. The core module of MART is the encoder, which comprises a Pair-wise +Relational Transformer (PRT) and a Hyper Relational Transformer (HRT). The +encoder extends the capabilities of a relational transformer by introducing +HRT, which integrates hyperedge features into the transformer mechanism, +promoting attention weights to focus on group-wise relations. In addition, we +propose an Adaptive Group Estimator (AGE) designed to infer complex group +relations in real-world environments. Extensive experiments on three real-world +datasets (NBA, SDD, and ETH-UCY) demonstrate that our method achieves +state-of-the-art performance, enhancing ADE/FDE by 3.9%/11.8% on the NBA +dataset. Code is available at https://github.com/gist-ailab/MART. + +
+
+ comment: 19 pages, 12 figures, 7 tables, 8 pages of supplementary material. + Paper accepted at ECCV 2024 +
+
+
+
+
+ + ☆ Extended Fiducial Inference: Toward an Automated Process of Statistical + Inference + + +
+ While fiducial inference was widely considered a big blunder by R.A. Fisher, +the goal he initially set --`inferring the uncertainty of model parameters on +the basis of observations' -- has been continually pursued by many +statisticians. To this end, we develop a new statistical inference method +called extended Fiducial inference (EFI). The new method achieves the goal of +fiducial inference by leveraging advanced statistical computing techniques +while remaining scalable for big data. EFI involves jointly imputing random +errors realized in observations using stochastic gradient Markov chain Monte +Carlo and estimating the inverse function using a sparse deep neural network +(DNN). The consistency of the sparse DNN estimator ensures that the uncertainty +embedded in observations is properly propagated to model parameters through the +estimated inverse function, thereby validating downstream statistical +inference. Compared to frequentist and Bayesian methods, EFI offers significant +advantages in parameter estimation and hypothesis testing. Specifically, EFI +provides higher fidelity in parameter estimation, especially when outliers are +present in the observations; and eliminates the need for theoretical reference +distributions in hypothesis testing, thereby automating the statistical +inference process. EFI also provides an innovative framework for +semi-supervised learning. + +
+
+
+
+
+ + ☆ Ironing the Graphs: Toward a Correct Geometric Analysis of Large-Scale + Graphs + + +
+ Graph embedding approaches attempt to project graphs into geometric entities, +i.e, manifolds. The idea is that the geometric properties of the projected +manifolds are helpful in the inference of graph properties. However, if the +choice of the embedding manifold is incorrectly performed, it can lead to +incorrect geometric inference. In this paper, we argue that the classical +embedding techniques cannot lead to correct geometric interpretation as they +miss the curvature at each point, of manifold. We advocate that for doing +correct geometric interpretation the embedding of graph should be done over +regular constant curvature manifolds. To this end, we present an embedding +approach, the discrete Ricci flow graph embedding (dRfge) based on the discrete +Ricci flow that adapts the distance between nodes in a graph so that the graph +can be embedded onto a constant curvature manifold that is homogeneous and +isotropic, i.e., all directions are equivalent and distances comparable, +resulting in correct geometric interpretations. A major contribution of this +paper is that for the first time, we prove the convergence of discrete Ricci +flow to a constant curvature and stable distance metrics over the edges. A +drawback of using the discrete Ricci flow is the high computational complexity +that prevented its usage in large-scale graph analysis. Another contribution of +this paper is a new algorithmic solution that makes it feasible to calculate +the Ricci flow for graphs of up to 50k nodes, and beyond. The intuitions behind +the discrete Ricci flow make it possible to obtain new insights into the +structure of large-scale graphs. We demonstrate this through a case study on +analyzing the internet connectivity structure between countries at the BGP +level. + +
+
+
+
+
+ + ☆ Higher order quantum reservoir computing for non-intrusive reduced-order + models + + +
+ Forecasting dynamical systems is of importance to numerous real-world +applications. When possible, dynamical systems forecasts are constructed based +on first-principles-based models such as through the use of differential +equations. When these equations are unknown, non-intrusive techniques must be +utilized to build predictive models from data alone. Machine learning (ML) +methods have recently been used for such tasks. Moreover, ML methods provide +the added advantage of significant reductions in time-to-solution for +predictions in contrast with first-principle based models. However, many +state-of-the-art ML-based methods for forecasting rely on neural networks, +which may be expensive to train and necessitate requirements for large amounts +of memory. In this work, we propose a quantum mechanics inspired ML modeling +strategy for learning nonlinear dynamical systems that provides data-driven +forecasts for complex dynamical systems with reduced training time and memory +costs. This approach, denoted the quantum reservoir computing technique (QRC), +is a hybrid quantum-classical framework employing an ensemble of interconnected +small quantum systems via classical linear feedback connections. By mapping the +dynamical state to a suitable quantum representation amenable to unitary +operations, QRC is able to predict complex nonlinear dynamical systems in a +stable and accurate manner. We demonstrate the efficacy of this framework +through benchmark forecasts of the NOAA Optimal Interpolation Sea Surface +Temperature dataset and compare the performance of QRC to other ML methods. + +
+
+
+
+
+ + ☆ Measuring What Matters: Intrinsic Distance Preservation as a Robust + Metric for Embedding Quality + + +
+ Unsupervised embeddings are fundamental to numerous machine learning +applications, yet their evaluation remains a challenging task. Traditional +assessment methods often rely on extrinsic variables, such as performance in +downstream tasks, which can introduce confounding factors and mask the true +quality of embeddings. This paper introduces the Intrinsic Distance +Preservation Evaluation (IDPE) method, a novel approach for assessing embedding +quality based on the preservation of Mahalanobis distances between data points +in the original and embedded spaces. We demonstrate the limitations of +extrinsic evaluation methods through a simple example, highlighting how they +can lead to misleading conclusions about embedding quality. IDPE addresses +these issues by providing a task-independent measure of how well embeddings +preserve the intrinsic structure of the original data. Our method leverages +efficient similarity search techniques to make it applicable to large-scale +datasets. We compare IDPE with established intrinsic metrics like +trustworthiness and continuity, as well as extrinsic metrics such as Average +Rank and Mean Reciprocal Rank. Our results show that IDPE offers a more +comprehensive and reliable assessment of embedding quality across various +scenarios. We evaluate PCA and t-SNE embeddings using IDPE, revealing insights +into their performance that are not captured by traditional metrics. This work +contributes to the field by providing a robust, efficient, and interpretable +method for embedding evaluation. IDPE's focus on intrinsic properties offers a +valuable tool for researchers and practitioners seeking to develop and assess +high-quality embeddings for diverse machine learning applications. + +
+
+
+
+
+ + ☆ Multi-agent reinforcement learning for the control of three-dimensional + Rayleigh-Bénard convection + + +
+ Deep reinforcement learning (DRL) has found application in numerous use-cases +pertaining to flow control. Multi-agent RL (MARL), a variant of DRL, has shown +to be more effective than single-agent RL in controlling flows exhibiting +locality and translational invariance. We present, for the first time, an +implementation of MARL-based control of three-dimensional Rayleigh-B\'enard +convection (RBC). Control is executed by modifying the temperature distribution +along the bottom wall divided into multiple control segments, each of which +acts as an independent agent. Two regimes of RBC are considered at Rayleigh +numbers $\mathrm{Ra}=500$ and $750$. Evaluation of the learned control policy +reveals a reduction in convection intensity by $23.5\%$ and $8.7\%$ at +$\mathrm{Ra}=500$ and $750$, respectively. The MARL controller converts +irregularly shaped convective patterns to regular straight rolls with lower +convection that resemble flow in a relatively more stable regime. We draw +comparisons with proportional control at both $\mathrm{Ra}$ and show that MARL +is able to outperform the proportional controller. The learned control strategy +is complex, featuring different non-linear segment-wise actuator delays and +actuation magnitudes. We also perform successful evaluations on a larger domain +than used for training, demonstrating that the invariant property of MARL +allows direct transfer of the learnt policy. + +
+
+ comment: Submitted to the special issue titled 'Machine Learning for Fluid + Dynamics' in the journal Flow, Turbulence and Combusion. 39 pages and 20 + figures +
+
+
+
+
+ + ☆ CXSimulator: A User Behavior Simulation using LLM Embeddings for + Web-Marketing Campaign Assessment CIKM '24 + + +
+ This paper presents the Customer Experience (CX) Simulator, a novel framework +designed to assess the effects of untested web-marketing campaigns through user +behavior simulations. The proposed framework leverages large language models +(LLMs) to represent various events in a user's behavioral history, such as +viewing an item, applying a coupon, or purchasing an item, as semantic +embedding vectors. We train a model to predict transitions between events from +their LLM embeddings, which can even generalize to unseen events by learning +from diverse training data. In web-marketing applications, we leverage this +transition prediction model to simulate how users might react differently when +new campaigns or products are presented to them. This allows us to eliminate +the need for costly online testing and enhance the marketers' abilities to +reveal insights. Our numerical evaluation and user study, utilizing BigQuery +Public Datasets from the Google Merchandise Store, demonstrate the +effectiveness of our framework. + +
+
+ comment: 5 pages, 2 figures, 1 table, the 33rd ACM International Conference on + Information and Knowledge Management (CIKM '24) +
+
+
+
+
+ + ☆ Black box meta-learning intrinsic rewards for sparse-reward environments + + +
+ Despite the successes and progress of deep reinforcement learning over the +last decade, several challenges remain that hinder its broader application. +Some fundamental aspects to improve include data efficiency, generalization +capability, and ability to learn in sparse-reward environments, which often +require human-designed dense rewards. Meta-learning has emerged as a promising +approach to address these issues by optimizing components of the learning +algorithm to meet desired characteristics. Additionally, a different line of +work has extensively studied the use of intrinsic rewards to enhance the +exploration capabilities of algorithms. This work investigates how +meta-learning can improve the training signal received by RL agents. The focus +is on meta-learning intrinsic rewards under a framework that doesn't rely on +the use of meta-gradients. We analyze and compare this approach to the use of +extrinsic rewards and a meta-learned advantage function. The developed +algorithms are evaluated on distributions of continuous control tasks with both +parametric and non-parametric variations, and with only sparse rewards +accessible for the evaluation tasks. + +
+
+ comment: This work is part of OP Bachelor's Degree Thesis +
+
+
+
+
+ + ☆ Probabilistic Scoring Lists for Interpretable Machine Learning + + +
+ A scoring system is a simple decision model that checks a set of features, +adds a certain number of points to a total score for each feature that is +satisfied, and finally makes a decision by comparing the total score to a +threshold. Scoring systems have a long history of active use in safety-critical +domains such as healthcare and justice, where they provide guidance for making +objective and accurate decisions. Given their genuine interpretability, the +idea of learning scoring systems from data is obviously appealing from the +perspective of explainable AI. In this paper, we propose a practically +motivated extension of scoring systems called probabilistic scoring lists +(PSL), as well as a method for learning PSLs from data. Instead of making a +deterministic decision, a PSL represents uncertainty in the form of probability +distributions, or, more generally, probability intervals. Moreover, in the +spirit of decision lists, a PSL evaluates features one by one and stops as soon +as a decision can be made with enough confidence. To evaluate our approach, we +conduct a case study in the medical domain. + +
+
+
+
+
+ + ☆ Data Contamination Report from the 2024 CONDA Shared Task + + +
+ The 1st Workshop on Data Contamination (CONDA 2024) focuses on all relevant +aspects of data contamination in natural language processing, where data +contamination is understood as situations where evaluation data is included in +pre-training corpora used to train large scale models, compromising evaluation +results. The workshop fostered a shared task to collect evidence on data +contamination in current available datasets and models. The goal of the shared +task and associated database is to assist the community in understanding the +extent of the problem and to assist researchers in avoiding reporting +evaluation results on known contaminated resources. The shared task provides a +structured, centralized public database for the collection of contamination +evidence, open to contributions from the community via GitHub pool requests. +This first compilation paper is based on 566 reported entries over 91 +contaminated sources from a total of 23 contributors. The details of the +individual contamination events are available in the platform. The platform +continues to be online, open to contributions from the community. + +
+
+ comment: https://huggingface.co/spaces/CONDA-Workshop/Data-Contamination-Database +
+
+
+
+
+ + ☆ Tabular Data Augmentation for Machine Learning: Progress and Prospects + of Embracing Generative AI + + +
+ Machine learning (ML) on tabular data is ubiquitous, yet obtaining abundant +high-quality tabular data for model training remains a significant obstacle. +Numerous works have focused on tabular data augmentation (TDA) to enhance the +original table with additional data, thereby improving downstream ML tasks. +Recently, there has been a growing interest in leveraging the capabilities of +generative AI for TDA. Therefore, we believe it is time to provide a +comprehensive review of the progress and future prospects of TDA, with a +particular emphasis on the trending generative AI. Specifically, we present an +architectural view of the TDA pipeline, comprising three main procedures: +pre-augmentation, augmentation, and post-augmentation. Pre-augmentation +encompasses preparation tasks that facilitate subsequent TDA, including error +handling, table annotation, table simplification, table representation, table +indexing, table navigation, schema matching, and entity matching. Augmentation +systematically analyzes current TDA methods, categorized into retrieval-based +methods, which retrieve external data, and generation-based methods, which +generate synthetic data. We further subdivide these methods based on the +granularity of the augmentation process at the row, column, cell, and table +levels. Post-augmentation focuses on the datasets, evaluation and optimization +aspects of TDA. We also summarize current trends and future directions for TDA, +highlighting promising opportunities in the era of generative AI. In addition, +the accompanying papers and related resources are continuously updated and +maintained in the GitHub repository at +https://github.com/SuDIS-ZJU/awesome-tabular-data-augmentation to reflect +ongoing advancements in the field. + +
+
+ comment: repository maintained at + https://github.com/SuDIS-ZJU/awesome-tabular-data-augmentation +
+
+
+
+
+ + ☆ FSSC: Federated Learning of Transformer Neural Networks for Semantic + Image Communication + + +
+ In this paper, we address the problem of image semantic communication in a +multi-user deployment scenario and propose a federated learning (FL) strategy +for a Swin Transformer-based semantic communication system (FSSC). Firstly, we +demonstrate that the adoption of a Swin Transformer for joint source-channel +coding (JSCC) effectively extracts semantic information in the communication +system. Next, the FL framework is introduced to collaboratively learn a global +model by aggregating local model parameters, rather than directly sharing +clients' data. This approach enhances user privacy protection and reduces the +workload on the server or mobile edge. Simulation evaluations indicate that our +method outperforms the typical JSCC algorithm and traditional separate-based +communication algorithms. Particularly after integrating local semantics, the +global aggregation model has further increased the Peak Signal-to-Noise Ratio +(PSNR) by more than 2dB, thoroughly proving the effectiveness of our algorithm. + +
+
+
+
+
+ + ☆ Root Cause Analysis Of Productivity Losses In Manufacturing Systems + Utilizing Ensemble Machine Learning + + +
+ In today's rapidly evolving landscape of automation and manufacturing +systems, the efficient resolution of productivity losses is paramount. This +study introduces a data-driven ensemble approach, utilizing the cyclic +multivariate time series data from binary sensors and signals from Programmable +Logic Controllers (PLCs) within these systems. The objective is to +automatically analyze productivity losses per cycle and pinpoint their root +causes by assigning the loss to a system element. The ensemble approach +introduced in this publication integrates various methods, including +information theory and machine learning behavior models, to provide a robust +analysis for each production cycle. To expedite the resolution of productivity +losses and ensure short response times, stream processing becomes a necessity. +Addressing this, the approach is implemented as data-stream analysis and can be +transferred to batch processing, seamlessly integrating into existing systems +without the need for extensive historical data analysis. This method has two +positive effects. Firstly, the result of the analysis ensures that the period +of lower productivity is reduced by identifying the likely root cause of the +productivity loss. Secondly, these results are more reliable due to the +ensemble approach and therefore avoid dependency on technical experts. The +approach is validated using a semi-automated welding manufacturing system, an +injection molding automation system, and a synthetically generated test PLC +dataset. The results demonstrate the method's efficacy in offering a +data-driven understanding of process behavior and mark an advancement in +autonomous manufacturing system analysis. + +
+
+
+
+
+ + ☆ Explainable and Controllable Motion Curve Guided Cardiac Ultrasound + Video Generation MICCAI + + +
+ Echocardiography video is a primary modality for diagnosing heart diseases, +but the limited data poses challenges for both clinical teaching and machine +learning training. Recently, video generative models have emerged as a +promising strategy to alleviate this issue. However, previous methods often +relied on holistic conditions during generation, hindering the flexible +movement control over specific cardiac structures. In this context, we propose +an explainable and controllable method for echocardiography video generation, +taking an initial frame and a motion curve as guidance. Our contributions are +three-fold. First, we extract motion information from each heart substructure +to construct motion curves, enabling the diffusion model to synthesize +customized echocardiography videos by modifying these curves. Second, we +propose the structure-to-motion alignment module, which can map semantic +features onto motion curves across cardiac structures. Third, The +position-aware attention mechanism is designed to enhance video consistency +utilizing Gaussian masks with structural position information. Extensive +experiments on three echocardiography datasets show that our method outperforms +others regarding fidelity and consistency. The full code will be released at +https://github.com/mlmi-2024-72/ECM. + +
+
+ comment: Accepted by MICCAI MLMI 2024 +
+
+
+
+
+ + ☆ On the Problem of Text-To-Speech Model Selection for Synthetic Data + Generation in Automatic Speech Recognition + + +
+ The rapid development of neural text-to-speech (TTS) systems enabled its +usage in other areas of natural language processing such as automatic speech +recognition (ASR) or spoken language translation (SLT). Due to the large number +of different TTS architectures and their extensions, selecting which TTS +systems to use for synthetic data creation is not an easy task. We use the +comparison of five different TTS decoder architectures in the scope of +synthetic data generation to show the impact on CTC-based speech recognition +training. We compare the recognition results to computable metrics like NISQA +MOS and intelligibility, finding that there are no clear relations to the ASR +performance. We also observe that for data generation auto-regressive decoding +performs better than non-autoregressive decoding, and propose an approach to +quantify TTS generalization capabilities. + +
+
+ comment: Accepted at the SynData4GenAI 2024 workshop +
+
+
+
+
+ + ☆ Multi-agent Assessment with QoS Enhancement for HD Map Updates in a + Vehicular Network + + +
+ Reinforcement Learning (RL) algorithms have been used to address the +challenging problems in the offloading process of vehicular ad hoc networks +(VANET). More recently, they have been utilized to improve the dissemination of +high-definition (HD) Maps. Nevertheless, implementing solutions such as deep +Q-learning (DQN) and Actor-critic at the autonomous vehicle (AV) may lead to an +increase in the computational load, causing a heavy burden on the computational +devices and higher costs. Moreover, their implementation might raise +compatibility issues between technologies due to the required modifications to +the standards. Therefore, in this paper, we assess the scalability of an +application utilizing a Q-learning single-agent solution in a distributed +multi-agent environment. This application improves the network performance by +taking advantage of a smaller state, and action space whilst using a +multi-agent approach. The proposed solution is extensively evaluated with +different test cases involving reward function considering individual or +overall network performance, number of agents, and centralized and distributed +learning comparison. The experimental results demonstrate that the time +latencies of our proposed solution conducted in voice, video, HD Map, and +best-effort cases have significant improvements, with 40.4%, 36%, 43%, and 12% +respectively, compared to the performances with the single-agent approach. + +
+
+
+
+
+ + ☆ TinyChirp: Bird Song Recognition Using TinyML Models on Low-power + Wireless Acoustic Sensors + + +
+ Monitoring biodiversity at scale is challenging. Detecting and identifying +species in fine grained taxonomies requires highly accurate machine learning +(ML) methods. Training such models requires large high quality data sets. And +deploying these models to low power devices requires novel compression +techniques and model architectures. While species classification methods have +profited from novel data sets and advances in ML methods, in particular neural +networks, deploying these state of the art models to low power devices remains +difficult. Here we present a comprehensive empirical comparison of various +tinyML neural network architectures and compression techniques for species +classification. We focus on the example of bird song detection, more concretely +a data set curated for studying the corn bunting bird species. The data set is +released along with all code and experiments of this study. In our experiments +we compare predictive performance, memory and time complexity of classical +spectrogram based methods and recent approaches operating on raw audio signal. +Our results indicate that individual bird species can be robustly detected with +relatively simple architectures that can be readily deployed to low power +devices. + +
+
+
+
+
+ + ☆ MLLM Is a Strong Reranker: Advancing Multimodal Retrieval-augmented + Generation via Knowledge-enhanced Reranking and Noise-injected Training + + +
+ Multimodal Large Language Models (MLLMs) have demonstrated remarkable +capabilities in processing and generating content across multiple data +modalities, including text, images, audio, and video. However, a significant +drawback of MLLMs is their reliance on static training data, leading to +outdated information and limited contextual awareness. This static nature +hampers their ability to provide accurate, up-to-date responses, particularly +in dynamic or rapidly evolving contexts. Integrating Multimodal +Retrieval-augmented Generation (Multimodal RAG) offers a promising solution, +but the system would inevitably encounter the multi-granularity noisy +correspondence (MNC) problem, which involves two types of noise: coarse-grained +(query-caption) and fine-grained (query-image). This noise hinders accurate +retrieval and generation. In this work, we propose \textbf{RagLLaVA}, a novel +framework with knowledge-enhanced reranking and noise-injected training, to +address these limitations. We instruction-tune the MLLM with a simple yet +effective instruction template to induce its ranking ability and serve it as a +reranker to precisely filter the top-k retrieved images. For generation, we +inject visual noise during training at the data and token levels to enhance the +generator's robustness. Extensive experiments are conducted on the subsets of +two datasets that require retrieving and reasoning over images to answer a +given query. Our results demonstrate the superiority of RagLLaVA in retrieving +accurately and generating robustly. Code and models are available at +https://github.com/IDEA-FinAI/RagLLaVA. + +
+
+
+
+
+ + ☆ Transient anisotropic kernel for probabilistic learning on manifolds + + +
+ PLoM (Probabilistic Learning on Manifolds) is a method introduced in 2016 for +handling small training datasets by projecting an It\^o equation from a +stochastic dissipative Hamiltonian dynamical system, acting as the MCMC +generator, for which the KDE-estimated probability measure with the training +dataset is the invariant measure. PLoM performs a projection on a reduced-order +vector basis related to the training dataset, using the diffusion maps (DMAPS) +basis constructed with a time-independent isotropic kernel. In this paper, we +propose a new ISDE projection vector basis built from a transient anisotropic +kernel, providing an alternative to the DMAPS basis to improve statistical +surrogates for stochastic manifolds with heterogeneous data. The construction +ensures that for times near the initial time, the DMAPS basis coincides with +the transient basis. For larger times, the differences between the two bases +are characterized by the angle of their spanned vector subspaces. The optimal +instant yielding the optimal transient basis is determined using an estimation +of mutual information from Information Theory, which is normalized by the +entropy estimation to account for the effects of the number of realizations +used in the estimations. Consequently, this new vector basis better represents +statistical dependencies in the learned probability measure for any dimension. +Three applications with varying levels of statistical complexity and data +heterogeneity validate the proposed theory, showing that the transient +anisotropic kernel improves the learned probability measure. + +
+
+ comment: 44 pages, 14 figures +
+
+
+
+
+ + ☆ Cost-Effective Hallucination Detection for LLMs + + +
+ Large language models (LLMs) can be prone to hallucinations - generating +unreliable outputs that are unfaithful to their inputs, external facts or +internally inconsistent. In this work, we address several challenges for +post-hoc hallucination detection in production settings. Our pipeline for +hallucination detection entails: first, producing a confidence score +representing the likelihood that a generated answer is a hallucination; second, +calibrating the score conditional on attributes of the inputs and candidate +response; finally, performing detection by thresholding the calibrated score. +We benchmark a variety of state-of-the-art scoring methods on different +datasets, encompassing question answering, fact checking, and summarization +tasks. We employ diverse LLMs to ensure a comprehensive assessment of +performance. We show that calibrating individual scoring methods is critical +for ensuring risk-aware downstream decision making. Based on findings that no +individual score performs best in all situations, we propose a multi-scoring +framework, which combines different scores and achieves top performance across +all datasets. We further introduce cost-effective multi-scoring, which can +match or even outperform more expensive detection methods, while significantly +reducing computational overhead. + +
+
+
+
+
+ + ☆ FTuner: A Fast Dynamic Shape Tensors Program Auto-Tuner for Deep + Learning Compilers + + +
+ Many artificial intelligence models process input data of different lengths +and resolutions, making the shape of the tensors dynamic. The performance of +these models depends on the shape of the tensors, which makes it difficult to +optimize the tensors before the model runs. There are two common solutions to +this problem. The first is to add useless data to the input to match a +pre-optimized tensor library. The second is to use small basic tensors to +create a tensor that is closest in size to the input data and then tune it to +minimize padding. However, this second solution can be time-consuming. + This paper proposes a new technique for deep learning compilers called +FTuner. Instead of using a large design space or training a cost model, we use +an abstract computational unit called the uKernel to patch together small, +various-sized tensors to match the shape of the input tensor. We determine the +shape of the uKernel using an analytic hardware information model. Experiments +show that the FTuner can achieve comparable operators and end-to-end +performance to vendor libraries and achieves 3\% speedup on existing auto-tuner +with the model-training compiler while reducing tuning time by two orders of +magnitude. + +
+
+ comment: 14 pages, 16 figures, 6 tables +
+
+
+
+
+ + ☆ Deep Fréchet Regression + + +
+ Advancements in modern science have led to the increasing availability of +non-Euclidean data in metric spaces. This paper addresses the challenge of +modeling relationships between non-Euclidean responses and multivariate +Euclidean predictors. We propose a flexible regression model capable of +handling high-dimensional predictors without imposing parametric assumptions. +Two primary challenges are addressed: the curse of dimensionality in +nonparametric regression and the absence of linear structure in general metric +spaces. The former is tackled using deep neural networks, while for the latter +we demonstrate the feasibility of mapping the metric space where responses +reside to a low-dimensional Euclidean space using manifold learning. We +introduce a reverse mapping approach, employing local Fr\'echet regression, to +map the low-dimensional manifold representations back to objects in the +original metric space. We develop a theoretical framework, investigating the +convergence rate of deep neural networks under dependent sub-Gaussian noise +with bias. The convergence rate of the proposed regression model is then +obtained by expanding the scope of local Fr\'echet regression to accommodate +multivariate predictors in the presence of errors in predictors. Simulations +and case studies show that the proposed model outperforms existing methods for +non-Euclidean responses, focusing on the special cases of probability measures +and networks. + +
+
+ comment: 66 pages, 6 figures, 5 tables +
+
+
+
+
+ + ☆ SmileyNet -- Towards the Prediction of the Lottery by Reading Tea Leaves + with AI + + +
+ We introduce SmileyNet, a novel neural network with psychic abilities. It is +inspired by the fact that a positive mood can lead to improved cognitive +capabilities including classification tasks. The network is hence presented in +a first phase with smileys and an encouraging loss function is defined to bias +it into a good mood. SmileyNet is then used to forecast the flipping of a coin +based on an established method of Tasseology, namely by reading tea leaves. +Training and testing in this second phase are done with a high-fidelity +simulation based on real-world pixels sampled from a professional tea-reading +cup. SmileyNet has an amazing accuracy of 72% to correctly predict the flip of +a coin. Resnet-34, respectively YOLOv5 achieve only 49%, respectively 53%. It +is then shown how multiple SmileyNets can be combined to win the lottery. + +
+
+ comment: This is a satirical accumulation of misconceptions, mistakes, and + flawed reasoning I have encountered in recent times as a reviewer and + sometimes even as a reader of published papers. I hope it is entertaining and + useful in the context of the education of BSc, MSc, and PhD students in + Machine Learning, Artificial Intelligence, and Cognitive Science +
+
+
+
+
+ + ☆ Dynamic Gesture Recognition in Ultra-Range Distance for Effective + Human-Robot Interaction + + +
+ This paper presents a novel approach for ultra-range gesture recognition, +addressing Human-Robot Interaction (HRI) challenges over extended distances. By +leveraging human gestures in video data, we propose the Temporal-Spatiotemporal +Fusion Network (TSFN) model that surpasses the limitations of current methods, +enabling robots to understand gestures from long distances. With applications +in service robots, search and rescue operations, and drone-based interactions, +our approach enhances HRI in expansive environments. Experimental validation +demonstrates significant advancements in gesture recognition accuracy, +particularly in prolonged gesture sequences. + +
+
+
+
+
+ + ☆ Two Completely Parameter-Free Alternating Gradient Projection Algorithms + for Nonconvex-(strongly) Concave Minimax Problems + + +
+ Due to their importance in various emerging applications, efficient +algorithms for solving minimax problems have recently received increasing +attention. However, many existing algorithms require prior knowledge of the +problem parameters in order to achieve optimal iteration complexity. In this +paper, we propose a completely parameter-free alternating gradient projection +(PF-AGP) algorithm to solve the smooth nonconvex-(strongly) concave minimax +problems using a backtracking strategy, which does not require prior knowledge +of parameters such as the Lipschtiz constant $L$ or the strongly concave +constant $\mu$. The PF-AGP algorithm utilizes a parameter-free gradient +projection step to alternately update the outer and inner variables in each +iteration. We show that the total number of gradient calls of the PF-AGP +algorithm to obtain an $\varepsilon$-stationary point for nonconvex-strongly +concave minimax problems is upper bounded by $\mathcal{O}\left( +L\kappa^3\varepsilon^{-2} \right)$ where $\kappa$ is the condition number, +while the total number of gradient calls to obtain an $\varepsilon$-stationary +point for nonconvex-concave minimax problems is upper bounded by +$\mathcal{O}\left( L^4\varepsilon^{-4} \right)$. As far as we know, this is the +first completely parameter-free algorithm for solving nonconvex-strongly +concave minimax problems, and it is also the completely parameter-free +algorithm which achieves the best iteration complexity in single loop method +for solving nonconvex-concave minimax problems. Numerical results validate the +efficiency of the proposed PF-AGP algorithm. + +
+
+
+
+
+ + ☆ Prompting Medical Large Vision-Language Models to Diagnose Pathologies + by Visual Question Answering + + +
+ Large Vision-Language Models (LVLMs) have achieved significant success in +recent years, and they have been extended to the medical domain. Although +demonstrating satisfactory performance on medical Visual Question Answering +(VQA) tasks, Medical LVLMs (MLVLMs) suffer from the hallucination problem, +which makes them fail to diagnose complex pathologies. Moreover, they readily +fail to learn minority pathologies due to imbalanced training data. We propose +two prompting strategies for MLVLMs that reduce hallucination and improve VQA +performance. In the first strategy, we provide a detailed explanation of the +queried pathology. In the second strategy, we fine-tune a cheap, weak learner +to achieve high performance on a specific metric, and textually provide its +judgment to the MLVLM. Tested on the MIMIC-CXR-JPG and Chexpert datasets, our +methods significantly improve the diagnostic F1 score, with the highest +increase being 0.27. We also demonstrate that our prompting strategies can be +extended to general LVLM domains. Based on POPE metrics, it effectively +suppresses the false negative predictions of existing LVLMs and improves Recall +by approximately 0.07. + +
+
+
+
+
+ + ☆ ProSpec RL: Plan Ahead, then Execute + + +
+ Imagining potential outcomes of actions before execution helps agents make +more informed decisions, a prospective thinking ability fundamental to human +cognition. However, mainstream model-free Reinforcement Learning (RL) methods +lack the ability to proactively envision future scenarios, plan, and guide +strategies. These methods typically rely on trial and error to adjust policy +functions, aiming to maximize cumulative rewards or long-term value, even if +such high-reward decisions place the environment in extremely dangerous states. +To address this, we propose the Prospective (ProSpec) RL method, which makes +higher-value, lower-risk optimal decisions by imagining future n-stream +trajectories. Specifically, ProSpec employs a dynamic model to predict future +states (termed "imagined states") based on the current state and a series of +sampled actions. Furthermore, we integrate the concept of Model Predictive +Control and introduce a cycle consistency constraint that allows the agent to +evaluate and select the optimal actions from these trajectories. Moreover, +ProSpec employs cycle consistency to mitigate two fundamental issues in RL: +augmenting state reversibility to avoid irreversible events (low risk) and +augmenting actions to generate numerous virtual trajectories, thereby improving +data efficiency. We validated the effectiveness of our method on the DMControl +benchmarks, where our approach achieved significant performance improvements. +Code will be open-sourced upon acceptance. + +
+
+
+
+
+ + ☆ Differentially Private Block-wise Gradient Shuffle for Deep Learning + + +
+ Traditional Differentially Private Stochastic Gradient Descent (DP-SGD) +introduces statistical noise on top of gradients drawn from a Gaussian +distribution to ensure privacy. This paper introduces the novel Differentially +Private Block-wise Gradient Shuffle (DP-BloGS) algorithm for deep learning. +BloGS builds off of existing private deep learning literature, but makes a +definitive shift by taking a probabilistic approach to gradient noise +introduction through shuffling modeled after information theoretic privacy +analyses. The theoretical results presented in this paper show that the +combination of shuffling, parameter-specific block size selection, batch layer +clipping, and gradient accumulation allows DP-BloGS to achieve training times +close to that of non-private training while maintaining similar privacy and +utility guarantees to DP-SGD. DP-BloGS is found to be significantly more +resistant to data extraction attempts than DP-SGD. The theoretical results are +validated by the experimental findings. + +
+
+ comment: 43 pages, 11 figures, 8 tables +
+
+
+
+
+ + ☆ MIST: A Simple and Scalable End-To-End 3D Medical Imaging Segmentation + Framework + + +
+ Medical imaging segmentation is a highly active area of research, with deep +learning-based methods achieving state-of-the-art results in several +benchmarks. However, the lack of standardized tools for training, testing, and +evaluating new methods makes the comparison of methods difficult. To address +this, we introduce the Medical Imaging Segmentation Toolkit (MIST), a simple, +modular, and end-to-end medical imaging segmentation framework designed to +facilitate consistent training, testing, and evaluation of deep learning-based +medical imaging segmentation methods. MIST standardizes data analysis, +preprocessing, and evaluation pipelines, accommodating multiple architectures +and loss functions. This standardization ensures reproducible and fair +comparisons across different methods. We detail MIST's data format +requirements, pipelines, and auxiliary features and demonstrate its efficacy +using the BraTS Adult Glioma Post-Treatment Challenge dataset. Our results +highlight MIST's ability to produce accurate segmentation masks and its +scalability across multiple GPUs, showcasing its potential as a powerful tool +for future medical imaging research and development. + +
+
+ comment: Submitted to BraTS 2024 +
+
+
+
+
+ + ☆ Image-Based Deep Reinforcement Learning with Intrinsically Motivated + Stimuli: On the Execution of Complex Robotic Tasks + + +
+ Reinforcement Learning (RL) has been widely used to solve tasks where the +environment consistently provides a dense reward value. However, in real-world +scenarios, rewards can often be poorly defined or sparse. Auxiliary signals are +indispensable for discovering efficient exploration strategies and aiding the +learning process. In this work, inspired by intrinsic motivation theory, we +postulate that the intrinsic stimuli of novelty and surprise can assist in +improving exploration in complex, sparsely rewarded environments. We introduce +a novel sample-efficient method able to learn directly from pixels, an +image-based extension of TD3 with an autoencoder called \textit{NaSA-TD3}. The +experiments demonstrate that NaSA-TD3 is easy to train and an efficient method +for tackling complex continuous-control robotic tasks, both in simulated +environments and real-world settings. NaSA-TD3 outperforms existing +state-of-the-art RL image-based methods in terms of final performance without +requiring pre-trained models or human demonstrations. + +
+
+
+
+
+ + ☆ Big Cooperative Learning + + +
+ Cooperation plays a pivotal role in the evolution of human intelligence; +moreover, it also underlies the recent revolutionary advancement of artificial +intelligence (AI) that is driven by foundation models. Specifically, we reveal +that the training of foundation models can be interpreted as a form of big +cooperative learning (\textit{abbr.} big learning), where massive learning +individuals/tasks \emph{cooperate} to approach the unique essence of data from +diverse perspectives of data prediction, leveraging a universal model. The +presented big learning therefore unifies most training objectives of foundation +models within a consistent framework, where their underlying assumptions are +exposed simultaneously. We design tailored simulations to demonstrate the +principle of big learning, based on which we provide learning-perspective +justifications for the successes of foundation models, with interesting +side-products. Furthermore, we reveal that big learning is a new dimension for +upgrading conventional machine learning paradigms, valuable for endowing +reinvigorations to associated applications; as an illustrative example, we +propose the BigLearn-GAN, which is a novel adversarially-trained foundation +model with versatile data sampling capabilities. Code is available at +\texttt{https://github.com/YulaiCong/BigCooperativeLearning}. + +
+
+
+
+
+ + ☆ Diff-Cleanse: Identifying and Mitigating Backdoor Attacks in Diffusion + Models + + +
+ Diffusion models (DM) represent one of the most advanced generative models +today, yet recent studies suggest that DMs are vulnerable to backdoor attacks. +Backdoor attacks establish hidden associations between particular input +patterns and model behaviors, compromising model integrity by triggering +undesirable actions with manipulated input data. This vulnerability poses +substantial risks, including reputational damage to model owners and the +dissemination of harmful content. To mitigate the threat of backdoor attacks, +there have been some investigations on backdoor detection and model repair. +However, previous work fails to purify the backdoored DMs created by +state-of-the-art attacks, rendering the field much underexplored. To bridge +this gap, we introduce \textbf{Diff-Cleanse}, a novel two-stage backdoor +defense framework specifically designed for DMs. The first stage employs a +innovative trigger inversion technique to detect the backdoor and reconstruct +the trigger, and the second stage utilizes a structural pruning method to +eliminate the backdoor. We evaluate our framework on hundreds of DMs attacked +by 3 existing backdoor attack methods. Extensive experiments demonstrate that +Diff-Cleanse achieves nearly 100\% detection accuracy and effectively mitigates +backdoor impacts, preserving the model's benign performance with minimal +compromise. Our code is avaliable at https://github.com/shymuel/diff-cleanse. + +
+
+
+
+
+ + ☆ State-observation augmented diffusion model for nonlinear assimilation + + +
+ Data assimilation has become a crucial technique aiming to combine physical +models with observational data to estimate state variables. Traditional +assimilation algorithms often face challenges of high nonlinearity brought by +both the physical and observational models. In this work, we propose a novel +data-driven assimilation algorithm based on generative models to address such +concerns. Our State-Observation Augmented Diffusion (SOAD) model is designed to +handle nonlinear physical and observational models more effectively. The +marginal posterior associated with SOAD has been derived and then proved to +match the real posterior under mild assumptions, which shows theoretical +superiority over previous score-based assimilation works. Experimental results +also indicate that our SOAD model may offer improved accuracy over existing +data-driven methods. + +
+
+
+
+
+ + ☆ EUDA: An Efficient Unsupervised Domain Adaptation via Self-Supervised + Vision Transformer + + +
+ Unsupervised domain adaptation (UDA) aims to mitigate the domain shift issue, +where the distribution of training (source) data differs from that of testing +(target) data. Many models have been developed to tackle this problem, and +recently vision transformers (ViTs) have shown promising results. However, the +complexity and large number of trainable parameters of ViTs restrict their +deployment in practical applications. This underscores the need for an +efficient model that not only reduces trainable parameters but also allows for +adjustable complexity based on specific needs while delivering comparable +performance. To achieve this, in this paper we introduce an Efficient +Unsupervised Domain Adaptation (EUDA) framework. EUDA employs the DINOv2, which +is a self-supervised ViT, as a feature extractor followed by a simplified +bottleneck of fully connected layers to refine features for enhanced domain +adaptation. Additionally, EUDA employs the synergistic domain alignment loss +(SDAL), which integrates cross-entropy (CE) and maximum mean discrepancy (MMD) +losses, to balance adaptation by minimizing classification errors in the source +domain while aligning the source and target domain distributions. The +experimental results indicate the effectiveness of EUDA in producing comparable +results as compared with other state-of-the-art methods in domain adaptation +with significantly fewer trainable parameters, between 42% to 99.7% fewer. This +showcases the ability to train the model in a resource-limited environment. The +code of the model is available at: https://github.com/A-Abedi/EUDA. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ☆ MSMA: Multi-agent Trajectory Prediction in Connected and Autonomous + Vehicle Environment with Multi-source Data Integration + + +
+ The prediction of surrounding vehicle trajectories is crucial for +collision-free path planning. In this study, we focus on a scenario where a +connected and autonomous vehicle (CAV) serves as the central agent, utilizing +both sensors and communication technologies to perceive its surrounding +traffics consisting of autonomous vehicles (AVs), connected vehicles (CVs), and +human-driven vehicles (HDVs). Our trajectory prediction task is aimed at all +the detected surrounding vehicles. To effectively integrate the multi-source +data from both sensor and communication technologies, we propose a deep +learning framework called MSMA utilizing a cross-attention module for +multi-source data fusion. Vector map data is utilized to provide contextual +information. The trajectory dataset is collected in CARLA simulator with +synthesized data errors introduced. Numerical experiments demonstrate that in a +mixed traffic flow scenario, the integration of data from different sources +enhances our understanding of the environment. This notably improves trajectory +prediction accuracy, particularly in situations with a high CV market +penetration rate. The code is available at: https://github.com/xichennn/MSMA. + +
+
+
+
+
+ + ☆ Who should I trust? A Visual Analytics Approach for Comparing Net Load + Forecasting Models + + +
+ Net load forecasting is crucial for energy planning and facilitating informed +decision-making regarding trade and load distributions. However, evaluating +forecasting models' performance against benchmark models remains challenging, +thereby impeding experts' trust in the model's performance. In this context, +there is a demand for technological interventions that allow scientists to +compare models across various timeframes and solar penetration levels. This +paper introduces a visual analytics-based application designed to compare the +performance of deep-learning-based net load forecasting models with other +models for probabilistic net load forecasting. This application employs +carefully selected visual analytic interventions, enabling users to discern +differences in model performance across different solar penetration levels, +dataset resolutions, and hours of the day over multiple months. We also present +observations made using our application through a case study, demonstrating the +effectiveness of visualizations in aiding scientists in making informed +decisions and enhancing trust in net load forecasting models. + +
+
+ comment: Accepted for publication in the proceedings of 2025 IEEE PES Grid + Edge Technologies Conference & Exposition (Grid Edge) +
+
+
+
+
+ + ☆ A Vectorization Method Induced By Maximal Margin Classification For + Persistent Diagrams + + +
+ Persistent homology is an effective method for extracting topological +information, represented as persistent diagrams, of spatial structure data. +Hence it is well-suited for the study of protein structures. Attempts to +incorporate Persistent homology in machine learning methods of protein function +prediction have resulted in several techniques for vectorizing persistent +diagrams. However, current vectorization methods are excessively artificial and +cannot ensure the effective utilization of information or the rationality of +the methods. To address this problem, we propose a more geometrical +vectorization method of persistent diagrams based on maximal margin +classification for Banach space, and additionaly propose a framework that +utilizes topological data analysis to identify proteins with specific +functions. We evaluated our vectorization method using a binary classification +task on proteins and compared it with the statistical methods that exhibit the +best performance among thirteen commonly used vectorization methods. The +experimental results indicate that our approach surpasses the statistical +methods in both robustness and precision. + +
+
+
+
+
+ + ☆ Decentralized and Uncoordinated Learning of Stable Matchings: A + Game-Theoretic Approach + + +
+ We consider the problem of learning stable matchings in a fully decentralized +and uncoordinated manner. In this problem, there are $n$ men and $n$ women, +each having preference over the other side. It is assumed that women know their +preferences over men, but men are not aware of their preferences over women, +and they only learn them if they propose and successfully get matched to women. +A matching is called stable if no man and woman prefer each other over their +current matches. When all the preferences are known a priori, the celebrated +Deferred-Acceptance algorithm proposed by Gale and Shapley provides a +decentralized and uncoordinated algorithm to obtain a stable matching. However, +when the preferences are unknown, developing such an algorithm faces major +challenges due to a lack of coordination. We achieve this goal by making a +connection between stable matchings and learning Nash equilibria (NE) in +noncooperative games. First, we provide a complete information game formulation +for the stable matching problem with known preferences such that its set of +pure NE coincides with the set of stable matchings, while its mixed NE can be +rounded in a decentralized manner to a stable matching. Relying on such a +game-theoretic formulation, we show that for hierarchical markets, adopting the +exponential weight (EXP) learning algorithm for the stable matching game +achieves logarithmic regret with polynomial dependence on the number of +players, thus answering a question posed in previous literature. Moreover, we +show that the same EXP learning algorithm converges locally and exponentially +fast to a stable matching in general matching markets. We complement this +result by introducing another decentralized and uncoordinated learning +algorithm that globally converges to a stable matching with arbitrarily high +probability, leveraging the weak acyclicity property of the stable matching +game. + +
+
+
+
+
+ + ☆ TrackSorter: A Transformer-based sorting algorithm for track finding in + High Energy Physics + + +
+ Track finding in particle data is a challenging pattern recognition problem +in High Energy Physics. It takes as inputs a point cloud of space points and +labels them so that space points created by the same particle have the same +label. The list of space points with the same label is a track candidate. We +argue that this pattern recognition problem can be formulated as a sorting +problem, of which the inputs are a list of space points sorted by their +distances away from the collision points and the outputs are the space points +sorted by their labels. In this paper, we propose the TrackSorter algorithm: a +Transformer-based algorithm for pattern recognition in particle data. +TrackSorter uses a simple tokenization scheme to convert space points into +discrete tokens. It then uses the tokenized space points as inputs and sorts +the input tokens into track candidates. TrackSorter is a novel end-to-end track +finding algorithm that leverages Transformer-based models to solve pattern +recognition problems. It is evaluated on the TrackML dataset and has good track +finding performance. + +
+
+ comment: 6 pages, 3 figures, to be included in Proceedings of the 22nd + International Workshop on Advanced Computing and Analysis Techniques in + Physics Research (ACAT 2024) +
+
+
+
+
+ + ☆ Robust Box Prompt based SAM for Medical Image Segmentation MICCAI + + +
+ The Segment Anything Model (SAM) can achieve satisfactory segmentation +performance under high-quality box prompts. However, SAM's robustness is +compromised by the decline in box quality, limiting its practicality in +clinical reality. In this study, we propose a novel Robust Box prompt based SAM +(\textbf{RoBox-SAM}) to ensure SAM's segmentation performance under prompts +with different qualities. Our contribution is three-fold. First, we propose a +prompt refinement module to implicitly perceive the potential targets, and +output the offsets to directly transform the low-quality box prompt into a +high-quality one. We then provide an online iterative strategy for further +prompt refinement. Second, we introduce a prompt enhancement module to +automatically generate point prompts to assist the box-promptable segmentation +effectively. Last, we build a self-information extractor to encode the prior +information from the input image. These features can optimize the image +embeddings and attention calculation, thus, the robustness of SAM can be +further enhanced. Extensive experiments on the large medical segmentation +dataset including 99,299 images, 5 modalities, and 25 organs/targets validated +the efficacy of our proposed RoBox-SAM. + +
+
+ comment: Accepted by MICCAI MLMI 2024 +
+
+
+
+
+ + ☆ FedBChain: A Blockchain-enabled Federated Learning Framework for + Improving DeepConvLSTM with Comparative Strategy Insights + + +
+ Recent research in the field of Human Activity Recognition has shown that an +improvement in prediction performance can be achieved by reducing the number of +LSTM layers. However, this kind of enhancement is only significant on +monolithic architectures, and when it runs on large-scale distributed training, +data security and privacy issues will be reconsidered, and its prediction +performance is unknown. In this paper, we introduce a novel framework: +FedBChain, which integrates the federated learning paradigm based on a modified +DeepConvLSTM architecture with a single LSTM layer. This framework performs +comparative tests of prediction performance on three different real-world +datasets based on three different hidden layer units (128, 256, and 512) +combined with five different federated learning strategies, respectively. The +results show that our architecture has significant improvements in Precision, +Recall and F1-score compared to the centralized training approach on all +datasets with all hidden layer units for all strategies: FedAvg strategy +improves on average by 4.54%, FedProx improves on average by 4.57%, +FedTrimmedAvg improves on average by 4.35%, Krum improves by 4.18% on average, +and FedAvgM improves by 4.46% on average. Based on our results, it can be seen +that FedBChain not only improves in performance, but also guarantees the +security and privacy of user data compared to centralized training methods +during the training process. The code for our experiments is publicly available +(https://github.com/Glen909/FedBChain). + +
+
+
+
+
+ + ☆ Enhanced Uncertainty Estimation in Ultrasound Image Segmentation with + MSU-Net MICCAI 2024 + + +
+ Efficient intravascular access in trauma and critical care significantly +impacts patient outcomes. However, the availability of skilled medical +personnel in austere environments is often limited. Autonomous robotic +ultrasound systems can aid in needle insertion for medication delivery and +support non-experts in such tasks. Despite advances in autonomous needle +insertion, inaccuracies in vessel segmentation predictions pose risks. +Understanding the uncertainty of predictive models in ultrasound imaging is +crucial for assessing their reliability. We introduce MSU-Net, a novel +multistage approach for training an ensemble of U-Nets to yield accurate +ultrasound image segmentation maps. We demonstrate substantial improvements, +18.1% over a single Monte Carlo U-Net, enhancing uncertainty evaluations, model +transparency, and trustworthiness. By highlighting areas of model certainty, +MSU-Net can guide safe needle insertions, empowering non-experts to accomplish +such tasks. + +
+
+ comment: Accepted for the 5th International Workshop of Advances in + Simplifying Medical UltraSound (ASMUS), held in conjunction with MICCAI 2024, + the 27th International Conference on Medical Image Computing and Computer + Assisted Intervention +
+
+
+
+
+ + ☆ DDU-Net: A Domain Decomposition-based CNN on Multiple GPUs + + +
+ The segmentation of ultra-high resolution images poses challenges such as +loss of spatial information or computational inefficiency. In this work, a +novel approach that combines encoder-decoder architectures with domain +decomposition strategies to address these challenges is proposed. Specifically, +a domain decomposition-based U-Net (DDU-Net) architecture is introduced, which +partitions input images into non-overlapping patches that can be processed +independently on separate devices. A communication network is added to +facilitate inter-patch information exchange to enhance the understanding of +spatial context. Experimental validation is performed on a synthetic dataset +that is designed to measure the effectiveness of the communication network. +Then, the performance is tested on the DeepGlobe land cover classification +dataset as a real-world benchmark data set. The results demonstrate that the +approach, which includes inter-patch communication for images divided into +$16\times16$ non-overlapping subimages, achieves a $2-3\,\%$ higher +intersection over union (IoU) score compared to the same network without +inter-patch communication. The performance of the network which includes +communication is equivalent to that of a baseline U-Net trained on the full +image, showing that our model provides an effective solution for segmenting +ultra-high-resolution images while preserving spatial context. The code is +available at https://github.com/corne00/HiRes-Seg-CNN. + +
+
+
+
+
+ + ☆ Tractable and Provably Efficient Distributional Reinforcement Learning + with General Value Function Approximation + + +
+ Distributional reinforcement learning improves performance by effectively +capturing environmental stochasticity, but a comprehensive theoretical +understanding of its effectiveness remains elusive. In this paper, we present a +regret analysis for distributional reinforcement learning with general value +function approximation in a finite episodic Markov decision process setting. We +first introduce a key notion of Bellman unbiasedness for a tractable and +exactly learnable update via statistical functional dynamic programming. Our +theoretical results show that approximating the infinite-dimensional return +distribution with a finite number of moment functionals is the only method to +learn the statistical information unbiasedly, including nonlinear statistical +functionals. Second, we propose a provably efficient algorithm, +$\texttt{SF-LSVI}$, achieving a regret bound of $\tilde{O}(d_E +H^{\frac{3}{2}}\sqrt{K})$ where $H$ is the horizon, $K$ is the number of +episodes, and $d_E$ is the eluder dimension of a function class. + +
+
+
+
+
+ + ♻ ☆ PerAct2: Benchmarking and Learning for Robotic Bimanual Manipulation + Tasks + + +
+ Bimanual manipulation is challenging due to precise spatial and temporal +coordination required between two arms. While there exist several real-world +bimanual systems, there is a lack of simulated benchmarks with a large task +diversity for systematically studying bimanual capabilities across a wide range +of tabletop tasks. This paper addresses the gap by extending RLBench to +bimanual manipulation. We open-source our code and benchmark comprising 13 new +tasks with 23 unique task variations, each requiring a high degree of +coordination and adaptability. To kickstart the benchmark, we extended several +state-of-the art methods to bimanual manipulation and also present a +language-conditioned behavioral cloning agent -- PerAct2, which enables the +learning and execution of bimanual 6-DoF manipulation tasks. Our novel network +architecture efficiently integrates language processing with action prediction, +allowing robots to understand and perform complex bimanual tasks in response to +user-specified goals. Project website with code is available at: +http://bimanual.github.io + +
+
+
+
+
+ + ♻ ☆ Occam Gradient Descent + + +
+ Deep learning neural network models must be large enough to adapt to their +problem domain, while small enough to avoid overfitting training data during +gradient descent. To balance these competing demands, overprovisioned deep +learning models such as transformers are trained for a single epoch on large +data sets, and hence inefficient with both computing resources and training +data. In response to these inefficiencies, we exploit learning theory to derive +Occam Gradient Descent, an algorithm that interleaves adaptive reduction of +model size to minimize generalization error, with gradient descent on model +weights to minimize fitting error. In contrast, traditional gradient descent +greedily minimizes fitting error without regard to generalization error. Our +algorithm simultaneously descends the space of weights and topological size of +any neural network without modification. With respect to loss, compute and +model size, our experiments show (a) on image classification benchmarks, linear +and convolutional neural networks trained with Occam Gradient Descent +outperform traditional gradient descent with or without post-train pruning; (b) +on a range of tabular data classification tasks, neural networks trained with +Occam Gradient Descent outperform traditional gradient descent, as well as +Random Forests; (c) on natural language transformers, Occam Gradient Descent +outperforms traditional gradient descent. + +
+
+
+
+
+ + ♻ ☆ MoFO: Momentum-Filtered Optimizer for Mitigating Forgetting in LLM + Fine-Tuning + + +
+ Recently, large language models (LLMs) have demonstrated remarkable +capabilities in a wide range of tasks. Typically, an LLM is pre-trained on +large corpora and subsequently fine-tuned on task-specific datasets. However, +during fine-tuning, LLMs may forget the knowledge acquired in the pre-training +stage, leading to a decline in general capabilities. To address this issue, we +propose a new fine-tuning algorithm termed Momentum-Filtered Optimizer (MoFO). +The key idea of MoFO is to iteratively select and update the model parameters +with the largest momentum magnitudes. Compared to full-parameter training, MoFO +achieves similar fine-tuning performance while keeping parameters closer to the +pre-trained model, thereby mitigating knowledge forgetting. Unlike most +existing methods for forgetting mitigation, MoFO combines the following two +advantages. First, MoFO does not require access to pre-training data. This +makes MoFO particularly suitable for fine-tuning scenarios where pre-training +data is unavailable, such as fine-tuning checkpoint-only open-source LLMs. +Second, MoFO does not alter the original loss function. This could avoid +impairing the model performance on the fine-tuning tasks. We validate MoFO +through rigorous convergence analysis and extensive experiments, demonstrating +its superiority over existing methods in mitigating forgetting and enhancing +fine-tuning performance. + +
+
+
+
+
+ + ♻ ☆ FedADMM-InSa: An Inexact and Self-Adaptive ADMM for Federated Learning + + +
+ Federated learning (FL) is a promising framework for learning from +distributed data while maintaining privacy. The development of efficient FL +algorithms encounters various challenges, including heterogeneous data and +systems, limited communication capacities, and constrained local computational +resources. Recently developed FedADMM methods show great resilience to both +data and system heterogeneity. However, they still suffer from performance +deterioration if the hyperparameters are not carefully tuned. To address this +issue, we propose an inexact and self-adaptive FedADMM algorithm, termed +FedADMM-InSa. First, we design an inexactness criterion for the clients' local +updates to eliminate the need for empirically setting the local training +accuracy. This inexactness criterion can be assessed by each client +independently based on its unique condition, thereby reducing the local +computational cost and mitigating the undesirable straggle effect. The +convergence of the resulting inexact ADMM is proved under the assumption of +strongly convex loss functions. Additionally, we present a self-adaptive scheme +that dynamically adjusts each client's penalty parameter, enhancing algorithm +robustness by mitigating the need for empirical penalty parameter choices for +each client. Extensive numerical experiments on both synthetic and real-world +datasets are conducted. As validated by some numerical tests, our proposed +algorithm can reduce the clients' local computational load significantly and +also accelerate the learning process compared to the vanilla FedADMM. + +
+
+
+
+
+ + ♻ ☆ Optimal Decision Tree and Adaptive Submodular Ranking with Noisy + Outcomes + + +
+ In pool-based active learning, the learner is given an unlabeled data set and +aims to efficiently learn the unknown hypothesis by querying the labels of the +data points. This can be formulated as the classical Optimal Decision Tree +(ODT) problem: Given a set of tests, a set of hypotheses, and an outcome for +each pair of test and hypothesis, our objective is to find a low-cost testing +procedure (i.e., decision tree) that identifies the true hypothesis. This +optimization problem has been extensively studied under the assumption that +each test generates a deterministic outcome. However, in numerous applications, +for example, clinical trials, the outcomes may be uncertain, which renders the +ideas from the deterministic setting invalid. In this work, we study a +fundamental variant of the ODT problem in which some test outcomes are noisy, +even in the more general case where the noise is persistent, i.e., repeating a +test gives the same noisy output. Our approximation algorithms provide +guarantees that are nearly best possible and hold for the general case of a +large number of noisy outcomes per test or per hypothesis where the performance +degrades continuously with this number. We numerically evaluated our algorithms +for identifying toxic chemicals and learning linear classifiers, and observed +that our algorithms have costs very close to the information-theoretic minimum. + +
+
+
+
+
+ + ♻ ☆ A Survey on Self-Supervised Graph Foundation Models: Knowledge-Based + Perspective + + +
+ Graph self-supervised learning (SSL) is now a go-to method for pre-training +graph foundation models (GFMs). There is a wide variety of knowledge patterns +embedded in the graph data, such as node properties and clusters, which are +crucial to learning generalized representations for GFMs. However, existing +surveys of GFMs have several shortcomings: they lack comprehensiveness +regarding the most recent progress, have unclear categorization of +self-supervised methods, and take a limited architecture-based perspective that +is restricted to only certain types of graph models. As the ultimate goal of +GFMs is to learn generalized graph knowledge, we provide a comprehensive survey +of self-supervised GFMs from a novel knowledge-based perspective. We propose a +knowledge-based taxonomy, which categorizes self-supervised graph models by the +specific graph knowledge utilized. Our taxonomy consists of microscopic (nodes, +links, etc.), mesoscopic (context, clusters, etc.), and macroscopic knowledge +(global structure, manifolds, etc.). It covers a total of 9 knowledge +categories and more than 25 pretext tasks for pre-training GFMs, as well as +various downstream task generalization strategies. Such a knowledge-based +taxonomy allows us to re-examine graph models based on new architectures more +clearly, such as graph language models, as well as provide more in-depth +insights for constructing GFMs. + +
+
+ comment: 21 pages, 7 figures; work in progress +
+
+
+
+
+ + ♻ ☆ Exact Fractional Inference via Re-Parametrization & Interpolation + between Tree-Re-Weighted- and Belief Propagation- Algorithms + + +
+ The computational complexity of inference -- required to compute the +partition function, $Z$, of an Ising model over a graph of $N$''spins" -- is +most likely exponential in $N$. Efficient variational methods, such as Belief +Propagation (BP) and Tree Re-Weighted (TRW) algorithms, compute $Z$ +approximately by minimizing the respective (BP- or TRW-) free energy. We +generalize the variational scheme by building a $\lambda$-fractional +interpolation, $Z^{(\lambda)}$, where $\lambda=0$ and $\lambda=1$ correspond to +TRW- and BP-approximations, respectively. This fractional scheme -- coined +Fractional Belief Propagation (FBP) -- guarantees that in the attractive +(ferromagnetic) case $Z^{(TRW)} \geq Z^{(\lambda)} \geq Z^{(BP)}$, and there +exists a unique (``exact") $\lambda_*$ such that $Z=Z^{(\lambda_*)}$. +Generalizing the re-parametrization approach of +\citep{wainwright_tree-based_2002} and the loop series approach of +\citep{chertkov_loop_2006}, we show how to express $Z$ as a product, $\forall +\lambda:\ Z=Z^{(\lambda)}{\tilde Z}^{(\lambda)}$, where the multiplicative +correction, ${\tilde Z}^{(\lambda)}$, is an expectation over a node-independent +probability distribution built from node-wise fractional marginals. Our +theoretical analysis is complemented by extensive experiments with models from +Ising ensembles over planar and random graphs of medium- and large-sizes. The +empirical study yields a number of interesting observations, such as the +ability to estimate ${\tilde Z}^{(\lambda)}$ with $O(N^{2::4})$ fractional +samples and suppression of $\lambda_*$ fluctuations with an increase in $N$ for +instances from a particular random Ising ensemble. We also verify and discuss +the applicability of this approach to the problem of image de-noising. + +
+
+
+
+
+ + ♻ ☆ Is $F_1$ Score Suboptimal for Cybersecurity Models? Introducing + $C_{score}$, a Cost-Aware Alternative for Model Assessment + + +
+ The cost of errors related to machine learning classifiers, namely, false +positives and false negatives, are not equal and are application dependent. For +example, in cybersecurity applications, the cost of not detecting an attack is +very different from marking a benign activity as an attack. Various design +choices during machine learning model building, such as hyperparameter tuning +and model selection, allow a data scientist to trade-off between these two +errors. However, most of the commonly used metrics to evaluate model quality, +such as $F_1$ score, which is defined in terms of model precision and recall, +treat both these errors equally, making it difficult for users to optimize for +the actual cost of these errors. In this paper, we propose a new cost-aware +metric, $C_{score}$ based on precision and recall that can replace $F_1$ score +for model evaluation and selection. It includes a cost ratio that takes into +account the differing costs of handling false positives and false negatives. We +derive and characterize the new cost metric, and compare it to $F_1$ score. +Further, we use this metric for model thresholding for five cybersecurity +related datasets for multiple cost ratios. The results show an average cost +savings of 49%. + +
+
+
+
+
+ + ♻ ☆ Early detection of inflammatory arthritis to improve referrals using + multimodal machine learning from blood testing, semi-structured and + unstructured patient records + + +
+ Early detection of inflammatory arthritis (IA) is critical to efficient and +accurate hospital referral triage for timely treatment and preventing the +deterioration of the IA disease course, especially under limited healthcare +resources. The manual assessment process is the most common approach in +practice for the early detection of IA, but it is extremely labor-intensive and +inefficient. A large amount of clinical information needs to be assessed for +every referral from General Practice (GP) to the hospitals. Machine learning +shows great potential in automating repetitive assessment tasks and providing +decision support for the early detection of IA. However, most machine +learning-based methods for IA detection rely on blood testing results. But in +practice, blood testing data is not always available at the point of referrals, +so we need methods to leverage multimodal data such as semi-structured and +unstructured data for early detection of IA. In this research, we present +fusion and ensemble learning-based methods using multimodal data to assist +decision-making in the early detection of IA, and a conformal prediction-based +method to quantify the uncertainty of the prediction and detect any unreliable +predictions. To the best of our knowledge, our study is the first attempt to +utilize multimodal data to support the early detection of IA from GP referrals. + +
+
+ comment: We found some issues in data preprocessing, which will impact the + final result. Therefore we would like to withdraw the paper +
+
+
+
+
+ + ♻ ☆ PP-TIL: Personalized Planning for Autonomous Driving with Instance-based + Transfer Imitation Learning IROS 2024 + + +
+ Personalized motion planning holds significant importance within urban +automated driving, catering to the unique requirements of individual users. +Nevertheless, prior endeavors have frequently encountered difficulties in +simultaneously addressing two crucial aspects: personalized planning within +intricate urban settings and enhancing planning performance through data +utilization. The challenge arises from the expensive and limited nature of user +data, coupled with the scene state space tending towards infinity. These +factors contribute to overfitting and poor generalization problems during model +training. Henceforth, we propose an instance-based transfer imitation learning +approach. This method facilitates knowledge transfer from extensive expert +domain data to the user domain, presenting a fundamental resolution to these +issues. We initially train a pre-trained model using large-scale expert data. +Subsequently, during the fine-tuning phase, we feed the batch data, which +comprises expert and user data. Employing the inverse reinforcement learning +technique, we extract the style feature distribution from user demonstrations, +constructing the regularization term for the approximation of user style. In +our experiments, we conducted extensive evaluations of the proposed method. +Compared to the baseline methods, our approach mitigates the overfitting issue +caused by sparse user data. Furthermore, we discovered that integrating the +driving model with a differentiable nonlinear optimizer as a safety protection +layer for end-to-end personalized fine-tuning results in superior planning +performance. + +
+
+ comment: IROS 2024 Accepted +
+
+
+
+
+ + ♻ ☆ Figure it Out: Analyzing-based Jailbreak Attack on Large Language Models + + +
+ The rapid development of Large Language Models (LLMs) has brought remarkable +generative capabilities across diverse tasks. However, despite the impressive +achievements, these models still have numerous security vulnerabilities, +particularly when faced with jailbreak attacks. Therefore, by investigating +jailbreak attacks, we can uncover hidden weaknesses in LLMs and guide us in +developing more robust defense mechanisms to fortify their security. In this +paper, we further explore the boundary of jailbreak attacks on LLMs and propose +Analyzing-based Jailbreak (ABJ). This effective jailbreak attack method takes +advantage of LLMs' growing analyzing and reasoning capability and reveals their +underlying vulnerabilities when facing analysis-based tasks. We conduct a +detailed evaluation of ABJ across various open-source and closed-source LLMs, +which achieves 94.8% Attack Success Rate (ASR) and 1.06 Attack Efficiency (AE) +on GPT-4-turbo-0409, demonstrating state-of-the-art attack effectiveness and +efficiency. Our research highlights the importance of prioritizing and +enhancing the safety of LLMs to mitigate the risks of misuse.The code is +publicly available at https://github.com/theshi-1128/ABJ-Attack. + +
+
+
+
+
+ + ♻ ☆ Practical aspects for the creation of an audio dataset from field + recordings with optimized labeling budget with AI-assisted strategy ICML 2024 + + +
+ Machine Listening focuses on developing technologies to extract relevant +information from audio signals. A critical aspect of these projects is the +acquisition and labeling of contextualized data, which is inherently complex +and requires specific resources and strategies. Despite the availability of +some audio datasets, many are unsuitable for commercial applications. The paper +emphasizes the importance of Active Learning (AL) using expert labelers over +crowdsourcing, which often lacks detailed insights into dataset structures. AL +is an iterative process combining human labelers and AI models to optimize the +labeling budget by intelligently selecting samples for human review. This +approach addresses the challenge of handling large, constantly growing datasets +that exceed available computational resources and memory. The paper presents a +comprehensive data-centric framework for Machine Listening projects, detailing +the configuration of recording nodes, database structure, and labeling budget +optimization in resource-constrained scenarios. Applied to an industrial port +in Valencia, Spain, the framework successfully labeled 6540 ten-second audio +samples over five months with a small team, demonstrating its effectiveness and +adaptability to various resource availability situations. + Acknowledgments: The participation of Javier Naranjo-Alcazar, Jordi Grau-Haro +and Pedro Zuccarello in this research was funded by the Valencian Institute for +Business Competitiveness (IVACE) and the FEDER funds by means of project +Soroll-IA2 (IMDEEA/2023/91). + +
+
+ comment: Submitted to ICML 2024 Workshop on Data-Centric Machine Learning + Research +
+
+
+
+
+ + ♻ ☆ Towards Generalizable Reinforcement Learning via Causality-Guided + Self-Adaptive Representations NeurIPS24 + + +
+ General intelligence requires quick adaption across tasks. While existing +reinforcement learning (RL) methods have made progress in generalization, they +typically assume only distribution changes between source and target domains. +In this paper, we explore a wider range of scenarios where both the +distribution and environment spaces may change. For example, in Atari games, we +train agents to generalize to tasks with different levels of mode and +difficulty, where there could be new state or action variables that never +occurred in previous environments. To address this challenging setting, we +introduce a causality-guided self-adaptive representation-based approach, +called CSR, that equips the agent to generalize effectively and efficiently +across a sequence of tasks with evolving dynamics. Specifically, we employ +causal representation learning to characterize the latent causal variables and +world models within the RL system. Such compact causal representations uncover +the structural relationships among variables, enabling the agent to +autonomously determine whether changes in the environment stem from +distribution shifts or variations in space, and to precisely locate these +changes. We then devise a three-step strategy to fine-tune the model under +different scenarios accordingly. Empirical experiments show that CSR +efficiently adapts to the target domains with only a few samples and +outperforms state-of-the-art baselines on a wide range of scenarios, including +our simulated environments, Cartpole, and Atari games. + +
+
+ comment: This paper was submitted to NeurIPS24. According to the reviews, + there are some mistakes in the Theorems in this papers. Moreover, we will + choose some other environments for experiments, which means that it takes at + least months to update/rewrite the Experiment & Appendix Sections. So we need + to withdraw this paper for major revision +
+
+
+
+
+ + ♻ ☆ Analysis of Total Variation Minimization for Clustered Federated + Learning + + +
+ A key challenge in federated learning applications is the statistical +heterogeneity of local datasets. Clustered federated learning addresses this +challenge by identifying clusters of local datasets that are approximately +homogeneous. One recent approach to clustered federated learning is generalized +total variation minimization (GTVMin). This approach requires a similarity +graph which can be obtained by domain expertise or in a data-driven fashion via +graph learning techniques. Under a widely applicable clustering assumption, we +derive an upper bound the deviation between GTVMin solutions and their +cluster-wise averages. This bound provides valuable insights into the +effectiveness and robustness of GTVMin in addressing statistical heterogeneity +within federated learning environments. + +
+
+
+
+
+ + ♻ ☆ Diversifying AI: Towards Creative Chess with AlphaZero + + +
+ In recent years, Artificial Intelligence (AI) systems have surpassed human +intelligence in a variety of computational tasks. However, AI systems, like +humans, make mistakes, have blind spots, hallucinate, and struggle to +generalize to new situations. This work explores whether AI can benefit from +creative decision-making mechanisms when pushed to the limits of its +computational rationality. In particular, we investigate whether a team of +diverse AI systems can outperform a single AI in challenging tasks by +generating more ideas as a group and then selecting the best ones. We study +this question in the game of chess, the so-called drosophila of AI. We build on +AlphaZero (AZ) and extend it to represent a league of agents via a +latent-conditioned architecture, which we call AZ_db. We train AZ_db to +generate a wider range of ideas using behavioral diversity techniques and +select the most promising ones with sub-additive planning. Our experiments +suggest that AZ_db plays chess in diverse ways, solves more puzzles as a group +and outperforms a more homogeneous team. Notably, AZ_db solves twice as many +challenging puzzles as AZ, including the challenging Penrose positions. When +playing chess from different openings, we notice that players in AZ_db +specialize in different openings, and that selecting a player for each opening +using sub-additive planning results in a 50 Elo improvement over AZ. Our +findings suggest that diversity bonuses emerge in teams of AI agents, just as +they do in teams of humans and that diversity is a valuable asset in solving +computationally hard problems. + +
+
+
+
+
+ + ♻ ☆ A comparison between black-, grey- and white-box modeling for the + bidirectional Raman amplifier optimization + + +
+ Designing and optimizing optical amplifiers to maximize system performance is +becoming increasingly important as optical communication systems strive to +increase throughput. Offline optimization of optical amplifiers relies on +models ranging from white-box models deeply rooted in physics to black-box +data-driven and physics-agnostic models. Here, we compare the capabilities of +white-, grey- and black-box models on the challenging test case of optimizing a +bidirectional distributed Raman amplifier to achieve a target +frequency-distance signal power profile. We show that any of the studied +methods can achieve similar frequency and distance flatness of between 1 and +3.6 dB (depending on the definition of flatness) over the C-band in an 80-km +span. Then, we discuss the models' applicability, advantages, and drawbacks +based on the target application scenario, in particular in terms of +flexibility, optimization speed, and access to training data. + +
+
+
+
+
+ + ♻ ☆ Position: An Inner Interpretability Framework for AI Inspired by Lessons + from Cognitive Neuroscience ICML 2024 + + +
+ Inner Interpretability is a promising emerging field tasked with uncovering +the inner mechanisms of AI systems, though how to develop these mechanistic +theories is still much debated. Moreover, recent critiques raise issues that +question its usefulness to advance the broader goals of AI. However, it has +been overlooked that these issues resemble those that have been grappled with +in another field: Cognitive Neuroscience. Here we draw the relevant connections +and highlight lessons that can be transferred productively between fields. +Based on these, we propose a general conceptual framework and give concrete +methodological strategies for building mechanistic explanations in AI inner +interpretability research. With this conceptual framework, Inner +Interpretability can fend off critiques and position itself on a productive +path to explain AI systems. + +
+
+ comment: Accepted at ICML 2024 +
+
+
+
+
+ + ♻ ☆ Conditional Quantile Estimation for Uncertain Watch Time in Short-Video + Recommendation + + +
+ Accurately predicting watch time is crucial for optimizing recommendations +and user experience in short video platforms. However, existing methods that +estimate a single average watch time often fail to capture the inherent +uncertainty and diversity in user engagement patterns. In this paper, we +propose the Conditional Quantile Estimation (CQE) framework to model the entire +conditional distribution of watch time. Using quantile regression, CQE +characterizes the complex watch-time distribution for each user-video pair, +providing a flexible and comprehensive approach to understanding user behavior. +We further design multiple strategies to combine the quantile estimates, +adapting to different recommendation scenarios and user preferences. Extensive +offline experiments and online A/B tests demonstrate the superiority of CQE in +watch time prediction and user engagement modeling. In particular, the online +deployment of CQE in KuaiShow has led to significant improvements in key +evaluation metrics, including active days, active users, engagement duration, +and video view counts. These results highlight the practical impact of our +proposed approach in enhancing the user experience and overall performance of +the short video recommendation system. The code will be released after +publication. + +
+
+ comment: 8 pages, 5 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Manifold learning in Wasserstein space + + +
+ This paper aims at building the theoretical foundations for manifold learning +algorithms in the space of absolutely continuous probability measures on a +compact and convex subset of $\mathbb{R}^d$, metrized with the Wasserstein-2 +distance $\mathrm{W}$. We begin by introducing a construction of submanifolds +$\Lambda$ of probability measures equipped with metric $\mathrm{W}_\Lambda$, +the geodesic restriction of $W$ to $\Lambda$. In contrast to other +constructions, these submanifolds are not necessarily flat, but still allow for +local linearizations in a similar fashion to Riemannian submanifolds of +$\mathbb{R}^d$. We then show how the latent manifold structure of +$(\Lambda,\mathrm{W}_{\Lambda})$ can be learned from samples +$\{\lambda_i\}_{i=1}^N$ of $\Lambda$ and pairwise extrinsic Wasserstein +distances $\mathrm{W}$ only. In particular, we show that the metric space +$(\Lambda,\mathrm{W}_{\Lambda})$ can be asymptotically recovered in the sense +of Gromov--Wasserstein from a graph with nodes $\{\lambda_i\}_{i=1}^N$ and edge +weights $W(\lambda_i,\lambda_j)$. In addition, we demonstrate how the tangent +space at a sample $\lambda$ can be asymptotically recovered via spectral +analysis of a suitable "covariance operator" using optimal transport maps from +$\lambda$ to sufficiently close and diverse samples $\{\lambda_i\}_{i=1}^N$. +The paper closes with some explicit constructions of submanifolds $\Lambda$ and +numerical examples on the recovery of tangent spaces through spectral analysis. + +
+
+
+
+
+ + ♻ ☆ Inverse Concave-Utility Reinforcement Learning is Inverse Game Theory + + +
+ We consider inverse reinforcement learning problems with concave utilities. +Concave Utility Reinforcement Learning (CURL) is a generalisation of the +standard RL objective, which employs a concave function of the state occupancy +measure, rather than a linear function. CURL has garnered recent attention for +its ability to represent instances of many important applications including the +standard RL such as imitation learning, pure exploration, constrained MDPs, +offline RL, human-regularized RL, and others. Inverse reinforcement learning is +a powerful paradigm that focuses on recovering an unknown reward function that +can rationalize the observed behaviour of an agent. There has been recent +theoretical advances in inverse RL where the problem is formulated as +identifying the set of feasible reward functions. However, inverse RL for CURL +problems has not been considered previously. In this paper we show that most of +the standard IRL results do not apply to CURL in general, since CURL +invalidates the classical Bellman equations. This calls for a new theoretical +framework for the inverse CURL problem. Using a recent equivalence result +between CURL and Mean-field Games, we propose a new definition for the feasible +rewards for I-CURL by proving that this problem is equivalent to an inverse +game theory problem in a subclass of mean-field games. We present initial query +and sample complexity results for the I-CURL problem under assumptions such as +Lipschitz-continuity. Finally, we outline future directions and applications in +human--AI collaboration enabled by our results. + +
+
+
+
+
+ + ♻ ☆ Scalable Bayesian uncertainty quantification with data-driven priors for + radio interferometric imaging + + +
+ Next-generation radio interferometers like the Square Kilometer Array have +the potential to unlock scientific discoveries thanks to their unprecedented +angular resolution and sensitivity. One key to unlocking their potential +resides in handling the deluge and complexity of incoming data. This challenge +requires building radio interferometric imaging methods that can cope with the +massive data sizes and provide high-quality image reconstructions with +uncertainty quantification (UQ). This work proposes a method coined QuantifAI +to address UQ in radio-interferometric imaging with data-driven (learned) +priors for high-dimensional settings. Our model, rooted in the Bayesian +framework, uses a physically motivated model for the likelihood. The model +exploits a data-driven convex prior, which can encode complex information +learned implicitly from simulations and guarantee the log-concavity of the +posterior. We leverage probability concentration phenomena of high-dimensional +log-concave posteriors that let us obtain information about the posterior, +avoiding MCMC sampling techniques. We rely on convex optimisation methods to +compute the MAP estimation, which is known to be faster and better scale with +dimension than MCMC sampling strategies. Our method allows us to compute local +credible intervals, i.e., Bayesian error bars, and perform hypothesis testing +of structure on the reconstructed image. In addition, we propose a novel +blazing-fast method to compute pixel-wise uncertainties at different scales. We +demonstrate our method by reconstructing radio-interferometric images in a +simulated setting and carrying out fast and scalable UQ, which we validate with +MCMC sampling. Our method shows an improved image quality and more meaningful +uncertainties than the benchmark method based on a sparsity-promoting prior. +QuantifAI's source code: https://github.com/astro-informatics/QuantifAI. + +
+
+ comment: 30 pages, 14 figures, 10 tables, code available at + https://github.com/astro-informatics/QuantifAI +
+
+
+
+
+ + ♻ ☆ Empirical Capacity Model for Self-Attention Neural Networks + + +
+ Large pretrained self-attention neural networks, or transformers, have been +very successful in various tasks recently. The performance of a model on a +given task depends on its ability to memorize and generalize the training data. +Large transformer models, which may have billions of parameters, in theory have +a huge capacity to memorize content. However, the current algorithms for the +optimization fall short of the theoretical capacity, and the capacity is also +highly dependent on the content. In this paper, we focus on the memory capacity +of these models obtained using common training algorithms and synthetic +training data. Based on the results, we derive an empirical capacity model +(ECM) for a generic transformer. The ECM can be used to design task-specific +transformer models with an optimal number of parameters in cases where the +target memorization capability of the task can be defined. + +
+
+ comment: Submitted to BNAIC'24, 14 pages + refs +
+
+
+
+
+ + ♻ ☆ Understanding Prediction Discrepancies in Machine Learning Classifiers + + +
+ A multitude of classifiers can be trained on the same data to achieve similar +performances during test time, while having learned significantly different +classification patterns. This phenomenon, which we call prediction +discrepancies, is often associated with the blind selection of one model +instead of another with similar performances. When making a choice, the machine +learning practitioner has no understanding on the differences between models, +their limits, where they agree and where they don't. But his/her choice will +result in concrete consequences for instances to be classified in the +discrepancy zone, since the final decision will be based on the selected +classification pattern. Besides the arbitrary nature of the result, a bad +choice could have further negative consequences such as loss of opportunity or +lack of fairness. This paper proposes to address this question by analyzing the +prediction discrepancies in a pool of best-performing models trained on the +same data. A model-agnostic algorithm, DIG, is proposed to capture and explain +discrepancies locally, to enable the practitioner to make the best educated +decision when selecting a model by anticipating its potential undesired +consequences. All the code to reproduce the experiments is available. + +
+
+
+
+
+ + ♻ ☆ Enhancing and Assessing Instruction-Following with Fine-Grained + Instruction Variants + + +
+ The effective alignment of Large Language Models (LLMs) with precise +instructions is essential for their application in diverse real-world +scenarios. Current methods focus on enhancing the diversity and complexity of +training and evaluation samples, yet they fall short in accurately assessing +LLMs' ability to follow similar instruction variants. We introduce an effective +data augmentation technique that decomposes complex instructions into simpler +sub-components, modifies these, and reconstructs them into new variants, +thereby preserves the original instruction's context and complexity while +introducing variability, which is critical for training and evaluating LLMs' +instruction-following precision. We developed the DeMoRecon dataset using this +method to both fine-tune and evaluate LLMs. Our findings show that LLMs +fine-tuned with DeMoRecon will gain significant performance boost on both ours +and commonly used instructions-following benchmarks. + +
+
+
+
+
+ + ♻ ☆ Synthetic Tabular Data Validation: A Divergence-Based Approach + + +
+ The ever-increasing use of generative models in various fields where tabular +data is used highlights the need for robust and standardized validation metrics +to assess the similarity between real and synthetic data. Current methods lack +a unified framework and rely on diverse and often inconclusive statistical +measures. Divergences, which quantify discrepancies between data distributions, +offer a promising avenue for validation. However, traditional approaches +calculate divergences independently for each feature due to the complexity of +joint distribution modeling. This paper addresses this challenge by proposing a +novel approach that uses divergence estimation to overcome the limitations of +marginal comparisons. Our core contribution lies in applying a divergence +estimator to build a validation metric considering the joint distribution of +real and synthetic data. We leverage a probabilistic classifier to approximate +the density ratio between datasets, allowing the capture of complex +relationships. We specifically calculate two divergences: the well-known +Kullback-Leibler (KL) divergence and the Jensen-Shannon (JS) divergence. KL +divergence offers an established use in the field, while JS divergence is +symmetric and bounded, providing a reliable metric. The efficacy of this +approach is demonstrated through a series of experiments with varying +distribution complexities. The initial phase involves comparing estimated +divergences with analytical solutions for simple distributions, setting a +benchmark for accuracy. Finally, we validate our method on a real-world dataset +and its corresponding synthetic counterpart, showcasing its effectiveness in +practical applications. This research offers a significant contribution with +applicability beyond tabular data and the potential to improve synthetic data +validation in various fields. + +
+
+ comment: 15 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ Explainable Time Series Anomaly Detection using Masked Latent Generative + Modeling + + +
+ We present a novel time series anomaly detection method that achieves +excellent detection accuracy while offering a superior level of explainability. +Our proposed method, TimeVQVAE-AD, leverages masked generative modeling adapted +from the cutting-edge time series generation method known as TimeVQVAE. The +prior model is trained on the discrete latent space of a time-frequency domain. +Notably, the dimensional semantics of the time-frequency domain are preserved +in the latent space, enabling us to compute anomaly scores across different +frequency bands, which provides a better insight into the detected anomalies. +Additionally, the generative nature of the prior model allows for sampling +likely normal states for detected anomalies, enhancing the explainability of +the detected anomalies through counterfactuals. Our experimental evaluation on +the UCR Time Series Anomaly archive demonstrates that TimeVQVAE-AD +significantly surpasses the existing methods in terms of detection accuracy and +explainability. We provide our implementation on GitHub: +https://github.com/ML4ITS/TimeVQVAE-AnomalyDetection. + +
+
+ comment: Published in Pattern Recognition +
+
+
+
+
+ + ♻ ☆ Impact of data for forecasting on performance of model predictive + control in buildings with smart energy storage + + +
+ Data is required to develop forecasting models for use in Model Predictive +Control (MPC) schemes in building energy systems. However, data is costly to +both collect and exploit. Determining cost optimal data usage strategies +requires understanding of the forecast accuracy and resulting MPC operational +performance it enables. This study investigates the performance of both simple +and state-of-the-art machine learning prediction models for MPC in +multi-building energy systems using a simulated case study with historic +building energy data. The impact on forecast accuracy of measures to improve +model data efficiency are quantified, specifically for: reuse of prediction +models, reduction of training data duration, reduction of model data features, +and online model training. A simple linear multi-layer perceptron model is +shown to provide equivalent forecast accuracy to state-of-the-art models, with +greater data efficiency and generalisability. The use of more than 2 years of +training data for load prediction models provided no significant improvement in +forecast accuracy. Forecast accuracy and data efficiency were improved +simultaneously by using change-point analysis to screen training data. Reused +models and those trained with 3 months of data had on average 10% higher error +than baseline, indicating that deploying MPC systems without prior data +collection may be economic. + +
+
+ comment: 36 pages, 22 figures +
+
+
+
+
+ + ♻ ☆ Knowledge Mechanisms in Large Language Models: A Survey and Perspective + + +
+ Understanding knowledge mechanisms in Large Language Models (LLMs) is crucial +for advancing towards trustworthy AGI. This paper reviews knowledge mechanism +analysis from a novel taxonomy including knowledge utilization and evolution. +Knowledge utilization delves into the mechanism of memorization, comprehension +and application, and creation. Knowledge evolution focuses on the dynamic +progression of knowledge within individual and group LLMs. Moreover, we discuss +what knowledge LLMs have learned, the reasons for the fragility of parametric +knowledge, and the potential dark knowledge (hypothesis) that will be +challenging to address. We hope this work can help understand knowledge in LLMs +and provide insights for future research. + +
+
+ comment: Ongoing work (v2); add Section 5: Application of Knowledge Mechanism; + revise Section 6 and 7; fix typos +
+
+
+
+
+ + ♻ ☆ Analysis and Predictive Modeling of Solar Coronal Holes Using Computer + Vision and ARIMA-LSTM Networks SP + + +
+ In the era of space exploration, coronal holes on the sun play a significant +role due to their impact on satellites and aircraft through their open magnetic +fields and increased solar wind emissions. This study employs computer vision +techniques to detect coronal hole regions and estimate their sizes using +imagery from the Solar Dynamics Observatory (SDO). Additionally, we utilize +hybrid time series prediction model, specifically combination of Long +Short-Term Memory (LSTM) networks and ARIMA, to analyze trends in the area of +coronal holes and predict their areas across various solar regions over a span +of seven days. By examining time series data, we aim to identify patterns in +coronal hole behavior and understand their potential effects on space weather. + +
+
+ comment: Accepted to the first joint European Space Agency SPAICE Conference + 2024 +
+
+
+
+
+ + ♻ ☆ ZeroDDI: A Zero-Shot Drug-Drug Interaction Event Prediction Method with + Semantic Enhanced Learning and Dual-Modal Uniform Alignment IJCAI2024 + + +
+ Drug-drug interactions (DDIs) can result in various pharmacological changes, +which can be categorized into different classes known as DDI events (DDIEs). In +recent years, previously unobserved/unseen DDIEs have been emerging, posing a +new classification task when unseen classes have no labelled instances in the +training stage, which is formulated as a zero-shot DDIE prediction (ZS-DDIE) +task. However, existing computational methods are not directly applicable to +ZS-DDIE, which has two primary challenges: obtaining suitable DDIE +representations and handling the class imbalance issue. To overcome these +challenges, we propose a novel method named ZeroDDI for the ZS-DDIE task. +Specifically, we design a biological semantic enhanced DDIE representation +learning module, which emphasizes the key biological semantics and distills +discriminative molecular substructure-related semantics for DDIE representation +learning. Furthermore, we propose a dual-modal uniform alignment strategy to +distribute drug pair representations and DDIE semantic representations +uniformly in a unit sphere and align the matched ones, which can mitigate the +issue of class imbalance. Extensive experiments showed that ZeroDDI surpasses +the baselines and indicate that it is a promising tool for detecting unseen +DDIEs. Our code has been released in https://github.com/wzy-Sarah/ZeroDDI. + +
+
+ comment: Accepted by IJCAI2024 +
+
+
+
+
+ + ♻ ☆ A learning theory for quantum photonic processors and beyond + + +
+ We consider the tasks of learning quantum states, measurements and channels +generated by continuous-variable (CV) quantum circuits. This family of circuits +is suited to describe optical quantum technologies and in particular it +includes state-of-the-art photonic processors capable of showing quantum +advantage. We define classes of functions that map classical variables, encoded +into the CV circuit parameters, to outcome probabilities evaluated on those +circuits. We then establish efficient learnability guarantees for such classes, +by computing bounds on their pseudo-dimension or covering numbers, showing that +CV quantum circuits can be learned with a sample complexity that scales +polynomially with the circuit's size, i.e., the number of modes. Our results +show that CV circuits can be trained efficiently using a number of training +samples that, unlike their finite-dimensional counterpart, does not scale with +the circuit depth. + +
+
+ comment: 27+5 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ QQQ: Quality Quattuor-Bit Quantization for Large Language Models + + +
+ Quantization is a proven effective method for compressing large language +models. Although popular techniques like W8A8 and W4A16 effectively maintain +model performance, they often fail to concurrently speed up the prefill and +decoding stages of inference. W4A8 is a promising strategy to accelerate both +of them while usually leads to a significant performance degradation. To +address these issues, we present QQQ, a Quality Quattuor-bit Quantization +method with 4-bit weights and 8-bit activations. QQQ employs adaptive smoothing +and Hessian-based compensation, significantly enhancing the performance of +quantized models without extensive training. Furthermore, we meticulously +engineer W4A8 GEMM kernels to increase inference speed. Our specialized +per-channel W4A8 GEMM and per-group W4A8 GEMM achieve impressive speed +increases of 3.67$\times$ and 3.29 $\times$ over FP16 GEMM. Our extensive +experiments show that QQQ achieves performance on par with existing +state-of-the-art LLM quantization methods while significantly accelerating +inference, achieving speed boosts up to 2.24 $\times$, 2.10$\times$, and +1.25$\times$ compared to FP16, W8A8, and W4A16, respectively. + +
+
+
+
+
+ + ♻ ☆ Model Free Prediction with Uncertainty Assessment + + +
+ Deep nonparametric regression, characterized by the utilization of deep +neural networks to learn target functions, has emerged as a focus of research +attention in recent years. Despite considerable progress in understanding +convergence rates, the absence of asymptotic properties hinders rigorous +statistical inference. To address this gap, we propose a novel framework that +transforms the deep estimation paradigm into a platform conducive to +conditional mean estimation, leveraging the conditional diffusion model. +Theoretically, we develop an end-to-end convergence rate for the conditional +diffusion model and establish the asymptotic normality of the generated +samples. Consequently, we are equipped to construct confidence regions, +facilitating robust statistical inference. Furthermore, through numerical +experiments, we empirically validate the efficacy of our proposed methodology. + +
+
+
+
+
+ + ♻ ☆ FTF-ER: Feature-Topology Fusion-Based Experience Replay Method for + Continual Graph Learning + + +
+ Continual graph learning (CGL) is an important and challenging task that aims +to extend static GNNs to dynamic task flow scenarios. As one of the mainstream +CGL methods, the experience replay (ER) method receives widespread attention +due to its superior performance. However, existing ER methods focus on +identifying samples by feature significance or topological relevance, which +limits their utilization of comprehensive graph data. In addition, the +topology-based ER methods only consider local topological information and add +neighboring nodes to the buffer, which ignores the global topological +information and increases memory overhead. To bridge these gaps, we propose a +novel method called Feature-Topology Fusion-based Experience Replay (FTF-ER) to +effectively mitigate the catastrophic forgetting issue with enhanced +efficiency. Specifically, from an overall perspective to maximize the +utilization of the entire graph data, we propose a highly complementary +approach including both feature and global topological information, which can +significantly improve the effectiveness of the sampled nodes. Moreover, to +further utilize global topological information, we propose Hodge Potential +Score (HPS) as a novel module to calculate the topological importance of nodes. +HPS derives a global node ranking via Hodge decomposition on graphs, providing +more accurate global topological information compared to neighbor sampling. By +excluding neighbor sampling, HPS significantly reduces buffer storage costs for +acquiring topological information and simultaneously decreases training time. +Compared with state-of-the-art methods, FTF-ER achieves a significant +improvement of 3.6% in AA and 7.1% in AF on the OGB-Arxiv dataset, +demonstrating its superior performance in the class-incremental learning +setting. + +
+
+ comment: Accepted by ACM Multimedia 2024 +
+
+
+
+
+ + ♻ ☆ Robot Synesthesia: In-Hand Manipulation with Visuotactile Sensing + + +
+ Executing contact-rich manipulation tasks necessitates the fusion of tactile +and visual feedback. However, the distinct nature of these modalities poses +significant challenges. In this paper, we introduce a system that leverages +visual and tactile sensory inputs to enable dexterous in-hand manipulation. +Specifically, we propose Robot Synesthesia, a novel point cloud-based tactile +representation inspired by human tactile-visual synesthesia. This approach +allows for the simultaneous and seamless integration of both sensory inputs, +offering richer spatial information and facilitating better reasoning about +robot actions. The method, trained in a simulated environment and then deployed +to a real robot, is applicable to various in-hand object rotation tasks. +Comprehensive ablations are performed on how the integration of vision and +touch can improve reinforcement learning and Sim2Real performance. Our project +page is available at https://yingyuan0414.github.io/visuotactile/ . + +
+
+ comment: Project page: https://yingyuan0414.github.io/visuotactile/ +
+
+
+
+
+ + ♻ ☆ FrameQuant: Flexible Low-Bit Quantization for Transformers + + +
+ Transformers are the backbone of powerful foundation models for many Vision +and Natural Language Processing tasks. But their compute and memory/storage +footprint is large, and so, serving such models is expensive often requiring +high-end hardware. To mitigate this difficulty, Post-Training Quantization +seeks to modify a pre-trained model and quantize it to eight bits or lower, +significantly boosting compute/memory/latency efficiency. Such models have been +successfully quantized to four bits with some performance loss. In this work, +we outline a simple scheme to quantize Transformer-based models to just two +bits (plus some overhead) with only a small drop in accuracy. Key to our +formulation is a concept borrowed from Harmonic analysis called Fusion Frames. +Our main finding is that the quantization must take place not in the original +weight space, but instead in the Fusion Frame representations. If quantization +is interpreted as the addition of noise, our casting of the problem allows +invoking an extensive body of known consistent recovery and noise robustness +guarantees. Further, if desired, de-noising filters are known in closed form. +We show empirically, via a variety of experiments, that (almost) two-bit +quantization for Transformer models promises sizable efficiency gains. The code +is available at https://github.com/vsingh-group/FrameQuant + +
+
+ comment: 25 pages, 15 figures +
+
+
+
+
+ + ♻ ☆ Time Series Imputation with Multivariate Radial Basis Function Neural + Network + + +
+ Researchers have been persistently working to address the issue of missing +values in time series data. Numerous models have been proposed, striving to +estimate the distribution of the data. The Radial Basis Functions Neural +Network (RBFNN) has recently exhibited exceptional performance in estimating +data distribution. In this paper, we propose a time series imputation model +based on RBFNN. Our imputation model learns local information from timestamps +to create a continuous function. Additionally, we incorporate time gaps to +facilitate learning information considering the missing terms of missing +values. We name this model the Missing Imputation Multivariate RBFNN +(MIM-RBFNN). However, MIM-RBFNN relies on a local information-based learning +approach, which presents difficulties in utilizing temporal information. +Therefore, we propose an extension called the Missing Value Imputation +Recurrent Neural Network with Continuous Function (MIRNN-CF) using the +continuous function generated by MIM-RBFNN. We evaluate the performance using +two real-world datasets with non-random missing and random missing patterns, +and conduct an ablation study comparing MIM-RBFNN and MIRNN-CF. + +
+
+
+
+
+ + ♻ ☆ Multi-Tower Multi-Interest Recommendation with User Representation Repel + + +
+ In the era of information overload, the value of recommender systems has been +profoundly recognized in academia and industry alike. Multi-interest sequential +recommendation, in particular, is a subfield that has been receiving increasing +attention in recent years. By generating multiple-user representations, +multi-interest learning models demonstrate superior expressiveness than +single-user representation models, both theoretically and empirically. Despite +major advancements in the field, three major issues continue to plague the +performance and adoptability of multi-interest learning methods, the difference +between training and deployment objectives, the inability to access item +information, and the difficulty of industrial adoption due to its single-tower +architecture. We address these challenges by proposing a novel multi-tower +multi-interest framework with user representation repel. Experimental results +across multiple large-scale industrial datasets proved the effectiveness and +generalizability of our proposed framework. + +
+
+ comment: Not accepted by conference +
+
+
+
+
+ + ♻ ☆ The Hard-Constraint PINNs for Interface Optimal Control Problems + + +
+ We show that the physics-informed neural networks (PINNs), in combination +with some recently developed discontinuity capturing neural networks, can be +applied to solve optimal control problems subject to partial differential +equations (PDEs) with interfaces and some control constraints. The resulting +algorithm is mesh-free and scalable to different PDEs, and it ensures the +control constraints rigorously. Since the boundary and interface conditions, as +well as the PDEs, are all treated as soft constraints by lumping them into a +weighted loss function, it is necessary to learn them simultaneously and there +is no guarantee that the boundary and interface conditions can be satisfied +exactly. This immediately causes difficulties in tuning the weights in the +corresponding loss function and training the neural networks. To tackle these +difficulties and guarantee the numerical accuracy, we propose to impose the +boundary and interface conditions as hard constraints in PINNs by developing a +novel neural network architecture. The resulting hard-constraint PINNs approach +guarantees that both the boundary and interface conditions can be satisfied +exactly or with a high degree of accuracy, and they are decoupled from the +learning of the PDEs. Its efficiency is promisingly validated by some elliptic +and parabolic interface optimal control problems. + +
+
+
+
+
+ + ♻ ☆ MINT-1T: Scaling Open-Source Multimodal Data by 10x: A Multimodal + Dataset with One Trillion Tokens + + +
+ Multimodal interleaved datasets featuring free-form interleaved sequences of +images and text are crucial for training frontier large multimodal models +(LMMs). Despite the rapid progression of open-source LMMs, there remains a +pronounced scarcity of large-scale, diverse open-source multimodal interleaved +datasets. In response, we introduce MINT-1T, the most extensive and diverse +open-source Multimodal INTerleaved dataset to date. MINT-1T comprises one +trillion text tokens and 3.4 billion images, a 10x scale-up from existing +open-source datasets. Additionally, we include previously untapped sources such +as PDFs and ArXiv papers. As scaling multimodal interleaved datasets requires +substantial engineering effort, sharing the data curation process and releasing +the dataset greatly benefits the community. Our experiments show that LMMs +trained on MINT-1T rival the performance of models trained on the previous +leading dataset, OBELICS. Our data and code will be released at +https://github.com/mlfoundations/MINT-1T. + +
+
+
+
+
+ + ♻ ☆ Disentangled Condensation for Large-scale Graphs + + +
+ Graph condensation has emerged as an intriguing technique to save the +expensive training costs of Graph Neural Networks (GNNs) by substituting a +condensed small graph with the original graph. Despite the promising results +achieved, previous methods usually employ an entangled paradigm of redundant +parameters (nodes, edges, GNNs), which incurs complex joint optimization during +condensation. This paradigm has considerably impeded the scalability of graph +condensation, making it challenging to condense extremely large-scale graphs +and generate high-fidelity condensed graphs. Therefore, we propose to +disentangle the condensation process into a two-stage GNN-free paradigm, +independently condensing nodes and generating edges while eliminating the need +to optimize GNNs at the same time. The node condensation module avoids the +complexity of GNNs by focusing on node feature alignment with anchors of the +original graph, while the edge translation module constructs the edges of the +condensed nodes by transferring the original structure knowledge with +neighborhood anchors. This simple yet effective approach achieves at least 10 +times faster than state-of-the-art methods with comparable accuracy on +medium-scale graphs. Moreover, the proposed DisCo can successfully scale up to +the Ogbn-papers100M graph with flexible reduction rates. Extensive downstream +tasks and ablation study on five common datasets further demonstrate the +effectiveness of the proposed DisCo framework. The source code will be made +publicly available. + +
+
+ comment: Under Review +
+
+
+
+
+ + ♻ ☆ Saliency Guided Image Warping for Unsupervised Domain Adaptation + + +
+ Driving is challenging in conditions like night, rain, and snow. The lack of +good labeled datasets has hampered progress in scene understanding under such +conditions. Unsupervised domain adaptation (UDA) using large labeled clear-day +datasets is a promising research direction in such cases. Current UDA methods, +however, treat all image pixels uniformly, leading to over-reliance on the +dominant scene backgrounds (e.g., roads, sky, sidewalks) that appear +dramatically different across domains. As a result, they struggle to learn +effective features of smaller and often sparse foreground objects (e.g., +people, vehicles, signs). + In this work, we improve UDA training by using in-place image warping to +focus on salient object regions. Our insight is that while backgrounds vary +significantly across domains (e.g., snowy night vs. clear day), object +appearances vary to a lesser extent. Therefore, we design instance-level +saliency guidance to adaptively oversample object regions, which reduces +adverse effects from background context and enhances backbone feature learning. +We then unwarp the better learned features while adapting from source to +target. Our approach improves adaptation across geographies, lighting, and +weather conditions, and is agnostic to the task (segmentation, detection), +domain adaptation algorithm, saliency guidance, and underlying model +architecture. Result highlights include +6.1 mAP50 for BDD100K Clear +$\rightarrow$ DENSE Foggy, +3.7 mAP50 for BDD100K Day $\rightarrow$ Night, +3.0 +mAP50 for BDD100K Clear $\rightarrow$ Rainy, and +6.3 mIoU for Cityscapes +$\rightarrow$ ACDC. Our method adds minimal training memory and incurs no +additional inference latency. Please see Appendix for more results and +analysis. + +
+
+
+
+
+ + ♻ ☆ SILO Language Models: Isolating Legal Risk In a Nonparametric Datastore ICLR 2024 + + +
+ The legality of training language models (LMs) on copyrighted or otherwise +restricted data is under intense debate. However, as we show, model performance +significantly degrades if trained only on low-risk text (e.g., out-of-copyright +books or government documents), due to its limited size and domain coverage. We +present SILO, a new language model that manages this risk-performance tradeoff +during inference. SILO is built by (1) training a parametric LM on Open License +Corpus (OLC), a new corpus we curate with 228B tokens of public domain and +permissively licensed text and (2) augmenting it with a more general and easily +modifiable nonparametric datastore (e.g., containing copyrighted books or news) +that is only queried during inference. The datastore allows use of high-risk +data without training on it, supports sentence-level data attribution, and +enables data producers to opt out from the model by removing content from the +store. These capabilities can foster compliance with data-use regulations such +as the fair use doctrine in the United States and the GDPR in the European +Union. Our experiments show that the parametric LM struggles on domains not +covered by OLC. However, access to the datastore greatly improves out of domain +performance, closing 90% of the performance gap with an LM trained on the Pile, +a more diverse corpus with mostly high-risk text. We also analyze which +nonparametric approach works best, where the remaining errors lie, and how +performance scales with datastore size. Our results suggest that it is possible +to build high quality language models while mitigating their legal risk. + +
+
+ comment: 29 pages; 7 figures. Published as a conference paper at ICLR 2024 + (spotlight). Code, models, and data available at + https://github.com/kernelmachine/silo-lm +
+
+
+
+
+ + ♻ ☆ A2SF: Accumulative Attention Scoring with Forgetting Factor for Token + Pruning in Transformer Decoder + + +
+ Recently, large language models (LLM) based on transformers are facing memory +bottleneck issues due to KV cache, especially in long sequence handling. +Previous researches proposed KV cache compression techniques that identify +insignificant tokens based on Accumulative Attention Scores and removes their +items from KV cache, noting that only few tokens play an important role in +attention operations. However, we have observed that the existing Accumulative +Attention Score is not suitable for the transformer decoder structure. In the +decoder model, the number of times the Attention Score accumulates varies +depending on the order of token appearance due to the effect of masking, +causing an uneven comparison between tokens. To solve this, we propose +Accumulative Attention Score with Forgetting Factor (A2SF) technique, which +introduces a Forgetting Factor in the Attention Score accumulation process. +A2SF applies a penalty to the past Attention Score generated from old tokens by +repeatedly multiplying the Forgetting Factor to the Attention Score over time. +Therefore, older tokens receive a larger penalty, providing fairness among +different ages of tokens. Through the fair comparison among tokens, we can more +effectively select important tokens. We have verified the accuracy improvement +through A2SF in the OPT and LLaMA models and A2SF improves the accuracy of +LLaMA 2 by up to 7.8% and 5.1% on 1-shot and 0-shot. + +
+
+ comment: 11 pages(9 pages + reference 2 pages), 6 figures +
+
+
+
+
+ + ♻ ☆ An Adaptive Gradient Regularization Method + + +
+ Optimizer plays an important role in neural network training with high +efficiency and performance. Weight update based on its gradient is the central +part of the optimizer. It has been shown that normalization and standardization +operation on weight and gradient can accelerate the training process and +improve performance such as Weight Standardization (WS), weight normalization +(WN) and gradient normalization (GN); there is also gradient centralization +(GC). In this work, we introduce a new optimization technique based on the +gradient magnitude in a gradient vector named adaptive gradient regularization +(AGR), which normalizes the gradient vector in all dimensions as a coefficient +vector and subtracts the product of the gradient and its coefficient vector by +the vanilla gradient. It can be viewed as an adaptive gradient clipping method. +We show that the AGR can improve the loss function Lipschitzness with a more +stable training process and better generalization performance. AGR is very +simple to be embedded into vanilla optimizers such as Adan and AdamW with only +three lines of code. Our experiments are conducted in image generation, image +classification and language representation, which shows that our AGR improves +the training result. + +
+
+ comment: 11 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ UPS: Efficiently Building Foundation Models for PDE Solving via + Cross-Modal Adaptation ICML 2024 + + +
+ We present Unified PDE Solvers (UPS), a data- and compute-efficient approach +to developing unified neural operators for diverse families of spatiotemporal +PDEs from various domains, dimensions, and resolutions. UPS embeds different +PDEs into a shared representation space and processes them using a +FNO-transformer architecture. Rather than training the network from scratch, +which is data-demanding and computationally expensive, we warm-start the +transformer from pretrained LLMs and perform explicit alignment to reduce the +modality gap while improving data and compute efficiency. The cross-modal UPS +achieves state-of-the-art results on a wide range of 1D and 2D PDE families +from PDEBench, outperforming existing unified models using 4 times less data +and 26 times less compute. Meanwhile, it is capable of few-shot transfer to +unseen PDE families and coefficients. + +
+
+ comment: ICML 2024 AI for Science Workshop (Spotlight) +
+
+
+
+
+ + ♻ ☆ CAT: Interpretable Concept-based Taylor Additive Models + + +
+ As an emerging interpretable technique, Generalized Additive Models (GAMs) +adopt neural networks to individually learn non-linear functions for each +feature, which are then combined through a linear model for final predictions. +Although GAMs can explain deep neural networks (DNNs) at the feature level, +they require large numbers of model parameters and are prone to overfitting, +making them hard to train and scale. Additionally, in real-world datasets with +many features, the interpretability of feature-based explanations diminishes +for humans. To tackle these issues, recent research has shifted towards +concept-based interpretable methods. These approaches try to integrate concept +learning as an intermediate step before making predictions, explaining the +predictions in terms of human-understandable concepts. However, these methods +require domain experts to extensively label concepts with relevant names and +their ground-truth values. In response, we propose CAT, a novel interpretable +Concept-bAsed Taylor additive model to simply this process. CAT does not have +to require domain experts to annotate concepts and their ground-truth values. +Instead, it only requires users to simply categorize input features into broad +groups, which can be easily accomplished through a quick metadata review. +Specifically, CAT first embeds each group of input features into +one-dimensional high-level concept representation, and then feeds the concept +representations into a new white-box Taylor Neural Network (TaylorNet). The +TaylorNet aims to learn the non-linear relationship between the inputs and +outputs using polynomials. Evaluation results across multiple benchmarks +demonstrate that CAT can outperform or compete with the baselines while +reducing the need of extensive model parameters. Importantly, it can explain +model predictions through high-level concepts that human can understand. + +
+
+
+
+
+
+
+
+ + Multimedia 7 + +
+
+
+ + ☆ Learning Video Context as Interleaved Multimodal Sequences ECCV 2024 + + +
+ Narrative videos, such as movies, pose significant challenges in video +understanding due to their rich contexts (characters, dialogues, storylines) +and diverse demands (identify who, relationship, and reason). In this paper, we +introduce MovieSeq, a multimodal language model developed to address the wide +range of challenges in understanding video contexts. Our core idea is to +represent videos as interleaved multimodal sequences (including images, plots, +videos, and subtitles), either by linking external knowledge databases or using +offline models (such as whisper for subtitles). Through instruction-tuning, +this approach empowers the language model to interact with videos using +interleaved multimodal instructions. For example, instead of solely relying on +video as input, we jointly provide character photos alongside their names and +dialogues, allowing the model to associate these elements and generate more +comprehensive responses. To demonstrate its effectiveness, we validate +MovieSeq's performance on six datasets (LVU, MAD, Movienet, CMD, TVC, MovieQA) +across five settings (video classification, audio description, video-text +retrieval, video captioning, and video question-answering). The code will be +public at https://github.com/showlab/MovieSeq. + +
+
+ comment: Accepted by ECCV 2024 +
+
+
+
+
+ + ☆ Open-Vocabulary Audio-Visual Semantic Segmentation ACM MM 2024 + + +
+ Audio-visual semantic segmentation (AVSS) aims to segment and classify +sounding objects in videos with acoustic cues. However, most approaches operate +on the close-set assumption and only identify pre-defined categories from +training data, lacking the generalization ability to detect novel categories in +practical applications. In this paper, we introduce a new task: open-vocabulary +audio-visual semantic segmentation, extending AVSS task to open-world scenarios +beyond the annotated label space. This is a more challenging task that requires +recognizing all categories, even those that have never been seen nor heard +during training. Moreover, we propose the first open-vocabulary AVSS framework, +OV-AVSS, which mainly consists of two parts: 1) a universal sound source +localization module to perform audio-visual fusion and locate all potential +sounding objects and 2) an open-vocabulary classification module to predict +categories with the help of the prior knowledge from large-scale pre-trained +vision-language models. To properly evaluate the open-vocabulary AVSS, we split +zero-shot training and testing subsets based on the AVSBench-semantic +benchmark, namely AVSBench-OV. Extensive experiments demonstrate the strong +segmentation and zero-shot generalization ability of our model on all +categories. On the AVSBench-OV dataset, OV-AVSS achieves 55.43% mIoU on base +categories and 29.14% mIoU on novel categories, exceeding the state-of-the-art +zero-shot method by 41.88%/20.61% and open-vocabulary method by 10.2%/11.6%. +The code is available at https://github.com/ruohaoguo/ovavss. + +
+
+ comment: Accepted by ACM MM 2024 (Oral) +
+
+
+
+
+ + ☆ Can LLMs "Reason" in Music? An Evaluation of LLMs' Capability of Music + Understanding and Generation + + +
+ Symbolic Music, akin to language, can be encoded in discrete symbols. Recent +research has extended the application of large language models (LLMs) such as +GPT-4 and Llama2 to the symbolic music domain including understanding and +generation. Yet scant research explores the details of how these LLMs perform +on advanced music understanding and conditioned generation, especially from the +multi-step reasoning perspective, which is a critical aspect in the +conditioned, editable, and interactive human-computer co-creation process. This +study conducts a thorough investigation of LLMs' capability and limitations in +symbolic music processing. We identify that current LLMs exhibit poor +performance in song-level multi-step music reasoning, and typically fail to +leverage learned music knowledge when addressing complex musical tasks. An +analysis of LLMs' responses highlights distinctly their pros and cons. Our +findings suggest achieving advanced musical capability is not intrinsically +obtained by LLMs, and future research should focus more on bridging the gap +between music knowledge and reasoning, to improve the co-creation experience +for musicians. + +
+
+ comment: Accepted by ISMIR2024 +
+
+
+
+
+ + ☆ Design and Development of Laughter Recognition System Based on + Multimodal Fusion and Deep Learning + + +
+ This study aims to design and implement a laughter recognition system based +on multimodal fusion and deep learning, leveraging image and audio processing +technologies to achieve accurate laughter recognition and emotion analysis. +First, the system loads video files and uses the OpenCV library to extract +facial information while employing the Librosa library to process audio +features such as MFCC. Then, multimodal fusion techniques are used to integrate +image and audio features, followed by training and prediction using deep +learning models. Evaluation results indicate that the model achieved 80% +accuracy, precision, and recall on the test dataset, with an F1 score of 80%, +demonstrating robust performance and the ability to handle real-world data +variability. This study not only verifies the effectiveness of multimodal +fusion methods in laughter recognition but also highlights their potential +applications in affective computing and human-computer interaction. Future work +will focus on further optimizing feature extraction and model architecture to +improve recognition accuracy and expand application scenarios, promoting the +development of laughter recognition technology in fields such as mental health +monitoring and educational activity evaluation + +
+
+ comment: 7 pages,2 figures +
+
+
+
+
+ + ☆ ESIQA: Perceptual Quality Assessment of Vision-Pro-based Egocentric + Spatial Images + + +
+ With the development of eXtended Reality (XR), head-mounted shooting and +display technology have experienced significant advancement and gained +considerable attention. Egocentric spatial images and videos are emerging as a +compelling form of stereoscopic XR content. Different from traditional 2D +images, egocentric spatial images present challenges for perceptual quality +assessment due to their special shooting, processing methods, and stereoscopic +characteristics. However, the corresponding image quality assessment (IQA) +research for egocentric spatial images is still lacking. In this paper, we +establish the Egocentric Spatial Images Quality Assessment Database (ESIQAD), +the first IQA database dedicated for egocentric spatial images as far as we +know. Our ESIQAD includes 500 egocentric spatial images, containing 400 images +captured with the Apple Vision Pro and 100 images generated via an iPhone's +"Spatial Camera" app. The corresponding mean opinion scores (MOSs) are +collected under three viewing modes, including 2D display, 3D-window display, +and 3D-immersive display. Furthermore, based on our database, we conduct a +benchmark experiment and evaluate the performance of 22 state-of-the-art IQA +models under three different viewing modes. We hope this research can +facilitate future IQA research on egocentric spatial images. The database is +available at https://github.com/IntMeGroup/ESIQA. + +
+
+ comment: 8 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Attribute-driven Disentangled Representation Learning for Multimodal + Recommendation + + +
+ Recommendation algorithms forecast user preferences by correlating user and +item representations derived from historical interaction patterns. In pursuit +of enhanced performance, many methods focus on learning robust and independent +representations by disentangling the intricate factors within interaction data +across various modalities in an unsupervised manner. However, such an approach +obfuscates the discernment of how specific factors (e.g., category or brand) +influence the outcomes, making it challenging to regulate their effects. In +response to this challenge, we introduce a novel method called Attribute-Driven +Disentangled Representation Learning (short for AD-DRL), which explicitly +incorporates attributes from different modalities into the disentangled +representation learning process. By assigning a specific attribute to each +factor in multimodal features, AD-DRL can disentangle the factors at both +attribute and attribute-value levels. To obtain robust and independent +representations for each factor associated with a specific attribute, we first +disentangle the representations of features both within and across different +modalities. Moreover, we further enhance the robustness of the representations +by fusing the multimodal features of the same factor. Empirical evaluations +conducted on three public real-world datasets substantiate the effectiveness of +AD-DRL, as well as its interpretability and controllability. + +
+
+ comment: ACM Multimedia 2024 Accepted +
+
+
+
+
+ + ♻ ☆ Towards Natural Language-Guided Drones: GeoText-1652 Benchmark with + Spatial Relation Matching ECCV 2024 + + +
+ Navigating drones through natural language commands remains challenging due +to the dearth of accessible multi-modal datasets and the stringent precision +requirements for aligning visual and textual data. To address this pressing +need, we introduce GeoText-1652, a new natural language-guided geo-localization +benchmark. This dataset is systematically constructed through an interactive +human-computer process leveraging Large Language Model (LLM) driven annotation +techniques in conjunction with pre-trained vision models. GeoText-1652 extends +the established University-1652 image dataset with spatial-aware text +annotations, thereby establishing one-to-one correspondences between image, +text, and bounding box elements. We further introduce a new optimization +objective to leverage fine-grained spatial associations, called blending +spatial matching, for region-level spatial relation matching. Extensive +experiments reveal that our approach maintains a competitive recall rate +comparing other prevailing cross-modality methods. This underscores the +promising potential of our approach in elevating drone control and navigation +through the seamless integration of natural language commands in real-world +scenarios. + +
+
+ comment: Accepted by ECCV 2024 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 70 + +
+
+
+ + ☆ ThinK: Thinner Key Cache by Query-Driven Pruning + + +
+ Large Language Models (LLMs) have revolutionized the field of natural +language processing, achieving unprecedented performance across a variety of +applications by leveraging increased model sizes and sequence lengths. However, +the associated rise in computational and memory costs poses significant +challenges, particularly in managing long sequences due to the quadratic +complexity of the transformer attention mechanism. This paper focuses on the +long-context scenario, addressing the inefficiencies in KV cache memory +consumption during inference. Unlike existing approaches that optimize the +memory based on the sequence lengths, we uncover that the channel dimension of +the KV cache exhibits significant redundancy, characterized by unbalanced +magnitude distribution and low-rank structure in attention weights. Based on +these observations, we propose ThinK, a novel query-dependent KV cache pruning +method designed to minimize attention weight loss while selectively pruning the +least significant channels. Our approach not only maintains or enhances model +accuracy but also achieves a reduction in memory costs by over 20% compared +with vanilla KV cache eviction methods. Extensive evaluations on the LLaMA3 and +Mistral models across various long-sequence datasets confirm the efficacy of +ThinK, setting a new precedent for efficient LLM deployment without +compromising performance. We also outline the potential of extending our method +to value cache pruning, demonstrating ThinK's versatility and broad +applicability in reducing both memory and computational overheads. + +
+
+ comment: 20 pages, 6 figures +
+
+
+
+
+ + ☆ Evolver: Chain-of-Evolution Prompting to Boost Large Multimodal Models + for Hateful Meme Detection + + +
+ Recent advances show that two-stream approaches have achieved outstanding +performance in hateful meme detection. However, hateful memes constantly evolve +as new memes emerge by fusing progressive cultural ideas, making existing +methods obsolete or ineffective. In this work, we explore the potential of +Large Multimodal Models (LMMs) for hateful meme detection. To this end, we +propose Evolver, which incorporates LMMs via Chain-of-Evolution (CoE) +Prompting, by integrating the evolution attribute and in-context information of +memes. Specifically, Evolver simulates the evolving and expressing process of +memes and reasons through LMMs in a step-by-step manner. First, an evolutionary +pair mining module retrieves the top-k most similar memes in the external +curated meme set with the input meme. Second, an evolutionary information +extractor is designed to summarize the semantic regularities between the paired +memes for prompting. Finally, a contextual relevance amplifier enhances the +in-context hatefulness information to boost the search for evolutionary +processes. Extensive experiments on public FHM, MAMI, and HarM datasets show +that CoE prompting can be incorporated into existing LMMs to improve their +performance. More encouragingly, it can serve as an interpretive tool to +promote the understanding of the evolution of social memes. + +
+
+
+
+
+ + ☆ From Feature Importance to Natural Language Explanations Using LLMs with + RAG + + +
+ As machine learning becomes increasingly integral to autonomous +decision-making processes involving human interaction, the necessity of +comprehending the model's outputs through conversational means increases. Most +recently, foundation models are being explored for their potential as post hoc +explainers, providing a pathway to elucidate the decision-making mechanisms of +predictive models. In this work, we introduce traceable question-answering, +leveraging an external knowledge repository to inform the responses of Large +Language Models (LLMs) to user queries within a scene understanding task. This +knowledge repository comprises contextual details regarding the model's output, +containing high-level features, feature importance, and alternative +probabilities. We employ subtractive counterfactual reasoning to compute +feature importance, a method that entails analysing output variations resulting +from decomposing semantic features. Furthermore, to maintain a seamless +conversational flow, we integrate four key characteristics - social, causal, +selective, and contrastive - drawn from social science research on human +explanations into a single-shot prompt, guiding the response generation +process. Our evaluation demonstrates that explanations generated by the LLMs +encompassed these elements, indicating its potential to bridge the gap between +complex model outputs and natural language expressions. + +
+
+
+
+
+ + ☆ Enabling Contextual Soft Moderation on Social Media through Contrastive + Textual Deviation + + +
+ Automated soft moderation systems are unable to ascertain if a post supports +or refutes a false claim, resulting in a large number of contextual false +positives. This limits their effectiveness, for example undermining trust in +health experts by adding warnings to their posts or resorting to vague warnings +instead of granular fact-checks, which result in desensitizing users. In this +paper, we propose to incorporate stance detection into existing automated +soft-moderation pipelines, with the goal of ruling out contextual false +positives and providing more precise recommendations for social media content +that should receive warnings. We develop a textual deviation task called +Contrastive Textual Deviation (CTD) and show that it outperforms existing +stance detection approaches when applied to soft moderation.We then integrate +CTD into the stateof-the-art system for automated soft moderation Lambretta, +showing that our approach can reduce contextual false positives from 20% to +2.1%, providing another important building block towards deploying reliable +automated soft moderation tools on social media. + +
+
+
+
+
+ + ☆ Automated Review Generation Method Based on Large Language Models + + +
+ Literature research, vital for scientific advancement, is overwhelmed by the +vast ocean of available information. Addressing this, we propose an automated +review generation method based on Large Language Models (LLMs) to streamline +literature processing and reduce cognitive load. In case study on propane +dehydrogenation (PDH) catalysts, our method swiftly generated comprehensive +reviews from 343 articles, averaging seconds per article per LLM account. +Extended analysis of 1041 articles provided deep insights into catalysts' +composition, structure, and performance. Recognizing LLMs' hallucinations, we +employed a multi-layered quality control strategy, ensuring our method's +reliability and effective hallucination mitigation. Expert verification +confirms the accuracy and citation integrity of generated reviews, +demonstrating LLM hallucination risks reduced to below 0.5% with over 95% +confidence. Released Windows application enables one-click review generation, +aiding researchers in tracking advancements and recommending literature. This +approach showcases LLMs' role in enhancing scientific research productivity and +sets the stage for further exploration. + +
+
+ comment: 16 pages, 3 figures, 3 tables +
+
+
+
+
+ + ☆ Faithful and Plausible Natural Language Explanations for Image + Classification: A Pipeline Approach + + +
+ Existing explanation methods for image classification struggle to provide +faithful and plausible explanations. This paper addresses this issue by +proposing a post-hoc natural language explanation method that can be applied to +any CNN-based classifier without altering its training process or affecting +predictive performance. By analysing influential neurons and the corresponding +activation maps, the method generates a faithful description of the +classifier's decision process in the form of a structured meaning +representation, which is then converted into text by a language model. Through +this pipeline approach, the generated explanations are grounded in the neural +network architecture, providing accurate insight into the classification +process while remaining accessible to non-experts. Experimental results show +that the NLEs constructed by our method are significantly more plausible and +faithful. In particular, user interventions in the neural network structure +(masking of neurons) are three times more effective than the baselines. + +
+
+
+
+
+ + ☆ Effective Black Box Testing of Sentiment Analysis Classification + Networks + + +
+ Transformer-based neural networks have demonstrated remarkable performance in +natural language processing tasks such as sentiment analysis. Nevertheless, the +issue of ensuring the dependability of these complicated architectures through +comprehensive testing is still open. This paper presents a collection of +coverage criteria specifically designed to assess test suites created for +transformer-based sentiment analysis networks. Our approach utilizes input +space partitioning, a black-box method, by considering emotionally relevant +linguistic features such as verbs, adjectives, adverbs, and nouns. In order to +effectively produce test cases that encompass a wide range of emotional +elements, we utilize the k-projection coverage metric. This metric minimizes +the complexity of the problem by examining subsets of k features at the same +time, hence reducing dimensionality. Large language models are employed to +generate sentences that display specific combinations of emotional features. +The findings from experiments obtained from a sentiment analysis dataset +illustrate that our criteria and generated tests have led to an average +increase of 16\% in test coverage. In addition, there is a corresponding +average decrease of 6.5\% in model accuracy, showing the ability to identify +vulnerabilities. Our work provides a foundation for improving the dependability +of transformer-based sentiment analysis systems through comprehensive test +evaluation. + +
+
+ comment: This paper uses LaTeX with the IEEEtran.cls document class +
+
+
+
+
+ + ☆ SynthVLM: High-Efficiency and High-Quality Synthetic Data for Vision + Language Models + + +
+ Recently, with the rise of web images, managing and understanding large-scale +image datasets has become increasingly important. Vision Large Language Models +(VLLMs) have recently emerged due to their robust vision-understanding +capabilities. However, training these models requires vast amounts of data, +posing challenges to efficiency, effectiveness, data quality, and privacy. In +this paper, we introduce SynthVLM, a novel data synthesis pipeline for VLLMs. +Unlike existing methods that generate captions from images, SynthVLM employs +advanced diffusion models and high-quality captions to automatically generate +and select high-resolution images from captions, creating precisely aligned +image-text pairs. Leveraging these pairs, we achieve state-of-the-art (SoTA) +performance on various vision question answering tasks, maintaining high +alignment quality and preserving advanced language abilities. Moreover, +SynthVLM surpasses traditional GPT-4 Vision-based caption generation methods in +performance while significantly reducing computational overhead. Crucially, our +method's reliance on purely generated data ensures the preservation of privacy, +achieving SoTA performance with just 100k data points (only 18% of the official +dataset size). + +
+
+
+
+
+ + ☆ JaColBERTv2.5: Optimising Multi-Vector Retrievers to Create + State-of-the-Art Japanese Retrievers with Constrained Resources + + +
+ Neural Information Retrieval has advanced rapidly in high-resource languages, +but progress in lower-resource ones such as Japanese has been hindered by data +scarcity, among other challenges. Consequently, multilingual models have +dominated Japanese retrieval, despite their computational inefficiencies and +inability to capture linguistic nuances. While recent multi-vector monolingual +models like JaColBERT have narrowed this gap, they still lag behind +multilingual methods in large-scale evaluations. This work addresses the +suboptimal training methods of multi-vector retrievers in lower-resource +settings, focusing on Japanese. We systematically evaluate and improve key +aspects of the inference and training settings of JaColBERT, and more broadly, +multi-vector models. We further enhance performance through a novel checkpoint +merging step, showcasing it to be an effective way of combining the benefits of +fine-tuning with the generalization capabilities of the original checkpoint. +Building on our analysis, we introduce a novel training recipe, resulting in +the JaColBERTv2.5 model. JaColBERTv2.5, with only 110 million parameters and +trained in under 15 hours on 4 A100 GPUs, significantly outperforms all +existing methods across all common benchmarks, reaching an average score of +0.754, significantly above the previous best of 0.720. To support future +research, we make our final models, intermediate checkpoints and all data used +publicly available. + +
+
+
+
+
+ + ☆ Meltemi: The first open Large Language Model for Greek + + +
+ We describe the development and capabilities of Meltemi 7B, the first open +Large Language Model for the Greek language. Meltemi 7B has 7 billion +parameters and is trained on a 40 billion token Greek corpus. For the +development of Meltemi 7B, we adapt Mistral, by continuous pretraining on the +Greek Corpus. Meltemi 7B contains up-to-date information up to September 2023. +Furthermore, we have translated and curated a Greek instruction corpus, which +has been used for the instruction-tuning of a chat model, named Meltemi 7B +Instruct. Special care has been given to the alignment and the removal of toxic +content for the Meltemi 7B Instruct. The developed models are evaluated on a +broad set of collected evaluation corpora, and examples of prompts and +responses are presented. Both Meltemi 7B and Meltemi 7B Instruct are available +at https://huggingface.co/ilsp under the Apache 2.0 license. + +
+
+
+
+
+ + ☆ Adapting Safe-for-Work Classifier for Malaysian Language Text: Enhancing + Alignment in LLM-Ops Framework + + +
+ As large language models (LLMs) become increasingly integrated into +operational workflows (LLM-Ops), there is a pressing need for effective +guardrails to ensure safe and aligned interactions, including the ability to +detect potentially unsafe or inappropriate content across languages. However, +existing safe-for-work classifiers are primarily focused on English text. To +address this gap for the Malaysian language, we present a novel safe-for-work +text classifier tailored specifically for Malaysian language content. By +curating and annotating a first-of-its-kind dataset of Malaysian text spanning +multiple content categories, we trained a classification model capable of +identifying potentially unsafe material using state-of-the-art natural language +processing techniques. This work represents an important step in enabling safer +interactions and content filtering to mitigate potential risks and ensure +responsible deployment of LLMs. To maximize accessibility and promote further +research towards enhancing alignment in LLM-Ops for the Malaysian context, the +model is publicly released at +https://huggingface.co/malaysia-ai/malaysian-sfw-classifier. + +
+
+
+
+
+ + ☆ Industrial-Grade Smart Troubleshooting through Causal Technical Language + Processing: a Proof of Concept KDD 2024 + + +
+ This paper describes the development of a causal diagnosis approach for +troubleshooting an industrial environment on the basis of the technical +language expressed in Return on Experience records. The proposed method +leverages the vectorized linguistic knowledge contained in the distributed +representation of a Large Language Model, and the causal associations entailed +by the embedded failure modes and mechanisms of the industrial assets. The +paper presents the elementary but essential concepts of the solution, which is +conceived as a causality-aware retrieval augmented generation system, and +illustrates them experimentally on a real-world Predictive Maintenance setting. +Finally, it discusses avenues of improvement for the maturity of the utilized +causal technology to meet the robustness challenges of increasingly complex +scenarios in the industry. + +
+
+ comment: 2nd Workshop on Causal Inference and Machine Learning in Practice at + the KDD 2024 Conference. arXiv admin note: text overlap with arXiv:2407.11056 +
+
+
+
+
+ + ☆ CultureVo: The Serious Game of Utilizing Gen AI for Enhancing Cultural + Intelligence + + +
+ CultureVo, Inc. has developed the Integrated Culture Learning Suite (ICLS) to +deliver foundational knowledge of world cultures through a combination of +interactive lessons and gamified experiences. This paper explores how +Generative AI powered by open source Large Langauge Models are utilized within +the ICLS to enhance cultural intelligence. The suite employs Generative AI +techniques to automate the assessment of learner knowledge, analyze behavioral +patterns, and manage interactions with non-player characters using real time +learner assessment. Additionally, ICLS provides contextual hint and recommend +course content by assessing learner proficiency, while Generative AI +facilitates the automated creation and validation of educational content. + +
+
+ comment: Fourth International Conference on AI-ML Systems, 8-11 October, 2024, + Louisiana, USA +
+
+
+
+
+ + ☆ Label-Guided Prompt for Multi-label Few-shot Aspect Category Detection + + +
+ Multi-label few-shot aspect category detection aims at identifying multiple +aspect categories from sentences with a limited number of training instances. +The representation of sentences and categories is a key issue in this task. +Most of current methods extract keywords for the sentence representations and +the category representations. Sentences often contain many category-independent +words, which leads to suboptimal performance of keyword-based methods. Instead +of directly extracting keywords, we propose a label-guided prompt method to +represent sentences and categories. To be specific, we design label-specific +prompts to represent sentences by combining crucial contextual and semantic +information. Further, the label is introduced into a prompt to obtain category +descriptions by utilizing a large language model. This kind of category +descriptions contain the characteristics of the aspect categories, guiding the +construction of discriminative category prototypes. Experimental results on two +public datasets show that our method outperforms current state-of-the-art +methods with a 3.86% - 4.75% improvement in the Macro-F1 score. + +
+
+
+
+
+ + ☆ ArabicNLU 2024: The First Arabic Natural Language Understanding Shared + Task + + +
+ This paper presents an overview of the Arabic Natural Language Understanding +(ArabicNLU 2024) shared task, focusing on two subtasks: Word Sense +Disambiguation (WSD) and Location Mention Disambiguation (LMD). The task aimed +to evaluate the ability of automated systems to resolve word ambiguity and +identify locations mentioned in Arabic text. We provided participants with +novel datasets, including a sense-annotated corpus for WSD, called SALMA with +approximately 34k annotated tokens, and the IDRISI-DA dataset with 3,893 +annotations and 763 unique location mentions. These are challenging tasks. Out +of the 38 registered teams, only three teams participated in the final +evaluation phase, with the highest accuracy being 77.8% for WSD and the highest +MRR@1 being 95.0% for LMD. The shared task not only facilitated the evaluation +and comparison of different techniques, but also provided valuable insights and +resources for the continued advancement of Arabic NLU technologies. + +
+
+ comment: In Proceedings of the Second Arabic Natural Language Processing + Conference (ArabicNLP 2024), Bangkok, Thailand. Association for Computational + Linguistics +
+
+
+
+
+ + ☆ Prompt-Driven Contrastive Learning for Transferable Adversarial Attacks ECCV 2024 + + +
+ Recent vision-language foundation models, such as CLIP, have demonstrated +superior capabilities in learning representations that can be transferable +across diverse range of downstream tasks and domains. With the emergence of +such powerful models, it has become crucial to effectively leverage their +capabilities in tackling challenging vision tasks. On the other hand, only a +few works have focused on devising adversarial examples that transfer well to +both unknown domains and model architectures. In this paper, we propose a novel +transfer attack method called PDCL-Attack, which leverages the CLIP model to +enhance the transferability of adversarial perturbations generated by a +generative model-based attack framework. Specifically, we formulate an +effective prompt-driven feature guidance by harnessing the semantic +representation power of text, particularly from the ground-truth class labels +of input images. To the best of our knowledge, we are the first to introduce +prompt learning to enhance the transferable generative attacks. Extensive +experiments conducted across various cross-domain and cross-model settings +empirically validate our approach, demonstrating its superiority over +state-of-the-art methods. + +
+
+ comment: Accepted to ECCV 2024, Project Page: https://PDCL-Attack.github.io +
+
+
+
+
+ + ☆ Prompting Encoder Models for Zero-Shot Classification: A Cross-Domain + Study in Italian + + +
+ Addressing the challenge of limited annotated data in specialized fields and +low-resource languages is crucial for the effective use of Language Models +(LMs). While most Large Language Models (LLMs) are trained on general-purpose +English corpora, there is a notable gap in models specifically tailored for +Italian, particularly for technical and bureaucratic jargon. This paper +explores the feasibility of employing smaller, domain-specific encoder LMs +alongside prompting techniques to enhance performance in these specialized +contexts. Our study concentrates on the Italian bureaucratic and legal +language, experimenting with both general-purpose and further pre-trained +encoder-only models. We evaluated the models on downstream tasks such as +document classification and entity typing and conducted intrinsic evaluations +using Pseudo-Log-Likelihood. The results indicate that while further +pre-trained models may show diminished robustness in general knowledge, they +exhibit superior adaptability for domain-specific tasks, even in a zero-shot +setting. Furthermore, the application of calibration techniques and in-domain +verbalizers significantly enhances the efficacy of encoder models. These +domain-specialized models prove to be particularly advantageous in scenarios +where in-domain resources or expertise are scarce. In conclusion, our findings +offer new insights into the use of Italian models in specialized contexts, +which may have a significant impact on both research and industrial +applications in the digital transformation era. + +
+
+ comment: Submitted to 'Language Resource and Evaluation' +
+
+
+
+
+ + ☆ Decoding Linguistic Representations of Human Brain + + +
+ Language, as an information medium created by advanced organisms, has always +been a concern of neuroscience regarding how it is represented in the brain. +Decoding linguistic representations in the evoked brain has shown +groundbreaking achievements, thanks to the rapid improvement of neuroimaging, +medical technology, life sciences and artificial intelligence. In this work, we +present a taxonomy of brain-to-language decoding of both textual and speech +formats. This work integrates two types of research: neuroscience focusing on +language understanding and deep learning-based brain decoding. Generating +discernible language information from brain activity could not only help those +with limited articulation, especially amyotrophic lateral sclerosis (ALS) +patients but also open up a new way for the next generation's brain-computer +interface (BCI). This article will help brain scientists and deep-learning +researchers to gain a bird's eye view of fine-grained language perception, and +thus facilitate their further investigation and research of neural process and +language decoding. + +
+
+
+
+
+ + ☆ Questionnaires for Everyone: Streamlining Cross-Cultural Questionnaire + Adaptation with GPT-Based Translation Quality Evaluation + + +
+ Adapting questionnaires to new languages is a resource-intensive process +often requiring the hiring of multiple independent translators, which limits +the ability of researchers to conduct cross-cultural research and effectively +creates inequalities in research and society. This work presents a prototype +tool that can expedite the questionnaire translation process. The tool +incorporates forward-backward translation using DeepL alongside GPT-4-generated +translation quality evaluations and improvement suggestions. We conducted two +online studies in which participants translated questionnaires from English to +either German (Study 1; n=10) or Portuguese (Study 2; n=20) using our +prototype. To evaluate the quality of the translations created using the tool, +evaluation scores between conventionally translated and tool-supported versions +were compared. Our results indicate that integrating LLM-generated translation +quality evaluations and suggestions for improvement can help users +independently attain results similar to those provided by conventional, +non-NLP-supported translation methods. This is the first step towards more +equitable questionnaire-based research, powered by AI. + +
+
+ comment: 19 pages, 13 figures +
+
+
+
+
+ + ☆ Harvesting Textual and Structured Data from the HAL Publication + Repository + + +
+ HAL (Hyper Articles en Ligne) is the French national publication repository, +used by most higher education and research organizations for their open science +policy. As a digital library, it is a rich repository of scholarly documents, +but its potential for advanced research has been underutilized. We present +HALvest, a unique dataset that bridges the gap between citation networks and +the full text of papers submitted on HAL. We craft our dataset by filtering HAL +for scholarly publications, resulting in approximately 700,000 documents, +spanning 34 languages across 13 identified domains, suitable for language model +training, and yielding approximately 16.5 billion tokens (with 8 billion in +French and 7 billion in English, the most represented languages). We transform +the metadata of each paper into a citation network, producing a directed +heterogeneous graph. This graph includes uniquely identified authors on HAL, as +well as all open submitted papers, and their citations. We provide a baseline +for authorship attribution using the dataset, implement a range of +state-of-the-art models in graph representation learning for link prediction, +and discuss the usefulness of our generated knowledge graph structure. + +
+
+
+
+
+ + ☆ Enhancing Agricultural Machinery Management through Advanced LLM + Integration + + +
+ The integration of artificial intelligence into agricultural practices, +specifically through Consultation on Intelligent Agricultural Machinery +Management (CIAMM), has the potential to revolutionize efficiency and +sustainability in farming. This paper introduces a novel approach that +leverages large language models (LLMs), particularly GPT-4, combined with +multi-round prompt engineering to enhance decision-making processes in +agricultural machinery management. We systematically developed and refined +prompts to guide the LLMs in generating precise and contextually relevant +outputs. Our approach was evaluated using a manually curated dataset from +various online sources, and performance was assessed with accuracy and GPT-4 +Scores. Comparative experiments were conducted using LLama-2-70B, ChatGPT, and +GPT-4 models, alongside baseline and state-of-the-art methods such as Chain of +Thought (CoT) and Thought of Thought (ThoT). The results demonstrate that our +method significantly outperforms these approaches, achieving higher accuracy +and relevance in generated responses. This paper highlights the potential of +advanced prompt engineering techniques in improving the robustness and +applicability of AI in agricultural contexts. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Pruning Large Language Models with Semi-Structural Adaptive Sparse + Training + + +
+ Transformer-based Large Language Models (LLMs) have demonstrated remarkable +success across various challenging tasks. However, the deployment of LLMs is +hindered by their substantial parameter count and memory consumption. Recently, +numerous studies have attempted to compress LLMs by pruning them using +training-free methods. However, these pruned models often experience +significant performance degradation on complex tasks. To address this issue, we +propose a novel training pipeline for semi-structured sparse models, named +Adaptive Sparse Trainer (AST). By distilling the knowledge stored in its dense +counterpart, we prevent the sparse model from overfitting and ensure a stable +training process. Moreover, AST allows the model to adaptively select better +lottery tickets (e.g., masks) during training. Additionally, we discovered that +adding extra well-initialized parameters can further enhance model performance +with only a small increase in memory footprint. Our method significantly +narrows the performance gap between dense and sparse models while maintaining +limited computational cost. Furthermore, when combined with existing +quantization methods, AST can compress language models by up to 16x compared to +dense FP32 precision models with minimal performance loss. AST outperforms +previous state-of-the-art methods by reducing the zero-shot accuracy gap +between dense and semi-structured sparse models to 1.12% across multiple +zero-shot tasks on Llama2-7B, using less than 0.4% of the pretraining tokens. + +
+
+
+
+
+ + ☆ Knesset-DictaBERT: A Hebrew Language Model for Parliamentary Proceedings + + +
+ We present Knesset-DictaBERT, a large Hebrew language model fine-tuned on the +Knesset Corpus, which comprises Israeli parliamentary proceedings. The model is +based on the DictaBERT architecture and demonstrates significant improvements +in understanding parliamentary language according to the MLM task. We provide a +detailed evaluation of the model's performance, showing improvements in +perplexity and accuracy over the baseline DictaBERT model. + +
+
+ comment: 3 pages, 1 table +
+
+
+
+
+ + ☆ Comparison of Large Language Models for Generating Contextually Relevant + Questions + + +
+ This study explores the effectiveness of Large Language Models (LLMs) for +Automatic Question Generation in educational settings. Three LLMs are compared +in their ability to create questions from university slide text without +fine-tuning. Questions were obtained in a two-step pipeline: first, answer +phrases were extracted from slides using Llama 2-Chat 13B; then, the three +models generated questions for each answer. To analyze whether the questions +would be suitable in educational applications for students, a survey was +conducted with 46 students who evaluated a total of 246 questions across five +metrics: clarity, relevance, difficulty, slide relation, and question-answer +alignment. Results indicate that GPT-3.5 and Llama 2-Chat 13B outperform Flan +T5 XXL by a small margin, particularly in terms of clarity and question-answer +alignment. GPT-3.5 especially excels at tailoring questions to match the input +answers. The contribution of this research is the analysis of the capacity of +LLMs for Automatic Question Generation in education. + +
+
+ comment: Published in Springer ECTEL 2024 conference proceedings +
+
+
+
+
+ + ☆ CLR-Fact: Evaluating the Complex Logical Reasoning Capability of Large + Language Models over Factual Knowledge + + +
+ While large language models (LLMs) have demonstrated impressive capabilities +across various natural language processing tasks by acquiring rich factual +knowledge from their broad training data, their ability to synthesize and +logically reason with this knowledge in complex ways remains underexplored. In +this work, we present a systematic evaluation of state-of-the-art LLMs' complex +logical reasoning abilities through a novel benchmark of automatically +generated complex reasoning questions over general domain and biomedical +knowledge graphs. Our extensive experiments, employing diverse in-context +learning techniques, reveal that LLMs excel at reasoning over general world +knowledge but face significant challenges with specialized domain-specific +knowledge. We find that prompting with explicit Chain-of-Thought demonstrations +can substantially improve LLM performance on complex logical reasoning tasks +with diverse logical operations. Interestingly, our controlled evaluations +uncover an asymmetry where LLMs display proficiency at set union operations, +but struggle considerably with set intersections - a key building block of +logical reasoning. To foster further work, we will publicly release our +evaluation benchmark and code. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ Survey of Design Paradigms for Social Robots + + +
+ The demand for social robots in fields like healthcare, education, and +entertainment increases due to their emotional adaptation features. These +robots leverage multimodal communication, incorporating speech, facial +expressions, and gestures to enhance user engagement and emotional support. The +understanding of design paradigms of social robots is obstructed by the +complexity of the system and the necessity to tune it to a specific task. This +article provides a structured review of social robot design paradigms, +categorizing them into cognitive architectures, role design models, linguistic +models, communication flow, activity system models, and integrated design +models. By breaking down the articles on social robot design and application +based on these paradigms, we highlight the strengths and areas for improvement +in current approaches. We further propose our original integrated design model +that combines the most important aspects of the design of social robots. Our +approach shows the importance of integrating operational, communicational, and +emotional dimensions to create more adaptive and empathetic interactions +between robots and humans. + +
+
+
+
+
+ + ☆ Contrastive Feedback Mechanism for Simultaneous Speech Translation + + +
+ Recent advances in simultaneous speech translation (SST) focus on the +decision policies that enable the use of offline-trained ST models for +simultaneous inference. These decision policies not only control the +quality-latency trade-off in SST but also mitigate the impact of unstable +predictions on translation quality by delaying translation for more context or +discarding these predictions through stable hypothesis detection. However, +these policies often overlook the potential benefits of utilizing unstable +predictions. We introduce the contrastive feedback mechanism (CFM) for SST, a +novel method that leverages these unstable predictions as feedback to improve +translation quality. CFM guides the system to eliminate undesired model +behaviors from these predictions through a contrastive objective. The +experiments on 3 state-of-the-art decision policies across 8 languages in the +MuST-C v1.0 dataset show that CFM effectively improves the performance of SST. + +
+
+
+
+
+ + ☆ Machine Unlearning in Generative AI: A Survey + + +
+ Generative AI technologies have been deployed in many places, such as +(multimodal) large language models and vision generative models. Their +remarkable performance should be attributed to massive training data and +emergent reasoning abilities. However, the models would memorize and generate +sensitive, biased, or dangerous information originated from the training data +especially those from web crawl. New machine unlearning (MU) techniques are +being developed to reduce or eliminate undesirable knowledge and its effects +from the models, because those that were designed for traditional +classification tasks could not be applied for Generative AI. We offer a +comprehensive survey on many things about MU in Generative AI, such as a new +problem formulation, evaluation methods, and a structured discussion on the +advantages and limitations of different kinds of MU techniques. It also +presents several critical challenges and promising directions in MU research. A +curated list of readings can be found: +https://github.com/franciscoliu/GenAI-MU-Reading. + +
+
+
+
+
+ + ☆ Prompt2DeModel: Declarative Neuro-Symbolic Modeling with Natural + Language + + +
+ This paper presents a conversational pipeline for crafting domain knowledge +for complex neuro-symbolic models through natural language prompts. It +leverages large language models to generate declarative programs in the +DomiKnowS framework. The programs in this framework express concepts and their +relationships as a graph in addition to logical constraints between them. The +graph, later, can be connected to trainable neural models according to those +specifications. Our proposed pipeline utilizes techniques like dynamic +in-context demonstration retrieval, model refinement based on feedback from a +symbolic parser, visualization, and user interaction to generate the tasks' +structure and formal knowledge representation. This approach empowers domain +experts, even those not well-versed in ML/AI, to formally declare their +knowledge to be incorporated in customized neural models in the DomiKnowS +framework. + +
+
+ comment: Accepted in NeSy 2024 Conference +
+
+
+
+
+ + ☆ A2SF: Accumulative Attention Scoring with Forgetting Factor for Token + Pruning in Transformer Decoder + + +
+ Recently, large language models (LLM) based on transformers are facing memory +bottleneck issues due to KV cache, especially in long sequence handling. +Previous researches proposed KV cache compression techniques that identify +insignificant tokens based on Accumulative Attention Scores and removes their +items from KV cache, noting that only few tokens play an important role in +attention operations. However, we have observed that the existing Accumulative +Attention Score is not suitable for the transformer decoder structure. In the +decoder model, the number of times the Attention Score accumulates varies +depending on the order of token appearance due to the effect of masking, +causing an uneven comparison between tokens. To solve this, we propose +Accumulative Attention Score with Forgetting Factor (A2SF) technique, which +introduces a Forgetting Factor in the Attention Score accumulation process. +A2SF applies a penalty to the past Attention Score generated from old tokens by +repeatedly multiplying the Forgetting Factor to the Attention Score over time. +Therefore, older tokens receive a larger penalty, providing fairness among +different ages of tokens. Through the fair comparison among tokens, we can more +effectively select important tokens. We have verified the accuracy improvement +through A2SF in the OPT and LLaMA models and A2SF improves the accuracy of +LLaMA 2 by up to 7.8% and 5.1% on 1-shot and 0-shot. + +
+
+ comment: 11 pages(9 pages + reference 2 pages), 6 figures +
+
+
+
+
+ + ☆ Adaptive Pre-training Data Detection for Large Language Models via + Surprising Tokens + + +
+ While large language models (LLMs) are extensively used, there are raising +concerns regarding privacy, security, and copyright due to their opaque +training data, which brings the problem of detecting pre-training data on the +table. Current solutions to this problem leverage techniques explored in +machine learning privacy such as Membership Inference Attacks (MIAs), which +heavily depend on LLMs' capability of verbatim memorization. However, this +reliance presents challenges, especially given the vast amount of training data +and the restricted number of effective training epochs. In this paper, we +propose an adaptive pre-training data detection method which alleviates this +reliance and effectively amplify the identification. Our method adaptively +locates \textit{surprising tokens} of the input. A token is surprising to a LLM +if the prediction on the token is "certain but wrong", which refers to low +Shannon entropy of the probability distribution and low probability of the +ground truth token at the same time. By using the prediction probability of +surprising tokens to measure \textit{surprising}, the detection method is +achieved based on the simple hypothesis that seeing seen data is less +surprising for the model compared with seeing unseen data. The method can be +applied without any access to the the pre-training data corpus or additional +training like reference models. Our approach exhibits a consistent enhancement +compared to existing methods in diverse experiments conducted on various +benchmarks and models, achieving a maximum improvement of 29.5\%. We also +introduce a new benchmark Dolma-Book developed upon a novel framework, which +employs book data collected both before and after model training to provide +further evaluation. + +
+
+
+
+
+ + ☆ Advancing Vietnamese Visual Question Answering with Transformer and + Convolutional Integration + + +
+ Visual Question Answering (VQA) has recently emerged as a potential research +domain, captivating the interest of many in the field of artificial +intelligence and computer vision. Despite the prevalence of approaches in +English, there is a notable lack of systems specifically developed for certain +languages, particularly Vietnamese. This study aims to bridge this gap by +conducting comprehensive experiments on the Vietnamese Visual Question +Answering (ViVQA) dataset, demonstrating the effectiveness of our proposed +model. In response to community interest, we have developed a model that +enhances image representation capabilities, thereby improving overall +performance in the ViVQA system. Specifically, our model integrates the +Bootstrapping Language-Image Pre-training with frozen unimodal models (BLIP-2) +and the convolutional neural network EfficientNet to extract and process both +local and global features from images. This integration leverages the strengths +of transformer-based architectures for capturing comprehensive contextual +information and convolutional networks for detailed local features. By freezing +the parameters of these pre-trained models, we significantly reduce the +computational cost and training time, while maintaining high performance. This +approach significantly improves image representation and enhances the +performance of existing VQA systems. We then leverage a multi-modal fusion +module based on a general-purpose multi-modal foundation model (BEiT-3) to fuse +the information between visual and textual features. Our experimental findings +demonstrate that our model surpasses competing baselines, achieving promising +performance. This is particularly evident in its accuracy of $71.04\%$ on the +test set of the ViVQA dataset, marking a significant advancement in our +research area. The code is available at https://github.com/nngocson2002/ViVQA. + +
+
+ comment: Accepted at the journal of Computers & Electrical Engineering + (Received 8 March 2024, Revised 8 June 2024, Accepted 10 July 2024) +
+
+
+
+
+ + ☆ GenRec: Generative Personalized Sequential Recommendation + + +
+ Sequential recommendation is a task to capture hidden user preferences from +historical user item interaction data. Significant progress has been made in +this domain by leveraging classification based learning methods. Inspired by +the recent paradigm of 'pretrain, prompt and predict' in NLP, we consider +sequential recommendation as a sequence to sequence generation task and propose +a novel model named Generative Recommendation (GenRec). Unlike classification +based models that learn explicit user and item representations, GenRec utilizes +the sequence modeling capability of Transformer and adopts the masked item +prediction objective to effectively learn the hidden bidirectional sequential +patterns. Different from existing generative sequential recommendation models, +GenRec does not rely on manually designed hard prompts. The input to GenRec is +textual user item sequence and the output is top ranked next items. Moreover, +GenRec is lightweight and requires only a few hours to train effectively in +low-resource settings, making it highly applicable to real-world scenarios and +helping to democratize large language models in the sequential recommendation +domain. Our extensive experiments have demonstrated that GenRec generalizes on +various public real-world datasets and achieves state-of-the-art results. Our +experiments also validate the effectiveness of the the proposed masked item +prediction objective that improves the model performance by a large margin. + +
+
+
+
+
+ + ☆ Decomposed Prompting to Answer Questions on a Course Discussion Board + + +
+ We propose and evaluate a question-answering system that uses decomposed +prompting to classify and answer student questions on a course discussion +board. Our system uses a large language model (LLM) to classify questions into +one of four types: conceptual, homework, logistics, and not answerable. This +enables us to employ a different strategy for answering questions that fall +under different types. Using a variant of GPT-3, we achieve $81\%$ +classification accuracy. We discuss our system's performance on answering +conceptual questions from a machine learning course and various failure modes. + +
+
+ comment: 6 pages. Published at International Conference on Artificial + Intelligence in Education 2023. Code repository: + https://github.com/brandonjaipersaud/piazza-qabot-gpt +
+
+
+
+
+ + ☆ Event-Arguments Extraction Corpus and Modeling using BERT for Arabic + + +
+ Event-argument extraction is a challenging task, particularly in Arabic due +to sparse linguistic resources. To fill this gap, we introduce the \hadath +corpus ($550$k tokens) as an extension of Wojood, enriched with event-argument +annotations. We used three types of event arguments: $agent$, $location$, and +$date$, which we annotated as relation types. Our inter-annotator agreement +evaluation resulted in $82.23\%$ $Kappa$ score and $87.2\%$ $F_1$-score. +Additionally, we propose a novel method for event relation extraction using +BERT, in which we treat the task as text entailment. This method achieves an +$F_1$-score of $94.01\%$. To further evaluate the generalization of our +proposed method, we collected and annotated another out-of-domain corpus (about +$80$k tokens) called \testNLI and used it as a second test set, on which our +approach achieved promising results ($83.59\%$ $F_1$-score). Last but not +least, we propose an end-to-end system for event-arguments extraction. This +system is implemented as part of SinaTools, and both corpora are publicly +available at {\small \url{https://sina.birzeit.edu/wojood}} + +
+
+
+
+
+ + ☆ Enhancing Semantic Similarity Understanding in Arabic NLP with Nested + Embedding Learning + + +
+ This work presents a novel framework for training Arabic nested embedding +models through Matryoshka Embedding Learning, leveraging multilingual, +Arabic-specific, and English-based models, to highlight the power of nested +embeddings models in various Arabic NLP downstream tasks. Our innovative +contribution includes the translation of various sentence similarity datasets +into Arabic, enabling a comprehensive evaluation framework to compare these +models across different dimensions. We trained several nested embedding models +on the Arabic Natural Language Inference triplet dataset and assessed their +performance using multiple evaluation metrics, including Pearson and Spearman +correlations for cosine similarity, Manhattan distance, Euclidean distance, and +dot product similarity. The results demonstrate the superior performance of the +Matryoshka embedding models, particularly in capturing semantic nuances unique +to the Arabic language. Results demonstrated that Arabic Matryoshka embedding +models have superior performance in capturing semantic nuances unique to the +Arabic language, significantly outperforming traditional models by up to +20-25\% across various similarity metrics. These results underscore the +effectiveness of language-specific training and highlight the potential of +Matryoshka models in enhancing semantic textual similarity tasks for Arabic +NLP. + +
+
+
+
+
+ + ☆ Entropy, Thermodynamics and the Geometrization of the Language Model + + +
+ In this paper, we discuss how pure mathematics and theoretical physics can be +applied to the study of language models. Using set theory and analysis, we +formulate mathematically rigorous definitions of language models, and introduce +the concept of the moduli space of distributions for a language model. We +formulate a generalized distributional hypothesis using functional analysis and +topology. We define the entropy function associated with a language model and +show how it allows us to understand many interesting phenomena in languages. We +argue that the zero points of the entropy function and the points where the +entropy is close to 0 are the key obstacles for an LLM to approximate an +intelligent language model, which explains why good LLMs need billions of +parameters. Using the entropy function, we formulate a conjecture about AGI. + Then, we show how thermodynamics gives us an immediate interpretation to +language models. In particular we will define the concepts of partition +function, internal energy and free energy for a language model, which offer +insights into how language models work. Based on these results, we introduce a +general concept of the geometrization of language models and define what is +called the Boltzmann manifold. While the current LLMs are the special cases of +the Boltzmann manifold. + +
+
+ comment: 18 pages +
+
+
+
+
+ + ☆ Accelerating Large Language Model Inference with Self-Supervised Early + Exits + + +
+ This paper presents a novel technique for accelerating inference in large, +pre-trained language models (LLMs) by introducing early exits during inference. +The computational demands of these models, used across a wide range of +applications, can be substantial. By capitalizing on the inherent variability +in token complexity, our approach enables selective acceleration of the +inference process. Specifically, we propose the integration of early exit +''heads'' atop existing transformer layers, which facilitate conditional +terminations based on a confidence metric. These heads are trained in a +self-supervised manner using the model's own predictions as training data, +thereby eliminating the need for additional annotated data. The confidence +metric, established using a calibration set, ensures a desired level of +accuracy while enabling early termination when confidence exceeds a +predetermined threshold. Notably, our method preserves the original accuracy +and reduces computational time on certain tasks, leveraging the existing +knowledge of pre-trained LLMs without requiring extensive retraining. This +lightweight, modular modification has the potential to greatly enhance the +practical usability of LLMs, particularly in applications like real-time +language processing in resource-constrained environments. + +
+
+
+
+
+ + ♻ ☆ Language-based Valence and Arousal Expressions between the United States + and China: a Cross-Cultural Examination + + +
+ Although affective expressions of individuals have been extensively studied +using social media, research has primarily focused on the Western context. +There are substantial differences among cultures that contribute to their +affective expressions. This paper examines the differences between Twitter (X) +in the United States and Sina Weibo posts in China on two primary dimensions of +affect - valence and arousal. We study the difference in the functional +relationship between arousal and valence (so-called V-shaped) among individuals +in the US and China and explore the associated content differences. +Furthermore, we correlate word usage and topics in both platforms to interpret +their differences. We observe that for Twitter users, the variation in +emotional intensity is less distinct between negative and positive emotions +compared to Weibo users, and there is a sharper escalation in arousal +corresponding with heightened emotions. From language features, we discover +that affective expressions are associated with personal life and feelings on +Twitter, while on Weibo such discussions are about socio-political topics in +the society. These results suggest a West-East difference in the V-shaped +relationship between valence and arousal of affective expressions on social +media influenced by content differences. Our findings have implications for +applications and theories related to cultural differences in affective +expressions. + +
+
+ comment: preview +
+
+
+
+
+ + ♻ ☆ Rethinking Radiology Report Generation via Causal Inspired + Counterfactual Augmentation + + +
+ Radiology Report Generation (RRG) draws attention as a vision-and-language +interaction of biomedical fields. Previous works inherited the ideology of +traditional language generation tasks, aiming to generate paragraphs with high +readability as reports. Despite significant progress, the independence between +diseases-a specific property of RRG-was neglected, yielding the models being +confused by the co-occurrence of diseases brought on by the biased data +distribution, thus generating inaccurate reports. In this paper, to rethink +this issue, we first model the causal effects between the variables from a +causal perspective, through which we prove that the co-occurrence relationships +between diseases on the biased distribution function as confounders, confusing +the accuracy through two backdoor paths, i.e. the Joint Vision Coupling and the +Conditional Sequential Coupling. Then, we proposed a novel model-agnostic +counterfactual augmentation method that contains two strategies, i.e. the +Prototype-based Counterfactual Sample Synthesis (P-CSS) and the Magic-Cube-like +Counterfactual Report Reconstruction (Cube), to intervene the backdoor paths, +thus enhancing the accuracy and generalization of RRG models. Experimental +results on the widely used MIMIC-CXR dataset demonstrate the effectiveness of +our proposed method. Additionally, a generalization performance is evaluated on +IU X-Ray dataset, which verifies our work can effectively reduce the impact of +co-occurrences caused by different distributions on the results. + +
+
+ comment: 10 pages,5 figures +
+
+
+
+
+ + ♻ ☆ Fast Multipole Attention: A Divide-and-Conquer Attention Mechanism for + Long Sequences + + +
+ Transformer-based models have achieved state-of-the-art performance in many +areas. However, the quadratic complexity of self-attention with respect to the +input length hinders the applicability of Transformer-based models to long +sequences. To address this, we present Fast Multipole Attention, a new +attention mechanism that uses a divide-and-conquer strategy to reduce the time +and memory complexity of attention for sequences of length $n$ from +$\mathcal{O}(n^2)$ to $\mathcal{O}(n \log n)$ or $O(n)$, while retaining a +global receptive field. The hierarchical approach groups queries, keys, and +values into $\mathcal{O}( \log n)$ levels of resolution, where groups at +greater distances are increasingly larger in size and the weights to compute +group quantities are learned. As such, the interaction between tokens far from +each other is considered in lower resolution in an efficient hierarchical +manner. The overall complexity of Fast Multipole Attention is $\mathcal{O}(n)$ +or $\mathcal{O}(n \log n)$, depending on whether the queries are down-sampled +or not. This multi-level divide-and-conquer strategy is inspired by fast +summation methods from $n$-body physics and the Fast Multipole Method. We +perform evaluation on autoregressive and bidirectional language modeling tasks +and compare our Fast Multipole Attention model with other efficient attention +variants on medium-size datasets. We find empirically that the Fast Multipole +Transformer performs much better than other efficient transformers in terms of +memory size and accuracy. The Fast Multipole Attention mechanism has the +potential to empower large language models with much greater sequence lengths, +taking the full context into account in an efficient, naturally hierarchical +manner during training and when generating long sequences. + +
+
+
+
+
+ + ♻ ☆ Large Language Models Assume People are More Rational than We Really are + + +
+ In order for AI systems to communicate effectively with people, they must +understand how we make decisions. However, people's decisions are not always +rational, so the implicit internal models of human decision-making in Large +Language Models (LLMs) must account for this. Previous empirical evidence seems +to suggest that these implicit models are accurate -- LLMs offer believable +proxies of human behavior, acting how we expect humans would in everyday +interactions. However, by comparing LLM behavior and predictions to a large +dataset of human decisions, we find that this is actually not the case: when +both simulating and predicting people's choices, a suite of cutting-edge LLMs +(GPT-4o & 4-Turbo, Llama-3-8B & 70B, Claude 3 Opus) assume that people are more +rational than we really are. Specifically, these models deviate from human +behavior and align more closely with a classic model of rational choice -- +expected value theory. Interestingly, people also tend to assume that other +people are rational when interpreting their behavior. As a consequence, when we +compare the inferences that LLMs and people draw from the decisions of others +using another psychological dataset, we find that these inferences are highly +correlated. Thus, the implicit decision-making models of LLMs appear to be +aligned with the human expectation that other people will act rationally, +rather than with how people actually act. + +
+
+
+
+
+ + ♻ ☆ A Survey on Model Compression for Large Language Models ACL + + +
+ Large Language Models (LLMs) have transformed natural language processing +tasks successfully. Yet, their large size and high computational needs pose +challenges for practical use, especially in resource-limited settings. Model +compression has emerged as a key research area to address these challenges. +This paper presents a survey of model compression techniques for LLMs. We cover +methods like quantization, pruning, and knowledge distillation, highlighting +recent advancements. We also discuss benchmarking strategies and evaluation +metrics crucial for assessing compressed LLMs. This survey offers valuable +insights for researchers and practitioners, aiming to enhance efficiency and +real-world applicability of LLMs while laying a foundation for future +advancements. + +
+
+ comment: Accepted for publication in TACL; a pre-MIT Press publication version +
+
+
+
+
+ + ♻ ☆ A Role-specific Guided Large Language Model for Ophthalmic Consultation + Based on Stylistic Differentiation + + +
+ Ophthalmology consultations are crucial for diagnosing, treating, and +preventing eye diseases. However, the growing demand for consultations exceeds +the availability of ophthalmologists. By leveraging large pre-trained language +models, we can design effective dialogues for specific scenarios, aiding in +consultations. Traditional fine-tuning strategies for question-answering tasks +are impractical due to increasing model size and often ignoring patient-doctor +role function during consultations. In this paper, we propose EyeDoctor, an +ophthalmic medical questioning large language model that enhances accuracy +through doctor-patient role perception guided and an augmented knowledge base +with external disease information. Experimental results show EyeDoctor achieves +higher question-answering precision in ophthalmology consultations. Notably, +EyeDoctor demonstrated a 7.25% improvement in Rouge-1 scores and a 10.16% +improvement in F1 scores on multi-round datasets compared to second best model +ChatGPT, highlighting the importance of doctor-patient role differentiation and +dynamic knowledge base expansion for intelligent medical consultations. EyeDoc +also serves as a free available web based service and souce code is available +at https://github.com/sperfu/EyeDoc. + +
+
+
+
+
+ + ♻ ☆ Knowledge Graph Structure as Prompt: Improving Small Language Models + Capabilities for Knowledge-based Causal Discovery ISWC'24 + + +
+ Causal discovery aims to estimate causal structures among variables based on +observational data. Large Language Models (LLMs) offer a fresh perspective to +tackle the causal discovery problem by reasoning on the metadata associated +with variables rather than their actual data values, an approach referred to as +knowledge-based causal discovery. In this paper, we investigate the +capabilities of Small Language Models (SLMs, defined as LLMs with fewer than 1 +billion parameters) with prompt-based learning for knowledge-based causal +discovery. Specifically, we present KG Structure as Prompt, a novel approach +for integrating structural information from a knowledge graph, such as common +neighbor nodes and metapaths, into prompt-based learning to enhance the +capabilities of SLMs. Experimental results on three types of biomedical and +open-domain datasets under few-shot settings demonstrate the effectiveness of +our approach, surpassing most baselines and even conventional fine-tuning +approaches trained on full datasets. Our findings further highlight the strong +capabilities of SLMs: in combination with knowledge graphs and prompt-based +learning, SLMs demonstrate the potential to surpass LLMs with larger number of +parameters. Our code and datasets are available on GitHub. + +
+
+ comment: accepted at ISWC'24 +
+
+
+
+
+ + ♻ ☆ DualTime: A Dual-Adapter Multimodal Language Model for Time Series + Representation + + +
+ The recent rapid development of language models (LMs) has attracted attention +in the field of time series, including multimodal time series modeling. +However, we note that current time series multimodal methods are biased, often +assigning a primary role to one modality while the other assumes a secondary +role. They overlook the mutual benefits and complementary of different +modalities. For example, in seizure diagnosis, relying solely on textual +clinical reports makes it difficult to pinpoint the area and type of the +disease, while electroencephalograms (EEGs) alone cannot provide an accurate +diagnosis without considering the symptoms. In this study, based on the +complementary information mining of time series multimodal data, we propose +DualTime, a Dual-adapter multimodal language model for Time series +representation implementing temporal-primary and textual-primary modeling +simultaneously. By injecting lightweight adaption tokens, the LM pipeline +shared by dual adapters encourages embedding alignment and achieves efficient +fine-tuning. Empirically, our method outperforms state-of-the-art models in +both supervised and unsupervised settings, highlighting the complementary +benefits of different modalities. In addition, we conduct few-shot label +transfer experiments, which further verifies the transferability and +expressiveness of our proposed DualTime. + +
+
+ comment: 15 pages, 12 figure, 5 tables +
+
+
+
+
+ + ♻ ☆ Papilusion at DAGPap24: Paper or Illusion? Detecting AI-generated + Scientific Papers ACL + 2024 + + +
+ This paper presents Papilusion, an AI-generated scientific text detector +developed within the DAGPap24 shared task on detecting automatically generated +scientific papers. We propose an ensemble-based approach and conduct ablation +studies to analyze the effect of the detector configurations on the +performance. Papilusion is ranked 6th on the leaderboard, and we improve our +performance after the competition ended, achieving 99.46 (+9.63) of the +F1-score on the official test set. + +
+
+ comment: to appear in "The 4th Workshop on Scholarly Document Processing @ ACL + 2024" proceedings +
+
+
+
+
+ + ♻ ☆ EHR-SeqSQL : A Sequential Text-to-SQL Dataset For Interactively + Exploring Electronic Health Records ACL 2024 + + +
+ In this paper, we introduce EHR-SeqSQL, a novel sequential text-to-SQL +dataset for Electronic Health Record (EHR) databases. EHR-SeqSQL is designed to +address critical yet underexplored aspects in text-to-SQL parsing: +interactivity, compositionality, and efficiency. To the best of our knowledge, +EHR-SeqSQL is not only the largest but also the first medical text-to-SQL +dataset benchmark to include sequential and contextual questions. We provide a +data split and the new test set designed to assess compositional generalization +ability. Our experiments demonstrate the superiority of a multi-turn approach +over a single-turn approach in learning compositionality. Additionally, our +dataset integrates specially crafted tokens into SQL queries to improve +execution efficiency. With EHR-SeqSQL, we aim to bridge the gap between +practical needs and academic research in the text-to-SQL domain. EHR-SeqSQL is +available at https://github.com/seonhee99/EHR-SeqSQL. + +
+
+ comment: ACL 2024 (Findings) +
+
+
+
+
+ + ♻ ☆ Large Language Models (LLMs) as Agents for Augmented Democracy + + +
+ We explore an augmented democracy system built on off-the-shelf LLMs +fine-tuned to augment data on citizen's preferences elicited over policies +extracted from the government programs of the two main candidates of Brazil's +2022 presidential election. We use a train-test cross-validation setup to +estimate the accuracy with which the LLMs predict both: a subject's individual +political choices and the aggregate preferences of the full sample of +participants. At the individual level, we find that LLMs predict out of sample +preferences more accurately than a "bundle rule", which would assume that +citizens always vote for the proposals of the candidate aligned with their +self-reported political orientation. At the population level, we show that a +probabilistic sample augmented by an LLM provides a more accurate estimate of +the aggregate preferences of a population than the non-augmented probabilistic +sample alone. Together, these results indicates that policy preference data +augmented using LLMs can capture nuances that transcend party lines and +represents a promising avenue of research for data augmentation. + +
+
+ comment: 24 pages main manuscript with 4 figures. 13 pages of supplementary + material +
+
+
+
+
+ + ♻ ☆ Between Lines of Code: Unraveling the Distinct Patterns of Machine and + Human Programmers ICSE 2025 + + +
+ Large language models have catalyzed an unprecedented wave in code +generation. While achieving significant advances, they blur the distinctions +between machine- and human-authored source code, causing integrity and +authenticity issues of software artifacts. Previous methods such as DetectGPT +have proven effective in discerning machine-generated texts, but they do not +identify and harness the unique patterns of machine-generated code. Thus, its +applicability falters when applied to code. In this paper, we carefully study +the specific patterns that characterize machine- and human-authored code. +Through a rigorous analysis of code attributes such as lexical diversity, +conciseness, and naturalness, we expose unique patterns inherent to each +source. We particularly notice that the syntactic segmentation of code is a +critical factor in identifying its provenance. Based on our findings, we +propose DetectCodeGPT, a novel method for detecting machine-generated code, +which improves DetectGPT by capturing the distinct stylized patterns of code. +Diverging from conventional techniques that depend on external LLMs for +perturbations, DetectCodeGPT perturbs the code corpus by strategically +inserting spaces and newlines, ensuring both efficacy and efficiency. +Experiment results show that our approach significantly outperforms +state-of-the-art techniques in detecting machine-generated code. + +
+
+ comment: Accepted by the 47th International Conference on Software Engineering + (ICSE 2025). Code available at https://github.com/YerbaPage/DetectCodeGPT +
+
+
+
+
+ + ♻ ☆ Key-Point-Driven Mathematical Reasoning Distillation of Large Language + Model + + +
+ Large Language Models (LLMs) have demonstrated exceptional proficiency in +mathematical reasoning tasks due to their extensive parameter counts and +training on vast datasets. Despite these capabilities, deploying LLMs is +hindered by their computational demands. Distilling LLM mathematical reasoning +into Smaller Language Models (SLMs) has emerged as a solution to this +challenge, although these smaller models often suffer from errors in +calculation and semantic understanding. Prior work has proposed +Program-of-Thought Distillation (PoTD) to avoid calculation error. To further +address semantic understanding errors, we propose Key-Point-Driven Mathematical +Reasoning Distillation (KPDD). KPDD enhances the reasoning performance of SLMs +by breaking down the problem-solving process into three stages: Core Question +Extraction, Problem-Solving Information Extraction, and Step-by-Step Solution. +This method is further divided into KPDD-CoT, which generates Chain-of-Thought +rationales, and KPDD-PoT, which creates Program-of-Thought rationales. The +experiment results show that KPDD-CoT significantly improves reasoning +abilities, while KPDD-PoT achieves state-of-the-art performance in mathematical +reasoning tasks. Our approach effectively mitigates misunderstanding errors, +advancing the deployment of efficient and capable SLMs. + +
+
+ comment: Modify the description error in the experiment settings, i.e., the + teacher LLM changes deepseek-v2 from GPT-4 +
+
+
+
+
+ + ♻ ☆ CT-ADE: An Evaluation Benchmark for Adverse Drug Event Prediction from + Clinical Trial Results + + +
+ Adverse drug events (ADEs) significantly impact clinical research, causing +many clinical trial failures. ADE prediction is key for developing safer +medications and enhancing patient outcomes. To support this effort, we +introduce CT-ADE, a dataset for multilabel predictive modeling of ADEs in +monopharmacy treatments. CT-ADE integrates data from 2,497 unique drugs, +encompassing 168,984 drug-ADE pairs extracted from clinical trials, annotated +with patient and contextual information, and comprehensive ADE concepts +standardized across multiple levels of the MedDRA ontology. Preliminary +analyses with large language models (LLMs) achieved F1-scores up to 55.90%. +Models using patient and contextual information showed F1-score improvements of +21%-38% over models using only chemical structure data. Our results highlight +the importance of target population and treatment regimens in the predictive +modeling of ADEs, offering greater performance gains than LLM domain +specialization and scaling. CT-ADE provides an essential tool for researchers +aiming to leverage artificial intelligence and machine learning to enhance +patient safety and minimize the impact of ADEs on pharmaceutical research and +development. The dataset is publicly accessible at +https://github.com/ds4dh/CT-ADE. + +
+
+
+
+
+ + ♻ ☆ CollectiveSFT: Scaling Large Language Models for Chinese Medical + Benchmark with Collective Instructions in Healthcare + + +
+ The rapid progress in Large Language Models (LLMs) has prompted the creation +of numerous benchmarks to evaluate their capabilities.This study focuses on the +Comprehensive Medical Benchmark in Chinese (CMB), showcasing how dataset +diversity and distribution in supervised fine-tuning (SFT) may enhance LLM +performance.Remarkably, We successfully trained a smaller base model to achieve +scores comparable to larger models, indicating that a diverse and +well-distributed dataset can optimize performance regardless of model size.This +study suggests that even smaller models may reach high performance levels with +carefully curated and varied datasets. By integrating a wide range of +instructional content, our approach addresses potential issues such as data +quality inconsistencies. Our results imply that a broader spectrum of training +data may enhance a model's ability to generalize and perform effectively across +different medical scenarios, highlighting the importance of dataset quality and +diversity in fine-tuning processes. We open-source the model for future +research at https://github.com/CAS-SIAT-XinHai/CollectiveSFT + +
+
+ comment: Technical Report +
+
+
+
+
+ + ♻ ☆ The Responsible Development of Automated Student Feedback with + Generative AI + + +
+ Contribution: This paper identifies four critical ethical considerations for +implementing generative AI tools to provide automated feedback to students. + Background: Providing rich feedback to students is essential for supporting +student learning. Recent advances in generative AI, particularly with large +language models (LLMs), provide the opportunity to deliver repeatable, scalable +and instant automatically generated feedback to students, making abundant a +previously scarce and expensive learning resource. Such an approach is feasible +from a technical perspective due to these recent advances in Artificial +Intelligence (AI) and Natural Language Processing (NLP); while the potential +upside is a strong motivator, doing so introduces a range of potential ethical +issues that must be considered as we apply these technologies. + Intended Outcomes: The goal of this work is to enable the use of AI systems +to automate mundane assessment and feedback tasks, without introducing a +"tyranny of the majority", where the needs of minorities in the long tail are +overlooked because they are difficult to automate. + Application Design: This paper applies an extant ethical framework used for +AI and machine learning to the specific challenge of providing automated +feedback to student engineers. The task is considered from both a development +and maintenance perspective, considering how automated feedback tools will +evolve and be used over time. + Findings: This paper identifies four key ethical considerations for the +implementation of automated feedback for students: Participation, Development, +Impact on Learning and Evolution over Time. + +
+
+ comment: Under review at IEEE ToE +
+
+
+
+
+ + ♻ ☆ GigaPevt: Multimodal Medical Assistant IJCAI 2024 + + +
+ Building an intelligent and efficient medical assistant is still a +challenging AI problem. The major limitation comes from the data modality +scarceness, which reduces comprehensive patient perception. This demo paper +presents the GigaPevt, the first multimodal medical assistant that combines the +dialog capabilities of large language models with specialized medical models. +Such an approach shows immediate advantages in dialog quality and metric +performance, with a 1.18% accuracy improvement in the question-answering task. + +
+
+ comment: IJCAI 2024, 4 pages, 2 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ ScreenQA: Large-Scale Question-Answer Pairs over Mobile App Screenshots + + +
+ We present a new benchmark and dataset, ScreenQA, for screen content +understanding via question answering. The existing screen datasets are focused +either on structure and component-level understanding, or on a much +higher-level composite task such as navigation and task completion. We attempt +to bridge the gap between these two by annotating 86K question-answer pairs +over the RICO dataset in hope to benchmark the screen reading comprehension +capacity. This work is also the first to annotate answers for different +application scenarios, including both full sentences and short forms, as well +as supporting UI contents on screen and their bounding boxes. With the rich +annotation, we discuss and define the evaluation metrics of the benchmark, show +applications of the dataset, and provide a few baselines using closed and open +source models. + +
+
+
+
+
+ + ♻ ☆ Benchmarks as Microscopes: A Call for Model Metrology + + +
+ Modern language models (LMs) pose a new challenge in capability assessment. +Static benchmarks inevitably saturate without providing confidence in the +deployment tolerances of LM-based systems, but developers nonetheless claim +that their models have generalized traits such as reasoning or open-domain +language understanding based on these flawed metrics. The science and practice +of LMs requires a new approach to benchmarking which measures specific +capabilities with dynamic assessments. To be confident in our metrics, we need +a new discipline of model metrology -- one which focuses on how to generate +benchmarks that predict performance under deployment. Motivated by our +evaluation criteria, we outline how building a community of model metrology +practitioners -- one focused on building tools and studying how to measure +system capabilities -- is the best way to meet these needs to and add clarity +to the AI discussion. + +
+
+ comment: Conference paper at COLM 2024 +
+
+
+
+
+ + ♻ ☆ ChatGLM: A Family of Large Language Models from GLM-130B to GLM-4 All + Tools + + +
+ We introduce ChatGLM, an evolving family of large language models that we +have been developing over time. This report primarily focuses on the GLM-4 +language series, which includes GLM-4, GLM-4-Air, and GLM-4-9B. They represent +our most capable models that are trained with all the insights and lessons +gained from the preceding three generations of ChatGLM. To date, the GLM-4 +models are pre-trained on ten trillions of tokens mostly in Chinese and +English, along with a small set of corpus from 24 languages, and aligned +primarily for Chinese and English usage. The high-quality alignment is achieved +via a multi-stage post-training process, which involves supervised fine-tuning +and learning from human feedback. Evaluations show that GLM-4 1) closely rivals +or outperforms GPT-4 in terms of general metrics such as MMLU, GSM8K, MATH, +BBH, GPQA, and HumanEval, 2) gets close to GPT-4-Turbo in instruction following +as measured by IFEval, 3) matches GPT-4 Turbo (128K) and Claude 3 for long +context tasks, and 4) outperforms GPT-4 in Chinese alignments as measured by +AlignBench. The GLM-4 All Tools model is further aligned to understand user +intent and autonomously decide when and which tool(s) touse -- including web +browser, Python interpreter, text-to-image model, and user-defined functions -- +to effectively complete complex tasks. In practical applications, it matches +and even surpasses GPT-4 All Tools in tasks like accessing online information +via web browsing and solving math problems using Python interpreter. Over the +course, we have open-sourced a series of models, including ChatGLM-6B (three +generations), GLM-4-9B (128K, 1M), GLM-4V-9B, WebGLM, and CodeGeeX, attracting +over 10 million downloads on Hugging face in the year 2023 alone. The open +models can be accessed through https://github.com/THUDM and +https://huggingface.co/THUDM. + +
+
+
+
+
+ + ♻ ☆ MMWorld: Towards Multi-discipline Multi-faceted World Model Evaluation + in Videos + + +
+ Multimodal Language Language Models (MLLMs) demonstrate the emerging +abilities of "world models" -- interpreting and reasoning about complex +real-world dynamics. To assess these abilities, we posit videos are the ideal +medium, as they encapsulate rich representations of real-world dynamics and +causalities. To this end, we introduce MMWorld, a new benchmark for +multi-discipline, multi-faceted multimodal video understanding. MMWorld +distinguishes itself from previous video understanding benchmarks with two +unique advantages: (1) multi-discipline, covering various disciplines that +often require domain expertise for comprehensive understanding; (2) +multi-faceted reasoning, including explanation, counterfactual thinking, future +prediction, etc. MMWorld consists of a human-annotated dataset to evaluate +MLLMs with questions about the whole videos and a synthetic dataset to analyze +MLLMs within a single modality of perception. Together, MMWorld encompasses +1,910 videos across seven broad disciplines and 69 subdisciplines, complete +with 6,627 question-answer pairs and associated captions. The evaluation +includes 2 proprietary and 10 open-source MLLMs, which struggle on MMWorld +(e.g., GPT-4V performs the best with only 52.3\% accuracy), showing large room +for improvement. Further ablation studies reveal other interesting findings +such as models' different skill sets from humans. We hope MMWorld can serve as +an essential step towards world model evaluation in videos. + +
+
+
+
+
+ + ♻ ☆ C-RAG: Certified Generation Risks for Retrieval-Augmented Language + Models ICML 2024 + + +
+ Despite the impressive capabilities of large language models (LLMs) across +diverse applications, they still suffer from trustworthiness issues, such as +hallucinations and misalignments. Retrieval-augmented language models (RAG) +have been proposed to enhance the credibility of generations by grounding +external knowledge, but the theoretical understandings of their generation +risks remains unexplored. In this paper, we answer: 1) whether RAG can indeed +lead to low generation risks, 2) how to provide provable guarantees on the +generation risks of RAG and vanilla LLMs, and 3) what sufficient conditions +enable RAG models to reduce generation risks. We propose C-RAG, the first +framework to certify generation risks for RAG models. Specifically, we provide +conformal risk analysis for RAG models and certify an upper confidence bound of +generation risks, which we refer to as conformal generation risk. We also +provide theoretical guarantees on conformal generation risks for general +bounded risk functions under test distribution shifts. We prove that RAG +achieves a lower conformal generation risk than that of a single LLM when the +quality of the retrieval model and transformer is non-trivial. Our intensive +empirical results demonstrate the soundness and tightness of our conformal +generation risk guarantees across four widely-used NLP datasets on four +state-of-the-art retrieval models. + +
+
+ comment: Accepted to ICML 2024 +
+
+
+
+
+ + ♻ ☆ On the Limitations of Compute Thresholds as a Governance Strategy + + +
+ At face value, this essay is about understanding a fairly esoteric governance +tool called compute thresholds. However, in order to grapple with whether these +thresholds will achieve anything, we must first understand how they came to be. +To do so, we need to engage with a decades-old debate at the heart of computer +science progress, namely, is bigger always better? Does a certain inflection +point of compute result in changes to the risk profile of a model? Hence, this +essay may be of interest not only to policymakers and the wider public but also +to computer scientists interested in understanding the role of compute in +unlocking breakthroughs. This discussion is timely given the wide adoption of +compute thresholds in both the White House Executive Orders on AI Safety (EO) +and the EU AI Act to identify more risky systems. A key conclusion of this +essay is that compute thresholds, as currently implemented, are shortsighted +and likely to fail to mitigate risk. The relationship between compute and risk +is highly uncertain and rapidly changing. Relying upon compute thresholds +overestimates our ability to predict what abilities emerge at different scales. +This essay ends with recommendations for a better way forward. + +
+
+
+
+
+ + ♻ ☆ ArabicMMLU: Assessing Massive Multitask Language Understanding in Arabic ACL 2024 + + +
+ The focus of language model evaluation has transitioned towards reasoning and +knowledge-intensive tasks, driven by advancements in pretraining large models. +While state-of-the-art models are partially trained on large Arabic texts, +evaluating their performance in Arabic remains challenging due to the limited +availability of relevant datasets. To bridge this gap, we present +\datasetname{}, the first multi-task language understanding benchmark for the +Arabic language, sourced from school exams across diverse educational levels in +different countries spanning North Africa, the Levant, and the Gulf regions. +Our data comprises 40 tasks and 14,575 multiple-choice questions in Modern +Standard Arabic (MSA) and is carefully constructed by collaborating with native +speakers in the region. Our comprehensive evaluations of 35 models reveal +substantial room for improvement, particularly among the best open-source +models. Notably, BLOOMZ, mT0, LLaMA2, and Falcon struggle to achieve a score of +50%, while even the top-performing Arabic-centric model only achieves a score +of 62.3%. + +
+
+ comment: Findings of ACL 2024 +
+
+
+
+
+ + ♻ ☆ Meta-Rewarding Language Models: Self-Improving Alignment with + LLM-as-a-Meta-Judge + + +
+ Large Language Models (LLMs) are rapidly surpassing human knowledge in many +domains. While improving these models traditionally relies on costly human +data, recent self-rewarding mechanisms (Yuan et al., 2024) have shown that LLMs +can improve by judging their own responses instead of relying on human +labelers. However, existing methods have primarily focused on improving model +responses rather than judgment capabilities, resulting in rapid saturation +during iterative training. To address this issue, we introduce a novel +Meta-Rewarding step to the self-improvement process, where the model judges its +own judgements and uses that feedback to refine its judgment skills. +Surprisingly, this unsupervised approach improves the model's ability to judge +{\em and} follow instructions, as demonstrated by a win rate improvement of +Llama-3-8B-Instruct from 22.9% to 39.4% on AlpacaEval 2, and 20.6% to 29.1% on +Arena-Hard. These results strongly suggest the potential for self-improving +models without human supervision. + +
+
+
+
+
+ + ♻ ☆ LLM in a flash: Efficient Large Language Model Inference with Limited + Memory ACL 2024 + + +
+ Large language models (LLMs) are central to modern natural language +processing, delivering exceptional performance in various tasks. However, their +substantial computational and memory requirements present challenges, +especially for devices with limited DRAM capacity. This paper tackles the +challenge of efficiently running LLMs that exceed the available DRAM capacity +by storing the model parameters in flash memory, but bringing them on demand to +DRAM. Our method involves constructing an inference cost model that takes into +account the characteristics of flash memory, guiding us to optimize in two +critical areas: reducing the volume of data transferred from flash and reading +data in larger, more contiguous chunks. Within this hardware-informed +framework, we introduce two principal techniques. First, "windowing" +strategically reduces data transfer by reusing previously activated neurons, +and second, "row-column bundling", tailored to the sequential data access +strengths of flash memory, increases the size of data chunks read from flash +memory. These methods collectively enable running models up to twice the size +of the available DRAM, with a 4-5x and 20-25x increase in inference speed +compared to naive loading approaches in CPU and GPU, respectively. Our +integration of sparsity awareness, context-adaptive loading, and a +hardware-oriented design paves the way for effective inference of LLMs on +devices with limited memory. + +
+
+ comment: ACL 2024 +
+
+
+
+
+ + ♻ ☆ Designing Informative Metrics for Few-Shot Example Selection + + +
+ Pretrained language models (PLMs) have shown remarkable few-shot learning +capabilities when provided with properly formatted examples. However, selecting +the "best" examples remains an open challenge. We propose a complexity-based +prompt selection approach for sequence tagging tasks. This approach avoids the +training of a dedicated model for selection of examples, and instead uses +certain metrics to align the syntactico-semantic complexity of test sentences +and examples. We use both sentence- and word-level metrics to match the +complexity of examples to the (test) sentence being considered. Our results +demonstrate that our approach extracts greater performance from PLMs: it +achieves state-of-the-art performance on few-shot NER, achieving a 5% absolute +improvement in F1 score on the CoNLL2003 dataset for GPT-4. We also see large +gains of upto 28.85 points (F1/Acc.) in smaller models like GPT-j-6B. + +
+
+
+
+
+ + ♻ ☆ Characterizing Learning Curves During Language Model Pre-Training: + Learning, Forgetting, and Stability ACL + + +
+ How do language models learn to make predictions during pre-training? To +study this, we extract learning curves from five autoregressive English +language model pre-training runs, for 1M unseen tokens in context. We observe +that the language models generate short repetitive phrases before learning to +generate longer and more coherent text. We also find that individual tokens +often exhibit sudden increases or decreases in loss that are surprisingly +consistent across pre-training runs. To better understand these fluctuations, +we quantify the final surprisal, within-run variability, age of acquisition, +forgettability, and cross-run variability of learning curves for individual +tokens in context. More frequent tokens reach lower final surprisals, exhibit +less variability within and across pre-training runs, are learned earlier, and +are less likely to be "forgotten" during pre-training. Higher n-gram +probabilities further accentuate these effects. Independent of the target +token, shorter and more frequent contexts correlate with marginally more stable +and quickly acquired predictions. Based on our results, we argue for the +existence of sequential learning dependencies between different model +capabilities, and we characterize language model learning as early n-gram +learning before gradual refinement of tail n-gram predictions. + +
+
+ comment: Accepted to TACL (pre-MIT Press version) +
+
+
+
+
+ + ♻ ☆ CompA: Addressing the Gap in Compositional Reasoning in Audio-Language + Models ICLR 2024 + + +
+ A fundamental characteristic of audio is its compositional nature. +Audio-language models (ALMs) trained using a contrastive approach (e.g., CLAP) +that learns a shared representation between audio and language modalities have +improved performance in many downstream applications, including zero-shot audio +classification, audio retrieval, etc. However, the ability of these models to +effectively perform compositional reasoning remains largely unexplored and +necessitates additional research. In this paper, we propose CompA, a collection +of two expert-annotated benchmarks with a majority of real-world audio samples, +to evaluate compositional reasoning in ALMs. Our proposed CompA-order evaluates +how well an ALM understands the order or occurrence of acoustic events in +audio, and CompA-attribute evaluates attribute-binding of acoustic events. An +instance from either benchmark consists of two audio-caption pairs, where both +audios have the same acoustic events but with different compositions. An ALM is +evaluated on how well it matches the right audio to the right caption. Using +this benchmark, we first show that current ALMs perform only marginally better +than random chance, thereby struggling with compositional reasoning. Next, we +propose CompA-CLAP, where we fine-tune CLAP using a novel learning method to +improve its compositional reasoning abilities. To train CompA-CLAP, we first +propose improvements to contrastive training with composition-aware hard +negatives, allowing for more focused training. Next, we propose a novel modular +contrastive loss that helps the model learn fine-grained compositional +understanding and overcomes the acute scarcity of openly available +compositional audios. CompA-CLAP significantly improves over all our baseline +models on the CompA benchmark, indicating its superior compositional reasoning +capabilities. + +
+
+ comment: ICLR 2024. Project Page: https://sreyan88.github.io/compa_iclr/ +
+
+
+
+
+ + ♻ ☆ Toward Automated Detection of Biased Social Signals from the Content of + Clinical Conversations + + +
+ Implicit bias can impede patient-provider interactions and lead to inequities +in care. Raising awareness is key to reducing such bias, but its manifestations +in the social dynamics of patient-provider communication are difficult to +detect. In this study, we used automated speech recognition (ASR) and natural +language processing (NLP) to identify social signals in patient-provider +interactions. We built an automated pipeline to predict social signals from +audio recordings of 782 primary care visits that achieved 90.1% average +accuracy across codes, and exhibited fairness in its predictions for white and +non-white patients. Applying this pipeline, we identified statistically +significant differences in provider communication behavior toward white versus +non-white patients. In particular, providers expressed more patient-centered +behaviors towards white patients including more warmth, engagement, and +attentiveness. Our study underscores the potential of automated tools in +identifying subtle communication signals that may be linked with bias and +impact healthcare quality and equity. + +
+
+ comment: Accepted by AMIA 2024 Annual Symposium +
+
+
+
+
+ + ♻ ☆ Offline Training of Language Model Agents with Functions as Learnable + Weights + + +
+ Researchers and practitioners have recently reframed powerful Large Language +Models (LLMs) as agents, enabling them to automate complex tasks largely via +the use of specialized functions. To facilitate the development of LLM agents, +we present a novel paradigm of training LLM agents without modifying the LLM +weights, which is particularly useful when the LLMs are difficult or +inaccessible for modifications. Inspired by how humans continuously forge tools +to adapt to real-world tasks, rather than change our biological structure to +fit a static set of tools, we propose to progressively forge agent's functions +to better solve the downstream tasks instead of modifying the LLM weights. By +treating the functions as learnable `agent parameters' and leveraging the +fundamental idea of model training in artificial intelligence, we develop +AgentOptimizer that employs the LLM to update agents' functions and devise an +agent training algorithm with two strategies, roll-back, and early-stop, to +streamline the training process. With extensive experiments, we showcase that +the agent training paradigm could significantly improve the performance of +representative LLM agents in various downstream tasks. We also study the +behavior of the agent training regarding aspects like the learning curve and +domain transferability. + +
+
+ comment: 22 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ RoseLoRA: Row and Column-wise Sparse Low-rank Adaptation of Pre-trained + Language Model for Knowledge Editing and Fine-tuning + + +
+ Pre-trained language models, trained on large-scale corpora, demonstrate +strong generalizability across various NLP tasks. Fine-tuning these models for +specific tasks typically involves updating all parameters, which is +resource-intensive. Parameter-efficient fine-tuning (PEFT) methods, such as the +popular LoRA family, introduce low-rank matrices to learn only a few parameters +efficiently. However, during inference, the product of these matrices updates +all pre-trained parameters, complicating tasks like knowledge editing that +require selective updates. We propose a novel PEFT method, which conducts +\textbf{r}ow and c\textbf{o}lumn-wise spar\textbf{se} +\textbf{lo}w-\textbf{r}ank \textbf{a}daptation (RoseLoRA), to address this +challenge. RoseLoRA identifies and updates only the most important parameters +for a specific task, maintaining efficiency while preserving other model +knowledge. By adding a sparsity constraint on the product of low-rank matrices +and converting it to row and column-wise sparsity, we ensure efficient and +precise model updates. Our theoretical analysis guarantees the lower bound of +the sparsity with respective to the matrix product. Extensive experiments on +five benchmarks across twenty datasets demonstrate that RoseLoRA outperforms +baselines in both general fine-tuning and knowledge editing tasks. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 141 + +
+
+
+ + ☆ Matting by Generation SIGGRAPH'24 + + +
+ This paper introduces an innovative approach for image matting that redefines +the traditional regression-based task as a generative modeling challenge. Our +method harnesses the capabilities of latent diffusion models, enriched with +extensive pre-trained knowledge, to regularize the matting process. We present +novel architectural innovations that empower our model to produce mattes with +superior resolution and detail. The proposed method is versatile and can +perform both guidance-free and guidance-based image matting, accommodating a +variety of additional cues. Our comprehensive evaluation across three benchmark +datasets demonstrates the superior performance of our approach, both +quantitatively and qualitatively. The results not only reflect our method's +robust effectiveness but also highlight its ability to generate visually +compelling mattes that approach photorealistic quality. The project page for +this paper is available at +https://lightchaserx.github.io/matting-by-generation/ + +
+
+ comment: SIGGRAPH'24, Project page: + https://lightchaserx.github.io/matting-by-generation/ +
+
+
+
+
+ + ☆ Add-SD: Rational Generation without Manual Reference + + +
+ Diffusion models have exhibited remarkable prowess in visual generalization. +Building on this success, we introduce an instruction-based object addition +pipeline, named Add-SD, which automatically inserts objects into realistic +scenes with rational sizes and positions. Different from layout-conditioned +methods, Add-SD is solely conditioned on simple text prompts rather than any +other human-costly references like bounding boxes. Our work contributes in +three aspects: proposing a dataset containing numerous instructed image pairs; +fine-tuning a diffusion model for rational generation; and generating synthetic +data to boost downstream tasks. The first aspect involves creating a +RemovalDataset consisting of original-edited image pairs with textual +instructions, where an object has been removed from the original image while +maintaining strong pixel consistency in the background. These data pairs are +then used for fine-tuning the Stable Diffusion (SD) model. Subsequently, the +pretrained Add-SD model allows for the insertion of expected objects into an +image with good rationale. Additionally, we generate synthetic instances for +downstream task datasets at scale, particularly for tail classes, to alleviate +the long-tailed problem. Downstream tasks benefit from the enriched dataset +with enhanced diversity and rationale. Experiments on LVIS val demonstrate that +Add-SD yields an improvement of 4.3 mAP on rare classes over the baseline. Code +and models are available at https://github.com/ylingfeng/Add-SD. + +
+
+
+
+
+ + ☆ CLEFT: Language-Image Contrastive Learning with Efficient Large Language + Model and Prompt Fine-Tuning MICCAI 2024 + + +
+ Recent advancements in Contrastive Language-Image Pre-training (CLIP) have +demonstrated notable success in self-supervised representation learning across +various tasks. However, the existing CLIP-like approaches often demand +extensive GPU resources and prolonged training times due to the considerable +size of the model and dataset, making them poor for medical applications, in +which large datasets are not always common. Meanwhile, the language model +prompts are mainly manually derived from labels tied to images, potentially +overlooking the richness of information within training samples. We introduce a +novel language-image Contrastive Learning method with an Efficient large +language model and prompt Fine-Tuning (CLEFT) that harnesses the strengths of +the extensive pre-trained language and visual models. Furthermore, we present +an efficient strategy for learning context-based prompts that mitigates the gap +between informative clinical diagnostic data and simple class labels. Our +method demonstrates state-of-the-art performance on multiple chest X-ray and +mammography datasets compared with various baselines. The proposed parameter +efficient framework can reduce the total trainable model size by 39% and reduce +the trainable language model to only 4% compared with the current BERT encoder. + +
+
+ comment: Accepted by MICCAI 2024 +
+
+
+
+
+ + ☆ Evolver: Chain-of-Evolution Prompting to Boost Large Multimodal Models + for Hateful Meme Detection + + +
+ Recent advances show that two-stream approaches have achieved outstanding +performance in hateful meme detection. However, hateful memes constantly evolve +as new memes emerge by fusing progressive cultural ideas, making existing +methods obsolete or ineffective. In this work, we explore the potential of +Large Multimodal Models (LMMs) for hateful meme detection. To this end, we +propose Evolver, which incorporates LMMs via Chain-of-Evolution (CoE) +Prompting, by integrating the evolution attribute and in-context information of +memes. Specifically, Evolver simulates the evolving and expressing process of +memes and reasons through LMMs in a step-by-step manner. First, an evolutionary +pair mining module retrieves the top-k most similar memes in the external +curated meme set with the input meme. Second, an evolutionary information +extractor is designed to summarize the semantic regularities between the paired +memes for prompting. Finally, a contextual relevance amplifier enhances the +in-context hatefulness information to boost the search for evolutionary +processes. Extensive experiments on public FHM, MAMI, and HarM datasets show +that CoE prompting can be incorporated into existing LMMs to improve their +performance. More encouragingly, it can serve as an interpretive tool to +promote the understanding of the evolution of social memes. + +
+
+
+
+
+ + ☆ XHand: Real-time Expressive Hand Avatar + + +
+ Hand avatars play a pivotal role in a wide array of digital interfaces, +enhancing user immersion and facilitating natural interaction within virtual +environments. While previous studies have focused on photo-realistic hand +rendering, little attention has been paid to reconstruct the hand geometry with +fine details, which is essential to rendering quality. In the realms of +extended reality and gaming, on-the-fly rendering becomes imperative. To this +end, we introduce an expressive hand avatar, named XHand, that is designed to +comprehensively generate hand shape, appearance, and deformations in real-time. +To obtain fine-grained hand meshes, we make use of three feature embedding +modules to predict hand deformation displacements, albedo, and linear blending +skinning weights, respectively. To achieve photo-realistic hand rendering on +fine-grained meshes, our method employs a mesh-based neural renderer by +leveraging mesh topological consistency and latent codes from embedding +modules. During training, a part-aware Laplace smoothing strategy is proposed +by incorporating the distinct levels of regularization to effectively maintain +the necessary details and eliminate the undesired artifacts. The experimental +evaluations on InterHand2.6M and DeepHandMesh datasets demonstrate the efficacy +of XHand, which is able to recover high-fidelity geometry and texture for hand +animations across diverse poses in real-time. To reproduce our results, we will +make the full implementation publicly available at +https://github.com/agnJason/XHand. + +
+
+
+
+
+ + ☆ GABInsight: Exploring Gender-Activity Binding Bias in Vision-Language + Models + + +
+ Vision-language models (VLMs) are intensively used in many downstream tasks, +including those requiring assessments of individuals appearing in the images. +While VLMs perform well in simple single-person scenarios, in real-world +applications, we often face complex situations in which there are persons of +different genders doing different activities. We show that in such cases, VLMs +are biased towards identifying the individual with the expected gender +(according to ingrained gender stereotypes in the model or other forms of +sample selection bias) as the performer of the activity. We refer to this bias +in associating an activity with the gender of its actual performer in an image +or text as the Gender-Activity Binding (GAB) bias and analyze how this bias is +internalized in VLMs. To assess this bias, we have introduced the GAB dataset +with approximately 5500 AI-generated images that represent a variety of +activities, addressing the scarcity of real-world images for some scenarios. To +have extensive quality control, the generated images are evaluated for their +diversity, quality, and realism. We have tested 12 renowned pre-trained VLMs on +this dataset in the context of text-to-image and image-to-text retrieval to +measure the effect of this bias on their predictions. Additionally, we have +carried out supplementary experiments to quantify the bias in VLMs' text +encoders and to evaluate VLMs' capability to recognize activities. Our +experiments indicate that VLMs experience an average performance decline of +about 13.2% when confronted with gender-activity binding bias. + +
+
+
+
+
+ + ☆ From Feature Importance to Natural Language Explanations Using LLMs with + RAG + + +
+ As machine learning becomes increasingly integral to autonomous +decision-making processes involving human interaction, the necessity of +comprehending the model's outputs through conversational means increases. Most +recently, foundation models are being explored for their potential as post hoc +explainers, providing a pathway to elucidate the decision-making mechanisms of +predictive models. In this work, we introduce traceable question-answering, +leveraging an external knowledge repository to inform the responses of Large +Language Models (LLMs) to user queries within a scene understanding task. This +knowledge repository comprises contextual details regarding the model's output, +containing high-level features, feature importance, and alternative +probabilities. We employ subtractive counterfactual reasoning to compute +feature importance, a method that entails analysing output variations resulting +from decomposing semantic features. Furthermore, to maintain a seamless +conversational flow, we integrate four key characteristics - social, causal, +selective, and contrastive - drawn from social science research on human +explanations into a single-shot prompt, guiding the response generation +process. Our evaluation demonstrates that explanations generated by the LLMs +encompassed these elements, indicating its potential to bridge the gap between +complex model outputs and natural language expressions. + +
+
+
+
+
+ + ☆ PIXELMOD: Improving Soft Moderation of Visual Misleading Information on + Twitter + + +
+ Images are a powerful and immediate vehicle to carry misleading or outright +false messages, yet identifying image-based misinformation at scale poses +unique challenges. In this paper, we present PIXELMOD, a system that leverages +perceptual hashes, vector databases, and optical character recognition (OCR) to +efficiently identify images that are candidates to receive soft moderation +labels on Twitter. We show that PIXELMOD outperforms existing image similarity +approaches when applied to soft moderation, with negligible performance +overhead. We then test PIXELMOD on a dataset of tweets surrounding the 2020 US +Presidential Election, and find that it is able to identify visually misleading +images that are candidates for soft moderation with 0.99% false detection and +2.06% false negatives. + +
+
+
+
+
+ + ☆ MMTrail: A Multimodal Trailer Video Dataset with Language and Music + Descriptions + + +
+ Massive multi-modality datasets play a significant role in facilitating the +success of large video-language models. However, current video-language +datasets primarily provide text descriptions for visual frames, considering +audio to be weakly related information. They usually overlook exploring the +potential of inherent audio-visual correlation, leading to monotonous +annotation within each modality instead of comprehensive and precise +descriptions. Such ignorance results in the difficulty of multiple +cross-modality studies. To fulfill this gap, we present MMTrail, a large-scale +multi-modality video-language dataset incorporating more than 20M trailer clips +with visual captions, and 2M high-quality clips with multimodal captions. +Trailers preview full-length video works and integrate context, visual frames, +and background music. In particular, the trailer has two main advantages: (1) +the topics are diverse, and the content characters are of various types, e.g., +film, news, and gaming. (2) the corresponding background music is +custom-designed, making it more coherent with the visual context. Upon these +insights, we propose a systemic captioning framework, achieving various +modality annotations with more than 27.1k hours of trailer videos. Here, to +ensure the caption retains music perspective while preserving the authority of +visual context, we leverage the advanced LLM to merge all annotations +adaptively. In this fashion, our MMtrail dataset potentially paves the path for +fine-grained large multimodal-language model training. In experiments, we +provide evaluation metrics and benchmark results on our dataset, demonstrating +the high quality of our annotation and its effectiveness for model training. + +
+
+ comment: 15 Pages. Dataset report +
+
+
+
+
+ + ☆ Learning Ordinality in Semantic Segmentation + + +
+ Semantic segmentation consists of predicting a semantic label for each image +pixel. Conventional deep learning models do not take advantage of ordinal +relations that might exist in the domain at hand. For example, it is known that +the pupil is inside the iris, and the lane markings are inside the road. Such +domain knowledge can be employed as constraints to make the model more robust. +The current literature on this topic has explored pixel-wise ordinal +segmentation methods, which treat each pixel as an independent observation and +promote ordinality in its representation. This paper proposes novel spatial +ordinal segmentation methods, which take advantage of the structured image +space by considering each pixel as an observation dependent on its neighborhood +context to also promote ordinal spatial consistency. When evaluated with five +biomedical datasets and multiple configurations of autonomous driving datasets, +ordinal methods resulted in more ordinally-consistent models, with substantial +improvements in ordinal metrics and some increase in the Dice coefficient. It +was also shown that the incorporation of ordinal consistency results in models +with better generalization abilities. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ dopanim: A Dataset of Doppelganger Animals with Noisy Annotations from + Multiple Humans NeurIPS 2024 + + +
+ Human annotators typically provide annotated data for training machine +learning models, such as neural networks. Yet, human annotations are subject to +noise, impairing generalization performances. Methodological research on +approaches counteracting noisy annotations requires corresponding datasets for +a meaningful empirical evaluation. Consequently, we introduce a novel benchmark +dataset, dopanim, consisting of about 15,750 animal images of 15 classes with +ground truth labels. For approximately 10,500 of these images, 20 humans +provided over 52,000 annotations with an accuracy of circa 67%. Its key +attributes include (1) the challenging task of classifying doppelganger +animals, (2) human-estimated likelihoods as annotations, and (3) annotator +metadata. We benchmark well-known multi-annotator learning approaches using +seven variants of this dataset and outline further evaluation use cases such as +learning beyond hard class labels and active learning. Our dataset and a +comprehensive codebase are publicly available to emulate the data collection +process and to reproduce all empirical results. + +
+
+ comment: Under review @ NeurIPS 2024 (Datasets and Benchmarks Track) +
+
+
+
+
+ + ☆ EAR: Edge-Aware Reconstruction of 3-D vertebrae structures from + bi-planar X-ray images + + +
+ X-ray images ease the diagnosis and treatment process due to their rapid +imaging speed and high resolution. However, due to the projection process of +X-ray imaging, much spatial information has been lost. To accurately provide +efficient spinal morphological and structural information, reconstructing the +3-D structures of the spine from the 2-D X-ray images is essential. It is +challenging for current reconstruction methods to preserve the edge information +and local shapes of the asymmetrical vertebrae structures. In this study, we +propose a new Edge-Aware Reconstruction network (EAR) to focus on the +performance improvement of the edge information and vertebrae shapes. In our +network, by using the auto-encoder architecture as the backbone, the edge +attention module and frequency enhancement module are proposed to strengthen +the perception of the edge reconstruction. Meanwhile, we also combine four loss +terms, including reconstruction loss, edge loss, frequency loss and projection +loss. The proposed method is evaluated using three publicly accessible datasets +and compared with four state-of-the-art models. The proposed method is superior +to other methods and achieves 25.32%, 15.32%, 86.44%, 80.13%, 23.7612 and +0.3014 with regard to MSE, MAE, Dice, SSIM, PSNR and frequency distance. Due to +the end-to-end and accurate reconstruction process, EAR can provide sufficient +3-D spatial information and precise preoperative surgical planning guidance. + +
+
+ comment: 13 pages, 11 figures, 3 tables +
+
+
+
+
+ + ☆ UniProcessor: A Text-induced Unified Low-level Image Processor + + +
+ Image processing, including image restoration, image enhancement, etc., +involves generating a high-quality clean image from a degraded input. Deep +learning-based methods have shown superior performance for various image +processing tasks in terms of single-task conditions. However, they require to +train separate models for different degradations and levels, which limits the +generalization abilities of these models and restricts their applications in +real-world. In this paper, we propose a text-induced unified image processor +for low-level vision tasks, termed UniProcessor, which can effectively process +various degradation types and levels, and support multimodal control. +Specifically, our UniProcessor encodes degradation-specific information with +the subject prompt and process degradations with the manipulation prompt. These +context control features are injected into the UniProcessor backbone via +cross-attention to control the processing procedure. For automatic +subject-prompt generation, we further build a vision-language model for +general-purpose low-level degradation perception via instruction tuning +techniques. Our UniProcessor covers 30 degradation types, and extensive +experiments demonstrate that our UniProcessor can well process these +degradations without additional training or tuning and outperforms other +competing methods. Moreover, with the help of degradation-aware context +control, our UniProcessor first shows the ability to individually handle a +single distortion in an image with multiple degradations. + +
+
+
+
+
+ + ☆ SSPA: Split-and-Synthesize Prompting with Gated Alignments for + Multi-Label Image Recognition + + +
+ Multi-label image recognition is a fundamental task in computer vision. +Recently, Vision-Language Models (VLMs) have made notable advancements in this +area. However, previous methods fail to effectively leverage the rich knowledge +in language models and often incorporate label semantics into visual features +unidirectionally. To overcome these problems, we propose a Split-and-Synthesize +Prompting with Gated Alignments (SSPA) framework to amplify the potential of +VLMs. Specifically, we develop an in-context learning approach to associate the +inherent knowledge from LLMs. Then we propose a novel Split-and-Synthesize +Prompting (SSP) strategy to first model the generic knowledge and downstream +label semantics individually and then aggregate them carefully through the +quaternion network. Moreover, we present Gated Dual-Modal Alignments (GDMA) to +bidirectionally interact visual and linguistic modalities while eliminating +redundant cross-modal information, enabling more efficient region-level +alignments. Rather than making the final prediction by a sharp manner in +previous works, we propose a soft aggregator to jointly consider results from +all image regions. With the help of flexible prompting and gated alignments, +SSPA is generalizable to specific domains. Extensive experiments on nine +datasets from three domains (i.e., natural, pedestrian attributes and remote +sensing) demonstrate the state-of-the-art performance of SSPA. Further analyses +verify the effectiveness of SSP and the interpretability of GDMA. The code will +be made public. + +
+
+ comment: 13 pages, 8 figures +
+
+
+
+
+ + ☆ How to Choose a Reinforcement-Learning Algorithm + + +
+ The field of reinforcement learning offers a large variety of concepts and +methods to tackle sequential decision-making problems. This variety has become +so large that choosing an algorithm for a task at hand can be challenging. In +this work, we streamline the process of choosing reinforcement-learning +algorithms and action-distribution families. We provide a structured overview +of existing methods and their properties, as well as guidelines for when to +choose which methods. An interactive version of these guidelines is available +online at https://rl-picker.github.io/. + +
+
+ comment: 40 pages +
+
+
+
+
+ + ☆ Dynamic Scene Understanding through Object-Centric Voxelization and + Neural Rendering + + +
+ Learning object-centric representations from unsupervised videos is +challenging. Unlike most previous approaches that focus on decomposing 2D +images, we present a 3D generative model named DynaVol-S for dynamic scenes +that enables object-centric learning within a differentiable volume rendering +framework. The key idea is to perform object-centric voxelization to capture +the 3D nature of the scene, which infers per-object occupancy probabilities at +individual spatial locations. These voxel features evolve through a +canonical-space deformation function and are optimized in an inverse rendering +pipeline with a compositional NeRF. Additionally, our approach integrates 2D +semantic features to create 3D semantic grids, representing the scene through +multiple disentangled voxel grids. DynaVol-S significantly outperforms existing +models in both novel view synthesis and unsupervised decomposition tasks for +dynamic scenes. By jointly considering geometric structures and semantic +features, it effectively addresses challenging real-world scenarios involving +complex object interactions. Furthermore, once trained, the explicitly +meaningful voxel features enable additional capabilities that 2D scene +decomposition methods cannot achieve, such as novel scene generation through +editing geometric shapes or manipulating the motion trajectories of objects. + +
+
+
+
+
+ + ☆ What is YOLOv5: A deep look into the internal features of the popular + object detector + + +
+ This study presents a comprehensive analysis of the YOLOv5 object detection +model, examining its architecture, training methodologies, and performance. Key +components, including the Cross Stage Partial backbone and Path +Aggregation-Network, are explored in detail. The paper reviews the model's +performance across various metrics and hardware platforms. Additionally, the +study discusses the transition from Darknet to PyTorch and its impact on model +development. Overall, this research provides insights into YOLOv5's +capabilities and its position within the broader landscape of object detection +and why it is a popular choice for constrained edge deployment scenarios. + +
+
+
+
+
+ + ☆ Bayesian Low-Rank LeArning (Bella): A Practical Approach to Bayesian + Neural Networks + + +
+ Computational complexity of Bayesian learning is impeding its adoption in +practical, large-scale tasks. Despite demonstrations of significant merits such +as improved robustness and resilience to unseen or out-of-distribution inputs +over their non- Bayesian counterparts, their practical use has faded to near +insignificance. In this study, we introduce an innovative framework to mitigate +the computational burden of Bayesian neural networks (BNNs). Our approach +follows the principle of Bayesian techniques based on deep ensembles, but +significantly reduces their cost via multiple low-rank perturbations of +parameters arising from a pre-trained neural network. Both vanilla version of +ensembles as well as more sophisticated schemes such as Bayesian learning with +Stein Variational Gradient Descent (SVGD), previously deemed impractical for +large models, can be seamlessly implemented within the proposed framework, +called Bayesian Low-Rank LeArning (Bella). In a nutshell, i) Bella achieves a +dramatic reduction in the number of trainable parameters required to +approximate a Bayesian posterior; and ii) it not only maintains, but in some +instances, surpasses the performance of conventional Bayesian learning methods +and non-Bayesian baselines. Our results with large-scale tasks such as +ImageNet, CAMELYON17, DomainNet, VQA with CLIP, LLaVA demonstrate the +effectiveness and versatility of Bella in building highly scalable and +practical Bayesian deep models for real-world applications. + +
+
+ comment: 25 pages, 14 figures, 11 tables +
+
+
+
+
+ + ☆ S3PET: Semi-supervised Standard-dose PET Image Reconstruction via + Dose-aware Token Swap + + +
+ To acquire high-quality positron emission tomography (PET) images while +reducing the radiation tracer dose, numerous efforts have been devoted to +reconstructing standard-dose PET (SPET) images from low-dose PET (LPET). +However, the success of current fully-supervised approaches relies on abundant +paired LPET and SPET images, which are often unavailable in clinic. Moreover, +these methods often mix the dose-invariant content with dose level-related +dose-specific details during reconstruction, resulting in distorted images. To +alleviate these problems, in this paper, we propose a two-stage Semi-Supervised +SPET reconstruction framework, namely S3PET, to accommodate the training of +abundant unpaired and limited paired SPET and LPET images. Our S3PET involves +an un-supervised pre-training stage (Stage I) to extract representations from +unpaired images, and a supervised dose-aware reconstruction stage (Stage II) to +achieve LPET-to-SPET reconstruction by transferring the dose-specific knowledge +between paired images. Specifically, in stage I, two independent dose-specific +masked autoencoders (DsMAEs) are adopted to comprehensively understand the +unpaired SPET and LPET images. Then, in Stage II, the pre-trained DsMAEs are +further finetuned using paired images. To prevent distortions in both content +and details, we introduce two elaborate modules, i.e., a dose knowledge +decouple module to disentangle the respective dose-specific and dose-invariant +knowledge of LPET and SPET, and a dose-specific knowledge learning module to +transfer the dose-specific information from SPET to LPET, thereby achieving +high-quality SPET reconstruction from LPET images. Experiments on two datasets +demonstrate that our S3PET achieves state-of-the-art performance quantitatively +and qualitatively. + +
+
+
+
+
+ + ☆ Automatic Die Studies for Ancient Numismatics + + +
+ Die studies are fundamental to quantifying ancient monetary production, +providing insights into the relationship between coinage, politics, and +history. The process requires tedious manual work, which limits the size of the +corpora that can be studied. Few works have attempted to automate this task, +and none have been properly released and evaluated from a computer vision +perspective. We propose a fully automatic approach that introduces several +innovations compared to previous methods. We rely on fast and robust local +descriptors matching that is set automatically. Second, the core of our +proposal is a clustering-based approach that uses an intrinsic metric (that +does not need the ground truth labels) to determine its critical +hyper-parameters. We validate the approach on two corpora of Greek coins, +propose an automatic implementation and evaluation of previous baselines, and +show that our approach significantly outperforms them. + +
+
+ comment: code: https://cea-list-lasti.github.io/projects/studies/studies.html +
+
+
+
+
+ + ☆ A Comparative Analysis of YOLOv5, YOLOv8, and YOLOv10 in Kitchen Safety + + +
+ Knife safety in the kitchen is essential for preventing accidents or injuries +with an emphasis on proper handling, maintenance, and storage methods. This +research presents a comparative analysis of three YOLO models, YOLOv5, YOLOv8, +and YOLOv10, to detect the hazards involved in handling knife, concentrating +mainly on ensuring fingers are curled while holding items to be cut and that +hands should only be in contact with knife handle avoiding the blade. +Precision, recall, F-score, and normalized confusion matrix are used to +evaluate the performance of the models. The results indicate that YOLOv5 +performed better than the other two models in identifying the hazard of +ensuring hands only touch the blade, while YOLOv8 excelled in detecting the +hazard of curled fingers while holding items. YOLOv5 and YOLOv8 performed +almost identically in recognizing classes such as hand, knife, and vegetable, +whereas YOLOv5, YOLOv8, and YOLOv10 accurately identified the cutting board. +This paper provides insights into the advantages and shortcomings of these +models in real-world settings. Moreover, by detailing the optimization of YOLO +architectures for safe knife handling, this study promotes the development of +increased accuracy and efficiency in safety surveillance systems. + +
+
+
+
+
+ + ☆ Mean of Means: A 10-dollar Solution for Human Localization with + Calibration-free and Unconstrained Camera Settings + + +
+ Accurate human localization is crucial for various applications, especially +in the Metaverse era. Existing high precision solutions rely on expensive, +tag-dependent hardware, while vision-based methods offer a cheaper, tag-free +alternative. However, current vision solutions based on stereo vision face +limitations due to rigid perspective transformation principles and error +propagation in multi-stage SVD solvers. These solutions also require multiple +high-resolution cameras with strict setup constraints. To address these +limitations, we propose a probabilistic approach that considers all points on +the human body as observations generated by a distribution centered around the +body's geometric center. This enables us to improve sampling significantly, +increasing the number of samples for each point of interest from hundreds to +billions. By modeling the relation between the means of the distributions of +world coordinates and pixel coordinates, leveraging the Central Limit Theorem, +we ensure normality and facilitate the learning process. Experimental results +demonstrate human localization accuracy of 95% within a 0.3m range and nearly +100% accuracy within a 0.5m range, achieved at a low cost of only 10 USD using +two web cameras with a resolution of 640x480 pixels. + +
+
+
+
+
+ + ☆ A Comparative Study of Neural Surface Reconstruction for Scientific + Visualization + + +
+ This comparative study evaluates various neural surface reconstruction +methods, particularly focusing on their implications for scientific +visualization through reconstructing 3D surfaces via multi-view rendering +images. We categorize ten methods into neural radiance fields and neural +implicit surfaces, uncovering the benefits of leveraging distance functions +(i.e., SDFs and UDFs) to enhance the accuracy and smoothness of the +reconstructed surfaces. Our findings highlight the efficiency and quality of +NeuS2 for reconstructing closed surfaces and identify NeUDF as a promising +candidate for reconstructing open surfaces despite some limitations. By sharing +our benchmark dataset, we invite researchers to test the performance of their +methods, contributing to the advancement of surface reconstruction solutions +for scientific visualization. + +
+
+
+
+
+ + ☆ DeTurb: Atmospheric Turbulence Mitigation with Deformable 3D + Convolutions and 3D Swin Transformers + + +
+ Atmospheric turbulence in long-range imaging significantly degrades the +quality and fidelity of captured scenes due to random variations in both +spatial and temporal dimensions. These distortions present a formidable +challenge across various applications, from surveillance to astronomy, +necessitating robust mitigation strategies. While model-based approaches +achieve good results, they are very slow. Deep learning approaches show promise +in image and video restoration but have struggled to address these +spatiotemporal variant distortions effectively. This paper proposes a new +framework that combines geometric restoration with an enhancement module. +Random perturbations and geometric distortion are removed using a pyramid +architecture with deformable 3D convolutions, resulting in aligned frames. +These frames are then used to reconstruct a sharp, clear image via a +multi-scale architecture of 3D Swin Transformers. The proposed framework +demonstrates superior performance over the state of the art for both synthetic +and real atmospheric turbulence effects, with reasonable speed and model size. + +
+
+
+
+
+ + ☆ NIS-SLAM: Neural Implicit Semantic RGB-D SLAM for 3D Consistent Scene + Understanding + + +
+ In recent years, the paradigm of neural implicit representations has gained +substantial attention in the field of Simultaneous Localization and Mapping +(SLAM). However, a notable gap exists in the existing approaches when it comes +to scene understanding. In this paper, we introduce NIS-SLAM, an efficient +neural implicit semantic RGB-D SLAM system, that leverages a pre-trained 2D +segmentation network to learn consistent semantic representations. +Specifically, for high-fidelity surface reconstruction and spatial consistent +scene understanding, we combine high-frequency multi-resolution +tetrahedron-based features and low-frequency positional encoding as the +implicit scene representations. Besides, to address the inconsistency of 2D +segmentation results from multiple views, we propose a fusion strategy that +integrates the semantic probabilities from previous non-keyframes into +keyframes to achieve consistent semantic learning. Furthermore, we implement a +confidence-based pixel sampling and progressive optimization weight function +for robust camera tracking. Extensive experimental results on various datasets +show the better or more competitive performance of our system when compared to +other existing neural dense implicit RGB-D SLAM approaches. Finally, we also +show that our approach can be used in augmented reality applications. Project +page: +\href{https://zju3dv.github.io/nis_slam}{https://zju3dv.github.io/nis\_slam}. + +
+
+ comment: Accept by TVCG (ISMAR 2024 Journal Track) +
+
+
+
+
+ + ☆ Assessing Graphical Perception of Image Embedding Models using Channel + Effectiveness + + +
+ Recent advancements in vision models have greatly improved their ability to +handle complex chart understanding tasks, like chart captioning and question +answering. However, it remains challenging to assess how these models process +charts. Existing benchmarks only roughly evaluate model performance without +evaluating the underlying mechanisms, such as how models extract image +embeddings. This limits our understanding of the model's ability to perceive +fundamental graphical components. To address this, we introduce a novel +evaluation framework to assess the graphical perception of image embedding +models. For chart comprehension, we examine two main aspects of channel +effectiveness: accuracy and discriminability of various visual channels. +Channel accuracy is assessed through the linearity of embeddings, measuring how +well the perceived magnitude aligns with the size of the stimulus. +Discriminability is evaluated based on the distances between embeddings, +indicating their distinctness. Our experiments with the CLIP model show that it +perceives channel accuracy differently from humans and shows unique +discriminability in channels like length, tilt, and curvature. We aim to +develop this work into a broader benchmark for reliable visual encoders, +enhancing models for precise chart comprehension and human-like perception in +future applications. + +
+
+ comment: In Proceedings of the 2024 IEEE Visualization and Visual Analytics + (VIS) +
+
+
+
+
+ + ☆ DFE-IANet: A Method for Polyp Image Classification Based on Dual-domain + Feature Extraction and Interaction Attention + + +
+ It is helpful in preventing colorectal cancer to detect and treat polyps in +the gastrointestinal tract early. However, there have been few studies to date +on designing polyp image classification networks that balance efficiency and +accuracy. This challenge is mainly attributed to the fact that polyps are +similar to other pathologies and have complex features influenced by texture, +color, and morphology. In this paper, we propose a novel network DFE-IANet +based on both spectral transformation and feature interaction. Firstly, to +extract detailed features and multi-scale features, the features are +transformed by the multi-scale frequency domain feature extraction (MSFD) block +to extract texture details at the fine-grained level in the frequency domain. +Secondly, the multi-scale interaction attention (MSIA) block is designed to +enhance the network's capability of extracting critical features. This block +introduces multi-scale features into self-attention, aiming to adaptively guide +the network to concentrate on vital regions. Finally, with a compact parameter +of only 4M, DFE-IANet outperforms the latest and classical networks in terms of +efficiency. Furthermore, DFE-IANet achieves state-of-the-art (SOTA) results on +the challenging Kvasir dataset, demonstrating a remarkable Top-1 accuracy of +93.94%. This outstanding accuracy surpasses ViT by 8.94%, ResNet50 by 1.69%, +and VMamba by 1.88%. Our code is publicly available at +https://github.com/PURSUETHESUN/DFE-IANet. + +
+
+ comment: This paper has been accepted by 2024 International Conference on + Intelligent Computing.It can be accessed at http://poster-openaccess.com +
+
+
+
+
+ + ☆ Vulnerabilities in AI-generated Image Detection: The Challenge of + Adversarial Attacks + + +
+ Recent advancements in image synthesis, particularly with the advent of GAN +and Diffusion models, have amplified public concerns regarding the +dissemination of disinformation. To address such concerns, numerous +AI-generated Image (AIGI) Detectors have been proposed and achieved promising +performance in identifying fake images. However, there still lacks a systematic +understanding of the adversarial robustness of these AIGI detectors. In this +paper, we examine the vulnerability of state-of-the-art AIGI detectors against +adversarial attack under white-box and black-box settings, which has been +rarely investigated so far. For the task of AIGI detection, we propose a new +attack containing two main parts. First, inspired by the obvious difference +between real images and fake images in the frequency domain, we add +perturbations under the frequency domain to push the image away from its +original frequency distribution. Second, we explore the full posterior +distribution of the surrogate model to further narrow this gap between +heterogeneous models, e.g. transferring adversarial examples across CNNs and +ViTs. This is achieved by introducing a novel post-train Bayesian strategy that +turns a single surrogate into a Bayesian one, capable of simulating diverse +victim models using one pre-trained surrogate, without the need for +re-training. We name our method as frequency-based post-train Bayesian attack, +or FPBA. Through FPBA, we show that adversarial attack is truly a real threat +to AIGI detectors, because FPBA can deliver successful black-box attacks across +models, generators, defense methods, and even evade cross-generator detection, +which is a crucial real-world detection scenario. + +
+
+
+
+
+ + ☆ Federated Knowledge Recycling: Privacy-Preserving Synthetic Data Sharing + + +
+ Federated learning has emerged as a paradigm for collaborative learning, +enabling the development of robust models without the need to centralise +sensitive data. However, conventional federated learning techniques have +privacy and security vulnerabilities due to the exposure of models, parameters +or updates, which can be exploited as an attack surface. This paper presents +Federated Knowledge Recycling (FedKR), a cross-silo federated learning approach +that uses locally generated synthetic data to facilitate collaboration between +institutions. FedKR combines advanced data generation techniques with a dynamic +aggregation process to provide greater security against privacy attacks than +existing methods, significantly reducing the attack surface. Experimental +results on generic and medical datasets show that FedKR achieves competitive +performance, with an average improvement in accuracy of 4.24% compared to +training models from local data, demonstrating particular effectiveness in data +scarcity scenarios. + +
+
+
+
+
+ + ☆ WARM-3D: A Weakly-Supervised Sim2Real Domain Adaptation Framework for + Roadside Monocular 3D Object Detection + + +
+ Existing roadside perception systems are limited by the absence of publicly +available, large-scale, high-quality 3D datasets. Exploring the use of +cost-effective, extensive synthetic datasets offers a viable solution to tackle +this challenge and enhance the performance of roadside monocular 3D detection. +In this study, we introduce the TUMTraf Synthetic Dataset, offering a diverse +and substantial collection of high-quality 3D data to augment scarce real-world +datasets. Besides, we present WARM-3D, a concise yet effective framework to aid +the Sim2Real domain transfer for roadside monocular 3D detection. Our method +leverages cheap synthetic datasets and 2D labels from an off-the-shelf 2D +detector for weak supervision. We show that WARM-3D significantly enhances +performance, achieving a +12.40% increase in mAP 3D over the baseline with only +pseudo-2D supervision. With 2D GT as weak labels, WARM-3D even reaches +performance close to the Oracle baseline. Moreover, WARM-3D improves the +ability of 3D detectors to unseen sample recognition across various real-world +environments, highlighting its potential for practical applications. + +
+
+
+
+
+ + ☆ SpotFormer: Multi-Scale Spatio-Temporal Transformer for Facial + Expression Spotting + + +
+ Facial expression spotting, identifying periods where facial expressions +occur in a video, is a significant yet challenging task in facial expression +analysis. The issues of irrelevant facial movements and the challenge of +detecting subtle motions in micro-expressions remain unresolved, hindering +accurate expression spotting. In this paper, we propose an efficient framework +for facial expression spotting. First, we propose a Sliding Window-based +Multi-Resolution Optical flow (SW-MRO) feature, which calculates +multi-resolution optical flow of the input image sequence within compact +sliding windows. The window length is tailored to perceive complete +micro-expressions and distinguish between general macro- and micro-expressions. +SW-MRO can effectively reveal subtle motions while avoiding severe head +movement problems. Second, we propose SpotFormer, a multi-scale spatio-temporal +Transformer that simultaneously encodes spatio-temporal relationships of the +SW-MRO features for accurate frame-level probability estimation. In SpotFormer, +our proposed Facial Local Graph Pooling (FLGP) and convolutional layers are +applied for multi-scale spatio-temporal feature extraction. We show the +validity of the architecture of SpotFormer by comparing it with several model +variants. Third, we introduce supervised contrastive learning into SpotFormer +to enhance the discriminability between different types of expressions. +Extensive experiments on SAMM-LV and CAS(ME)^2 show that our method outperforms +state-of-the-art models, particularly in micro-expression spotting. + +
+
+
+
+
+ + ☆ Highly Efficient No-reference 4K Video Quality Assessment with + Full-Pixel Covering Sampling and Training Strategy ACM MM 2024 + + +
+ Deep Video Quality Assessment (VQA) methods have shown impressive +high-performance capabilities. Notably, no-reference (NR) VQA methods play a +vital role in situations where obtaining reference videos is restricted or not +feasible. Nevertheless, as more streaming videos are being created in +ultra-high definition (e.g., 4K) to enrich viewers' experiences, the current +deep VQA methods face unacceptable computational costs. Furthermore, the +resizing, cropping, and local sampling techniques employed in these methods can +compromise the details and content of original 4K videos, thereby negatively +impacting quality assessment. In this paper, we propose a highly efficient and +novel NR 4K VQA technology. Specifically, first, a novel data sampling and +training strategy is proposed to tackle the problem of excessive resolution. +This strategy allows the VQA Swin Transformer-based model to effectively train +and make inferences using the full data of 4K videos on standard consumer-grade +GPUs without compromising content or details. Second, a weighting and scoring +scheme is developed to mimic the human subjective perception mode, which is +achieved by considering the distinct impact of each sub-region within a 4K +frame on the overall perception. Third, we incorporate the frequency domain +information of video frames to better capture the details that affect video +quality, consequently further improving the model's generalizability. To our +knowledge, this is the first technology for the NR 4K VQA task. Thorough +empirical studies demonstrate it not only significantly outperforms existing +methods on a specialized 4K VQA dataset but also achieves state-of-the-art +performance across multiple open-source NR video quality datasets. + +
+
+ comment: Accepted by ACM MM 2024 +
+
+
+
+
+ + ☆ SynthVLM: High-Efficiency and High-Quality Synthetic Data for Vision + Language Models + + +
+ Recently, with the rise of web images, managing and understanding large-scale +image datasets has become increasingly important. Vision Large Language Models +(VLLMs) have recently emerged due to their robust vision-understanding +capabilities. However, training these models requires vast amounts of data, +posing challenges to efficiency, effectiveness, data quality, and privacy. In +this paper, we introduce SynthVLM, a novel data synthesis pipeline for VLLMs. +Unlike existing methods that generate captions from images, SynthVLM employs +advanced diffusion models and high-quality captions to automatically generate +and select high-resolution images from captions, creating precisely aligned +image-text pairs. Leveraging these pairs, we achieve state-of-the-art (SoTA) +performance on various vision question answering tasks, maintaining high +alignment quality and preserving advanced language abilities. Moreover, +SynthVLM surpasses traditional GPT-4 Vision-based caption generation methods in +performance while significantly reducing computational overhead. Crucially, our +method's reliance on purely generated data ensures the preservation of privacy, +achieving SoTA performance with just 100k data points (only 18% of the official +dataset size). + +
+
+
+
+
+ + ☆ Re-localization acceleration with Medoid Silhouette Clustering + + +
+ Two crucial performance criteria for the deployment of visual localization +are speed and accuracy. Current research on visual localization with neural +networks is limited to examining methods for enhancing the accuracy of networks +across various datasets. How to expedite the re-localization process within +deep neural network architectures still needs further investigation. In this +paper, we present a novel approach for accelerating visual re-localization in +practice. A tree-like search strategy, built on the keyframes extracted by a +visual clustering algorithm, is designed for matching acceleration. Our method +has been validated on two tasks across three public datasets, allowing for 50 +up to 90 percent time saving over the baseline while not reducing location +accuracy. + +
+
+ comment: 11 pages, 6 figures +
+
+
+
+
+ + ☆ Scene-Specific Trajectory Sets: Maximizing Representation in Motion + Forecasting + + +
+ Representing diverse and plausible future trajectories of actors is crucial +for motion forecasting in autonomous driving. However, efficiently capturing +the true trajectory distribution with a compact set is challenging. In this +work, we propose a novel approach for generating scene-specific trajectory sets +that better represent the diversity and admissibility of future actor behavior. +Our method constructs multiple trajectory sets tailored to different scene +contexts, such as intersections and non-intersections, by leveraging map +information and actor dynamics. We introduce a deterministic goal sampling +algorithm that identifies relevant map regions and generates trajectories +conditioned on the scene layout. Furthermore, we empirically investigate +various sampling strategies and set sizes to optimize the trade-off between +coverage and diversity. Experiments on the Argoverse 2 dataset demonstrate that +our scene-specific sets achieve higher plausibility while maintaining diversity +compared to traditional single-set approaches. The proposed Recursive +In-Distribution Subsampling (RIDS) method effectively condenses the +representation space and outperforms metric-driven sampling in terms of +trajectory admissibility. Our work highlights the benefits of scene-aware +trajectory set generation for capturing the complex and heterogeneous nature of +actor behavior in real-world driving scenarios. + +
+
+
+
+
+ + ☆ Autogenic Language Embedding for Coherent Point Tracking ACM MM 2024 + + +
+ Point tracking is a challenging task in computer vision, aiming to establish +point-wise correspondence across long video sequences. Recent advancements have +primarily focused on temporal modeling techniques to improve local feature +similarity, often overlooking the valuable semantic consistency inherent in +tracked points. In this paper, we introduce a novel approach leveraging +language embeddings to enhance the coherence of frame-wise visual features +related to the same object. Our proposed method, termed autogenic language +embedding for visual feature enhancement, strengthens point correspondence in +long-term sequences. Unlike existing visual-language schemes, our approach +learns text embeddings from visual features through a dedicated mapping +network, enabling seamless adaptation to various tracking tasks without +explicit text annotations. Additionally, we introduce a consistency decoder +that efficiently integrates text tokens into visual features with minimal +computational overhead. Through enhanced visual consistency, our approach +significantly improves tracking trajectories in lengthy videos with substantial +appearance variations. Extensive experiments on widely-used tracking benchmarks +demonstrate the superior performance of our method, showcasing notable +enhancements compared to trackers relying solely on visual cues. + +
+
+ comment: accepted by ACM MM 2024 +
+
+
+
+
+ + ☆ Neural Fields for Continuous Periodic Motion Estimation in 4D + Cardiovascular Imaging + + +
+ Time-resolved three-dimensional flow MRI (4D flow MRI) provides a unique +non-invasive solution to visualize and quantify hemodynamics in blood vessels +such as the aortic arch. However, most current analysis methods for arterial 4D +flow MRI use static artery walls because of the difficulty in obtaining a full +cycle segmentation. To overcome this limitation, we propose a neural +fields-based method that directly estimates continuous periodic wall +deformations throughout the cardiac cycle. For a 3D + time imaging dataset, we +optimize an implicit neural representation (INR) that represents a +time-dependent velocity vector field (VVF). An ODE solver is used to integrate +the VVF into a deformation vector field (DVF), that can deform images, +segmentation masks, or meshes over time, thereby visualizing and quantifying +local wall motion patterns. To properly reflect the periodic nature of 3D + +time cardiovascular data, we impose periodicity in two ways. First, by +periodically encoding the time input to the INR, and hence VVF. Second, by +regularizing the DVF. We demonstrate the effectiveness of this approach on +synthetic data with different periodic patterns, ECG-gated CT, and 4D flow MRI +data. The obtained method could be used to improve 4D flow MRI analysis. + +
+
+ comment: 10 pages, 5 figures, STACOM 2024 +
+
+
+
+
+ + ☆ SceneTeller: Language-to-3D Scene Generation ECCV'24 + + +
+ Designing high-quality indoor 3D scenes is important in many practical +applications, such as room planning or game development. Conventionally, this +has been a time-consuming process which requires both artistic skill and +familiarity with professional software, making it hardly accessible for layman +users. However, recent advances in generative AI have established solid +foundation for democratizing 3D design. In this paper, we propose a pioneering +approach for text-based 3D room design. Given a prompt in natural language +describing the object placement in the room, our method produces a high-quality +3D scene corresponding to it. With an additional text prompt the users can +change the appearance of the entire scene or of individual objects in it. Built +using in-context learning, CAD model retrieval and 3D-Gaussian-Splatting-based +stylization, our turnkey pipeline produces state-of-the-art 3D scenes, while +being easy to use even for novices. Our project page is available at +https://sceneteller.github.io/. + +
+
+ comment: ECCV'24 camera-ready version +
+
+
+
+
+ + ☆ PIP: Prototypes-Injected Prompt for Federated Class Incremental Learning CIKM + + +
+ Federated Class Incremental Learning (FCIL) is a new direction in continual +learning (CL) for addressing catastrophic forgetting and non-IID data +distribution simultaneously. Existing FCIL methods call for high communication +costs and exemplars from previous classes. We propose a novel rehearsal-free +method for FCIL named prototypes-injected prompt (PIP) that involves 3 main +ideas: a) prototype injection on prompt learning, b) prototype augmentation, +and c) weighted Gaussian aggregation on the server side. Our experiment result +shows that the proposed method outperforms the current state of the arts +(SOTAs) with a significant improvement (up to 33%) in CIFAR100, MiniImageNet +and TinyImageNet datasets. Our extensive analysis demonstrates the robustness +of PIP in different task sizes, and the advantage of requiring smaller +participating local clients, and smaller global rounds. For further study, +source codes of PIP, baseline, and experimental logs are shared publicly in +https://github.com/anwarmaxsum/PIP. + +
+
+ comment: Conference on Information and Knowledge Management (CIKM) 2024 + (Accepted) +
+
+
+
+
+ + ☆ Time Series Anomaly Detection with CNN for Environmental Sensors in + Healthcare-IoT + + +
+ This research develops a new method to detect anomalies in time series data +using Convolutional Neural Networks (CNNs) in healthcare-IoT. The proposed +method creates a Distributed Denial of Service (DDoS) attack using an IoT +network simulator, Cooja, which emulates environmental sensors such as +temperature and humidity. CNNs detect anomalies in time series data, resulting +in a 92\% accuracy in identifying possible attacks. + +
+
+
+
+
+ + ☆ Boosting Audio Visual Question Answering via Key Semantic-Aware Cues ACM MM 2024 + + +
+ The Audio Visual Question Answering (AVQA) task aims to answer questions +related to various visual objects, sounds, and their interactions in videos. +Such naturally multimodal videos contain rich and complex dynamic audio-visual +components, with only a portion of them closely related to the given questions. +Hence, effectively perceiving audio-visual cues relevant to the given questions +is crucial for correctly answering them. In this paper, we propose a +Temporal-Spatial Perception Model (TSPM), which aims to empower the model to +perceive key visual and auditory cues related to the questions. Specifically, +considering the challenge of aligning non-declarative questions and visual +representations into the same semantic space using visual-language pretrained +models, we construct declarative sentence prompts derived from the question +template, to assist the temporal perception module in better identifying +critical segments relevant to the questions. Subsequently, a spatial perception +module is designed to merge visual tokens from selected segments to highlight +key latent targets, followed by cross-modal interaction with audio to perceive +potential sound-aware areas. Finally, the significant temporal-spatial cues +from these modules are integrated to answer the question. Extensive experiments +on multiple AVQA benchmarks demonstrate that our framework excels not only in +understanding audio-visual scenes but also in answering complex questions +effectively. Code is available at https://github.com/GeWu-Lab/TSPM. + +
+
+ comment: Accepted by ACM MM 2024 +
+
+
+
+
+ + ☆ 3D-GRES: Generalized 3D Referring Expression Segmentation ACM MM 2024 + + +
+ 3D Referring Expression Segmentation (3D-RES) is dedicated to segmenting a +specific instance within a 3D space based on a natural language description. +However, current approaches are limited to segmenting a single target, +restricting the versatility of the task. To overcome this limitation, we +introduce Generalized 3D Referring Expression Segmentation (3D-GRES), which +extends the capability to segment any number of instances based on natural +language instructions. In addressing this broader task, we propose the +Multi-Query Decoupled Interaction Network (MDIN), designed to break down +multi-object segmentation tasks into simpler, individual segmentations. MDIN +comprises two fundamental components: Text-driven Sparse Queries (TSQ) and +Multi-object Decoupling Optimization (MDO). TSQ generates sparse point cloud +features distributed over key targets as the initialization for queries. +Meanwhile, MDO is tasked with assigning each target in multi-object scenarios +to different queries while maintaining their semantic consistency. To adapt to +this new task, we build a new dataset, namely Multi3DRes. Our comprehensive +evaluations on this dataset demonstrate substantial enhancements over existing +models, thus charting a new path for intricate multi-object 3D scene +comprehension. The benchmark and code are available at +https://github.com/sosppxo/3D-GRES. + +
+
+ comment: Accepted by ACM MM 2024 (Oral), Code: + https://github.com/sosppxo/3D-GRES +
+
+
+
+
+ + ☆ DocXPand-25k: a large and diverse benchmark dataset for identity + documents analysis + + +
+ Identity document (ID) image analysis has become essential for many online +services, like bank account opening or insurance subscription. In recent years, +much research has been conducted on subjects like document localization, text +recognition and fraud detection, to achieve a level of accuracy reliable enough +to automatize identity verification. However, there are only a few available +datasets to benchmark ID analysis methods, mainly because of privacy +restrictions, security requirements and legal reasons. + In this paper, we present the DocXPand-25k dataset, which consists of 24,994 +richly labeled IDs images, generated using custom-made vectorial templates +representing nine fictitious ID designs, including four identity cards, two +residence permits and three passports designs. These synthetic IDs feature +artificially generated personal information (names, dates, identifiers, faces, +barcodes, ...), and present a rich diversity in the visual layouts and textual +contents. + We collected about 5.8k diverse backgrounds coming from real-world photos, +scans and screenshots of IDs to guarantee the variety of the backgrounds. The +software we wrote to generate these images has been published +(https://github.com/QuickSign/docxpand/) under the terms of the MIT license, +and our dataset has been published +(https://github.com/QuickSign/docxpand/releases/tag/v1.0.0) under the terms of +the CC-BY-NC-SA 4.0 License. + +
+
+
+
+
+ + ☆ What makes for good morphology representations for spatial omics? + + +
+ Spatial omics has transformed our understanding of tissue architecture by +preserving spatial context of gene expression patterns. Simultaneously, +advances in imaging AI have enabled extraction of morphological features +describing the tissue. The intersection of spatial omics and imaging AI +presents opportunities for a more holistic understanding. In this review we +introduce a framework for categorizing spatial omics-morphology combination +methods, focusing on how morphological features can be translated or integrated +into spatial omics analyses. By translation we mean finding morphological +features that spatially correlate with gene expression patterns with the +purpose of predicting gene expression. Such features can be used to generate +super-resolution gene expression maps or infer genetic information from +clinical H&E-stained samples. By integration we mean finding morphological +features that spatially complement gene expression patterns with the purpose of +enriching information. Such features can be used to define spatial domains, +especially where gene expression has preceded morphological changes and where +morphology remains after gene expression. We discuss learning strategies and +directions for further development of the field. + +
+
+
+
+
+ + ☆ Prompt-Driven Contrastive Learning for Transferable Adversarial Attacks ECCV 2024 + + +
+ Recent vision-language foundation models, such as CLIP, have demonstrated +superior capabilities in learning representations that can be transferable +across diverse range of downstream tasks and domains. With the emergence of +such powerful models, it has become crucial to effectively leverage their +capabilities in tackling challenging vision tasks. On the other hand, only a +few works have focused on devising adversarial examples that transfer well to +both unknown domains and model architectures. In this paper, we propose a novel +transfer attack method called PDCL-Attack, which leverages the CLIP model to +enhance the transferability of adversarial perturbations generated by a +generative model-based attack framework. Specifically, we formulate an +effective prompt-driven feature guidance by harnessing the semantic +representation power of text, particularly from the ground-truth class labels +of input images. To the best of our knowledge, we are the first to introduce +prompt learning to enhance the transferable generative attacks. Extensive +experiments conducted across various cross-domain and cross-model settings +empirically validate our approach, demonstrating its superiority over +state-of-the-art methods. + +
+
+ comment: Accepted to ECCV 2024, Project Page: https://PDCL-Attack.github.io +
+
+
+
+
+ + ☆ FACL-Attack: Frequency-Aware Contrastive Learning for Transferable + Adversarial Attacks AAAI 2024 + + +
+ Deep neural networks are known to be vulnerable to security risks due to the +inherent transferable nature of adversarial examples. Despite the success of +recent generative model-based attacks demonstrating strong transferability, it +still remains a challenge to design an efficient attack strategy in a +real-world strict black-box setting, where both the target domain and model +architectures are unknown. In this paper, we seek to explore a feature +contrastive approach in the frequency domain to generate adversarial examples +that are robust in both cross-domain and cross-model settings. With that goal +in mind, we propose two modules that are only employed during the training +phase: a Frequency-Aware Domain Randomization (FADR) module to randomize +domain-variant low- and high-range frequency components and a +Frequency-Augmented Contrastive Learning (FACL) module to effectively separate +domain-invariant mid-frequency features of clean and perturbed image. We +demonstrate strong transferability of our generated adversarial perturbations +through extensive cross-domain and cross-model experiments, while keeping the +inference time complexity. + +
+
+ comment: Accepted to AAAI 2024, Project Page: https://FACL-Attack.github.io +
+
+
+
+
+ + ☆ Image Re-Identification: Where Self-supervision Meets Vision-Language + Learning + + +
+ Recently, large-scale vision-language pre-trained models like CLIP have shown +impressive performance in image re-identification (ReID). In this work, we +explore whether self-supervision can aid in the use of CLIP for image ReID +tasks. Specifically, we propose SVLL-ReID, the first attempt to integrate +self-supervision and pre-trained CLIP via two training stages to facilitate the +image ReID. We observe that: 1) incorporating language self-supervision in the +first training stage can make the learnable text prompts more distinguishable, +and 2) incorporating vision self-supervision in the second training stage can +make the image features learned by the image encoder more discriminative. These +observations imply that: 1) the text prompt learning in the first stage can +benefit from the language self-supervision, and 2) the image feature learning +in the second stage can benefit from the vision self-supervision. These +benefits jointly facilitate the performance gain of the proposed SVLL-ReID. By +conducting experiments on six image ReID benchmark datasets without any +concrete text labels, we find that the proposed SVLL-ReID achieves the overall +best performances compared with state-of-the-arts. Codes will be publicly +available at https://github.com/BinWangGzhu/SVLL-ReID. + +
+
+
+
+
+ + ☆ Generalizing AI-driven Assessment of Immunohistochemistry across + Immunostains and Cancer Types: A Universal Immunohistochemistry Analyzer + + +
+ Despite advancements in methodologies, immunohistochemistry (IHC) remains the +most utilized ancillary test for histopathologic and companion diagnostics in +targeted therapies. However, objective IHC assessment poses challenges. +Artificial intelligence (AI) has emerged as a potential solution, yet its +development requires extensive training for each cancer and IHC type, limiting +versatility. We developed a Universal IHC (UIHC) analyzer, an AI model for +interpreting IHC images regardless of tumor or IHC types, using training +datasets from various cancers stained for PD-L1 and/or HER2. This multi-cohort +trained model outperforms conventional single-cohort models in interpreting +unseen IHCs (Kappa score 0.578 vs. up to 0.509) and consistently shows superior +performance across different positive staining cutoff values. Qualitative +analysis reveals that UIHC effectively clusters patches based on expression +levels. The UIHC model also quantitatively assesses c-MET expression with MET +mutations, representing a significant advancement in AI application in the era +of personalized medicine and accumulating novel biomarkers. + +
+
+
+
+
+ + ☆ Effectively Leveraging CLIP for Generating Situational Summaries of + Images and Videos + + +
+ Situation recognition refers to the ability of an agent to identify and +understand various situations or contexts based on available information and +sensory inputs. It involves the cognitive process of interpreting data from the +environment to determine what is happening, what factors are involved, and what +actions caused those situations. This interpretation of situations is +formulated as a semantic role labeling problem in computer vision-based +situation recognition. Situations depicted in images and videos hold pivotal +information, essential for various applications like image and video +captioning, multimedia retrieval, autonomous systems and event monitoring. +However, existing methods often struggle with ambiguity and lack of context in +generating meaningful and accurate predictions. Leveraging multimodal models +such as CLIP, we propose ClipSitu, which sidesteps the need for full +fine-tuning and achieves state-of-the-art results in situation recognition and +localization tasks. ClipSitu harnesses CLIP-based image, verb, and role +embeddings to predict nouns fulfilling all the roles associated with a verb, +providing a comprehensive understanding of depicted scenarios. Through a +cross-attention Transformer, ClipSitu XTF enhances the connection between +semantic role queries and visual token representations, leading to superior +performance in situation recognition. We also propose a verb-wise role +prediction model with near-perfect accuracy to create an end-to-end framework +for producing situational summaries for out-of-domain images. We show that +situational summaries empower our ClipSitu models to produce structured +descriptions with reduced ambiguity compared to generic captions. Finally, we +extend ClipSitu to video situation recognition to showcase its versatility and +produce comparable performance to state-of-the-art methods. + +
+
+ comment: 38 pages, 12 figures. arXiv admin note: text overlap with + arXiv:2307.00586 +
+
+
+
+
+ + ☆ Spiking-DD: Neuromorphic Event Camera based Driver Distraction Detection + with Spiking Neural Network + + +
+ Event camera-based driver monitoring is emerging as a pivotal area of +research, driven by its significant advantages such as rapid response, low +latency, power efficiency, enhanced privacy, and prevention of undersampling. +Effective detection of driver distraction is crucial in driver monitoring +systems to enhance road safety and reduce accident rates. The integration of an +optimized sensor such as Event Camera with an optimized network is essential +for maximizing these benefits. This paper introduces the innovative concept of +sensing without seeing to detect driver distraction, leveraging computationally +efficient spiking neural networks (SNN). To the best of our knowledge, this +study is the first to utilize event camera data with spiking neural networks +for driver distraction. The proposed Spiking-DD network not only achieve state +of the art performance but also exhibit fewer parameters and provides greater +accuracy than current event-based methodologies. + +
+
+ comment: Irish Machine Vision and Image Processing Conference (IMVIP) 2024 +
+
+
+
+
+ + ☆ SharkTrack: an accurate, generalisable software for streamlining shark + and ray underwater video analysis + + +
+ Elasmobranchs (sharks and rays) can be important components of marine +ecosystems but are experiencing global population declines. Effective +monitoring of these populations is essential to their protection. Baited Remote +Underwater Video Stations (BRUVS) have been a key tool for monitoring, but +require time-consuming manual analysis. To address these challenges, we +developed SharkTrack, an AI-enhanced BRUVS analysis software. SharkTrack uses +Convolutional Neural Networks and Multi-Object Tracking to detect and track +elasmobranchs and provides an annotation pipeline to manually classify +elasmobranch species and compute MaxN, the standard metric of relative +abundance. We tested SharkTrack on BRUVS footage from locations unseen by the +model during training. SharkTrack computed MaxN with 89% accuracy over 207 +hours of footage. The semi-automatic SharkTrack pipeline required two minutes +of manual classification per hour of video, a 97% reduction of manual BRUVS +analysis time compared to traditional methods, estimated conservatively at one +hour per hour of video. Furthermore, we demonstrate SharkTrack application +across diverse marine ecosystems and elasmobranch species, an advancement +compared to previous models, which were limited to specific species or +locations. SharkTrack applications extend beyond BRUVS analysis, facilitating +rapid annotation of unlabeled videos, aiding the development of further models +to classify elasmobranch species. We provide public access to the software and +an unprecedentedly diverse dataset, facilitating future research in an +important area of marine conservation. + +
+
+
+
+
+ + ☆ Knowledge Fused Recognition: Fusing Hierarchical Knowledge for Image + Recognition through Quantitative Relativity Modeling and Deep Metric Learning + + +
+ Image recognition is an essential baseline for deep metric learning. +Hierarchical knowledge about image classes depicts inter-class similarities or +dissimilarities. Effective fusion of hierarchical knowledge about image classes +to enhance image recognition remains a challenging topic to advance. In this +paper, we propose a novel deep metric learning based method to effectively fuse +hierarchical prior knowledge about image classes and enhance image recognition +performances in an end-to-end supervised regression manner. Existing deep +metric learning incorporated image classification mainly exploits qualitative +relativity between image classes, i.e., whether sampled images are from the +same class. A new triplet loss function term that exploits quantitative +relativity and aligns distances in model latent space with those in knowledge +space is also proposed and incorporated in the proposed dual-modality fusion +method. Experimental results indicate that the proposed method enhanced image +recognition performances and outperformed baseline and existing methods on +CIFAR-10, CIFAR-100, Mini-ImageNet, and ImageNet-1K datasets. + +
+
+
+
+
+ + ☆ Benchmarking Histopathology Foundation Models for Ovarian Cancer + Bevacizumab Treatment Response Prediction from Whole Slide Images + + +
+ Bevacizumab is a widely studied targeted therapeutic drug used in conjunction +with standard chemotherapy for the treatment of recurrent ovarian cancer. While +its administration has shown to increase the progression-free survival (PFS) in +patients with advanced stage ovarian cancer, the lack of identifiable +biomarkers for predicting patient response has been a major roadblock in its +effective adoption towards personalized medicine. In this work, we leverage the +latest histopathology foundation models trained on large-scale whole slide +image (WSI) datasets to extract ovarian tumor tissue features for predicting +bevacizumab response from WSIs. Our extensive experiments across a combination +of different histopathology foundation models and multiple instance learning +(MIL) strategies demonstrate capability of these large models in predicting +bevacizumab response in ovarian cancer patients with the best models achieving +an AUC score of 0.86 and an accuracy score of 72.5%. Furthermore, our survival +models are able to stratify high- and low-risk cases with statistical +significance (p < 0.05) even among the patients with the aggressive subtype of +high-grade serous ovarian carcinoma. This work highlights the utility of +histopathology foundation models for the task of ovarian bevacizumab response +prediction from WSIs. The high-attention regions of the WSIs highlighted by +these models not only aid the model explainability but also serve as promising +imaging biomarkers for treatment prognosis. + +
+
+
+
+
+ + ☆ EgoSonics: Generating Synchronized Audio for Silent Egocentric Videos + + +
+ We introduce EgoSonics, a method to generate semantically meaningful and +synchronized audio tracks conditioned on silent egocentric videos. Generating +audio for silent egocentric videos could open new applications in virtual +reality, assistive technologies, or for augmenting existing datasets. Existing +work has been limited to domains like speech, music, or impact sounds and +cannot easily capture the broad range of audio frequencies found in egocentric +videos. EgoSonics addresses these limitations by building on the strength of +latent diffusion models for conditioned audio synthesis. We first encode and +process audio and video data into a form that is suitable for generation. The +encoded data is used to train our model to generate audio tracks that capture +the semantics of the input video. Our proposed SyncroNet builds on top of +ControlNet to provide control signals that enables temporal synchronization to +the synthesized audio. Extensive evaluations show that our model outperforms +existing work in audio quality, and in our newly proposed synchronization +evaluation method. Furthermore, we demonstrate downstream applications of our +model in improving video summarization. + +
+
+ comment: preprint +
+
+
+
+
+ + ☆ Image-based Detection of Segment Misalignment in Multi-mirror Satellites + using Transfer Learning + + +
+ In this paper, we introduce a system based on transfer learning for detecting +segment misalignment in multimirror satellites, such as future CubeSat designs +and the James Webb Space Telescope (JWST), using image-based methods. When a +mirror segment becomes misaligned due to various environmental factors, such as +space debris, the images can become distorted with a shifted copy of itself +called a "ghost image". To detect whether segments are misaligned, we use +pre-trained, large-scale image models trained on the Fast Fourier Transform +(FFT) of patches of satellite images in grayscale. Multi-mirror designs can use +any arbitrary number of mirrors. For our purposes, the tests were performed on +simulated CubeSats with 4, 6, and 8 segments. For system design, we took this +into account when we want to know when a satellite has a misaligned segment and +how many segments are misaligned. The intensity of the ghost image is directly +proportional to the number of segments misaligned. Models trained for intensity +classification attempted to classify N-1 segments. Across eight classes, binary +models were able to achieve a classification accuracy of 98.75%, and models for +intensity classification were able to achieve an accuracy of 98.05%. + +
+
+
+
+
+ + ☆ Monocular Human-Object Reconstruction in the Wild + + +
+ Learning the prior knowledge of the 3D human-object spatial relation is +crucial for reconstructing human-object interaction from images and +understanding how humans interact with objects in 3D space. Previous works +learn this prior from datasets collected in controlled environments, but due to +the diversity of domains, they struggle to generalize to real-world scenarios. +To overcome this limitation, we present a 2D-supervised method that learns the +3D human-object spatial relation prior purely from 2D images in the wild. Our +method utilizes a flow-based neural network to learn the prior distribution of +the 2D human-object keypoint layout and viewports for each image in the +dataset. The effectiveness of the prior learned from 2D images is demonstrated +on the human-object reconstruction task by applying the prior to tune the +relative pose between the human and the object during the post-optimization +stage. To validate and benchmark our method on in-the-wild images, we collect +the WildHOI dataset from the YouTube website, which consists of various +interactions with 8 objects in real-world scenarios. We conduct the experiments +on the indoor BEHAVE dataset and the outdoor WildHOI dataset. The results show +that our method achieves almost comparable performance with fully 3D supervised +methods on the BEHAVE dataset, even if we have only utilized the 2D layout +information, and outperforms previous methods in terms of generality and +interaction diversity on in-the-wild images. + +
+
+ comment: Accepted by MM '24 +
+
+
+
+
+ + ☆ Pyramid Coder: Hierarchical Code Generator for Compositional Visual + Question Answering ICIP + + +
+ Visual question answering (VQA) is the task of providing accurate answers to +natural language questions based on visual input. Programmatic VQA (PVQA) +models have been gaining attention recently. These use large language models +(LLMs) to formulate executable programs that address questions requiring +complex visual reasoning. However, there are challenges in enabling LLMs to +comprehend the usage of image processing modules and generate relevant code. To +overcome these challenges, this paper introduces PyramidCoder, a novel +prompting framework for PVQA models. PyramidCoder consists of three +hierarchical levels, each serving a distinct purpose: query rephrasing, code +generation, and answer aggregation. Notably, PyramidCoder utilizes a single +frozen LLM and pre-defined prompts at each level, eliminating the need for +additional training and ensuring flexibility across various LLM architectures. +Compared to the state-of-the-art PVQA model, our approach improves accuracy by +at least 0.5% on the GQA dataset, 1.4% on the VQAv2 dataset, and 2.9% on the +NLVR2 dataset. + +
+
+ comment: Accepted to the IEEE International Conference on Image Processing + (IEEE ICIP) 2024 +
+
+
+
+
+ + ☆ StackFLOW: Monocular Human-Object Reconstruction by Stacked Normalizing + Flow with Offset IJCAI-23 + + +
+ Modeling and capturing the 3D spatial arrangement of the human and the object +is the key to perceiving 3D human-object interaction from monocular images. In +this work, we propose to use the Human-Object Offset between anchors which are +densely sampled from the surface of human mesh and object mesh to represent +human-object spatial relation. Compared with previous works which use contact +map or implicit distance filed to encode 3D human-object spatial relations, our +method is a simple and efficient way to encode the highly detailed spatial +correlation between the human and object. Based on this representation, we +propose Stacked Normalizing Flow (StackFLOW) to infer the posterior +distribution of human-object spatial relations from the image. During the +optimization stage, we finetune the human body pose and object 6D pose by +maximizing the likelihood of samples based on this posterior distribution and +minimizing the 2D-3D corresponding reprojection loss. Extensive experimental +results show that our method achieves impressive results on two challenging +benchmarks, BEHAVE and InterCap datasets. + +
+
+ comment: Accepted by IJCAI-23 +
+
+
+
+
+ + ☆ HandDAGT: A Denoising Adaptive Graph Transformer for 3D Hand Pose + Estimation ECCV + + +
+ The extraction of keypoint positions from input hand frames, known as 3D hand +pose estimation, is crucial for various human-computer interaction +applications. However, current approaches often struggle with the dynamic +nature of self-occlusion of hands and intra-occlusion with interacting objects. +To address this challenge, this paper proposes the Denoising Adaptive Graph +Transformer, HandDAGT, for hand pose estimation. The proposed HandDAGT +leverages a transformer structure to thoroughly explore effective geometric +features from input patches. Additionally, it incorporates a novel attention +mechanism to adaptively weigh the contribution of kinematic correspondence and +local geometric features for the estimation of specific keypoints. This +attribute enables the model to adaptively employ kinematic and local +information based on the occlusion situation, enhancing its robustness and +accuracy. Furthermore, we introduce a novel denoising training strategy aimed +at improving the model's robust performance in the face of occlusion +challenges. Experimental results show that the proposed model significantly +outperforms the existing methods on four challenging hand pose benchmark +datasets. Codes and pre-trained models are publicly available at +https://github.com/cwc1260/HandDAGT. + +
+
+ comment: Accepted as a conference paper to European Conference on Computer + Vision (ECCV) 2024 +
+
+
+
+
+ + ☆ High-Resolution Spatial Transcriptomics from Histology Images using + HisToSGE + + +
+ Spatial transcriptomics (ST) is a groundbreaking genomic technology that +enables spatial localization analysis of gene expression within tissue +sections. However, it is significantly limited by high costs and sparse spatial +resolution. An alternative, more cost-effective strategy is to use deep +learning methods to predict high-density gene expression profiles from +histological images. However, existing methods struggle to capture rich image +features effectively or rely on low-dimensional positional coordinates, making +it difficult to accurately predict high-resolution gene expression profiles. To +address these limitations, we developed HisToSGE, a method that employs a +Pathology Image Large Model (PILM) to extract rich image features from +histological images and utilizes a feature learning module to robustly generate +high-resolution gene expression profiles. We evaluated HisToSGE on four ST +datasets, comparing its performance with five state-of-the-art baseline +methods. The results demonstrate that HisToSGE excels in generating +high-resolution gene expression profiles and performing downstream tasks such +as spatial domain identification. All code and public datasets used in this +paper are available at https://github.com/wenwenmin/HisToSGE and +https://zenodo.org/records/12792163. + +
+
+
+
+
+ + ☆ Markers Identification for Relative Pose Estimation of an Uncooperative + Target + + +
+ This paper introduces a novel method using chaser spacecraft image processing +and Convolutional Neural Networks (CNNs) to detect structural markers on the +European Space Agency's (ESA) Environmental Satellite (ENVISAT) for safe +de-orbiting. Advanced image pre-processing techniques, including noise addition +and blurring, are employed to improve marker detection accuracy and robustness. +Initial results show promising potential for autonomous space debris removal, +supporting proactive strategies for space sustainability. The effectiveness of +our approach suggests that our estimation method could significantly enhance +the safety and efficiency of debris removal operations by implementing more +robust and autonomous systems in actual space missions. + +
+
+ comment: 2024 AAS/AIAA Astrodynamics Specialist Conference +
+
+
+
+
+ + ☆ Interpreting and Mitigating Hallucination in MLLMs through Multi-agent + Debate + + +
+ MLLMs often generate outputs that are inconsistent with the visual content, a +challenge known as hallucination. Previous methods focus on determining whether +a generated output is hallucinated, without identifying which image region +leads to the hallucination or interpreting why such hallucinations occur. In +this paper, we argue that hallucination in MLLMs is partially due to a lack of +slow-thinking and divergent-thinking in these models. To address this, we +propose adopting a self-reflection scheme to promote slow-thinking. +Furthermore, we consider eliminating hallucination as a complex reasoning task +and propose a multi-agent debate approach to encourage divergent-thinking. +Consequently, our approach can not only mitigate hallucinations but also +interpret why they occur and detail the specifics of hallucination. In +addition, we propose to distinguish creativity from hallucination in the +context of MLLMs, and illustrate how to evaluate MLLMs' creativity capability. +Extensive experiments on various benchmarks demonstrate that our approach +exhibits generalized hallucinations-mitigating performance across several +MLLMs. + +
+
+
+
+
+ + ☆ Restoring Real-World Degraded Events Improves Deblurring Quality + + +
+ Due to its high speed and low latency, DVS is frequently employed in motion +deblurring. Ideally, high-quality events would adeptly capture intricate motion +information. However, real-world events are generally degraded, thereby +introducing significant artifacts into the deblurred results. In response to +this challenge, we model the degradation of events and propose RDNet to improve +the quality of image deblurring. Specifically, we first analyze the mechanisms +underlying degradation and simulate paired events based on that. These paired +events are then fed into the first stage of the RDNet for training the +restoration model. The events restored in this stage serve as a guide for the +second-stage deblurring process. To better assess the deblurring performance of +different methods on real-world degraded events, we present a new real-world +dataset named DavisMCR. This dataset incorporates events with diverse +degradation levels, collected by manipulating environmental brightness and +target object contrast. Our experiments are conducted on synthetic datasets +(GOPRO), real-world datasets (REBlur), and the proposed dataset (DavisMCR). The +results demonstrate that RDNet outperforms classical event denoising methods in +event restoration. Furthermore, RDNet exhibits better performance in deblurring +tasks compared to state-of-the-art methods. DavisMCR are available at +https://github.com/Yeeesir/DVS_RDNet. + +
+
+
+
+
+ + ☆ Enhancing Quantitative Image Synthesis through Pretraining and + Resolution Scaling for Bone Mineral Density Estimation from a Plain X-ray + Image MICCAI + + +
+ While most vision tasks are essentially visual in nature (for recognition), +some important tasks, especially in the medical field, also require +quantitative analysis (for quantification) using quantitative images. Unlike in +visual analysis, pixel values in quantitative images correspond to physical +metrics measured by specific devices (e.g., a depth image). However, recent +work has shown that it is sometimes possible to synthesize accurate +quantitative values from visual ones (e.g., depth from visual cues or defocus). +This research aims to improve quantitative image synthesis (QIS) by exploring +pretraining and image resolution scaling. We propose a benchmark for evaluating +pretraining performance using the task of QIS-based bone mineral density (BMD) +estimation from plain X-ray images, where the synthesized quantitative image is +used to derive BMD. Our results show that appropriate pretraining can improve +QIS performance, significantly raising the correlation of BMD estimation from +0.820 to 0.898, while others do not help or even hinder it. Scaling-up the +resolution can further boost the correlation up to 0.923, a significant +enhancement over conventional methods. Future work will include exploring more +pretraining strategies and validating them on other image synthesis tasks. + +
+
+ comment: SASHIMI, 2024 (MICCAI workshop). 13 pages, 3 figures +
+
+
+
+
+ + ☆ VITAL: Visual Teleoperation to Enhance Robot Learning through + Human-in-the-Loop Corrections + + +
+ Imitation Learning (IL) has emerged as a powerful approach in robotics, +allowing robots to acquire new skills by mimicking human actions. Despite its +potential, the data collection process for IL remains a significant challenge +due to the logistical difficulties and high costs associated with obtaining +high-quality demonstrations. To address these issues, we propose a low-cost +visual teleoperation system for bimanual manipulation tasks, called VITAL. Our +approach leverages affordable hardware and visual processing techniques to +collect demonstrations, which are then augmented to create extensive training +datasets for imitation learning. We enhance the generalizability and robustness +of the learned policies by utilizing both real and simulated environments and +human-in-the-loop corrections. We evaluated our method through several rounds +of experiments in simulated and real-robot settings, focusing on tasks of +varying complexity, including bottle collecting, stacking objects, and +hammering. Our experimental results validate the effectiveness of our approach +in learning robust robot policies from simulated data, significantly improved +by human-in-the-loop corrections and real-world data integration. Additionally, +we demonstrate the framework's capability to generalize to new tasks, such as +setting a drink tray, showcasing its adaptability and potential for handling a +wide range of real-world bimanual manipulation tasks. A video of the +experiments can be found at: https://youtu.be/YeVAMRqRe64?si=R179xDlEGc7nPu8i + +
+
+
+
+
+ + ☆ TMA-Grid: An open-source, zero-footprint web application for FAIR Tissue + MicroArray De-arraying + + +
+ Background: + Tissue Microarrays (TMAs) significantly increase analytical efficiency in +histopathology and large-scale epidemiologic studies by allowing multiple +tissue cores to be scanned on a single slide. The individual cores can be +digitally extracted and then linked to metadata for analysis in a process known +as de-arraying. However, TMAs often contain core misalignments and artifacts +due to assembly errors, which can adversely affect the reliability of the +extracted cores during the de-arraying process. Moreover, conventional +approaches for TMA de-arraying rely on desktop solutions.Therefore, a robust +yet flexible de-arraying method is crucial to account for these inaccuracies +and ensure effective downstream analyses. + Results: + We developed TMA-Grid, an in-browser, zero-footprint, interactive web +application for TMA de-arraying. This web application integrates a +convolutional neural network for precise tissue segmentation and a grid +estimation algorithm to match each identified core to its expected location. +The application emphasizes interactivity, allowing users to easily adjust +segmentation and gridding results. Operating entirely in the web-browser, +TMA-Grid eliminates the need for downloads or installations and ensures data +privacy. Adhering to FAIR principles (Findable, Accessible, Interoperable, and +Reusable), the application and its components are designed for seamless +integration into TMA research workflows. + Conclusions: + TMA-Grid provides a robust, user-friendly solution for TMA dearraying on the +web. As an open, freely accessible platform, it lays the foundation for +collaborative analyses of TMAs and similar histopathology imaging data. +Availability: Web application: https://episphere.github.io/tma-grid Code: +https://github.com/episphere/tma-grid Tutorial: https://youtu.be/miajqyw4BVk + +
+
+ comment: NA +
+
+
+
+
+ + ☆ Advancing Vietnamese Visual Question Answering with Transformer and + Convolutional Integration + + +
+ Visual Question Answering (VQA) has recently emerged as a potential research +domain, captivating the interest of many in the field of artificial +intelligence and computer vision. Despite the prevalence of approaches in +English, there is a notable lack of systems specifically developed for certain +languages, particularly Vietnamese. This study aims to bridge this gap by +conducting comprehensive experiments on the Vietnamese Visual Question +Answering (ViVQA) dataset, demonstrating the effectiveness of our proposed +model. In response to community interest, we have developed a model that +enhances image representation capabilities, thereby improving overall +performance in the ViVQA system. Specifically, our model integrates the +Bootstrapping Language-Image Pre-training with frozen unimodal models (BLIP-2) +and the convolutional neural network EfficientNet to extract and process both +local and global features from images. This integration leverages the strengths +of transformer-based architectures for capturing comprehensive contextual +information and convolutional networks for detailed local features. By freezing +the parameters of these pre-trained models, we significantly reduce the +computational cost and training time, while maintaining high performance. This +approach significantly improves image representation and enhances the +performance of existing VQA systems. We then leverage a multi-modal fusion +module based on a general-purpose multi-modal foundation model (BEiT-3) to fuse +the information between visual and textual features. Our experimental findings +demonstrate that our model surpasses competing baselines, achieving promising +performance. This is particularly evident in its accuracy of $71.04\%$ on the +test set of the ViVQA dataset, marking a significant advancement in our +research area. The code is available at https://github.com/nngocson2002/ViVQA. + +
+
+ comment: Accepted at the journal of Computers & Electrical Engineering + (Received 8 March 2024, Revised 8 June 2024, Accepted 10 July 2024) +
+
+
+
+
+ + ☆ DeepBaR: Fault Backdoor Attack on Deep Neural Network Layers + + +
+ Machine Learning using neural networks has received prominent attention +recently because of its success in solving a wide variety of computational +tasks, in particular in the field of computer vision. However, several works +have drawn attention to potential security risks involved with the training and +implementation of such networks. In this work, we introduce DeepBaR, a novel +approach that implants backdoors on neural networks by faulting their behavior +at training, especially during fine-tuning. Our technique aims to generate +adversarial samples by optimizing a custom loss function that mimics the +implanted backdoors while adding an almost non-visible trigger in the image. We +attack three popular convolutional neural network architectures and show that +DeepBaR attacks have a success rate of up to 98.30\%. Furthermore, DeepBaR does +not significantly affect the accuracy of the attacked networks after deployment +when non-malicious inputs are given. Remarkably, DeepBaR allows attackers to +choose an input that looks similar to a given class, from a human perspective, +but that will be classified as belonging to an arbitrary target class. + +
+
+
+
+
+ + ☆ Distribution-Aware Replay for Continual MRI Segmentation + + +
+ Medical image distributions shift constantly due to changes in patient +population and discrepancies in image acquisition. These distribution changes +result in performance deterioration; deterioration that continual learning aims +to alleviate. However, only adaptation with data rehearsal strategies yields +practically desirable performance for medical image segmentation. Such +rehearsal violates patient privacy and, as most continual learning approaches, +overlooks unexpected changes from out-of-distribution instances. To transcend +both of these challenges, we introduce a distribution-aware replay strategy +that mitigates forgetting through auto-encoding of features, while +simultaneously leveraging the learned distribution of features to detect model +failure. We provide empirical corroboration on hippocampus and prostate MRI +segmentation. + +
+
+
+
+
+ + ☆ AI Safety in Practice: Enhancing Adversarial Robustness in Multimodal + Image Captioning KDD 2024 + + +
+ Multimodal machine learning models that combine visual and textual data are +increasingly being deployed in critical applications, raising significant +safety and security concerns due to their vulnerability to adversarial attacks. +This paper presents an effective strategy to enhance the robustness of +multimodal image captioning models against such attacks. By leveraging the Fast +Gradient Sign Method (FGSM) to generate adversarial examples and incorporating +adversarial training techniques, we demonstrate improved model robustness on +two benchmark datasets: Flickr8k and COCO. Our findings indicate that +selectively training only the text decoder of the multimodal architecture shows +performance comparable to full adversarial training while offering increased +computational efficiency. This targeted approach suggests a balance between +robustness and training costs, facilitating the ethical deployment of +multimodal AI systems across various domains. + +
+
+ comment: Accepted into KDD 2024 workshop on Ethical AI +
+
+
+
+
+ + ☆ Embedding Space Selection for Detecting Memorization and Fingerprinting + in Generative Models + + +
+ In the rapidly evolving landscape of artificial intelligence, generative +models such as Generative Adversarial Networks (GANs) and Diffusion Models have +become cornerstone technologies, driving innovation in diverse fields from art +creation to healthcare. Despite their potential, these models face the +significant challenge of data memorization, which poses risks to privacy and +the integrity of generated content. Among various metrics of memorization +detection, our study delves into the memorization scores calculated from +encoder layer embeddings, which involves measuring distances between samples in +the embedding spaces. Particularly, we find that the memorization scores +calculated from layer embeddings of Vision Transformers (ViTs) show an notable +trend - the latter (deeper) the layer, the less the memorization measured. It +has been found that the memorization scores from the early layers' embeddings +are more sensitive to low-level memorization (e.g. colors and simple patterns +for an image), while those from the latter layers are more sensitive to +high-level memorization (e.g. semantic meaning of an image). We also observe +that, for a specific model architecture, its degree of memorization on +different levels of information is unique. It can be viewed as an inherent +property of the architecture. Building upon this insight, we introduce a unique +fingerprinting methodology. This method capitalizes on the unique distributions +of the memorization score across different layers of ViTs, providing a novel +approach to identifying models involved in generating deepfakes and malicious +content. Our approach demonstrates a marked 30% enhancement in identification +accuracy over existing baseline methods, offering a more effective tool for +combating digital misinformation. + +
+
+
+
+
+ + ☆ PLANesT-3D: A new annotated dataset for segmentation of 3D plant point + clouds + + +
+ Creation of new annotated public datasets is crucial in helping advances in +3D computer vision and machine learning meet their full potential for automatic +interpretation of 3D plant models. In this paper, we introduce PLANesT-3D; a +new annotated dataset of 3D color point clouds of plants. PLANesT-3D is +composed of 34 point cloud models representing 34 real plants from three +different plant species: \textit{Capsicum annuum}, \textit{Rosa kordana}, and +\textit{Ribes rubrum}. Both semantic labels in terms of "leaf" and "stem", and +organ instance labels were manually annotated for the full point clouds. As an +additional contribution, SP-LSCnet, a novel semantic segmentation method that +is a combination of unsupervised superpoint extraction and a 3D point-based +deep learning approach is introduced and evaluated on the new dataset. Two +existing deep neural network architectures, PointNet++ and RoseSegNet were also +tested on the point clouds of PLANesT-3D for semantic segmentation. + +
+
+
+
+
+ + ☆ Domain Shift Analysis in Chest Radiographs Classification in a Veterans + Healthcare Administration Population + + +
+ Objectives: This study aims to assess the impact of domain shift on chest +X-ray classification accuracy and to analyze the influence of ground truth +label quality and demographic factors such as age group, sex, and study year. +Materials and Methods: We used a DenseNet121 model pretrained MIMIC-CXR dataset +for deep learning-based multilabel classification using ground truth labels +from radiology reports extracted using the CheXpert and CheXbert Labeler. We +compared the performance of the 14 chest X-ray labels on the MIMIC-CXR and +Veterans Healthcare Administration chest X-ray dataset (VA-CXR). The VA-CXR +dataset comprises over 259k chest X-ray images spanning between the years 2010 +and 2022. Results: The validation of ground truth and the assessment of +multi-label classification performance across various NLP extraction tools +revealed that the VA-CXR dataset exhibited lower disagreement rates than the +MIMIC-CXR datasets. Additionally, there were notable differences in AUC scores +between models utilizing CheXpert and CheXbert. When evaluating multi-label +classification performance across different datasets, minimal domain shift was +observed in unseen datasets, except for the label "Enlarged Cardiomediastinum." +The study year's subgroup analyses exhibited the most significant variations in +multi-label classification model performance. These findings underscore the +importance of considering domain shifts in chest X-ray classification tasks, +particularly concerning study years. Conclusion: Our study reveals the +significant impact of domain shift and demographic factors on chest X-ray +classification, emphasizing the need for improved transfer learning and +equitable model development. Addressing these challenges is crucial for +advancing medical imaging and enhancing patient care. + +
+
+
+
+
+ + ☆ Adding Multi-modal Controls to Whole-body Human Motion Generation + + +
+ Whole-body multi-modal motion generation, controlled by text, speech, or +music, has numerous applications including video generation and character +animation. However, employing a unified model to accomplish various generation +tasks with different condition modalities presents two main challenges: motion +distribution drifts across different generation scenarios and the complex +optimization of mixed conditions with varying granularity. Furthermore, +inconsistent motion formats in existing datasets further hinder effective +multi-modal motion generation. In this paper, we propose ControlMM, a unified +framework to Control whole-body Multi-modal Motion generation in a +plug-and-play manner. To effectively learn and transfer motion knowledge across +different motion distributions, we propose ControlMM-Attn, for parallel +modeling of static and dynamic human topology graphs. To handle conditions with +varying granularity, ControlMM employs a coarse-to-fine training strategy, +including stage-1 text-to-motion pre-training for semantic generation and +stage-2 multi-modal control adaptation for conditions of varying low-level +granularity. To address existing benchmarks' varying motion format limitations, +we introduce ControlMM-Bench, the first publicly available multi-modal +whole-body human motion generation benchmark based on the unified whole-body +SMPL-X format. Extensive experiments show that ControlMM achieves +state-of-the-art performance across various standard motion generation tasks. +Our website is at https://yxbian23.github.io/ControlMM. + +
+
+
+
+
+ + ☆ Self-supervised Multi-future Occupancy Forecasting for Autonomous + Driving + + +
+ Environment prediction frameworks are critical for the safe navigation of +autonomous vehicles (AVs) in dynamic settings. LiDAR-generated occupancy grid +maps (L-OGMs) offer a robust bird's-eye view for the scene representation, +enabling self-supervised joint scene predictions while exhibiting resilience to +partial observability and perception detection failures. Prior approaches have +focused on deterministic L-OGM prediction architectures within the grid cell +space. While these methods have seen some success, they frequently produce +unrealistic predictions and fail to capture the stochastic nature of the +environment. Additionally, they do not effectively integrate additional sensor +modalities present in AVs. Our proposed framework performs stochastic L-OGM +prediction in the latent space of a generative architecture and allows for +conditioning on RGB cameras, maps, and planned trajectories. We decode +predictions using either a single-step decoder, which provides high-quality +predictions in real-time, or a diffusion-based batch decoder, which can further +refine the decoded frames to address temporal consistency issues and reduce +compression losses. Our experiments on the nuScenes and Waymo Open datasets +show that all variants of our approach qualitatively and quantitatively +outperform prior approaches. + +
+
+
+
+
+ + ☆ Taming the Frequency Factory of Sinusoidal Networks + + +
+ This work investigates the structure and representation capacity of +$sinusoidal$ MLPs, which have recently shown promising results in encoding +low-dimensional signals. This success can be attributed to its smoothness and +high representation capacity. The first allows the use of the network's +derivatives during training, enabling regularization. However, defining the +architecture and initializing its parameters to achieve a desired capacity +remains an empirical task. This work provides theoretical and experimental +results justifying the capacity property of sinusoidal MLPs and offers control +mechanisms for their initialization and training. + We approach this from a Fourier series perspective and link the training with +the model's spectrum. Our analysis is based on a $harmonic$ expansion of the +sinusoidal MLP, which says that the composition of sinusoidal layers produces a +large number of new frequencies expressed as integer linear combinations of the +input frequencies (weights of the input layer). We use this novel $identity$ to +initialize the input neurons which work as a sampling in the signal spectrum. +We also note that each hidden neuron produces the same frequencies with +amplitudes completely determined by the hidden weights. Finally, we give an +upper bound for these amplitudes, which results in a $bounding$ scheme for the +network's spectrum during training. + +
+
+
+
+
+ + ♻ ☆ Mixture of Nested Experts: Adaptive Processing of Visual Tokens + + +
+ The visual medium (images and videos) naturally contains a large amount of +information redundancy, thereby providing a great opportunity for leveraging +efficiency in processing. While Vision Transformer (ViT) based models scale +effectively to large data regimes, they fail to capitalize on this inherent +redundancy, leading to higher computational costs. Mixture of Experts (MoE) +networks demonstrate scalability while maintaining same inference-time costs, +but they come with a larger parameter footprint. We present Mixture of Nested +Experts (MoNE), which utilizes a nested structure for experts, wherein +individual experts fall on an increasing compute-accuracy curve. Given a +compute budget, MoNE learns to dynamically choose tokens in a priority order, +and thus redundant tokens are processed through cheaper nested experts. Using +this framework, we achieve equivalent performance as the baseline models, while +reducing inference time compute by over two-fold. We validate our approach on +standard image and video datasets - ImageNet-21K, Kinetics400, and +Something-Something-v2. We further highlight MoNE$'$s adaptability by +showcasing its ability to maintain strong performance across different +inference-time compute budgets on videos, using only a single trained model. + +
+
+
+
+
+ + ♻ ☆ Differentiable Voxelization and Mesh Morphing + + +
+ In this paper, we propose the differentiable voxelization of 3D meshes via +the winding number and solid angles. The proposed approach achieves fast, +flexible, and accurate voxelization of 3D meshes, admitting the computation of +gradients with respect to the input mesh and GPU acceleration. We further +demonstrate the application of the proposed voxelization in mesh morphing, +where the voxelized mesh is deformed by a neural network. The proposed method +is evaluated on the ShapeNet dataset and achieves state-of-the-art performance +in terms of both accuracy and efficiency. + +
+
+
+
+
+ + ♻ ☆ Surgical Text-to-Image Generation + + +
+ Acquiring surgical data for research and development is significantly +hindered by high annotation costs and practical and ethical constraints. +Utilizing synthetically generated images could offer a valuable alternative. In +this work, we explore adapting text-to-image generative models for the surgical +domain using the CholecT50 dataset, which provides surgical images annotated +with action triplets (instrument, verb, target). We investigate several +language models and find T5 to offer more distinct features for differentiating +surgical actions on triplet-based textual inputs, and showcasing stronger +alignment between long and triplet-based captions. To address challenges in +training text-to-image models solely on triplet-based captions without +additional inputs and supervisory signals, we discover that triplet text +embeddings are instrument-centric in the latent space. Leveraging this insight, +we design an instrument-based class balancing technique to counteract data +imbalance and skewness, improving training convergence. Extending Imagen, a +diffusion-based generative model, we develop Surgical Imagen to generate +photorealistic and activity-aligned surgical images from triplet-based textual +prompts. We assess the model on quality, alignment, reasoning, and knowledge, +achieving FID and CLIP scores of 3.7 and 26.8% respectively. Human expert +survey shows that participants were highly challenged by the realistic +characteristics of the generated samples, demonstrating Surgical Imagen's +effectiveness as a practical alternative to real data collection. + +
+
+ comment: 11 pages, 11 figures, 3 tables, project page at + https://camma-public.github.io/endogen/ +
+
+
+
+
+ + ♻ ☆ On the Exploitation of DCT-Traces in the Generative-AI Domain + + +
+ Deepfakes represent one of the toughest challenges in the world of +Cybersecurity and Digital Forensics, especially considering the high-quality +results obtained with recent generative AI-based solutions. Almost all +generative models leave unique traces in synthetic data that, if analyzed and +identified in detail, can be exploited to improve the generalization +limitations of existing deepfake detectors. In this paper we analyzed deepfake +images in the frequency domain generated by both GAN and Diffusion Model +engines, examining in detail the underlying statistical distribution of +Discrete Cosine Transform (DCT) coefficients. Recognizing that not all +coefficients contribute equally to image detection, we hypothesize the +existence of a unique ``discriminative fingerprint", embedded in specific +combinations of coefficients. To identify them, Machine Learning classifiers +were trained on various combinations of coefficients. In addition, the +Explainable AI (XAI) LIME algorithm was used to search for intrinsic +discriminative combinations of coefficients. Finally, we performed a robustness +test to analyze the persistence of traces by applying JPEG compression. The +experimental results reveal the existence of traces left by the generative +models that are more discriminative and persistent at JPEG attacks. Code and +dataset are available at https://github.com/opontorno/dcts_analysis_deepfakes. + +
+
+
+
+
+ + ♻ ☆ Rethinking Radiology Report Generation via Causal Inspired + Counterfactual Augmentation + + +
+ Radiology Report Generation (RRG) draws attention as a vision-and-language +interaction of biomedical fields. Previous works inherited the ideology of +traditional language generation tasks, aiming to generate paragraphs with high +readability as reports. Despite significant progress, the independence between +diseases-a specific property of RRG-was neglected, yielding the models being +confused by the co-occurrence of diseases brought on by the biased data +distribution, thus generating inaccurate reports. In this paper, to rethink +this issue, we first model the causal effects between the variables from a +causal perspective, through which we prove that the co-occurrence relationships +between diseases on the biased distribution function as confounders, confusing +the accuracy through two backdoor paths, i.e. the Joint Vision Coupling and the +Conditional Sequential Coupling. Then, we proposed a novel model-agnostic +counterfactual augmentation method that contains two strategies, i.e. the +Prototype-based Counterfactual Sample Synthesis (P-CSS) and the Magic-Cube-like +Counterfactual Report Reconstruction (Cube), to intervene the backdoor paths, +thus enhancing the accuracy and generalization of RRG models. Experimental +results on the widely used MIMIC-CXR dataset demonstrate the effectiveness of +our proposed method. Additionally, a generalization performance is evaluated on +IU X-Ray dataset, which verifies our work can effectively reduce the impact of +co-occurrences caused by different distributions on the results. + +
+
+ comment: 10 pages,5 figures +
+
+
+
+
+ + ♻ ☆ vSHARP: variable Splitting Half-quadratic Admm algorithm for + Reconstruction of inverse-Problems + + +
+ Medical Imaging (MI) tasks, such as accelerated parallel Magnetic Resonance +Imaging (MRI), often involve reconstructing an image from noisy or incomplete +measurements. This amounts to solving ill-posed inverse problems, where a +satisfactory closed-form analytical solution is not available. Traditional +methods such as Compressed Sensing (CS) in MRI reconstruction can be +time-consuming or prone to obtaining low-fidelity images. Recently, a plethora +of Deep Learning (DL) approaches have demonstrated superior performance in +inverse-problem solving, surpassing conventional methods. In this study, we +propose vSHARP (variable Splitting Half-quadratic ADMM algorithm for +Reconstruction of inverse Problems), a novel DL-based method for solving +ill-posed inverse problems arising in MI. vSHARP utilizes the Half-Quadratic +Variable Splitting method and employs the Alternating Direction Method of +Multipliers (ADMM) to unroll the optimization process. For data consistency, +vSHARP unrolls a differentiable gradient descent process in the image domain, +while a DL-based denoiser, such as a U-Net architecture, is applied to enhance +image quality. vSHARP also employs a dilated-convolution DL-based model to +predict the Lagrange multipliers for the ADMM initialization. We evaluate +vSHARP on tasks of accelerated parallel MRI Reconstruction using two distinct +datasets and on accelerated parallel dynamic MRI Reconstruction using another +dataset. Our comparative analysis with state-of-the-art methods demonstrates +the superior performance of vSHARP in these applications. + +
+
+ comment: 22 pages, 9 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Ponymation: Learning 3D Animal Motions from Unlabeled Online Videos + + +
+ We introduce Ponymation, a new method for learning a generative model of +articulated 3D animal motions from raw, unlabeled online videos. Unlike +existing approaches for motion synthesis, our model does not require any pose +annotations or parametric shape models for training, and is learned purely from +a collection of raw video clips obtained from the Internet. We build upon a +recent work, MagicPony, which learns articulated 3D animal shapes purely from +single image collections, and extend it on two fronts. First, instead of +training on static images, we augment the framework with a video training +pipeline that incorporates temporal regularizations, achieving more accurate +and temporally consistent reconstructions. Second, we learn a generative model +of the underlying articulated 3D motion sequences via a spatio-temporal +transformer VAE, simply using 2D reconstruction losses without relying on any +explicit pose annotations. At inference time, given a single 2D image of a new +animal instance, our model reconstructs an articulated, textured 3D mesh, and +generates plausible 3D animations by sampling from the learned motion latent +space. + +
+
+ comment: Project page: https://keqiangsun.github.io/projects/ponymation. The + first two authors contributed equally to this work. The last two authors + contributed equally +
+
+
+
+
+ + ♻ ☆ SceneX:Procedural Controllable Large-scale Scene Generation via + Large-language Models + + +
+ Due to its great application potential, large-scale scene generation has +drawn extensive attention in academia and industry. Recent research employs +powerful generative models to create desired scenes and achieves promising +results. However, most of these methods represent the scene using 3D primitives +(e.g. point cloud or radiance field) incompatible with the industrial pipeline, +which leads to a substantial gap between academic research and industrial +deployment. Procedural Controllable Generation (PCG) is an efficient technique +for creating scalable and high-quality assets, but it is unfriendly for +ordinary users as it demands profound domain expertise. To address these +issues, we resort to using the large language model (LLM) to drive the +procedural modeling. In this paper, we introduce a large-scale scene generation +framework, SceneX, which can automatically produce high-quality procedural +models according to designers' textual descriptions.Specifically, the proposed +method comprises two components, PCGBench and PCGPlanner. The former +encompasses an extensive collection of accessible procedural assets and +thousands of hand-craft API documents. The latter aims to generate executable +actions for Blender to produce controllable and precise 3D assets guided by the +user's instructions. Our SceneX can generate a city spanning 2.5 km times 2.5 +km with delicate layout and geometric structures, drastically reducing the time +cost from several weeks for professional PCG engineers to just a few hours for +an ordinary user. Extensive experiments demonstrated the capability of our +method in controllable large-scale scene generation and editing, including +asset placement and season translation. + +
+
+
+
+
+ + ♻ ☆ SpaER: Learning Spatio-temporal Equivariant Representations for Fetal + Brain Motion Tracking + + +
+ In this paper, we introduce SpaER, a pioneering method for fetal motion +tracking that leverages equivariant filters and self-attention mechanisms to +effectively learn spatio-temporal representations. Different from conventional +approaches that statically estimate fetal brain motions from pairs of images, +our method dynamically tracks the rigid movement patterns of the fetal head +across temporal and spatial dimensions. Specifically, we first develop an +equivariant neural network that efficiently learns rigid motion sequences +through low-dimensional spatial representations of images. Subsequently, we +learn spatio-temporal representations by incorporating time encoding and +self-attention neural network layers. This approach allows for the capture of +long-term dependencies of fetal brain motion and addresses alignment errors due +to contrast changes and severe motion artifacts. Our model also provides a +geometric deformation estimation that properly addresses image distortions +among all time frames. To the best of our knowledge, our approach is the first +to learn spatial-temporal representations via deep neural networks for fetal +motion tracking without data augmentation. We validated our model using real +fetal echo-planar images with simulated and real motions. Our method carries +significant potential value in accurately measuring, tracking, and correcting +fetal motion in fetal MRI sequences. + +
+
+
+
+
+ + ♻ ☆ JSSL: Joint Supervised and Self-supervised Learning for MRI + Reconstruction + + +
+ Purpose: MRI represents an important diagnostic modality; however, its +inherently slow acquisition process poses challenges in obtaining fully-sampled +k-space data under motion. In the absence of fully-sampled acquisitions, +serving as ground truths, training deep learning algorithms in a supervised +manner to predict the underlying ground truth image becomes challenging. To +address this limitation, self-supervised methods have emerged as a viable +alternative, leveraging available subsampled k-space data to train deep neural +networks for MRI reconstruction. Nevertheless, these approaches often fall +short when compared to supervised methods. + Methods: We propose Joint Supervised and Self-supervised Learning (JSSL), a +novel training approach for deep learning-based MRI reconstruction algorithms +aimed at enhancing reconstruction quality in cases where target datasets +containing fully-sampled k-space measurements are unavailable. JSSL operates by +simultaneously training a model in a self-supervised learning setting, using +subsampled data from the target dataset(s), and in a supervised learning +manner, utilizing datasets with fully-sampled k-space data, referred to as +proxy datasets. We demonstrate JSSL's efficacy using subsampled prostate or +cardiac MRI data as the target datasets, with fully-sampled brain and knee, or +brain, knee and prostate k-space acquisitions, respectively, as proxy datasets. + Results: Our results showcase substantial improvements over conventional +self-supervised methods, validated using common image quality metrics. +Furthermore, we provide theoretical motivations for JSSL and establish +rule-of-thumb guidelines for training MRI reconstruction models. + Conclusion: JSSL effectively enhances MRI reconstruction quality in scenarios +where fully-sampled k-space data is not available, leveraging the strengths of +supervised learning by incorporating proxy datasets. + +
+
+ comment: pages, 14 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ From Majority to Minority: A Diffusion-based Augmentation for + Underrepresented Groups in Skin Lesion Analysis + + +
+ AI-based diagnoses have demonstrated dermatologist-level performance in +classifying skin cancer. However, such systems are prone to under-performing +when tested on data from minority groups that lack sufficient representation in +the training sets. Although data collection and annotation offer the best means +for promoting minority groups, these processes are costly and time-consuming. +Prior works have suggested that data from majority groups may serve as a +valuable information source to supplement the training of diagnosis tools for +minority groups. In this work, we propose an effective diffusion-based +augmentation framework that maximizes the use of rich information from majority +groups to benefit minority groups. Using groups with different skin types as a +case study, our results show that the proposed framework can generate synthetic +images that improve diagnostic results for the minority groups, even when there +is little or no reference data from these target groups. The practical value of +our work is evident in medical imaging analysis, where under-diagnosis persists +as a problem for certain groups due to insufficient representation. + +
+
+
+
+
+ + ♻ ☆ KI-PMF: Knowledge Integrated Plausible Motion Forecasting + + +
+ Accurately forecasting the motion of traffic actors is crucial for the +deployment of autonomous vehicles at a large scale. Current trajectory +forecasting approaches primarily concentrate on optimizing a loss function with +a specific metric, which can result in predictions that do not adhere to +physical laws or violate external constraints. Our objective is to incorporate +explicit knowledge priors that allow a network to forecast future trajectories +in compliance with both the kinematic constraints of a vehicle and the geometry +of the driving environment. To achieve this, we introduce a non-parametric +pruning layer and attention layers to integrate the defined knowledge priors. +Our proposed method is designed to ensure reachability guarantees for traffic +actors in both complex and dynamic situations. By conditioning the network to +follow physical laws, we can obtain accurate and safe predictions, essential +for maintaining autonomous vehicles' safety and efficiency in real-world +settings.In summary, this paper presents concepts that prevent off-road +predictions for safe and reliable motion forecasting by incorporating knowledge +priors into the training process. + +
+
+
+
+
+ + ♻ ☆ Structure Unbiased Adversarial Model for Medical Image Segmentation + + +
+ Generative models have been widely proposed in image recognition to generate +more images where the distribution is similar to that of the real ones. It +often introduces a discriminator network to differentiate the real data from +the generated ones. Such models utilise a discriminator network tasked with +differentiating style transferred data from data contained in the target +dataset. However in doing so the network focuses on discrepancies in the +intensity distribution and may overlook structural differences between the +datasets. In this paper we formulate a new image-to-image translation problem +to ensure that the structure of the generated images is similar to that in the +target dataset. We propose a simple, yet powerful Structure-Unbiased +Adversarial (SUA) network which accounts for both intensity and structural +differences between the training and test sets when performing image +segmentation. It consists of a spatial transformation block followed by an +intensity distribution rendering module. The spatial transformation block is +proposed to reduce the structure gap between the two images, and also produce +an inverse deformation field to warp the final segmented image back. The +intensity distribution rendering module then renders the deformed structure to +an image with the target intensity distribution. Experimental results show that +the proposed SUA method has the capability to transfer both intensity +distribution and structural content between multiple datasets. + +
+
+ comment: Will revise the paper and resubmit +
+
+
+
+
+ + ♻ ☆ Look Around and Learn: Self-Training Object Detection by Exploration ECCV2024 + + +
+ When an object detector is deployed in a novel setting it often experiences a +drop in performance. This paper studies how an embodied agent can automatically +fine-tune a pre-existing object detector while exploring and acquiring images +in a new environment without relying on human intervention, i.e., a fully +self-supervised approach. In our setting, an agent initially learns to explore +the environment using a pre-trained off-the-shelf detector to locate objects +and associate pseudo-labels. By assuming that pseudo-labels for the same object +must be consistent across different views, we learn the exploration policy Look +Around to mine hard samples, and we devise a novel mechanism called +Disagreement Reconciliation for producing refined pseudo-labels from the +consensus among observations. We implement a unified benchmark of the current +state-of-the-art and compare our approach with pre-existing exploration +policies and perception mechanisms. Our method is shown to outperform existing +approaches, improving the object detector by 6.2% in a simulated scenario, a +3.59% advancement over other state-of-the-art methods, and by 9.97% in the real +robotic test without relying on ground-truth. Code for the proposed approach +and baselines are available at +https://iit-pavis.github.io/Look_Around_And_Learn/. + +
+
+ comment: Paper accepted at ECCV2024 +
+
+
+
+
+ + ♻ ☆ Semantic Image Synthesis via Class-Adaptive Cross-Attention + + +
+ In semantic image synthesis the state of the art is dominated by methods that +use customized variants of the SPatially-Adaptive DE-normalization (SPADE) +layers, which allow for good visual generation quality and editing versatility. +By design, such layers learn pixel-wise modulation parameters to de-normalize +the generator activations based on the semantic class each pixel belongs to. +Thus, they tend to overlook global image statistics, ultimately leading to +unconvincing local style editing and causing global inconsistencies such as +color or illumination distribution shifts. Also, SPADE layers require the +semantic segmentation mask for mapping styles in the generator, preventing +shape manipulations without manual intervention. In response, we designed a +novel architecture where cross-attention layers are used in place of SPADE for +learning shape-style correlations and so conditioning the image generation +process. Our model inherits the versatility of SPADE, at the same time +obtaining state-of-the-art generation quality, as well as improved global and +local style transfer. Code and models available at +https://github.com/TFonta/CA2SIS. + +
+
+ comment: Code and models available at https://github.com/TFonta/CA2SIS The + paper is under consideration at Computer Vision and Image Understanding +
+
+
+
+
+ + ♻ ☆ Synthetic Image Learning: Preserving Performance and Preventing + Membership Inference Attacks + + +
+ Generative artificial intelligence has transformed the generation of +synthetic data, providing innovative solutions to challenges like data scarcity +and privacy, which are particularly critical in fields such as medicine. +However, the effective use of this synthetic data to train high-performance +models remains a significant challenge. This paper addresses this issue by +introducing Knowledge Recycling (KR), a pipeline designed to optimise the +generation and use of synthetic data for training downstream classifiers. At +the heart of this pipeline is Generative Knowledge Distillation (GKD), the +proposed technique that significantly improves the quality and usefulness of +the information provided to classifiers through a synthetic dataset +regeneration and soft labelling mechanism. The KR pipeline has been tested on a +variety of datasets, with a focus on six highly heterogeneous medical image +datasets, ranging from retinal images to organ scans. The results show a +significant reduction in the performance gap between models trained on real and +synthetic data, with models based on synthetic data outperforming those trained +on real data in some cases. Furthermore, the resulting models show almost +complete immunity to Membership Inference Attacks, manifesting privacy +properties missing in models trained with conventional techniques. + +
+
+
+
+
+ + ♻ ☆ Local-peak scale-invariant feature transform for fast and random image + stitching + + +
+ Image stitching aims to construct a wide field of view with high spatial +resolution, which cannot be achieved in a single exposure. Typically, +conventional image stitching techniques, other than deep learning, require +complex computation and thus computational pricy, especially for stitching +large raw images. In this study, inspired by the multiscale feature of fluid +turbulence, we developed a fast feature point detection algorithm named +local-peak scale-invariant feature transform (LP-SIFT), based on the multiscale +local peaks and scale-invariant feature transform method. By combining LP-SIFT +and RANSAC in image stitching, the stitching speed can be improved by orders, +compared with the original SIFT method. Nine large images (over 2600*1600 +pixels), arranged randomly without prior knowledge, can be stitched within +158.94 s. The algorithm is highly practical for applications requiring a wide +field of view in diverse application scenes, e.g., terrain mapping, biological +analysis, and even criminal investigation. + +
+
+
+
+
+ + ♻ ☆ SALSA: Swift Adaptive Lightweight Self-Attention for Enhanced LiDAR + Place Recognition + + +
+ Large-scale LiDAR mappings and localization leverage place recognition +techniques to mitigate odometry drifts, ensuring accurate mapping. These +techniques utilize scene representations from LiDAR point clouds to identify +previously visited sites within a database. Local descriptors, assigned to each +point within a point cloud, are aggregated to form a scene representation for +the point cloud. These descriptors are also used to re-rank the retrieved point +clouds based on geometric fitness scores. We propose SALSA, a novel, +lightweight, and efficient framework for LiDAR place recognition. It consists +of a Sphereformer backbone that uses radial window attention to enable +information aggregation for sparse distant points, an adaptive self-attention +layer to pool local descriptors into tokens, and a multi-layer-perceptron Mixer +layer for aggregating the tokens to generate a scene descriptor. The proposed +framework outperforms existing methods on various LiDAR place recognition +datasets in terms of both retrieval and metric localization while operating in +real-time. + +
+
+
+
+
+ + ♻ ☆ The Sky's the Limit: Re-lightable Outdoor Scenes via a Sky-pixel + Constrained Illumination Prior and Outside-In Visibility ECCV 2024 + + +
+ Inverse rendering of outdoor scenes from unconstrained image collections is a +challenging task, particularly illumination/albedo ambiguities and occlusion of +the illumination environment (shadowing) caused by geometry. However, there are +many cues in an image that can aid in the disentanglement of geometry, albedo +and shadows. Whilst sky is frequently masked out in state-of-the-art methods, +we exploit the fact that any sky pixel provides a direct observation of distant +lighting in the corresponding direction and, via a neural illumination prior, a +statistical cue to derive the remaining illumination environment. The +incorporation of our illumination prior is enabled by a novel `outside-in' +method for computing differentiable sky visibility based on a neural +directional distance function. This is highly efficient and can be trained in +parallel with the neural scene representation, allowing gradients from +appearance loss to flow from shadows to influence the estimation of +illumination and geometry. Our method estimates high-quality albedo, geometry, +illumination and sky visibility, achieving state-of-the-art results on the +NeRF-OSR relighting benchmark. Our code and models can be found at +https://github.com/JADGardner/neusky + +
+
+ comment: Accepted to ECCV 2024 +
+
+
+
+
+ + ♻ ☆ latentSplat: Autoencoding Variational Gaussians for Fast Generalizable + 3D Reconstruction + + +
+ We present latentSplat, a method to predict semantic Gaussians in a 3D latent +space that can be splatted and decoded by a light-weight generative 2D +architecture. Existing methods for generalizable 3D reconstruction either do +not scale to large scenes and resolutions, or are limited to interpolation of +close input views. latentSplat combines the strengths of regression-based and +generative approaches while being trained purely on readily available real +video data. The core of our method are variational 3D Gaussians, a +representation that efficiently encodes varying uncertainty within a latent +space consisting of 3D feature Gaussians. From these Gaussians, specific +instances can be sampled and rendered via efficient splatting and a fast, +generative decoder. We show that latentSplat outperforms previous works in +reconstruction quality and generalization, while being fast and scalable to +high-resolution data. + +
+
+ comment: Project website: https://geometric-rl.mpi-inf.mpg.de/latentsplat/ +
+
+
+
+
+ + ♻ ☆ Efficient Face Super-Resolution via Wavelet-based Feature Enhancement + Network + + +
+ Face super-resolution aims to reconstruct a high-resolution face image from a +low-resolution face image. Previous methods typically employ an encoder-decoder +structure to extract facial structural features, where the direct downsampling +inevitably introduces distortions, especially to high-frequency features such +as edges. To address this issue, we propose a wavelet-based feature enhancement +network, which mitigates feature distortion by losslessly decomposing the input +feature into high and low-frequency components using the wavelet transform and +processing them separately. To improve the efficiency of facial feature +extraction, a full domain Transformer is further proposed to enhance local, +regional, and global facial features. Such designs allow our method to perform +better without stacking many modules as previous methods did. Experiments show +that our method effectively balances performance, model size, and speed. Code +link: https://github.com/PRIS-CV/WFEN. + +
+
+
+
+
+ + ♻ ☆ Improving Zero-shot Generalization of Learned Prompts via Unsupervised + Knowledge Distillation ECCV24 + + +
+ Vision-Language Models (VLMs) demonstrate remarkable zero-shot generalization +to unseen tasks, but fall short of the performance of supervised methods in +generalizing to downstream tasks with limited data. Prompt learning is emerging +as a parameter-efficient method for adapting VLMs, but state-of-the-art +approaches require annotated samples. In this paper we propose a novel approach +to prompt learning based on unsupervised knowledge distillation from more +powerful models. Our approach, which we call Knowledge Distillation Prompt +Learning (KDPL), can be integrated into existing prompt learning techniques and +eliminates the need for labeled examples during adaptation. Our experiments on +more than ten standard benchmark datasets demonstrate that KDPL is very +effective at improving generalization of learned prompts for zero-shot domain +generalization, zero-shot cross-dataset generalization, and zero-shot +base-to-novel class generalization problems. KDPL requires no ground-truth +labels for adaptation, and moreover we show that even in the absence of any +knowledge of training class names it can be used to effectively transfer +knowledge. The code is publicly available at https://github.com/miccunifi/KDPL. + +
+
+ comment: Accepted for publication at ECCV24 +
+
+
+
+
+ + ♻ ☆ Adaptive Bounding Box Uncertainties via Two-Step Conformal Prediction ECCV + + +
+ Quantifying a model's predictive uncertainty is essential for safety-critical +applications such as autonomous driving. We consider quantifying such +uncertainty for multi-object detection. In particular, we leverage conformal +prediction to obtain uncertainty intervals with guaranteed coverage for object +bounding boxes. One challenge in doing so is that bounding box predictions are +conditioned on the object's class label. Thus, we develop a novel two-step +conformal approach that propagates uncertainty in predicted class labels into +the uncertainty intervals of bounding boxes. This broadens the validity of our +conformal coverage guarantees to include incorrectly classified objects, thus +offering more actionable safety assurances. Moreover, we investigate novel +ensemble and quantile regression formulations to ensure the bounding box +intervals are adaptive to object size, leading to a more balanced coverage. +Validating our two-step approach on real-world datasets for 2D bounding box +localization, we find that desired coverage levels are satisfied with +practically tight predictive uncertainty intervals. + +
+
+ comment: European Conference on Computer Vision (ECCV) 2024; 37 pages, 14 + figures, 6 tables (incl. appendix) +
+
+
+
+
+ + ♻ ☆ ALERT-Transformer: Bridging Asynchronous and Synchronous Machine + Learning for Real-Time Event-based Spatio-Temporal Data ICML 2024 + + +
+ We seek to enable classic processing of continuous ultra-sparse +spatiotemporal data generated by event-based sensors with dense machine +learning models. We propose a novel hybrid pipeline composed of asynchronous +sensing and synchronous processing that combines several ideas: (1) an +embedding based on PointNet models -- the ALERT module -- that can continuously +integrate new and dismiss old events thanks to a leakage mechanism, (2) a +flexible readout of the embedded data that allows to feed any downstream model +with always up-to-date features at any sampling rate, (3) exploiting the input +sparsity in a patch-based approach inspired by Vision Transformer to optimize +the efficiency of the method. These embeddings are then processed by a +transformer model trained for object and gesture recognition. Using this +approach, we achieve performances at the state-of-the-art with a lower latency +than competitors. We also demonstrate that our asynchronous model can operate +at any desired sampling rate. + +
+
+ comment: Originally published in the Proceedings of Machine Learning Research + ICML 2024 +
+
+
+
+
+ + ♻ ☆ Learning with Alignments: Tackling the Inter- and Intra-domain Shifts + for Cross-multidomain Facial Expression Recognition ACM MM 2024 + + +
+ Facial Expression Recognition (FER) holds significant importance in +human-computer interactions. Existing cross-domain FER methods often transfer +knowledge solely from a single labeled source domain to an unlabeled target +domain, neglecting the comprehensive information across multiple sources. +Nevertheless, cross-multidomain FER (CMFER) is very challenging for (i) the +inherent inter-domain shifts across multiple domains and (ii) the intra-domain +shifts stemming from the ambiguous expressions and low inter-class +distinctions. In this paper, we propose a novel Learning with Alignments CMFER +framework, named LA-CMFER, to handle both inter- and intra-domain shifts. +Specifically, LA-CMFER is constructed with a global branch and a local branch +to extract features from the full images and local subtle expressions, +respectively. Based on this, LA-CMFER presents a dual-level inter-domain +alignment method to force the model to prioritize hard-to-align samples in +knowledge transfer at a sample level while gradually generating a +well-clustered feature space with the guidance of class attributes at a cluster +level, thus narrowing the inter-domain shifts. To address the intra-domain +shifts, LA-CMFER introduces a multi-view intra-domain alignment method with a +multi-view clustering consistency constraint where a prediction similarity +matrix is built to pursue consistency between the global and local views, thus +refining pseudo labels and eliminating latent noise. Extensive experiments on +six benchmark datasets have validated the superiority of our LA-CMFER. + +
+
+ comment: Accepted by ACM MM 2024 +
+
+
+
+
+ + ♻ ☆ SOWA: Adapting Hierarchical Frozen Window Self-Attention to + Visual-Language Models for Better Anomaly Detection + + +
+ Visual anomaly detection is critical in industrial manufacturing, but +traditional methods often rely on extensive normal datasets and custom models, +limiting scalability. Recent advancements in large-scale visual-language models +have significantly improved zero/few-shot anomaly detection. However, these +approaches may not fully utilize hierarchical features, potentially missing +nuanced details. We introduce a window self-attention mechanism based on the +CLIP model, combined with learnable prompts to process multi-level features +within a Soldier-Offier Window self-Attention (SOWA) framework. Our method has +been tested on five benchmark datasets, demonstrating superior performance by +leading in 18 out of 20 metrics compared to existing state-of-the-art +techniques. + +
+
+ comment: 8 pages, 9 figures, conference +
+
+
+
+
+ + ♻ ☆ WidthFormer: Toward Efficient Transformer-based BEV View Transformation IROS 2024 + + +
+ We present WidthFormer, a novel transformer-based module to compute +Bird's-Eye-View (BEV) representations from multi-view cameras for real-time +autonomous-driving applications. WidthFormer is computationally efficient, +robust and does not require any special engineering effort to deploy. We first +introduce a novel 3D positional encoding mechanism capable of accurately +encapsulating 3D geometric information, which enables our model to compute +high-quality BEV representations with only a single transformer decoder layer. +This mechanism is also beneficial for existing sparse 3D object detectors. +Inspired by the recently proposed works, we further improve our model's +efficiency by vertically compressing the image features when serving as +attention keys and values, and then we develop two modules to compensate for +potential information loss due to feature compression. Experimental evaluation +on the widely-used nuScenes 3D object detection benchmark demonstrates that our +method outperforms previous approaches across different 3D detection +architectures. More importantly, our model is highly efficient. For example, +when using $256\times 704$ input images, it achieves 1.5 ms and 2.8 ms latency +on NVIDIA 3090 GPU and Horizon Journey-5 computation solutions. Furthermore, +WidthFormer also exhibits strong robustness to different degrees of camera +perturbations. Our study offers valuable insights into the deployment of BEV +transformation methods in real-world, complex road environments. Code is +available at https://github.com/ChenhongyiYang/WidthFormer . + +
+
+ comment: IROS 2024 Oral Presentation +
+
+
+
+
+ + ♻ ☆ NeRF-Supervised Feature Point Detection and Description + + +
+ Feature point detection and description is the backbone for various computer +vision applications, such as Structure-from-Motion, visual SLAM, and visual +place recognition. While learning-based methods have surpassed traditional +handcrafted techniques, their training often relies on simplistic +homography-based simulations of multi-view perspectives, limiting model +generalisability. This paper presents a novel approach leveraging Neural +Radiance Fields (NeRFs) to generate a diverse and realistic dataset consisting +of indoor and outdoor scenes. Our proposed methodology adapts state-of-the-art +feature detectors and descriptors for training on multi-view NeRF-synthesised +data, with supervision achieved through perspective projective geometry. +Experiments demonstrate that the proposed methodology achieves competitive or +superior performance on standard benchmarks for relative pose estimation, point +cloud registration, and homography estimation while requiring significantly +less training data and time compared to existing approaches. + +
+
+
+
+
+ + ♻ ☆ Zero123-6D: Zero-shot Novel View Synthesis for RGB Category-level 6D + Pose Estimation + + +
+ Estimating the pose of objects through vision is essential to make robotic +platforms interact with the environment. Yet, it presents many challenges, +often related to the lack of flexibility and generalizability of +state-of-the-art solutions. Diffusion models are a cutting-edge neural +architecture transforming 2D and 3D computer vision, outlining remarkable +performances in zero-shot novel-view synthesis. Such a use case is particularly +intriguing for reconstructing 3D objects. However, localizing objects in +unstructured environments is rather unexplored. To this end, this work presents +Zero123-6D, the first work to demonstrate the utility of Diffusion Model-based +novel-view-synthesizers in enhancing RGB 6D pose estimation at category-level, +by integrating them with feature extraction techniques. Novel View Synthesis +allows to obtain a coarse pose that is refined through an online optimization +method introduced in this work to deal with intra-category geometric +differences. In such a way, the outlined method shows reduction in data +requirements, removal of the necessity of depth information in zero-shot +category-level 6D pose estimation task, and increased performance, +quantitatively demonstrated through experiments on the CO3D dataset. + +
+
+ comment: 6 pages, 2 reference pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Can we Constrain Concept Bottleneck Models to Learn Semantically + Meaningful Input Features? + + +
+ Concept Bottleneck Models (CBMs) are regarded as inherently interpretable +because they first predict a set of human-defined concepts which are used to +predict a task label. For inherent interpretability to be fully realised, and +ensure trust in a model's output, it's desirable for concept predictions to use +semantically meaningful input features. For instance, in an image, pixels +representing a broken bone should contribute to predicting a fracture. However, +current literature suggests that concept predictions often rely on irrelevant +input features. We hypothesise that this occurs when dataset labels include +inaccurate concept annotations, or the relationship between input features and +concepts is unclear. In general, the effect of dataset labelling on concept +representations remains an understudied area. In this paper, we demonstrate +that CBMs can learn to map concepts to semantically meaningful input features, +by utilising datasets with a clear link between the input features and the +desired concept predictions. This is achieved, for instance, by ensuring +multiple concepts do not always co-occur and, therefore provide a clear +training signal for the CBM to distinguish the relevant input features for each +concept. We validate our hypothesis on both synthetic and real-world image +datasets, and demonstrate under the correct conditions, CBMs can learn to +attribute semantically meaningful input features to the correct concept +predictions. + +
+
+ comment: Main paper: 8 pages, 9 figures, Appendix: 14 pages, 21 figures. This + paper is a preprint +
+
+
+
+
+ + ♻ ☆ Eyes Wide Unshut: Unsupervised Mistake Detection in Egocentric + Procedural Video by Detecting Unpredictable Gaze + + +
+ In this paper, we address the challenge of unsupervised mistake detection in +egocentric procedural video through the analysis of gaze signals. Traditional +supervised mistake detection methods rely on manually labeled mistakes, and +hence suffer from domain-dependence and scalability issues. We introduce an +unsupervised method for detecting mistakes in videos of human activities, +overcoming the challenges of domain-specific requirements and the need for +annotated data. We postulate that, when a subject is making a mistake in the +execution of a procedure, their attention patterns will deviate from normality. +We hence propose to detect mistakes by comparing gaze trajectories predicted +from input video with ground truth gaze signals collected through a gaze +tracker. Since predicting gaze in video is characterized by high uncertainty, +we propose a novel \textit{gaze completion task}, which aims to predict gaze +from visual observations and partial gaze trajectories. We further contribute a +\textit{gaze completion approach} based on a Gaze-Frame Correlation module to +explicitly model the correlation between gaze information and each local visual +token. Inconsistencies between the predicted and observed gaze trajectories act +as an indicator for identifying mistakes. Experiments on the EPIC-Tent, +HoloAssist and IndustReal datasets showcase the effectiveness of the proposed +approach as compared to unsupervised and one-class techniques. Our method is +ranked first on the HoloAssist Mistake Detection challenge. + +
+
+
+
+
+ + ♻ ☆ AFGI: Towards Accurate and Fast-convergent Gradient Inversion Attack in + Federated Learning + + +
+ Federated learning (FL) empowers privacypreservation in model training by +only exposing users' model gradients. Yet, FL users are susceptible to gradient +inversion attacks (GIAs) which can reconstruct ground-truth training data such +as images based on model gradients. However, reconstructing high-resolution +images by existing GIAs faces two challenges: inferior accuracy and +slow-convergence, especially when duplicating labels exist in the training +batch. To address these challenges, we present an Accurate and Fast-convergent +Gradient Inversion attack algorithm, called AFGI, with two components: Label +Recovery Block (LRB) which can accurately restore duplicating labels of private +images based on exposed gradients; VME Regularization Term, which includes the +total variance of reconstructed images, the discrepancy between three-channel +means and edges, between values from exposed gradients and reconstructed +images, respectively. The AFGI can be regarded as a white-box attack strategy +to reconstruct images by leveraging labels recovered by LRB. In particular, +AFGI is efficient that accurately reconstruct ground-truth images when users' +training batch size is up to 48. Our experimental results manifest that AFGI +can diminish 85% time costs while achieving superb inversion quality in the +ImageNet dataset. At last, our study unveils the shortcomings of FL in +privacy-preservation, prompting the development of more advanced countermeasure +strategies. + +
+
+
+
+
+ + ♻ ☆ Vision-Based Power Line Cables and Pylons Detection for Low Flying + Aircraft + + +
+ Power lines are dangerous for low-flying aircraft, especially in +low-visibility conditions. Thus, a vision-based system able to analyze the +aircraft's surroundings and to provide the pilots with a "second pair of eyes" +can contribute to enhancing their safety. To this end, we have developed a deep +learning approach to jointly detect power line cables and pylons from images +captured at distances of several hundred meters by aircraft-mounted cameras. In +doing so, we have combined a modern convolutional architecture with transfer +learning and a loss function adapted to curvilinear structure delineation. We +use a single network for both detection tasks and demonstrated its performance +on two benchmarking datasets. We have integrated it within an onboard system +and run it in flight, and have demonstrated with our experiments that it +outperforms the prior distant cable detection method on both datasets, while +also successfully detecting pylons, given their annotations are available for +the data. + +
+
+ comment: Added several declarations at the end of the publication +
+
+
+
+
+ + ♻ ☆ Classification of freshwater snails of the genus Radomaniola with + multimodal triplet networks ICML 2024 + + +
+ In this paper, we present our first proposal of a machine learning system for +the classification of freshwater snails of the genus Radomaniola. We elaborate +on the specific challenges encountered during system design, and how we tackled +them; namely a small, very imbalanced dataset with a high number of classes and +high visual similarity between classes. We then show how we employed triplet +networks and the multiple input modalities of images, measurements, and genetic +information to overcome these challenges and reach a performance comparable to +that of a trained domain expert. + +
+
+ comment: Spotlight at ICML 2024 AI for Science workshop +
+
+
+
+
+ + ♻ ☆ Exploit the Leak: Understanding Risks in Biometric Matchers + + +
+ In a biometric authentication or identification system, the matcher compares +a stored and a fresh template to determine whether there is a match. This +assessment is based on both a similarity score and a predefined threshold. For +better compliance with privacy legislation, the matcher can be built upon a +privacy-preserving distance. Beyond the binary output (`yes' or `no'), most +schemes may perform more precise computations, e.g., the value of the distance. +Such precise information is prone to leakage even when not returned by the +system. This can occur due to a malware infection or the use of a weakly +privacy-preserving distance, exemplified by side channel attacks or partially +obfuscated designs. This paper provides an analysis of information leakage +during distance evaluation. We provide a catalog of information leakage +scenarios with their impacts on data privacy. Each scenario gives rise to +unique attacks with impacts quantified in terms of computational costs, thereby +providing a better understanding of the security level. + +
+
+ comment: Minor corrections +
+
+
+
+
+ + ♻ ☆ Noise-Tolerant Few-Shot Unsupervised Adapter for Vision-Language Models BMVC 2024 + + +
+ Recent advances in large-scale vision-language models have achieved +impressive performance in various zero-shot image classification tasks. While +prior studies have demonstrated significant improvements by introducing +few-shot labelled target samples, they still require labelling of target +samples, which greatly degrades their scalability and generalizability while +handling various visual recognition tasks. We design NtUA, a Noise-tolerant +Unsupervised Adapter that allows the learning of effective target models with +few unlabelled target samples. NtUA works as a key-value cache that formulates +visual features and predicted pseudo-labels of the few unlabelled target +samples as key-value pairs. It consists of two complementary designs. The first +is adaptive cache formation that combats pseudo-label noises by weighting the +key-value pairs according to their prediction confidence. The second is +knowledge-guided cache refinement, which refines pair values (i.e., +pseudo-labels) and cache weights by leveraging knowledge distillation from +large-scale vision language models. Extensive experiments show that NtUA +achieves superior performance consistently across multiple widely adopted +benchmarks. + +
+
+ comment: Accepted at BMVC 2024 +
+
+
+
+
+ + ♻ ☆ STUPD: A Synthetic Dataset for Spatial and Temporal Relation Reasoning + + +
+ Understanding relations between objects is crucial for understanding the +semantics of a visual scene. It is also an essential step in order to bridge +visual and language models. However, current state-of-the-art computer vision +models still lack the ability to perform spatial reasoning well. Existing +datasets mostly cover a relatively small number of spatial relations, all of +which are static relations that do not intrinsically involve motion. In this +paper, we propose the Spatial and Temporal Understanding of Prepositions +Dataset (STUPD) -- a large-scale video dataset for understanding static and +dynamic spatial relationships derived from prepositions of the English +language. The dataset contains 150K visual depictions (videos and images), +consisting of 30 distinct spatial prepositional senses, in the form of object +interaction simulations generated synthetically using Unity3D. In addition to +spatial relations, we also propose 50K visual depictions across 10 temporal +relations, consisting of videos depicting event/time-point interactions. To our +knowledge, no dataset exists that represents temporal relations through visual +settings. In this dataset, we also provide 3D information about object +interactions such as frame-wise coordinates, and descriptions of the objects +used. The goal of this synthetic dataset is to help models perform better in +visual relationship detection in real-world settings. We demonstrate an +increase in the performance of various models over 2 real-world datasets +(ImageNet-VidVRD and Spatial Senses) when pretrained on the STUPD dataset, in +comparison to other pretraining datasets. + +
+
+
+
+
+ + ♻ ☆ Controllable Face Synthesis with Semantic Latent Diffusion Models + + +
+ Semantic Image Synthesis (SIS) is among the most popular and effective +techniques in the field of face generation and editing, thanks to its good +generation quality and the versatility is brings along. Recent works attempted +to go beyond the standard GAN-based framework, and started to explore Diffusion +Models (DMs) for this task as these stand out with respect to GANs in terms of +both quality and diversity. On the other hand, DMs lack in fine-grained +controllability and reproducibility. To address that, in this paper we propose +a SIS framework based on a novel Latent Diffusion Model architecture for human +face generation and editing that is both able to reproduce and manipulate a +real reference image and generate diversity-driven results. The proposed system +utilizes both SPADE normalization and cross-attention layers to merge shape and +style information and, by doing so, allows for a precise control over each of +the semantic parts of the human face. This was not possible with previous +methods in the state of the art. Finally, we performed an extensive set of +experiments to prove that our model surpasses current state of the art, both +qualitatively and quantitatively. + +
+
+
+
+
+ + ♻ ☆ SimPro: A Simple Probabilistic Framework Towards Realistic Long-Tailed + Semi-Supervised Learning ICML2024 + + +
+ Recent advancements in semi-supervised learning have focused on a more +realistic yet challenging task: addressing imbalances in labeled data while the +class distribution of unlabeled data remains both unknown and potentially +mismatched. Current approaches in this sphere often presuppose rigid +assumptions regarding the class distribution of unlabeled data, thereby +limiting the adaptability of models to only certain distribution ranges. In +this study, we propose a novel approach, introducing a highly adaptable +framework, designated as SimPro, which does not rely on any predefined +assumptions about the distribution of unlabeled data. Our framework, grounded +in a probabilistic model, innovatively refines the expectation-maximization +(EM) algorithm by explicitly decoupling the modeling of conditional and +marginal class distributions. This separation facilitates a closed-form +solution for class distribution estimation during the maximization phase, +leading to the formulation of a Bayes classifier. The Bayes classifier, in +turn, enhances the quality of pseudo-labels in the expectation phase. +Remarkably, the SimPro framework not only comes with theoretical guarantees but +also is straightforward to implement. Moreover, we introduce two novel class +distributions broadening the scope of the evaluation. Our method showcases +consistent state-of-the-art performance across diverse benchmarks and data +distribution scenarios. Our code is available at +https://github.com/LeapLabTHU/SimPro. + +
+
+ comment: ICML2024 camera-ready version +
+
+
+
+
+ + ♻ ☆ Gaze-directed Vision GNN for Mitigating Shortcut Learning in Medical + Image + + +
+ Deep neural networks have demonstrated remarkable performance in medical +image analysis. However, its susceptibility to spurious correlations due to +shortcut learning raises concerns about network interpretability and +reliability. Furthermore, shortcut learning is exacerbated in medical contexts +where disease indicators are often subtle and sparse. In this paper, we propose +a novel gaze-directed Vision GNN (called GD-ViG) to leverage the visual +patterns of radiologists from gaze as expert knowledge, directing the network +toward disease-relevant regions, and thereby mitigating shortcut learning. +GD-ViG consists of a gaze map generator (GMG) and a gaze-directed classifier +(GDC). Combining the global modelling ability of GNNs with the locality of +CNNs, GMG generates the gaze map based on radiologists' visual patterns. +Notably, it eliminates the need for real gaze data during inference, enhancing +the network's practical applicability. Utilizing gaze as the expert knowledge, +the GDC directs the construction of graph structures by incorporating both +feature distances and gaze distances, enabling the network to focus on +disease-relevant foregrounds. Thereby avoiding shortcut learning and improving +the network's interpretability. The experiments on two public medical image +datasets demonstrate that GD-ViG outperforms the state-of-the-art methods, and +effectively mitigates shortcut learning. Our code is available at +https://github.com/SX-SS/GD-ViG. + +
+
+
+
+
+ + ♻ ☆ RDA-INR: Riemannian Diffeomorphic Autoencoding via Implicit Neural + Representations + + +
+ Diffeomorphic registration frameworks such as Large Deformation Diffeomorphic +Metric Mapping (LDDMM) are used in computer graphics and the medical domain for +atlas building, statistical latent modeling, and pairwise and groupwise +registration. In recent years, researchers have developed neural network-based +approaches regarding diffeomorphic registration to improve the accuracy and +computational efficiency of traditional methods. In this work, we focus on a +limitation of neural network-based atlas building and statistical latent +modeling methods, namely that they either are (i) resolution dependent or (ii) +disregard any data- or problem-specific geometry needed for proper +mean-variance analysis. In particular, we overcome this limitation by designing +a novel encoder based on resolution-independent implicit neural +representations. The encoder achieves resolution invariance for LDDMM-based +statistical latent modeling. Additionally, the encoder adds LDDMM Riemannian +geometry to resolution-independent deep learning models for statistical latent +modeling. We investigate how the Riemannian geometry improves latent modeling +and is required for a proper mean-variance analysis. To highlight the benefit +of resolution independence for LDDMM-based data variability modeling, we show +that our approach outperforms current neural network-based LDDMM latent code +models. Our work paves the way for more research into how Riemannian geometry, +shape respectively image analysis, and deep learning can be combined. + +
+
+ comment: 41 pages, 27 figures (including subfigures), revised version, to be + published in SIAM Journal on Imaging Sciences +
+
+
+
+
+ + ♻ ☆ Learning Signed Hyper Surfaces for Oriented Point Cloud Normal + Estimation CVPR 2023 + + +
+ We propose a novel method called SHS-Net for oriented normal estimation of +point clouds by learning signed hyper surfaces, which can accurately predict +normals with global consistent orientation from various point clouds. Almost +all existing methods estimate oriented normals through a two-stage pipeline, +i.e., unoriented normal estimation and normal orientation, and each step is +implemented by a separate algorithm. However, previous methods are sensitive to +parameter settings, resulting in poor results from point clouds with noise, +density variations and complex geometries. In this work, we introduce signed +hyper surfaces (SHS), which are parameterized by multi-layer perceptron (MLP) +layers, to learn to estimate oriented normals from point clouds in an +end-to-end manner. The signed hyper surfaces are implicitly learned in a +high-dimensional feature space where the local and global information is +aggregated. Specifically, we introduce a patch encoding module and a shape +encoding module to encode a 3D point cloud into a local latent code and a +global latent code, respectively. Then, an attention-weighted normal prediction +module is proposed as a decoder, which takes the local and global latent codes +as input to predict oriented normals. Experimental results show that our +SHS-Net outperforms the state-of-the-art methods in both unoriented and +oriented normal estimation on the widely used benchmarks. + +
+
+ comment: Accepted by TPAMI 2024 (extension) and CVPR 2023. Project page: + https://leoqli.github.io/SHS-Net/. Code: https://github.com/LeoQLi/SHS-Net +
+
+
+
+
+ + ♻ ☆ X-Dreamer: Creating High-quality 3D Content by Bridging the Domain Gap + Between Text-to-2D and Text-to-3D Generation + + +
+ In recent times, automatic text-to-3D content creation has made significant +progress, driven by the development of pretrained 2D diffusion models. Existing +text-to-3D methods typically optimize the 3D representation to ensure that the +rendered image aligns well with the given text, as evaluated by the pretrained +2D diffusion model. Nevertheless, a substantial domain gap exists between 2D +images and 3D assets, primarily attributed to variations in camera-related +attributes and the exclusive presence of foreground objects. Consequently, +employing 2D diffusion models directly for optimizing 3D representations may +lead to suboptimal outcomes. To address this issue, we present X-Dreamer, a +novel approach for high-quality text-to-3D content creation that effectively +bridges the gap between text-to-2D and text-to-3D synthesis. The key components +of X-Dreamer are two innovative designs: Camera-Guided Low-Rank Adaptation +(CG-LoRA) and Attention-Mask Alignment (AMA) Loss. CG-LoRA dynamically +incorporates camera information into the pretrained diffusion models by +employing camera-dependent generation for trainable parameters. This +integration enhances the alignment between the generated 3D assets and the +camera's perspective. AMA loss guides the attention map of the pretrained +diffusion model using the binary mask of the 3D object, prioritizing the +creation of the foreground object. This module ensures that the model focuses +on generating accurate and detailed foreground objects. Extensive evaluations +demonstrate the effectiveness of our proposed method compared to existing +text-to-3D approaches. Our project webpage: +https://xmu-xiaoma666.github.io/Projects/X-Dreamer/ . + +
+
+ comment: ToMM24 +
+
+
+
+
+ + ♻ ☆ GenNBV: Generalizable Next-Best-View Policy for Active 3D Reconstruction CVPR 2024 + + +
+ While recent advances in neural radiance field enable realistic digitization +for large-scale scenes, the image-capturing process is still time-consuming and +labor-intensive. Previous works attempt to automate this process using the +Next-Best-View (NBV) policy for active 3D reconstruction. However, the existing +NBV policies heavily rely on hand-crafted criteria, limited action space, or +per-scene optimized representations. These constraints limit their +cross-dataset generalizability. To overcome them, we propose GenNBV, an +end-to-end generalizable NBV policy. Our policy adopts a reinforcement learning +(RL)-based framework and extends typical limited action space to 5D free space. +It empowers our agent drone to scan from any viewpoint, and even interact with +unseen geometries during training. To boost the cross-dataset generalizability, +we also propose a novel multi-source state embedding, including geometric, +semantic, and action representations. We establish a benchmark using the Isaac +Gym simulator with the Houses3K and OmniObject3D datasets to evaluate this NBV +policy. Experiments demonstrate that our policy achieves a 98.26% and 97.12% +coverage ratio on unseen building-scale objects from these datasets, +respectively, outperforming prior solutions. + +
+
+ comment: CVPR 2024 accepted paper. Project page: http://gennbv.github.io/ +
+
+
+
+
+ + ♻ ☆ COLMAP-Free 3D Gaussian Splatting + + +
+ While neural rendering has led to impressive advances in scene reconstruction +and novel view synthesis, it relies heavily on accurately pre-computed camera +poses. To relax this constraint, multiple efforts have been made to train +Neural Radiance Fields (NeRFs) without pre-processed camera poses. However, the +implicit representations of NeRFs provide extra challenges to optimize the 3D +structure and camera poses at the same time. On the other hand, the recently +proposed 3D Gaussian Splatting provides new opportunities given its explicit +point cloud representations. This paper leverages both the explicit geometric +representation and the continuity of the input video stream to perform novel +view synthesis without any SfM preprocessing. We process the input frames in a +sequential manner and progressively grow the 3D Gaussians set by taking one +input frame at a time, without the need to pre-compute the camera poses. Our +method significantly improves over previous approaches in view synthesis and +camera pose estimation under large motion changes. Our project page is +https://oasisyang.github.io/colmap-free-3dgs + +
+
+ comment: Project Page: https://oasisyang.github.io/colmap-free-3dgs +
+
+
+
+
+ + ♻ ☆ Chain of Visual Perception: Harnessing Multimodal Large Language Models + for Zero-shot Camouflaged Object Detection + + +
+ In this paper, we introduce a novel multimodal camo-perceptive framework +(MMCPF) aimed at handling zero-shot Camouflaged Object Detection (COD) by +leveraging the powerful capabilities of Multimodal Large Language Models +(MLLMs). Recognizing the inherent limitations of current COD methodologies, +which predominantly rely on supervised learning models demanding extensive and +accurately annotated datasets, resulting in weak generalization, our research +proposes a zero-shot MMCPF that circumvents these challenges. Although MLLMs +hold significant potential for broad applications, their effectiveness in COD +is hindered and they would make misinterpretations of camouflaged objects. To +address this challenge, we further propose a strategic enhancement called the +Chain of Visual Perception (CoVP), which significantly improves the perceptual +capabilities of MLLMs in camouflaged scenes by leveraging both linguistic and +visual cues more effectively. We validate the effectiveness of MMCPF on five +widely used COD datasets, containing CAMO, COD10K, NC4K, MoCA-Mask and OVCamo. +Experiments show that MMCPF can outperform all existing state-of-the-art +zero-shot COD methods, and achieve competitive performance compared to +weakly-supervised and fully-supervised methods, which demonstrates the +potential of MMCPF. The Github link of this paper is +\url{https://github.com/luckybird1994/MMCPF}. + +
+
+ comment: Accepted by ACMMM2024 +
+
+
+
+
+ + ♻ ☆ Advancing Prompt Learning through an External Layer + + +
+ Prompt learning represents a promising method for adapting pre-trained +vision-language models (VLMs) to various downstream tasks by learning a set of +text embeddings. One challenge inherent to these methods is the poor +generalization performance due to the invalidity of the learned text embeddings +for unseen tasks. A straightforward approach to bridge this gap is to freeze +the text embeddings in prompts, which results in a lack of capacity to adapt +VLMs for downstream tasks. To address this dilemma, we propose a paradigm +called EnPrompt with a novel External Layer (EnLa). Specifically, we propose a +textual external layer and learnable visual embeddings for adapting VLMs to +downstream tasks. The learnable external layer is built upon valid embeddings +of pre-trained CLIP. This design considers the balance of learning capabilities +between the two branches. To align the textual and visual features, we propose +a novel two-pronged approach: i) we introduce the optimal transport as the +discrepancy metric to align the vision and text modalities, and ii) we +introduce a novel strengthening feature to enhance the interaction between +these two modalities. Four representative experiments (i.e., base-to-novel +generalization, few-shot learning, cross-dataset generalization, domain shifts +generalization) across 15 datasets demonstrate that our method outperforms the +existing prompt learning method. + +
+
+
+
+
+ + ♻ ☆ ScreenQA: Large-Scale Question-Answer Pairs over Mobile App Screenshots + + +
+ We present a new benchmark and dataset, ScreenQA, for screen content +understanding via question answering. The existing screen datasets are focused +either on structure and component-level understanding, or on a much +higher-level composite task such as navigation and task completion. We attempt +to bridge the gap between these two by annotating 86K question-answer pairs +over the RICO dataset in hope to benchmark the screen reading comprehension +capacity. This work is also the first to annotate answers for different +application scenarios, including both full sentences and short forms, as well +as supporting UI contents on screen and their bounding boxes. With the rich +annotation, we discuss and define the evaluation metrics of the benchmark, show +applications of the dataset, and provide a few baselines using closed and open +source models. + +
+
+
+
+
+ + ♻ ☆ A Symmetric Regressor for MRI-Based Assessment of Striatal Dopamine + Transporter Uptake in Parkinson's Disease + + +
+ Dopamine transporter (DAT) imaging is commonly used for monitoring +Parkinson's disease (PD), where striatal DAT uptake amount is computed to +assess PD severity. However, DAT imaging has a high cost and the risk of +radiance exposure and is not available in general clinics. Recently, MRI patch +of the nigral region has been proposed as a safer and easier alternative. This +paper proposes a symmetric regressor for predicting the DAT uptake amount from +the nigral MRI patch. Acknowledging the symmetry between the right and left +nigrae, the proposed regressor incorporates a paired input-output model that +simultaneously predicts the DAT uptake amounts for both the right and left +striata. Moreover, it employs a symmetric loss that imposes a constraint on the +difference between right-to-left predictions, resembling the high correlation +in DAT uptake amounts in the two lateral sides. Additionally, we propose a +symmetric Monte-Carlo (MC) dropout method for providing a fruitful uncertainty +estimate of the DAT uptake prediction, which utilizes the above symmetry. We +evaluated the proposed approach on 734 nigral patches, which demonstrated +significantly improved performance of the symmetric regressor compared with the +standard regressors while giving better explainability and feature +representation. The symmetric MC dropout also gave precise uncertainty ranges +with a high probability of including the true DAT uptake amounts within the +range. + +
+
+
+
+
+ + ♻ ☆ DocDeshadower: Frequency-Aware Transformer for Document Shadow Removal + + +
+ Shadows in scanned documents pose significant challenges for document +analysis and recognition tasks due to their negative impact on visual quality +and readability. Current shadow removal techniques, including traditional +methods and deep learning approaches, face limitations in handling varying +shadow intensities and preserving document details. To address these issues, we +propose DocDeshadower, a novel multi-frequency Transformer-based model built +upon the Laplacian Pyramid. By decomposing the shadow image into multiple +frequency bands and employing two critical modules: the Attention-Aggregation +Network for low-frequency shadow removal and the Gated Multi-scale Fusion +Transformer for global refinement. DocDeshadower effectively removes shadows at +different scales while preserving document content. Extensive experiments +demonstrate DocDeshadower's superior performance compared to state-of-the-art +methods, highlighting its potential to significantly improve document shadow +removal techniques. The code is available at +https://github.com/leiyingtie/DocDeshadower. + +
+
+ comment: Accepted by IEEE International Conference on Systems, Man, and + Cybernetics 2024 +
+
+
+
+
+ + ♻ ☆ Evidential Uncertainty Sets in Deep Classifiers Using Conformal + Prediction + + +
+ In this paper, we propose Evidential Conformal Prediction (ECP) method for +image classifiers to generate the conformal prediction sets. Our method is +designed based on a non-conformity score function that has its roots in +Evidential Deep Learning (EDL) as a method of quantifying model (epistemic) +uncertainty in DNN classifiers. We use evidence that are derived from the logit +values of target labels to compute the components of our non-conformity score +function: the heuristic notion of uncertainty in CP, uncertainty surprisal, and +expected utility. Our extensive experimental evaluation demonstrates that ECP +outperforms three state-of-the-art methods for generating CP sets, in terms of +their set sizes and adaptivity while maintaining the coverage of true labels. + +
+
+ comment: Accepted in 13th Symposium on Conformal and Probabilistic Prediction + with Applications (COPA2024). To be published in the Proceedings of Machine + Learning Research (PMLR), vol. 230, 2024 (25 Pages) +
+
+
+
+
+ + ♻ ☆ Fast-Poly: A Fast Polyhedral Framework For 3D Multi-Object Tracking + + +
+ 3D Multi-Object Tracking (MOT) captures stable and comprehensive motion +states of surrounding obstacles, essential for robotic perception. However, +current 3D trackers face issues with accuracy and latency consistency. In this +paper, we propose Fast-Poly, a fast and effective filter-based method for 3D +MOT. Building upon our previous work Poly-MOT, Fast-Poly addresses object +rotational anisotropy in 3D space, enhances local computation densification, +and leverages parallelization technique, improving inference speed and +precision. Fast-Poly is extensively tested on two large-scale tracking +benchmarks with Python implementation. On the nuScenes dataset, Fast-Poly +achieves new state-of-the-art performance with 75.8% AMOTA among all methods +and can run at 34.2 FPS on a personal CPU. On the Waymo dataset, Fast-Poly +exhibits competitive accuracy with 63.6% MOTA and impressive inference speed +(35.5 FPS). The source code is publicly available at +https://github.com/lixiaoyu2000/FastPoly. + +
+
+ comment: 1st on the NuScenes Tracking benchmark with 75.8 AMOTA and 34.2 FPS +
+
+
+
+
+ + ♻ ☆ Harnessing the power of longitudinal medical imaging for eye disease + prognosis using Transformer-based sequence modeling + + +
+ Deep learning has enabled breakthroughs in automated diagnosis from medical +imaging, with many successful applications in ophthalmology. However, standard +medical image classification approaches only assess disease presence at the +time of acquisition, neglecting the common clinical setting of longitudinal +imaging. For slow, progressive eye diseases like age-related macular +degeneration (AMD) and primary open-angle glaucoma (POAG), patients undergo +repeated imaging over time to track disease progression and forecasting the +future risk of developing disease is critical to properly plan treatment. Our +proposed Longitudinal Transformer for Survival Analysis (LTSA) enables dynamic +disease prognosis from longitudinal medical imaging, modeling the time to +disease from sequences of fundus photography images captured over long, +irregular time periods. Using longitudinal imaging data from the Age-Related +Eye Disease Study (AREDS) and Ocular Hypertension Treatment Study (OHTS), LTSA +significantly outperformed a single-image baseline in 19/20 head-to-head +comparisons on late AMD prognosis and 18/20 comparisons on POAG prognosis. A +temporal attention analysis also suggested that, while the most recent image is +typically the most influential, prior imaging still provides additional +prognostic value. + +
+
+ comment: Accepted to npj Digital Medicine +
+
+
+
+
+ + ♻ ☆ SpatialBot: Precise Spatial Understanding with Vision Language Models + + +
+ Vision Language Models (VLMs) have achieved impressive performance in 2D +image understanding, however they are still struggling with spatial +understanding which is the foundation of Embodied AI. In this paper, we propose +SpatialBot for better spatial understanding by feeding both RGB and depth +images. Additionally, we have constructed the SpatialQA dataset, which involves +multi-level depth-related questions to train VLMs for depth understanding. +Finally, we present SpatialBench to comprehensively evaluate VLMs' capabilities +in spatial understanding at different levels. Extensive experiments on our +spatial-understanding benchmark, general VLM benchmarks and Embodied AI tasks, +demonstrate the remarkable improvements of SpatialBot trained on SpatialQA. The +model, code and data are available at https://github.com/BAAI-DCAI/SpatialBot. + +
+
+
+
+
+ + ♻ ☆ MMWorld: Towards Multi-discipline Multi-faceted World Model Evaluation + in Videos + + +
+ Multimodal Language Language Models (MLLMs) demonstrate the emerging +abilities of "world models" -- interpreting and reasoning about complex +real-world dynamics. To assess these abilities, we posit videos are the ideal +medium, as they encapsulate rich representations of real-world dynamics and +causalities. To this end, we introduce MMWorld, a new benchmark for +multi-discipline, multi-faceted multimodal video understanding. MMWorld +distinguishes itself from previous video understanding benchmarks with two +unique advantages: (1) multi-discipline, covering various disciplines that +often require domain expertise for comprehensive understanding; (2) +multi-faceted reasoning, including explanation, counterfactual thinking, future +prediction, etc. MMWorld consists of a human-annotated dataset to evaluate +MLLMs with questions about the whole videos and a synthetic dataset to analyze +MLLMs within a single modality of perception. Together, MMWorld encompasses +1,910 videos across seven broad disciplines and 69 subdisciplines, complete +with 6,627 question-answer pairs and associated captions. The evaluation +includes 2 proprietary and 10 open-source MLLMs, which struggle on MMWorld +(e.g., GPT-4V performs the best with only 52.3\% accuracy), showing large room +for improvement. Further ablation studies reveal other interesting findings +such as models' different skill sets from humans. We hope MMWorld can serve as +an essential step towards world model evaluation in videos. + +
+
+
+
+
+ + ♻ ☆ Multi-Modality Co-Learning for Efficient Skeleton-based Action + Recognition + + +
+ Skeleton-based action recognition has garnered significant attention due to +the utilization of concise and resilient skeletons. Nevertheless, the absence +of detailed body information in skeletons restricts performance, while other +multimodal methods require substantial inference resources and are inefficient +when using multimodal data during both training and inference stages. To +address this and fully harness the complementary multimodal features, we +propose a novel multi-modality co-learning (MMCL) framework by leveraging the +multimodal large language models (LLMs) as auxiliary networks for efficient +skeleton-based action recognition, which engages in multi-modality co-learning +during the training stage and keeps efficiency by employing only concise +skeletons in inference. Our MMCL framework primarily consists of two modules. +First, the Feature Alignment Module (FAM) extracts rich RGB features from video +frames and aligns them with global skeleton features via contrastive learning. +Second, the Feature Refinement Module (FRM) uses RGB images with temporal +information and text instruction to generate instructive features based on the +powerful generalization of multimodal LLMs. These instructive text features +will further refine the classification scores and the refined scores will +enhance the model's robustness and generalization in a manner similar to soft +labels. Extensive experiments on NTU RGB+D, NTU RGB+D 120 and Northwestern-UCLA +benchmarks consistently verify the effectiveness of our MMCL, which outperforms +the existing skeleton-based action recognition methods. Meanwhile, experiments +on UTD-MHAD and SYSU-Action datasets demonstrate the commendable generalization +of our MMCL in zero-shot and domain-adaptive action recognition. Our code is +publicly available at: https://github.com/liujf69/MMCL-Action. + +
+
+
+
+
+ + ♻ ☆ Large Language Models Powered Context-aware Motion Prediction in + Autonomous Driving + + +
+ Motion prediction is among the most fundamental tasks in autonomous driving. +Traditional methods of motion forecasting primarily encode vector information +of maps and historical trajectory data of traffic participants, lacking a +comprehensive understanding of overall traffic semantics, which in turn affects +the performance of prediction tasks. In this paper, we utilized Large Language +Models (LLMs) to enhance the global traffic context understanding for motion +prediction tasks. We first conducted systematic prompt engineering, visualizing +complex traffic environments and historical trajectory information of traffic +participants into image prompts -- Transportation Context Map (TC-Map), +accompanied by corresponding text prompts. Through this approach, we obtained +rich traffic context information from the LLM. By integrating this information +into the motion prediction model, we demonstrate that such context can enhance +the accuracy of motion predictions. Furthermore, considering the cost +associated with LLMs, we propose a cost-effective deployment strategy: +enhancing the accuracy of motion prediction tasks at scale with 0.7\% +LLM-augmented datasets. Our research offers valuable insights into enhancing +the understanding of traffic scenes of LLMs and the motion prediction +performance of autonomous driving. The source code is available at +\url{https://github.com/AIR-DISCOVER/LLM-Augmented-MTR} and +\url{https://aistudio.baidu.com/projectdetail/7809548}. + +
+
+ comment: 6 pages,4 figures +
+
+
+
+
+ + ♻ ☆ Bridging the Gap: Studio-like Avatar Creation from a Monocular Phone + Capture ECCV 2024 + + +
+ Creating photorealistic avatars for individuals traditionally involves +extensive capture sessions with complex and expensive studio devices like the +LightStage system. While recent strides in neural representations have enabled +the generation of photorealistic and animatable 3D avatars from quick phone +scans, they have the capture-time lighting baked-in, lack facial details and +have missing regions in areas such as the back of the ears. Thus, they lag in +quality compared to studio-captured avatars. In this paper, we propose a method +that bridges this gap by generating studio-like illuminated texture maps from +short, monocular phone captures. We do this by parameterizing the phone texture +maps using the $W^+$ space of a StyleGAN2, enabling near-perfect +reconstruction. Then, we finetune a StyleGAN2 by sampling in the $W^+$ +parameterized space using a very small set of studio-captured textures as an +adversarial training signal. To further enhance the realism and accuracy of +facial details, we super-resolve the output of the StyleGAN2 using carefully +designed diffusion model that is guided by image gradients of the +phone-captured texture map. Once trained, our method excels at producing +studio-like facial texture maps from casual monocular smartphone videos. +Demonstrating its capabilities, we showcase the generation of photorealistic, +uniformly lit, complete avatars from monocular phone captures. The project page +can be found at http://shahrukhathar.github.io/2024/07/22/Bridging.html + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ♻ ☆ AutoRG-Brain: Grounded Report Generation for Brain MRI + + +
+ Radiologists are tasked with interpreting a large number of images in a daily +base, with the responsibility of generating corresponding reports. This +demanding workload elevates the risk of human error, potentially leading to +treatment delays, increased healthcare costs, revenue loss, and operational +inefficiencies. To address these challenges, we initiate a series of work on +grounded Automatic Report Generation (AutoRG), starting from the brain MRI +interpretation system, which supports the delineation of brain structures, the +localization of anomalies, and the generation of well-organized findings. We +make contributions from the following aspects, first, on dataset construction, +we release a comprehensive dataset encompassing segmentation masks of anomaly +regions and manually authored reports, termed as RadGenome-Brain MRI. This data +resource is intended to catalyze ongoing research and development in the field +of AI-assisted report generation systems. Second, on system design, we propose +AutoRG-Brain, the first brain MRI report generation system with pixel-level +grounded visual clues. Third, for evaluation, we conduct quantitative +assessments and human evaluations of brain structure segmentation, anomaly +localization, and report generation tasks to provide evidence of its +reliability and accuracy. This system has been integrated into real clinical +scenarios, where radiologists were instructed to write reports based on our +generated findings and anomaly segmentation masks. The results demonstrate that +our system enhances the report-writing skills of junior doctors, aligning their +performance more closely with senior doctors, thereby boosting overall +productivity. + +
+
+
+
+
+ + ♻ ☆ MimiQ: Low-Bit Data-Free Quantization of Vision Transformers with + Encouraging Inter-Head Attention Similarity + + +
+ Data-free quantization (DFQ) is a technique that creates a lightweight +network from its full-precision counterpart without the original training data, +often through a synthetic dataset. Although several DFQ methods have been +proposed for vision transformer (ViT) architectures, they fail to achieve +efficacy in low-bit settings. Examining the existing methods, we identify that +their synthetic data produce misaligned attention maps, while those of the real +samples are highly aligned. From the observation of aligned attention, we find +that aligning attention maps of synthetic data helps to improve the overall +performance of quantized ViTs. Motivated by this finding, we devise \aname, a +novel DFQ method designed for ViTs that focuses on inter-head attention +similarity. First, we generate synthetic data by aligning head-wise attention +responses in relation to spatial query patches. Then, we apply head-wise +structural attention distillation to align the attention maps of the quantized +network to those of the full-precision teacher. The experimental results show +that the proposed method significantly outperforms baselines, setting a new +state-of-the-art performance for data-free ViT quantization. + +
+
+ comment: Author Preprint +
+
+
+
+
+ + ♻ ☆ DreamCar: Leveraging Car-specific Prior for in-the-wild 3D Car + Reconstruction + + +
+ Self-driving industries usually employ professional artists to build +exquisite 3D cars. However, it is expensive to craft large-scale digital +assets. Since there are already numerous datasets available that contain a vast +number of images of cars, we focus on reconstructing high-quality 3D car models +from these datasets. However, these datasets only contain one side of cars in +the forward-moving scene. We try to use the existing generative models to +provide more supervision information, but they struggle to generalize well in +cars since they are trained on synthetic datasets not car-specific. In +addition, The reconstructed 3D car texture misaligns due to a large error in +camera pose estimation when dealing with in-the-wild images. These restrictions +make it challenging for previous methods to reconstruct complete 3D cars. To +address these problems, we propose a novel method, named DreamCar, which can +reconstruct high-quality 3D cars given a few images even a single image. To +generalize the generative model, we collect a car dataset, named Car360, with +over 5,600 vehicles. With this dataset, we make the generative model more +robust to cars. We use this generative prior specific to the car to guide its +reconstruction via Score Distillation Sampling. To further complement the +supervision information, we utilize the geometric and appearance symmetry of +cars. Finally, we propose a pose optimization method that rectifies poses to +tackle texture misalignment. Extensive experiments demonstrate that our method +significantly outperforms existing methods in reconstructing high-quality 3D +cars. \href{https://xiaobiaodu.github.io/dreamcar-project/}{Our code is +available.} + +
+
+ comment: Projet Page: https://xiaobiaodu.github.io/dreamcar-project/ +
+
+
+
+
+ + ♻ ☆ Multi-view X-ray Image Synthesis with Multiple Domain Disentanglement + from CT Scans ACM MM2024 + + +
+ X-ray images play a vital role in the intraoperative processes due to their +high resolution and fast imaging speed and greatly promote the subsequent +segmentation, registration and reconstruction. However, over-dosed X-rays +superimpose potential risks to human health to some extent. Data-driven +algorithms from volume scans to X-ray images are restricted by the scarcity of +paired X-ray and volume data. Existing methods are mainly realized by modelling +the whole X-ray imaging procedure. In this study, we propose a learning-based +approach termed CT2X-GAN to synthesize the X-ray images in an end-to-end manner +using the content and style disentanglement from three different image domains. +Our method decouples the anatomical structure information from CT scans and +style information from unpaired real X-ray images/ digital reconstructed +radiography (DRR) images via a series of decoupling encoders. Additionally, we +introduce a novel consistency regularization term to improve the stylistic +resemblance between synthesized X-ray images and real X-ray images. Meanwhile, +we also impose a supervised process by computing the similarity of computed +real DRR and synthesized DRR images. We further develop a pose attention module +to fully strengthen the comprehensive information in the decoupled content code +from CT scans, facilitating high-quality multi-view image synthesis in the +lower 2D space. Extensive experiments were conducted on the publicly available +CTSpine1K dataset and achieved 97.8350, 0.0842 and 3.0938 in terms of FID, KID +and defined user-scored X-ray similarity, respectively. In comparison with +3D-aware methods ($\pi$-GAN, EG3D), CT2X-GAN is superior in improving the +synthesis quality and realistic to the real X-ray images. + +
+
+ comment: 13 pages, 10 figures, ACM MM2024 +
+
+
+
+
+ + ♻ ☆ Optimizing LaneSegNet for Real-Time Lane Topology Prediction in + Autonomous Vehicles + + +
+ With the increasing prevalence of autonomous vehicles, it is essential for +computer vision algorithms to accurately assess road features in real-time. +This study explores the LaneSegNet architecture, a new approach to lane +topology prediction which integrates topological information with lane-line +data to provide a more contextual understanding of road environments. The +LaneSegNet architecture includes a feature extractor, lane encoder, lane +decoder, and prediction head, leveraging components from ResNet-50, BEVFormer, +and various attention mechanisms. We experimented with optimizations to the +LaneSegNet architecture through feature extractor modification and transformer +encoder-decoder stack modification. We found that modifying the encoder and +decoder stacks offered an interesting tradeoff between training time and +prediction accuracy, with certain combinations showing promising results. Our +implementation, trained on a single NVIDIA Tesla A100 GPU, found that a 2:4 +ratio reduced training time by 22.3% with only a 7.1% drop in mean average +precision, while a 4:8 ratio increased training time by only 11.1% but improved +mean average precision by a significant 23.7%. These results indicate that +strategic hyperparameter tuning can yield substantial improvements depending on +the resources of the user. This study provides valuable insights for optimizing +LaneSegNet according to available computation power, making it more accessible +for users with limited resources and increasing the capabilities for users with +more powerful resources. + +
+
+ comment: 18 pages, 16 figures +
+
+
+
+
+ + ♻ ☆ CriSp: Leveraging Tread Depth Maps for Enhanced Crime-Scene Shoeprint + Matching + + +
+ Shoeprints are a common type of evidence found at crime scenes and are used +regularly in forensic investigations. However, existing methods cannot +effectively employ deep learning techniques to match noisy and occluded +crime-scene shoeprints to a shoe database due to a lack of training data. +Moreover, all existing methods match crime-scene shoeprints to clean reference +prints, yet our analysis shows matching to more informative tread depth maps +yields better retrieval results. The matching task is further complicated by +the necessity to identify similarities only in corresponding regions (heels, +toes, etc) of prints and shoe treads. To overcome these challenges, we leverage +shoe tread images from online retailers and utilize an off-the-shelf predictor +to estimate depth maps and clean prints. Our method, named CriSp, matches +crime-scene shoeprints to tread depth maps by training on this data. CriSp +incorporates data augmentation to simulate crime-scene shoeprints, an encoder +to learn spatially-aware features, and a masking module to ensure only visible +regions of crime-scene prints affect retrieval results. To validate our +approach, we introduce two validation sets by reprocessing existing datasets of +crime-scene shoeprints and establish a benchmarking protocol for comparison. On +this benchmark, CriSp significantly outperforms state-of-the-art methods in +both automated shoeprint matching and image retrieval tailored to this task. + +
+
+
+
+
+ + ♻ ☆ NU-AIR -- A Neuromorphic Urban Aerial Dataset for Detection and + Localization of Pedestrians and Vehicles + + +
+ This paper presents an open-source aerial neuromorphic dataset that captures +pedestrians and vehicles moving in an urban environment. The dataset, titled +NU-AIR, features 70.75 minutes of event footage acquired with a 640 x 480 +resolution neuromorphic sensor mounted on a quadrotor operating in an urban +environment. Crowds of pedestrians, different types of vehicles, and street +scenes featuring busy urban environments are captured at different elevations +and illumination conditions. Manual bounding box annotations of vehicles and +pedestrians contained in the recordings are provided at a frequency of 30 Hz, +yielding 93,204 labels in total. Evaluation of the dataset's fidelity is +performed through comprehensive ablation study for three Spiking Neural +Networks (SNNs) and training ten Deep Neural Networks (DNNs) to validate the +quality and reliability of both the dataset and corresponding annotations. All +data and Python code to voxelize the data and subsequently train SNNs/DNNs has +been open-sourced. + +
+
+ comment: 24 pages, 8 figures +
+
+
+
+
+
+
+
+ + Information Retrieval 11 + +
+
+
+ + ☆ Learn by Selling: Equipping Large Language Models with Product Knowledge + for Context-Driven Recommendations + + +
+ The rapid evolution of large language models (LLMs) has opened up new +possibilities for applications such as context-driven product recommendations. +However, the effectiveness of these models in this context is heavily reliant +on their comprehensive understanding of the product inventory. This paper +presents a novel approach to equipping LLMs with product knowledge by training +them to respond contextually to synthetic search queries that include product +IDs. We delve into an extensive analysis of this method, evaluating its +effectiveness, outlining its benefits, and highlighting its constraints. The +paper also discusses the potential improvements and future directions for this +approach, providing a comprehensive understanding of the role of LLMs in +product recommendations. + +
+
+
+
+
+ + ☆ JaColBERTv2.5: Optimising Multi-Vector Retrievers to Create + State-of-the-Art Japanese Retrievers with Constrained Resources + + +
+ Neural Information Retrieval has advanced rapidly in high-resource languages, +but progress in lower-resource ones such as Japanese has been hindered by data +scarcity, among other challenges. Consequently, multilingual models have +dominated Japanese retrieval, despite their computational inefficiencies and +inability to capture linguistic nuances. While recent multi-vector monolingual +models like JaColBERT have narrowed this gap, they still lag behind +multilingual methods in large-scale evaluations. This work addresses the +suboptimal training methods of multi-vector retrievers in lower-resource +settings, focusing on Japanese. We systematically evaluate and improve key +aspects of the inference and training settings of JaColBERT, and more broadly, +multi-vector models. We further enhance performance through a novel checkpoint +merging step, showcasing it to be an effective way of combining the benefits of +fine-tuning with the generalization capabilities of the original checkpoint. +Building on our analysis, we introduce a novel training recipe, resulting in +the JaColBERTv2.5 model. JaColBERTv2.5, with only 110 million parameters and +trained in under 15 hours on 4 A100 GPUs, significantly outperforms all +existing methods across all common benchmarks, reaching an average score of +0.754, significantly above the previous best of 0.720. To support future +research, we make our final models, intermediate checkpoints and all data used +publicly available. + +
+
+
+
+
+ + ☆ RevGNN: Negative Sampling Enhanced Contrastive Graph Learning for + Academic Reviewer Recommendation + + +
+ Acquiring reviewers for academic submissions is a challenging recommendation +scenario. Recent graph learning-driven models have made remarkable progress in +the field of recommendation, but their performance in the academic reviewer +recommendation task may suffer from a significant false negative issue. This +arises from the assumption that unobserved edges represent negative samples. In +fact, the mechanism of anonymous review results in inadequate exposure of +interactions between reviewers and submissions, leading to a higher number of +unobserved interactions compared to those caused by reviewers declining to +participate. Therefore, investigating how to better comprehend the negative +labeling of unobserved interactions in academic reviewer recommendations is a +significant challenge. This study aims to tackle the ambiguous nature of +unobserved interactions in academic reviewer recommendations. Specifically, we +propose an unsupervised Pseudo Neg-Label strategy to enhance graph contrastive +learning (GCL) for recommending reviewers for academic submissions, which we +call RevGNN. RevGNN utilizes a two-stage encoder structure that encodes both +scientific knowledge and behavior using Pseudo Neg-Label to approximate review +preference. Extensive experiments on three real-world datasets demonstrate that +RevGNN outperforms all baselines across four metrics. Additionally, detailed +further analyses confirm the effectiveness of each component in RevGNN. + +
+
+ comment: Accepted by ACM Transactions on Information Systems (TOIS) +
+
+
+
+
+ + ☆ Powerful A/B-Testing Metrics and Where to Find Them RecSys '24 + + +
+ Online controlled experiments, colloquially known as A/B-tests, are the bread +and butter of real-world recommender system evaluation. Typically, end-users +are randomly assigned some system variant, and a plethora of metrics are then +tracked, collected, and aggregated throughout the experiment. A North Star +metric (e.g. long-term growth or revenue) is used to assess which system +variant should be deemed superior. As a result, most collected metrics are +supporting in nature, and serve to either (i) provide an understanding of how +the experiment impacts user experience, or (ii) allow for confident +decision-making when the North Star metric moves insignificantly (i.e. a false +negative or type-II error). The latter is not straightforward: suppose a +treatment variant leads to fewer but longer sessions, with more views but fewer +engagements; should this be considered a positive or negative outcome? + The question then becomes: how do we assess a supporting metric's utility +when it comes to decision-making using A/B-testing? Online platforms typically +run dozens of experiments at any given time. This provides a wealth of +information about interventions and treatment effects that can be used to +evaluate metrics' utility for online evaluation. We propose to collect this +information and leverage it to quantify type-I, type-II, and type-III errors +for the metrics of interest, alongside a distribution of measurements of their +statistical power (e.g. $z$-scores and $p$-values). We present results and +insights from building this pipeline at scale for two large-scale short-video +platforms: ShareChat and Moj; leveraging hundreds of past experiments to find +online metrics with high statistical power. + +
+
+ comment: Accepted to the Industry Track of the 2024 ACM Conference on + Recommender Systems (RecSys '24) +
+
+
+
+
+ + ☆ GenRec: Generative Personalized Sequential Recommendation + + +
+ Sequential recommendation is a task to capture hidden user preferences from +historical user item interaction data. Significant progress has been made in +this domain by leveraging classification based learning methods. Inspired by +the recent paradigm of 'pretrain, prompt and predict' in NLP, we consider +sequential recommendation as a sequence to sequence generation task and propose +a novel model named Generative Recommendation (GenRec). Unlike classification +based models that learn explicit user and item representations, GenRec utilizes +the sequence modeling capability of Transformer and adopts the masked item +prediction objective to effectively learn the hidden bidirectional sequential +patterns. Different from existing generative sequential recommendation models, +GenRec does not rely on manually designed hard prompts. The input to GenRec is +textual user item sequence and the output is top ranked next items. Moreover, +GenRec is lightweight and requires only a few hours to train effectively in +low-resource settings, making it highly applicable to real-world scenarios and +helping to democratize large language models in the sequential recommendation +domain. Our extensive experiments have demonstrated that GenRec generalizes on +various public real-world datasets and achieves state-of-the-art results. Our +experiments also validate the effectiveness of the the proposed masked item +prediction objective that improves the model performance by a large margin. + +
+
+
+
+
+ + ♻ ☆ Preliminary Study on Incremental Learning for Large Language Model-based + Recommender Systems CIKM 2024 + + +
+ Adapting Large Language Models for Recommendation (LLM4Rec) has shown +promising results. However, the challenges of deploying LLM4Rec in real-world +scenarios remain largely unexplored. In particular, recommender models need +incremental adaptation to evolving user preferences, while the suitability of +traditional incremental learning methods within LLM4Rec remains ambiguous due +to the unique characteristics of Large Language Models (LLMs). + In this study, we empirically evaluate two commonly employed incremental +learning strategies (full retraining and fine-tuning) for LLM4Rec. +Surprisingly, neither approach shows significant improvements in the +performance of LLM4Rec. Instead of dismissing the role of incremental learning, +we attribute the lack of anticipated performance enhancement to a mismatch +between the LLM4Rec architecture and incremental learning: LLM4Rec employs a +single adaptation module for learning recommendations, limiting its ability to +simultaneously capture long-term and short-term user preferences in the +incremental learning context. To test this speculation, we introduce a Long- +and Short-term Adaptation-aware Tuning (LSAT) framework for incremental +learning in LLM4Rec. Unlike the single adaptation module approach, LSAT +utilizes two distinct adaptation modules to independently learn long-term and +short-term user preferences. Empirical results verify that LSAT enhances +performance, thereby validating our speculation. We release our code at: +https://github.com/TianhaoShi2001/LSAT. + +
+
+ comment: accepted in the short paper track of the 2024 ACM International + Conference on Information and Knowledge Management (CIKM 2024) +
+
+
+
+
+ + ♻ ☆ EHR-SeqSQL : A Sequential Text-to-SQL Dataset For Interactively + Exploring Electronic Health Records ACL 2024 + + +
+ In this paper, we introduce EHR-SeqSQL, a novel sequential text-to-SQL +dataset for Electronic Health Record (EHR) databases. EHR-SeqSQL is designed to +address critical yet underexplored aspects in text-to-SQL parsing: +interactivity, compositionality, and efficiency. To the best of our knowledge, +EHR-SeqSQL is not only the largest but also the first medical text-to-SQL +dataset benchmark to include sequential and contextual questions. We provide a +data split and the new test set designed to assess compositional generalization +ability. Our experiments demonstrate the superiority of a multi-turn approach +over a single-turn approach in learning compositionality. Additionally, our +dataset integrates specially crafted tokens into SQL queries to improve +execution efficiency. With EHR-SeqSQL, we aim to bridge the gap between +practical needs and academic research in the text-to-SQL domain. EHR-SeqSQL is +available at https://github.com/seonhee99/EHR-SeqSQL. + +
+
+ comment: ACL 2024 (Findings) +
+
+
+
+
+ + ♻ ☆ Repeated Padding for Sequential Recommendation RecSys 2024 + + +
+ Sequential recommendation aims to provide users with personalized suggestions +based on their historical interactions. When training sequential models, +padding is a widely adopted technique for two main reasons: 1) The vast +majority of models can only handle fixed-length sequences; 2) Batching-based +training needs to ensure that the sequences in each batch have the same length. +The special value \emph{0} is usually used as the padding content, which does +not contain the actual information and is ignored in the model calculations. +This common-sense padding strategy leads us to a problem that has never been +explored before: \emph{Can we fully utilize this idle input space by padding +other content to further improve model performance and training efficiency?} + In this paper, we propose a simple yet effective padding method called +\textbf{Rep}eated \textbf{Pad}ding (\textbf{RepPad}). Specifically, we use the +original interaction sequences as the padding content and fill it to the +padding positions during model training. This operation can be performed a +finite number of times or repeated until the input sequences' length reaches +the maximum limit. Our RepPad can be viewed as a sequence-level data +augmentation strategy. Unlike most existing works, our method contains no +trainable parameters or hyperparameters and is a plug-and-play data +augmentation operation. Extensive experiments on various categories of +sequential models and five real-world datasets demonstrate the effectiveness +and efficiency of our approach. The average recommendation performance +improvement is up to 60.3\% on GRU4Rec and 24.3\% on SASRec. We also provide +in-depth analysis and explanation of what makes RepPad effective from multiple +perspectives. Our datasets and codes are available at +\url{https://github.com/KingGugu/RepPad}. + +
+
+ comment: Accepted by RecSys 2024 +
+
+
+
+
+ + ♻ ☆ C-RAG: Certified Generation Risks for Retrieval-Augmented Language + Models ICML 2024 + + +
+ Despite the impressive capabilities of large language models (LLMs) across +diverse applications, they still suffer from trustworthiness issues, such as +hallucinations and misalignments. Retrieval-augmented language models (RAG) +have been proposed to enhance the credibility of generations by grounding +external knowledge, but the theoretical understandings of their generation +risks remains unexplored. In this paper, we answer: 1) whether RAG can indeed +lead to low generation risks, 2) how to provide provable guarantees on the +generation risks of RAG and vanilla LLMs, and 3) what sufficient conditions +enable RAG models to reduce generation risks. We propose C-RAG, the first +framework to certify generation risks for RAG models. Specifically, we provide +conformal risk analysis for RAG models and certify an upper confidence bound of +generation risks, which we refer to as conformal generation risk. We also +provide theoretical guarantees on conformal generation risks for general +bounded risk functions under test distribution shifts. We prove that RAG +achieves a lower conformal generation risk than that of a single LLM when the +quality of the retrieval model and transformer is non-trivial. Our intensive +empirical results demonstrate the soundness and tightness of our conformal +generation risk guarantees across four widely-used NLP datasets on four +state-of-the-art retrieval models. + +
+
+ comment: Accepted to ICML 2024 +
+
+
+
+
+ + ♻ ☆ Multi-Behavior Generative Recommendation CIKM 2024 + + +
+ Multi-behavior sequential recommendation (MBSR) aims to incorporate behavior +types of interactions for better recommendations. Existing approaches focus on +the next-item prediction objective, neglecting the value of integrating the +target behavior type into the learning objective. In this paper, we propose +MBGen, a novel Multi-Behavior sequential Generative recommendation framework. +We formulate the MBSR task into a consecutive two-step process: (1) given item +sequences, MBGen first predicts the next behavior type to frame the user +intention, (2) given item sequences and a target behavior type, MBGen then +predicts the next items. To model such a two-step process, we tokenize both +behaviors and items into tokens and construct one single token sequence with +both behaviors and items placed interleaved. Furthermore, MBGen learns to +autoregressively generate the next behavior and item tokens in a unified +generative recommendation paradigm, naturally enabling a multi-task capability. +Additionally, we exploit the heterogeneous nature of token sequences in the +generative recommendation and propose a position-routed sparse architecture to +efficiently and effectively scale up models. Extensive experiments on public +datasets demonstrate that MBGen significantly outperforms existing MBSR models +across multiple tasks. + +
+
+ comment: Camera ready; accepted by CIKM 2024 +
+
+
+
+
+ + ♻ ☆ REAPER: Reasoning based Retrieval Planning for Complex RAG Systems + + +
+ Complex dialog systems often use retrieved evidence to facilitate factual +responses. Such RAG (Retrieval Augmented Generation) systems retrieve from +massive heterogeneous data stores that are usually architected as multiple +indexes or APIs instead of a single monolithic source. For a given query, +relevant evidence needs to be retrieved from one or a small subset of possible +retrieval sources. Complex queries can even require multi-step retrieval. For +example, a conversational agent on a retail site answering customer questions +about past orders will need to retrieve the appropriate customer order first +and then the evidence relevant to the customer's question in the context of the +ordered product. Most RAG Agents handle such Chain-of-Thought (CoT) tasks by +interleaving reasoning and retrieval steps. However, each reasoning step +directly adds to the latency of the system. For large models this latency cost +is significant -- in the order of multiple seconds. Multi-agent systems may +classify the query to a single Agent associated with a retrieval source, though +this means that a (small) classification model dictates the performance of a +large language model. In this work we present REAPER (REAsoning-based PlannER) +- an LLM based planner to generate retrieval plans in conversational systems. +We show significant gains in latency over Agent-based systems and are able to +scale easily to new and unseen use cases as compared to classification-based +planning. Though our method can be applied to any RAG system, we show our +results in the context of a conversational shopping assistant. + +
+
+
+
+
+
+
+
+ + Machine Learning 152 + +
+
+
+ + ☆ CLEFT: Language-Image Contrastive Learning with Efficient Large Language + Model and Prompt Fine-Tuning MICCAI 2024 + + +
+ Recent advancements in Contrastive Language-Image Pre-training (CLIP) have +demonstrated notable success in self-supervised representation learning across +various tasks. However, the existing CLIP-like approaches often demand +extensive GPU resources and prolonged training times due to the considerable +size of the model and dataset, making them poor for medical applications, in +which large datasets are not always common. Meanwhile, the language model +prompts are mainly manually derived from labels tied to images, potentially +overlooking the richness of information within training samples. We introduce a +novel language-image Contrastive Learning method with an Efficient large +language model and prompt Fine-Tuning (CLEFT) that harnesses the strengths of +the extensive pre-trained language and visual models. Furthermore, we present +an efficient strategy for learning context-based prompts that mitigates the gap +between informative clinical diagnostic data and simple class labels. Our +method demonstrates state-of-the-art performance on multiple chest X-ray and +mammography datasets compared with various baselines. The proposed parameter +efficient framework can reduce the total trainable model size by 39% and reduce +the trainable language model to only 4% compared with the current BERT encoder. + +
+
+ comment: Accepted by MICCAI 2024 +
+
+
+
+
+ + ☆ GABInsight: Exploring Gender-Activity Binding Bias in Vision-Language + Models + + +
+ Vision-language models (VLMs) are intensively used in many downstream tasks, +including those requiring assessments of individuals appearing in the images. +While VLMs perform well in simple single-person scenarios, in real-world +applications, we often face complex situations in which there are persons of +different genders doing different activities. We show that in such cases, VLMs +are biased towards identifying the individual with the expected gender +(according to ingrained gender stereotypes in the model or other forms of +sample selection bias) as the performer of the activity. We refer to this bias +in associating an activity with the gender of its actual performer in an image +or text as the Gender-Activity Binding (GAB) bias and analyze how this bias is +internalized in VLMs. To assess this bias, we have introduced the GAB dataset +with approximately 5500 AI-generated images that represent a variety of +activities, addressing the scarcity of real-world images for some scenarios. To +have extensive quality control, the generated images are evaluated for their +diversity, quality, and realism. We have tested 12 renowned pre-trained VLMs on +this dataset in the context of text-to-image and image-to-text retrieval to +measure the effect of this bias on their predictions. Additionally, we have +carried out supplementary experiments to quantify the bias in VLMs' text +encoders and to evaluate VLMs' capability to recognize activities. Our +experiments indicate that VLMs experience an average performance decline of +about 13.2% when confronted with gender-activity binding bias. + +
+
+
+
+
+ + ☆ MoFO: Momentum-Filtered Optimizer for Mitigating Forgetting in LLM + Fine-Tuning + + +
+ Recently, large language models (LLMs) have demonstrated remarkable +capabilities in a wide range of tasks. Typically, an LLM is pre-trained on +large corpora and subsequently fine-tuned on task-specific datasets. However, +during finetuning, LLMs may forget the knowledge acquired in the pretraining +stage, leading to a decline in general capabilities. To address this issue, we +propose a new fine-tuning algorithm termed Momentum-Filtered Optimizer (MoFO). +The key idea of MoFO is to iteratively select and update the model parameters +with the largest momentum magnitudes. Compared to full-parameter training, MoFO +achieves similar fine-tuning performance while keeping parameters closer to the +pre-trained model, thereby mitigating knowledge forgetting. Unlike most +existing methods for forgetting mitigation, MoFO combines the following two +advantages. First, MoFO does not require access to pre-training data. This +makes MoFO particularly suitable for fine-tuning scenarios where pre-training +data is unavailable, such as fine-tuning checkpoint-only open-source LLMs. +Second, MoFO does not alter the original loss function. This could avoid +impairing the model performance on the fine-tuning tasks. We validate MoFO +through rigorous convergence analysis and extensive experiments, demonstrating +its superiority over existing methods in mitigating forgetting and enhancing +fine-tuning performance. + +
+
+
+
+
+ + ☆ From Feature Importance to Natural Language Explanations Using LLMs with + RAG + + +
+ As machine learning becomes increasingly integral to autonomous +decision-making processes involving human interaction, the necessity of +comprehending the model's outputs through conversational means increases. Most +recently, foundation models are being explored for their potential as post hoc +explainers, providing a pathway to elucidate the decision-making mechanisms of +predictive models. In this work, we introduce traceable question-answering, +leveraging an external knowledge repository to inform the responses of Large +Language Models (LLMs) to user queries within a scene understanding task. This +knowledge repository comprises contextual details regarding the model's output, +containing high-level features, feature importance, and alternative +probabilities. We employ subtractive counterfactual reasoning to compute +feature importance, a method that entails analysing output variations resulting +from decomposing semantic features. Furthermore, to maintain a seamless +conversational flow, we integrate four key characteristics - social, causal, +selective, and contrastive - drawn from social science research on human +explanations into a single-shot prompt, guiding the response generation +process. Our evaluation demonstrates that explanations generated by the LLMs +encompassed these elements, indicating its potential to bridge the gap between +complex model outputs and natural language expressions. + +
+
+
+
+
+ + ☆ Contrasting Deep Learning Models for Direct Respiratory Insufficiency + Detection Versus Blood Oxygen Saturation Estimation + + +
+ We contrast high effectiveness of state of the art deep learning +architectures designed for general audio classification tasks, refined for +respiratory insufficiency (RI) detection and blood oxygen saturation (SpO2) +estimation and classification through automated audio analysis. Recently, +multiple deep learning architectures have been proposed to detect RI in COVID +patients through audio analysis, achieving accuracy above 95% and F1-score +above 0.93. RI is a condition associated with low SpO2 levels, commonly defined +as the threshold SpO2 <92%. While SpO2 serves as a crucial determinant of RI, a +medical doctor's diagnosis typically relies on multiple factors. These include +respiratory frequency, heart rate, SpO2 levels, among others. Here we study +pretrained audio neural networks (CNN6, CNN10 and CNN14) and the Masked +Autoencoder (Audio-MAE) for RI detection, where these models achieve near +perfect accuracy, surpassing previous results. Yet, for the regression task of +estimating SpO2 levels, the models achieve root mean square error values +exceeding the accepted clinical range of 3.5% for finger oximeters. +Additionally, Pearson correlation coefficients fail to surpass 0.3. As deep +learning models perform better in classification than regression, we transform +SpO2-regression into a SpO2-threshold binary classification problem, with a +threshold of 92%. However, this task still yields an F1-score below 0.65. Thus, +audio analysis offers valuable insights into a patient's RI status, but does +not provide accurate information about actual SpO2 levels, indicating a +separation of domains in which voice and speech biomarkers may and may not be +useful in medical diagnostics under current technologies. + +
+
+ comment: 23 pages, 4 figures, in review at Journal of Biomedical Signal + Processing and Control +
+
+
+
+
+ + ☆ Learning Ordinality in Semantic Segmentation + + +
+ Semantic segmentation consists of predicting a semantic label for each image +pixel. Conventional deep learning models do not take advantage of ordinal +relations that might exist in the domain at hand. For example, it is known that +the pupil is inside the iris, and the lane markings are inside the road. Such +domain knowledge can be employed as constraints to make the model more robust. +The current literature on this topic has explored pixel-wise ordinal +segmentation methods, which treat each pixel as an independent observation and +promote ordinality in its representation. This paper proposes novel spatial +ordinal segmentation methods, which take advantage of the structured image +space by considering each pixel as an observation dependent on its neighborhood +context to also promote ordinal spatial consistency. When evaluated with five +biomedical datasets and multiple configurations of autonomous driving datasets, +ordinal methods resulted in more ordinally-consistent models, with substantial +improvements in ordinal metrics and some increase in the Dice coefficient. It +was also shown that the incorporation of ordinal consistency results in models +with better generalization abilities. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ An Effective Dynamic Gradient Calibration Method for Continual Learning + + +
+ Continual learning (CL) is a fundamental topic in machine learning, where the +goal is to train a model with continuously incoming data and tasks. Due to the +memory limit, we cannot store all the historical data, and therefore confront +the ``catastrophic forgetting'' problem, i.e., the performance on the previous +tasks can substantially decrease because of the missing information in the +latter period. Though a number of elegant methods have been proposed, the +catastrophic forgetting phenomenon still cannot be well avoided in practice. In +this paper, we study the problem from the gradient perspective, where our aim +is to develop an effective algorithm to calibrate the gradient in each updating +step of the model; namely, our goal is to guide the model to be updated in the +right direction under the situation that a large amount of historical data are +unavailable. Our idea is partly inspired by the seminal stochastic variance +reduction methods (e.g., SVRG and SAGA) for reducing the variance of gradient +estimation in stochastic gradient descent algorithms. Another benefit is that +our approach can be used as a general tool, which is able to be incorporated +with several existing popular CL methods to achieve better performance. We also +conduct a set of experiments on several benchmark datasets to evaluate the +performance in practice. + +
+
+
+
+
+ + ☆ How to Choose a Reinforcement-Learning Algorithm + + +
+ The field of reinforcement learning offers a large variety of concepts and +methods to tackle sequential decision-making problems. This variety has become +so large that choosing an algorithm for a task at hand can be challenging. In +this work, we streamline the process of choosing reinforcement-learning +algorithms and action-distribution families. We provide a structured overview +of existing methods and their properties, as well as guidelines for when to +choose which methods. An interactive version of these guidelines is available +online at https://rl-picker.github.io/. + +
+
+ comment: 40 pages +
+
+
+
+
+ + ☆ What Are Good Positional Encodings for Directed Graphs? + + +
+ Positional encodings (PE) for graphs are essential in constructing powerful +and expressive graph neural networks and graph transformers as they effectively +capture relative spatial relations between nodes. While PEs for undirected +graphs have been extensively studied, those for directed graphs remain largely +unexplored, despite the fundamental role of directed graphs in representing +entities with strong logical dependencies, such as those in program analysis +and circuit designs. This work studies the design of PEs for directed graphs +that are expressive to represent desired directed spatial relations. We first +propose walk profile, a generalization of walk counting sequence to directed +graphs. We identify limitations in existing PE methods, including symmetrized +Laplacian PE, Singular Value Decomposition PE, and Magnetic Laplacian PE, in +their ability to express walk profiles. To address these limitations, we +propose the Multi-q Magnetic Laplacian PE, which extends Magnetic Laplacian PE +with multiple potential factors. This simple variant turns out to be capable of +provably expressing walk profiles. Furthermore, we generalize previous +basis-invariant and stable networks to handle complex-domain PEs decomposed +from Magnetic Laplacians. Our numerical experiments demonstrate the +effectiveness of Multi-q Magnetic Laplacian PE with a stable neural +architecture, outperforming previous PE methods (with stable networks) on +predicting directed distances/walk profiles, sorting network satisfiability, +and on general circuit benchmarks. Our code is available at +https://github.com/Graph-COM/Multi-q-Maglap. + +
+
+
+
+
+ + ☆ Machine learning surrogates for efficient hydrologic modeling: Insights + from stochastic simulations of managed aquifer recharge + + +
+ Process-based hydrologic models are invaluable tools for understanding the +terrestrial water cycle and addressing modern water resources problems. +However, many hydrologic models are computationally expensive and, depending on +the resolution and scale, simulations can take on the order of hours to days to +complete. While techniques such as uncertainty quantification and optimization +have become valuable tools for supporting management decisions, these analyses +typically require hundreds of model simulations, which are too computationally +expensive to perform with a process-based hydrologic model. To address this +gap, we propose a hybrid modeling workflow in which a process-based model is +used to generate an initial set of simulations and a machine learning (ML) +surrogate model is then trained to perform the remaining simulations required +for downstream analysis. As a case study, we apply this workflow to simulations +of variably saturated groundwater flow at a prospective managed aquifer +recharge (MAR) site. We compare the accuracy and computational efficiency of +several ML architectures, including deep convolutional networks, recurrent +neural networks, vision transformers, and networks with Fourier transforms. Our +results demonstrate that ML surrogate models can achieve under 10% mean +absolute percentage error and yield order-of-magnitude runtime savings over +processed-based models. We also offer practical recommendations for training +hydrologic surrogate models, including implementing data normalization to +improve accuracy, using a normalized loss function to improve training +stability and downsampling input features to decrease memory requirements. + +
+
+ comment: 32 pages, 14 figures, 11 tables +
+
+
+
+
+ + ☆ MambaCapsule: Towards Transparent Cardiac Disease Diagnosis with + Electrocardiography Using Mamba Capsule Network + + +
+ Cardiac arrhythmia, a condition characterized by irregular heartbeats, often +serves as an early indication of various heart ailments. With the advent of +deep learning, numerous innovative models have been introduced for diagnosing +arrhythmias using Electrocardiogram (ECG) signals. However, recent studies +solely focus on the performance of models, neglecting the interpretation of +their results. This leads to a considerable lack of transparency, posing a +significant risk in the actual diagnostic process. To solve this problem, this +paper introduces MambaCapsule, a deep neural networks for ECG arrhythmias +classification, which increases the explainability of the model while enhancing +the accuracy.Our model utilizes Mamba for feature extraction and Capsule +networks for prediction, providing not only a confidence score but also signal +features. Akin to the processing mechanism of human brain, the model learns +signal features and their relationship between them by reconstructing ECG +signals in the predicted selection. The model evaluation was conducted on +MIT-BIH and PTB dataset, following the AAMI standard. MambaCapsule has achieved +a total accuracy of 99.54% and 99.59% on the test sets respectively. These +results demonstrate the promising performance of under the standard test +protocol. + +
+
+
+
+
+ + ☆ Bayesian Low-Rank LeArning (Bella): A Practical Approach to Bayesian + Neural Networks + + +
+ Computational complexity of Bayesian learning is impeding its adoption in +practical, large-scale tasks. Despite demonstrations of significant merits such +as improved robustness and resilience to unseen or out-of-distribution inputs +over their non- Bayesian counterparts, their practical use has faded to near +insignificance. In this study, we introduce an innovative framework to mitigate +the computational burden of Bayesian neural networks (BNNs). Our approach +follows the principle of Bayesian techniques based on deep ensembles, but +significantly reduces their cost via multiple low-rank perturbations of +parameters arising from a pre-trained neural network. Both vanilla version of +ensembles as well as more sophisticated schemes such as Bayesian learning with +Stein Variational Gradient Descent (SVGD), previously deemed impractical for +large models, can be seamlessly implemented within the proposed framework, +called Bayesian Low-Rank LeArning (Bella). In a nutshell, i) Bella achieves a +dramatic reduction in the number of trainable parameters required to +approximate a Bayesian posterior; and ii) it not only maintains, but in some +instances, surpasses the performance of conventional Bayesian learning methods +and non-Bayesian baselines. Our results with large-scale tasks such as +ImageNet, CAMELYON17, DomainNet, VQA with CLIP, LLaVA demonstrate the +effectiveness and versatility of Bella in building highly scalable and +practical Bayesian deep models for real-world applications. + +
+
+ comment: 25 pages, 14 figures, 11 tables +
+
+
+
+
+ + ☆ Co-Neighbor Encoding Schema: A Light-cost Structure Encoding Method for + Dynamic Link Prediction + + +
+ Structure encoding has proven to be the key feature to distinguishing links +in a graph. However, Structure encoding in the temporal graph keeps changing as +the graph evolves, repeatedly computing such features can be time-consuming due +to the high-order subgraph construction. We develop the Co-Neighbor Encoding +Schema (CNES) to address this issue. Instead of recomputing the feature by the +link, CNES stores information in the memory to avoid redundant calculations. +Besides, unlike the existing memory-based dynamic graph learning method that +stores node hidden states, we introduce a hashtable-based memory to compress +the adjacency matrix for efficient structure feature construction and updating +with vector computation in parallel. Furthermore, CNES introduces a +Temporal-Diverse Memory to generate long-term and short-term structure encoding +for neighbors with different structural information. A dynamic graph learning +framework, Co-Neighbor Encoding Network (CNE-N), is proposed using the +aforementioned techniques. Extensive experiments on thirteen public datasets +verify the effectiveness and efficiency of the proposed method. + +
+
+
+
+
+ + ☆ Breaking Agents: Compromising Autonomous LLM Agents Through Malfunction + Amplification + + +
+ Recently, autonomous agents built on large language models (LLMs) have +experienced significant development and are being deployed in real-world +applications. These agents can extend the base LLM's capabilities in multiple +ways. For example, a well-built agent using GPT-3.5-Turbo as its core can +outperform the more advanced GPT-4 model by leveraging external components. +More importantly, the usage of tools enables these systems to perform actions +in the real world, moving from merely generating text to actively interacting +with their environment. Given the agents' practical applications and their +ability to execute consequential actions, it is crucial to assess potential +vulnerabilities. Such autonomous systems can cause more severe damage than a +standalone language model if compromised. While some existing research has +explored harmful actions by LLM agents, our study approaches the vulnerability +from a different perspective. We introduce a new type of attack that causes +malfunctions by misleading the agent into executing repetitive or irrelevant +actions. We conduct comprehensive evaluations using various attack methods, +surfaces, and properties to pinpoint areas of susceptibility. Our experiments +reveal that these attacks can induce failure rates exceeding 80\% in multiple +scenarios. Through attacks on implemented and deployable agents in multi-agent +scenarios, we accentuate the realistic risks associated with these +vulnerabilities. To mitigate such attacks, we propose self-examination +detection methods. However, our findings indicate these attacks are difficult +to detect effectively using LLMs alone, highlighting the substantial risks +associated with this vulnerability. + +
+
+
+
+
+ + ☆ Assessing Graphical Perception of Image Embedding Models using Channel + Effectiveness + + +
+ Recent advancements in vision models have greatly improved their ability to +handle complex chart understanding tasks, like chart captioning and question +answering. However, it remains challenging to assess how these models process +charts. Existing benchmarks only roughly evaluate model performance without +evaluating the underlying mechanisms, such as how models extract image +embeddings. This limits our understanding of the model's ability to perceive +fundamental graphical components. To address this, we introduce a novel +evaluation framework to assess the graphical perception of image embedding +models. For chart comprehension, we examine two main aspects of channel +effectiveness: accuracy and discriminability of various visual channels. +Channel accuracy is assessed through the linearity of embeddings, measuring how +well the perceived magnitude aligns with the size of the stimulus. +Discriminability is evaluated based on the distances between embeddings, +indicating their distinctness. Our experiments with the CLIP model show that it +perceives channel accuracy differently from humans and shows unique +discriminability in channels like length, tilt, and curvature. We aim to +develop this work into a broader benchmark for reliable visual encoders, +enhancing models for precise chart comprehension and human-like perception in +future applications. + +
+
+ comment: In Proceedings of the 2024 IEEE Visualization and Visual Analytics + (VIS) +
+
+
+
+
+ + ☆ Federated Knowledge Recycling: Privacy-Preserving Synthetic Data Sharing + + +
+ Federated learning has emerged as a paradigm for collaborative learning, +enabling the development of robust models without the need to centralise +sensitive data. However, conventional federated learning techniques have +privacy and security vulnerabilities due to the exposure of models, parameters +or updates, which can be exploited as an attack surface. This paper presents +Federated Knowledge Recycling (FedKR), a cross-silo federated learning approach +that uses locally generated synthetic data to facilitate collaboration between +institutions. FedKR combines advanced data generation techniques with a dynamic +aggregation process to provide greater security against privacy attacks than +existing methods, significantly reducing the attack surface. Experimental +results on generic and medical datasets show that FedKR achieves competitive +performance, with an average improvement in accuracy of 4.24% compared to +training models from local data, demonstrating particular effectiveness in data +scarcity scenarios. + +
+
+
+
+
+ + ☆ How to Measure the Intelligence of Large Language Models? + + +
+ With the release of ChatGPT and other large language models (LLMs) the +discussion about the intelligence, possibilities, and risks, of current and +future models have seen large attention. This discussion included much debated +scenarios about the imminent rise of so-called "super-human" AI, i.e., AI +systems that are orders of magnitude smarter than humans. In the spirit of Alan +Turing, there is no doubt that current state-of-the-art language models already +pass his famous test. Moreover, current models outperform humans in several +benchmark tests, so that publicly available LLMs have already become versatile +companions that connect everyday life, industry and science. Despite their +impressive capabilities, LLMs sometimes fail completely at tasks that are +thought to be trivial for humans. In other cases, the trustworthiness of LLMs +becomes much more elusive and difficult to evaluate. Taking the example of +academia, language models are capable of writing convincing research articles +on a given topic with only little input. Yet, the lack of trustworthiness in +terms of factual consistency or the existence of persistent hallucinations in +AI-generated text bodies has led to a range of restrictions for AI-based +content in many scientific journals. In view of these observations, the +question arises as to whether the same metrics that apply to human intelligence +can also be applied to computational methods and has been discussed +extensively. In fact, the choice of metrics has already been shown to +dramatically influence assessments on potential intelligence emergence. Here, +we argue that the intelligence of LLMs should not only be assessed by +task-specific statistical metrics, but separately in terms of qualitative and +quantitative measures. + +
+
+ comment: 3 pages, 1 figure +
+
+
+
+
+ + ☆ DyGKT: Dynamic Graph Learning for Knowledge Tracing + + +
+ Knowledge Tracing aims to assess student learning states by predicting their +performance in answering questions. Different from the existing research which +utilizes fixed-length learning sequence to obtain the student states and +regards KT as a static problem, this work is motivated by three dynamical +characteristics: 1) The scales of students answering records are constantly +growing; 2) The semantics of time intervals between the records vary; 3) The +relationships between students, questions and concepts are evolving. The three +dynamical characteristics above contain the great potential to revolutionize +the existing knowledge tracing methods. Along this line, we propose a Dynamic +Graph-based Knowledge Tracing model, namely DyGKT. In particular, a +continuous-time dynamic question-answering graph for knowledge tracing is +constructed to deal with the infinitely growing answering behaviors, and it is +worth mentioning that it is the first time dynamic graph learning technology is +used in this field. Then, a dual time encoder is proposed to capture long-term +and short-term semantics among the different time intervals. Finally, a +multiset indicator is utilized to model the evolving relationships between +students, questions, and concepts via the graph structural feature. Numerous +experiments are conducted on five real-world datasets, and the results +demonstrate the superiority of our model. All the used resources are publicly +available at https://github.com/PengLinzhi/DyGKT. + +
+
+
+
+
+ + ☆ Robust Load Prediction of Power Network Clusters Based on + Cloud-Model-Improved Transformer + + +
+ Load data from power network clusters indicates economic development in each +area, crucial for predicting regional trends and guiding power enterprise +decisions. The Transformer model, a leading method for load prediction, faces +challenges modeling historical data due to variables like weather, events, +festivals, and data volatility. To tackle this, the cloud model's fuzzy feature +is utilized to manage uncertainties effectively. Presenting an innovative +approach, the Cloud Model Improved Transformer (CMIT) method integrates the +Transformer model with the cloud model utilizing the particle swarm +optimization algorithm, with the aim of achieving robust and precise power load +predictions. Through comparative experiments conducted on 31 real datasets +within a power network cluster, it is demonstrated that CMIT significantly +surpasses the Transformer model in terms of prediction accuracy, thereby +highlighting its effectiveness in enhancing forecasting capabilities within the +power network cluster sector. + +
+
+
+
+
+ + ☆ ARCLE: The Abstraction and Reasoning Corpus Learning Environment for + Reinforcement Learning + + +
+ This paper introduces ARCLE, an environment designed to facilitate +reinforcement learning research on the Abstraction and Reasoning Corpus (ARC). +Addressing this inductive reasoning benchmark with reinforcement learning +presents these challenges: a vast action space, a hard-to-reach goal, and a +variety of tasks. We demonstrate that an agent with proximal policy +optimization can learn individual tasks through ARCLE. The adoption of +non-factorial policies and auxiliary losses led to performance enhancements, +effectively mitigating issues associated with action spaces and goal +attainment. Based on these insights, we propose several research directions and +motivations for using ARCLE, including MAML, GFlowNets, and World Models. + +
+
+ comment: Accepted by CoLLAs 2024, Project page: + https://github.com/confeitoHS/arcle +
+
+
+
+
+ + ☆ AhmedML: High-Fidelity Computational Fluid Dynamics Dataset for + Incompressible, Low-Speed Bluff Body Aerodynamics + + +
+ The development of Machine Learning (ML) methods for Computational Fluid +Dynamics (CFD) is currently limited by the lack of openly available training +data. This paper presents a new open-source dataset comprising of high +fidelity, scale-resolving CFD simulations of 500 geometric variations of the +Ahmed Car Body - a simplified car-like shape that exhibits many of the flow +topologies that are present on bluff bodies such as road vehicles. The dataset +contains simulation results that exhibit a broad set of fundamental flow +physics such as geometry and pressure-induced flow separation as well as 3D +vortical structures. Each variation of the Ahmed car body were run using a +high-fidelity, time-accurate, hybrid Reynolds-Averaged Navier-Stokes (RANS) - +Large-Eddy Simulation (LES) turbulence modelling approach using the open-source +CFD code OpenFOAM. The dataset contains boundary, volume, geometry, and +time-averaged forces/moments in widely used open-source formats. In addition, +the OpenFOAM case setup is provided so that others can reproduce or extend the +dataset. This represents to the authors knowledge, the first open-source +large-scale dataset using high-fidelity CFD methods for the widely used Ahmed +car body that is available to freely download with a permissive license +(CC-BY-SA). + +
+
+ comment: arXiv admin note: text overlap with arXiv:2407.19320 +
+
+
+
+
+ + ☆ Diffusion Augmented Agents: A Framework for Efficient Exploration and + Transfer Learning + + +
+ We introduce Diffusion Augmented Agents (DAAG), a novel framework that +leverages large language models, vision language models, and diffusion models +to improve sample efficiency and transfer learning in reinforcement learning +for embodied agents. DAAG hindsight relabels the agent's past experience by +using diffusion models to transform videos in a temporally and geometrically +consistent way to align with target instructions with a technique we call +Hindsight Experience Augmentation. A large language model orchestrates this +autonomous process without requiring human supervision, making it well-suited +for lifelong learning scenarios. The framework reduces the amount of +reward-labeled data needed to 1) finetune a vision language model that acts as +a reward detector, and 2) train RL agents on new tasks. We demonstrate the +sample efficiency gains of DAAG in simulated robotics environments involving +manipulation and navigation. Our results show that DAAG improves learning of +reward detectors, transferring past experience, and acquiring new tasks - key +abilities for developing efficient lifelong learning agents. Supplementary +material and visualizations are available on our website +https://sites.google.com/view/diffusion-augmented-agents/ + +
+
+ comment: Published at 3rd Conference on Lifelong Learning Agents (CoLLAs), + 2024 +
+
+
+
+
+ + ☆ Be aware of overfitting by hyperparameter optimization! + + +
+ Hyperparameter optimization is very frequently employed in machine learning. +However, an optimization of a large space of parameters could result in +overfitting of models. In recent studies on solubility prediction the authors +collected seven thermodynamic and kinetic solubility datasets from different +data sources. They used state-of-the-art graph-based methods and compared +models developed for each dataset using different data cleaning protocols and +hyperparameter optimization. In our study we showed that hyperparameter +optimization did not always result in better models, possibly due to +overfitting when using the same statistical measures. Similar results could be +calculated using pre-set hyperparameters, reducing the computational effort by +around 10,000 times. We also extended the previous analysis by adding a +representation learning method based on Natural Language Processing of smiles +called Transformer CNN. We show that across all analyzed sets using exactly the +same protocol, Transformer CNN provided better results than graph-based methods +for 26 out of 28 pairwise comparisons by using only a tiny fraction of time as +compared to other methods. Last but not least we stressed the importance of +comparing calculation results using exactly the same statistical measures. + +
+
+ comment: 19 pages, 5 Tables +
+
+
+
+
+ + ☆ Interpretable Pre-Trained Transformers for Heart Time-Series Data + + +
+ Decoder-only transformers are the backbone of the popular generative +pre-trained transformer (GPT) series of large language models. In this work, we +apply the same framework to periodic heart time-series data to create two +pre-trained general purpose cardiac models, namely PPG-PT and ECG-PT. We +demonstrate that both such pre-trained models are fully interpretable. This is +achieved firstly through aggregate attention maps which show that the model +focuses on similar points in previous cardiac cycles in order to make +predictions and gradually broadens its attention in deeper layers. Next, tokens +with the same value, that occur at different distinct points in the ECG and PPG +cycle, form separate clusters in high dimensional space based on their phase as +they propagate through the transformer blocks. Finally, we highlight that +individual attention heads respond to specific physiologically relevent +features, such as the dicrotic notch in PPG and the P-wave in ECG. It is also +demonstrated that these pre-trained models can be easily fine-tuned for tasks +such as classification of atrial fibrillation. In this specific example, the +fine-tuning took 11 minutes of computer time, and achieved a +leave-one-subject-out AUCs of 0.99 and 0.93 for ECG and PPG respectively. +Importantly, these fine-tuned models are also fully explainable, with attention +shifting to regions in the context that are strongly indicative of atrial +fibrillation. + +
+
+ comment: 14 pages, 5 figures +
+
+
+
+
+ + ☆ HyperMM : Robust Multimodal Learning with Varying-sized Inputs + + +
+ Combining multiple modalities carrying complementary information through +multimodal learning (MML) has shown considerable benefits for diagnosing +multiple pathologies. However, the robustness of multimodal models to missing +modalities is often overlooked. Most works assume modality completeness in the +input data, while in clinical practice, it is common to have incomplete +modalities. Existing solutions that address this issue rely on modality +imputation strategies before using supervised learning models. These +strategies, however, are complex, computationally costly and can strongly +impact subsequent prediction models. Hence, they should be used with parsimony +in sensitive applications such as healthcare. We propose HyperMM, an end-to-end +framework designed for learning with varying-sized inputs. Specifically, we +focus on the task of supervised MML with missing imaging modalities without +using imputation before training. We introduce a novel strategy for training a +universal feature extractor using a conditional hypernetwork, and propose a +permutation-invariant neural network that can handle inputs of varying +dimensions to process the extracted features, in a two-phase task-agnostic +framework. We experimentally demonstrate the advantages of our method in two +tasks: Alzheimer's disease detection and breast cancer classification. We +demonstrate that our strategy is robust to high rates of missing data and that +its flexibility allows it to handle varying-sized datasets beyond the scenario +of missing modalities. + +
+
+
+
+
+ + ☆ Efficient Quantum One-Class Support Vector Machines for Anomaly + Detection Using Randomized Measurements and Variable Subsampling + + +
+ Quantum one-class support vector machines leverage the advantage of quantum +kernel methods for semi-supervised anomaly detection. However, their quadratic +time complexity with respect to data size poses challenges when dealing with +large datasets. In recent work, quantum randomized measurements kernels and +variable subsampling were proposed, as two independent methods to address this +problem. The former achieves higher average precision, but suffers from +variance, while the latter achieves linear complexity to data size and has +lower variance. The current work focuses instead on combining these two +methods, along with rotated feature bagging, to achieve linear time complexity +both to data size and to number of features. Despite their instability, the +resulting models exhibit considerably higher performance and faster training +and testing times. + +
+
+ comment: Submitted to Springer Nature CS +
+
+
+
+
+ + ☆ Improving PINNs By Algebraic Inclusion of Boundary and Initial + Conditions + + +
+ "AI for Science" aims to solve fundamental scientific problems using AI +techniques. As most physical phenomena can be described as Partial Differential +Equations (PDEs) , approximating their solutions using neural networks has +evolved as a central component of scientific-ML. Physics-Informed Neural +Networks (PINNs) is the general method that has evolved for this task but its +training is well-known to be very unstable. In this work we explore the +possibility of changing the model being trained from being just a neural +network to being a non-linear transformation of it - one that algebraically +includes the boundary/initial conditions. This reduces the number of terms in +the loss function than the standard PINN losses. We demonstrate that our +modification leads to significant performance gains across a range of benchmark +tasks, in various dimensions and without having to tweak the training +algorithm. Our conclusions are based on conducting hundreds of experiments, in +the fully unsupervised setting, over multiple linear and non-linear PDEs set to +exactly solvable scenarios, which lends to a concrete measurement of our +performance gains in terms of order(s) of magnitude lower fractional errors +being achieved, than by standard PINNs. The code accompanying this manuscript +is publicly available at, +https://github.com/MorganREN/Improving-PINNs-By-Algebraic-Inclusion-of-Boundary-and-Initial-Conditions + +
+
+ comment: 48 Pages, 25 Figures +
+
+
+
+
+ + ☆ Efficient Pareto Manifold Learning with Low-Rank Structure ICML 2024 + + +
+ Multi-task learning, which optimizes performance across multiple tasks, is +inherently a multi-objective optimization problem. Various algorithms are +developed to provide discrete trade-off solutions on the Pareto front. +Recently, continuous Pareto front approximations using a linear combination of +base networks have emerged as a compelling strategy. However, it suffers from +scalability issues when the number of tasks is large. To address this issue, we +propose a novel approach that integrates a main network with several low-rank +matrices to efficiently learn the Pareto manifold. It significantly reduces the +number of parameters and facilitates the extraction of shared features. We also +introduce orthogonal regularization to further bolster performance. Extensive +experimental results demonstrate that the proposed approach outperforms +state-of-the-art baselines, especially on datasets with a large number of +tasks. + +
+
+ comment: ICML 2024 (Spotlight) +
+
+
+
+
+ + ☆ Persistent Sampling: Unleashing the Potential of Sequential Monte Carlo + + +
+ Sequential Monte Carlo (SMC) methods are powerful tools for Bayesian +inference but suffer from requiring many particles for accurate estimates, +leading to high computational costs. We introduce persistent sampling (PS), an +extension of SMC that mitigates this issue by allowing particles from previous +iterations to persist. This generates a growing, weighted ensemble of particles +distributed across iterations. In each iteration, PS utilizes multiple +importance sampling and resampling from the mixture of all previous +distributions to produce the next generation of particles. This addresses +particle impoverishment and mode collapse, resulting in more accurate posterior +approximations. Furthermore, this approach provides lower-variance marginal +likelihood estimates for model comparison. Additionally, the persistent +particles improve transition kernel adaptation for efficient exploration. +Experiments on complex distributions show that PS consistently outperforms +standard methods, achieving lower squared bias in posterior moment estimation +and significantly reduced marginal likelihood errors, all at a lower +computational cost. PS offers a robust, efficient, and scalable framework for +Bayesian inference. + +
+
+ comment: 30 pages, 9 figures, 4 tables. Submitted to Statistics & Computing +
+
+
+
+
+ + ☆ PIP: Prototypes-Injected Prompt for Federated Class Incremental Learning CIKM + + +
+ Federated Class Incremental Learning (FCIL) is a new direction in continual +learning (CL) for addressing catastrophic forgetting and non-IID data +distribution simultaneously. Existing FCIL methods call for high communication +costs and exemplars from previous classes. We propose a novel rehearsal-free +method for FCIL named prototypes-injected prompt (PIP) that involves 3 main +ideas: a) prototype injection on prompt learning, b) prototype augmentation, +and c) weighted Gaussian aggregation on the server side. Our experiment result +shows that the proposed method outperforms the current state of the arts +(SOTAs) with a significant improvement (up to 33%) in CIFAR100, MiniImageNet +and TinyImageNet datasets. Our extensive analysis demonstrates the robustness +of PIP in different task sizes, and the advantage of requiring smaller +participating local clients, and smaller global rounds. For further study, +source codes of PIP, baseline, and experimental logs are shared publicly in +https://github.com/anwarmaxsum/PIP. + +
+
+ comment: Conference on Information and Knowledge Management (CIKM) 2024 + (Accepted) +
+
+
+
+
+ + ☆ Industrial-Grade Smart Troubleshooting through Causal Technical Language + Processing: a Proof of Concept KDD 2024 + + +
+ This paper describes the development of a causal diagnosis approach for +troubleshooting an industrial environment on the basis of the technical +language expressed in Return on Experience records. The proposed method +leverages the vectorized linguistic knowledge contained in the distributed +representation of a Large Language Model, and the causal associations entailed +by the embedded failure modes and mechanisms of the industrial assets. The +paper presents the elementary but essential concepts of the solution, which is +conceived as a causality-aware retrieval augmented generation system, and +illustrates them experimentally on a real-world Predictive Maintenance setting. +Finally, it discusses avenues of improvement for the maturity of the utilized +causal technology to meet the robustness challenges of increasingly complex +scenarios in the industry. + +
+
+ comment: 2nd Workshop on Causal Inference and Machine Learning in Practice at + the KDD 2024 Conference. arXiv admin note: text overlap with arXiv:2407.11056 +
+
+
+
+
+ + ☆ Weak neural variational inference for solving Bayesian inverse problems + without forward models: applications in elastography + + +
+ In this paper, we introduce a novel, data-driven approach for solving +high-dimensional Bayesian inverse problems based on partial differential +equations (PDEs), called Weak Neural Variational Inference (WNVI). The method +complements real measurements with virtual observations derived from the +physical model. In particular, weighted residuals are employed as probes to the +governing PDE in order to formulate and solve a Bayesian inverse problem +without ever formulating nor solving a forward model. The formulation treats +the state variables of the physical model as latent variables, inferred using +Stochastic Variational Inference (SVI), along with the usual unknowns. The +approximate posterior employed uses neural networks to approximate the inverse +mapping from state variables to the unknowns. We illustrate the proposed method +in a biomedical setting where we infer spatially varying material properties +from noisy tissue deformation data. We demonstrate that WNVI is not only as +accurate and more efficient than traditional methods that rely on repeatedly +solving the (non)linear forward problem as a black-box, but it can also handle +ill-posed forward problems (e.g., with insufficient boundary conditions). + +
+
+
+
+
+ + ☆ Time Series Anomaly Detection with CNN for Environmental Sensors in + Healthcare-IoT + + +
+ This research develops a new method to detect anomalies in time series data +using Convolutional Neural Networks (CNNs) in healthcare-IoT. The proposed +method creates a Distributed Denial of Service (DDoS) attack using an IoT +network simulator, Cooja, which emulates environmental sensors such as +temperature and humidity. CNNs detect anomalies in time series data, resulting +in a 92\% accuracy in identifying possible attacks. + +
+
+
+
+
+ + ☆ Detecting Causality in the Frequency Domain with Cross-Mapping Coherence + + +
+ Understanding causal relationships within a system is crucial for uncovering +its underlying mechanisms. Causal discovery methods, which facilitate the +construction of such models from time-series data, hold the potential to +significantly advance scientific and engineering fields. + This study introduces the Cross-Mapping Coherence (CMC) method, designed to +reveal causal connections in the frequency domain between time series. CMC +builds upon nonlinear state-space reconstruction and extends the Convergent +Cross-Mapping algorithm to the frequency domain by utilizing coherence metrics +for evaluation. We tested the Cross-Mapping Coherence method using simulations +of logistic maps, Lorenz systems, Kuramoto oscillators, and the Wilson-Cowan +model of the visual cortex. CMC accurately identified the direction of causal +connections in all simulated scenarios. When applied to the Wilson-Cowan model, +CMC yielded consistent results similar to spectral Granger causality. + Furthermore, CMC exhibits high sensitivity in detecting weak connections, +demonstrates sample efficiency, and maintains robustness in the presence of +noise. + In conclusion, the capability to determine directed causal influences across +different frequency bands allows CMC to provide valuable insights into the +dynamics of complex, nonlinear systems. + +
+
+
+
+
+ + ☆ The Susceptibility of Example-Based Explainability Methods to Class + Outliers + + +
+ This study explores the impact of class outliers on the effectiveness of +example-based explainability methods for black-box machine learning models. We +reformulate existing explainability evaluation metrics, such as correctness and +relevance, specifically for example-based methods, and introduce a new metric, +distinguishability. Using these metrics, we highlight the shortcomings of +current example-based explainability methods, including those who attempt to +suppress class outliers. We conduct experiments on two datasets, a text +classification dataset and an image classification dataset, and evaluate the +performance of four state-of-the-art explainability methods. Our findings +underscore the need for robust techniques to tackle the challenges posed by +class outliers. + +
+
+
+
+
+ + ☆ Rethinking the Function of Neurons in KANs + + +
+ The neurons of Kolmogorov-Arnold Networks (KANs) perform a simple summation +motivated by the Kolmogorov-Arnold representation theorem, which asserts that +sum is the only fundamental multivariate function. In this work, we investigate +the potential for identifying an alternative multivariate function for KAN +neurons that may offer increased practical utility. Our empirical research +involves testing various multivariate functions in KAN neurons across a range +of benchmark Machine Learning tasks. + Our findings indicate that substituting the sum with the average function in +KAN neurons results in significant performance enhancements compared to +traditional KANs. Our study demonstrates that this minor modification +contributes to the stability of training by confining the input to the spline +within the effective range of the activation function. Our implementation and +experiments are available at: \url{https://github.com/Ghaith81/dropkan} + +
+
+
+
+
+ + ☆ DocXPand-25k: a large and diverse benchmark dataset for identity + documents analysis + + +
+ Identity document (ID) image analysis has become essential for many online +services, like bank account opening or insurance subscription. In recent years, +much research has been conducted on subjects like document localization, text +recognition and fraud detection, to achieve a level of accuracy reliable enough +to automatize identity verification. However, there are only a few available +datasets to benchmark ID analysis methods, mainly because of privacy +restrictions, security requirements and legal reasons. + In this paper, we present the DocXPand-25k dataset, which consists of 24,994 +richly labeled IDs images, generated using custom-made vectorial templates +representing nine fictitious ID designs, including four identity cards, two +residence permits and three passports designs. These synthetic IDs feature +artificially generated personal information (names, dates, identifiers, faces, +barcodes, ...), and present a rich diversity in the visual layouts and textual +contents. + We collected about 5.8k diverse backgrounds coming from real-world photos, +scans and screenshots of IDs to guarantee the variety of the backgrounds. The +software we wrote to generate these images has been published +(https://github.com/QuickSign/docxpand/) under the terms of the MIT license, +and our dataset has been published +(https://github.com/QuickSign/docxpand/releases/tag/v1.0.0) under the terms of +the CC-BY-NC-SA 4.0 License. + +
+
+
+
+
+ + ☆ Prompt-Driven Contrastive Learning for Transferable Adversarial Attacks ECCV 2024 + + +
+ Recent vision-language foundation models, such as CLIP, have demonstrated +superior capabilities in learning representations that can be transferable +across diverse range of downstream tasks and domains. With the emergence of +such powerful models, it has become crucial to effectively leverage their +capabilities in tackling challenging vision tasks. On the other hand, only a +few works have focused on devising adversarial examples that transfer well to +both unknown domains and model architectures. In this paper, we propose a novel +transfer attack method called PDCL-Attack, which leverages the CLIP model to +enhance the transferability of adversarial perturbations generated by a +generative model-based attack framework. Specifically, we formulate an +effective prompt-driven feature guidance by harnessing the semantic +representation power of text, particularly from the ground-truth class labels +of input images. To the best of our knowledge, we are the first to introduce +prompt learning to enhance the transferable generative attacks. Extensive +experiments conducted across various cross-domain and cross-model settings +empirically validate our approach, demonstrating its superiority over +state-of-the-art methods. + +
+
+ comment: Accepted to ECCV 2024, Project Page: https://PDCL-Attack.github.io +
+
+
+
+
+ + ☆ Efficient Multi-Objective Neural Architecture Search via Pareto + Dominance-based Novelty Search GECCO 2024 + + +
+ Neural Architecture Search (NAS) aims to automate the discovery of +high-performing deep neural network architectures. Traditional objective-based +NAS approaches typically optimize a certain performance metric (e.g., +prediction accuracy), overlooking large parts of the architecture search space +that potentially contain interesting network configurations. Furthermore, +objective-driven population-based metaheuristics in complex search spaces often +quickly exhaust population diversity and succumb to premature convergence to +local optima. This issue becomes more complicated in NAS when performance +objectives do not fully align with the actual performance of the candidate +architectures, as is often the case with training-free metrics. While +training-free metrics have gained popularity for their rapid performance +estimation of candidate architectures without incurring computation-heavy +network training, their effective incorporation into NAS remains a challenge. +This paper presents the Pareto Dominance-based Novelty Search for +multi-objective NAS with Multiple Training-Free metrics (MTF-PDNS). Unlike +conventional NAS methods that optimize explicit objectives, MTF-PDNS promotes +population diversity by utilizing a novelty score calculated based on multiple +training-free performance and complexity metrics, thereby yielding a broader +exploration of the search space. Experimental results on standard NAS benchmark +suites demonstrate that MTF-PDNS outperforms conventional methods driven by +explicit objectives in terms of convergence speed, diversity maintenance, +architecture transferability, and computational costs. + +
+
+ comment: 10 pages, 4 figures. Accepted as full paper at GECCO 2024 +
+
+
+
+
+ + ☆ FACL-Attack: Frequency-Aware Contrastive Learning for Transferable + Adversarial Attacks AAAI 2024 + + +
+ Deep neural networks are known to be vulnerable to security risks due to the +inherent transferable nature of adversarial examples. Despite the success of +recent generative model-based attacks demonstrating strong transferability, it +still remains a challenge to design an efficient attack strategy in a +real-world strict black-box setting, where both the target domain and model +architectures are unknown. In this paper, we seek to explore a feature +contrastive approach in the frequency domain to generate adversarial examples +that are robust in both cross-domain and cross-model settings. With that goal +in mind, we propose two modules that are only employed during the training +phase: a Frequency-Aware Domain Randomization (FADR) module to randomize +domain-variant low- and high-range frequency components and a +Frequency-Augmented Contrastive Learning (FACL) module to effectively separate +domain-invariant mid-frequency features of clean and perturbed image. We +demonstrate strong transferability of our generated adversarial perturbations +through extensive cross-domain and cross-model experiments, while keeping the +inference time complexity. + +
+
+ comment: Accepted to AAAI 2024, Project Page: https://FACL-Attack.github.io +
+
+
+
+
+ + ☆ Towards Generalizable Reinforcement Learning via Causality-Guided + Self-Adaptive Representations + + +
+ General intelligence requires quick adaption across tasks. While existing +reinforcement learning (RL) methods have made progress in generalization, they +typically assume only distribution changes between source and target domains. +In this paper, we explore a wider range of scenarios where both the +distribution and environment spaces may change. For example, in Atari games, we +train agents to generalize to tasks with different levels of mode and +difficulty, where there could be new state or action variables that never +occurred in previous environments. To address this challenging setting, we +introduce a causality-guided self-adaptive representation-based approach, +called CSR, that equips the agent to generalize effectively and efficiently +across a sequence of tasks with evolving dynamics. Specifically, we employ +causal representation learning to characterize the latent causal variables and +world models within the RL system. Such compact causal representations uncover +the structural relationships among variables, enabling the agent to +autonomously determine whether changes in the environment stem from +distribution shifts or variations in space, and to precisely locate these +changes. We then devise a three-step strategy to fine-tune the model under +different scenarios accordingly. Empirical experiments show that CSR +efficiently adapts to the target domains with only a few samples and +outperforms state-of-the-art baselines on a wide range of scenarios, including +our simulated environments, Cartpole, and Atari games. + +
+
+
+
+
+ + ☆ No learning rates needed: Introducing SALSA -- Stable Armijo Line Search + Adaptation IJCNN 2024 + + +
+ In recent studies, line search methods have been demonstrated to +significantly enhance the performance of conventional stochastic gradient +descent techniques across various datasets and architectures, while making an +otherwise critical choice of learning rate schedule superfluous. In this paper, +we identify problems of current state-of-the-art of line search methods, +propose enhancements, and rigorously assess their effectiveness. Furthermore, +we evaluate these methods on orders of magnitude larger datasets and more +complex data domains than previously done. More specifically, we enhance the +Armijo line search method by speeding up its computation and incorporating a +momentum term into the Armijo criterion, making it better suited for stochastic +mini-batching. Our optimization approach outperforms both the previous Armijo +implementation and a tuned learning rate schedule for the Adam and SGD +optimizers. Our evaluation covers a diverse range of architectures, such as +Transformers, CNNs, and MLPs, as well as data domains, including NLP and image +data. + Our work is publicly available as a Python package, which provides a simple +Pytorch optimizer. + +
+
+ comment: published in IJCNN 2024. arXiv admin note: text overlap with + arXiv:2403.18519 +
+
+
+
+
+ + ☆ Leveraging Multi-facet Paths for Heterogeneous Graph Representation + Learning + + +
+ Recent advancements in graph neural networks (GNNs) and heterogeneous GNNs +(HGNNs) have advanced node embeddings and relationship learning for various +tasks. However, existing methods often rely on domain-specific predefined +meta-paths, which are coarse-grained and focus solely on aspects like node +type, limiting their ability to capture complex interactions. We introduce +MF2Vec, a model that uses multi-faceted (fine-grained) paths instead of +predefined meta-paths. MF2Vec extracts paths via random walks and generates +multi-faceted vectors, ignoring predefined schemas. This method learns diverse +aspects of nodes and their relationships, constructs a homogeneous network, and +creates node embeddings for classification, link prediction, and clustering. +Extensive experiments show that MF2Vec outperforms existing methods, offering a +more flexible and comprehensive framework for analyzing complex networks. The +code is available at https://anonymous.4open.science/r/MF2Vec-6ABC. + +
+
+ comment: 9pages +
+
+
+
+
+ + ☆ Improved Bounds for Pure Private Agnostic Learning: Item-Level and + User-Level Privacy + + +
+ Machine Learning has made remarkable progress in a wide range of fields. In +many scenarios, learning is performed on datasets involving sensitive +information, in which privacy protection is essential for learning algorithms. +In this work, we study pure private learning in the agnostic model -- a +framework reflecting the learning process in practice. We examine the number of +users required under item-level (where each user contributes one example) and +user-level (where each user contributes multiple examples) privacy and derive +several improved upper bounds. For item-level privacy, our algorithm achieves a +near optimal bound for general concept classes. We extend this to the +user-level setting, rendering a tighter upper bound than the one proved by +Ghazi et al. (2023). Lastly, we consider the problem of learning thresholds +under user-level privacy and present an algorithm with a nearly tight user +complexity. + +
+
+
+
+
+ + ☆ SharkTrack: an accurate, generalisable software for streamlining shark + and ray underwater video analysis + + +
+ Elasmobranchs (sharks and rays) can be important components of marine +ecosystems but are experiencing global population declines. Effective +monitoring of these populations is essential to their protection. Baited Remote +Underwater Video Stations (BRUVS) have been a key tool for monitoring, but +require time-consuming manual analysis. To address these challenges, we +developed SharkTrack, an AI-enhanced BRUVS analysis software. SharkTrack uses +Convolutional Neural Networks and Multi-Object Tracking to detect and track +elasmobranchs and provides an annotation pipeline to manually classify +elasmobranch species and compute MaxN, the standard metric of relative +abundance. We tested SharkTrack on BRUVS footage from locations unseen by the +model during training. SharkTrack computed MaxN with 89% accuracy over 207 +hours of footage. The semi-automatic SharkTrack pipeline required two minutes +of manual classification per hour of video, a 97% reduction of manual BRUVS +analysis time compared to traditional methods, estimated conservatively at one +hour per hour of video. Furthermore, we demonstrate SharkTrack application +across diverse marine ecosystems and elasmobranch species, an advancement +compared to previous models, which were limited to specific species or +locations. SharkTrack applications extend beyond BRUVS analysis, facilitating +rapid annotation of unlabeled videos, aiding the development of further models +to classify elasmobranch species. We provide public access to the software and +an unprecedentedly diverse dataset, facilitating future research in an +important area of marine conservation. + +
+
+
+
+
+ + ☆ Accelerated forward-backward and Douglas-Rachford splitting dynamics + + +
+ We examine convergence properties of continuous-time variants of accelerated +Forward-Backward (FB) and Douglas-Rachford (DR) splitting algorithms for +nonsmooth composite optimization problems. When the objective function is given +by the sum of a quadratic and a nonsmooth term, we establish accelerated +sublinear and exponential convergence rates for convex and strongly convex +problems, respectively. Moreover, for FB splitting dynamics, we demonstrate +that accelerated exponential convergence rate carries over to general strongly +convex problems. In our Lyapunov-based analysis we exploit the variable-metric +gradient interpretations of FB and DR splittings to obtain smooth Lyapunov +functions that allow us to establish accelerated convergence rates. We provide +computational experiments to demonstrate the merits and the effectiveness of +our analysis. + +
+
+ comment: 10 pages; 2 figures +
+
+
+
+
+ + ☆ The Entrapment Problem in Random Walk Decentralized Learning + + +
+ This paper explores decentralized learning in a graph-based setting, where +data is distributed across nodes. We investigate a decentralized SGD algorithm +that utilizes a random walk to update a global model based on local data. Our +focus is on designing the transition probability matrix to speed up +convergence. While importance sampling can enhance centralized learning, its +decentralized counterpart, using the Metropolis-Hastings (MH) algorithm, can +lead to the entrapment problem, where the random walk becomes stuck at certain +nodes, slowing convergence. To address this, we propose the Metropolis-Hastings +with L\'evy Jumps (MHLJ) algorithm, which incorporates random perturbations +(jumps) to overcome entrapment. We theoretically establish the convergence rate +and error gap of MHLJ and validate our findings through numerical experiments. + +
+
+ comment: 10 pages, accepted by 2024 IEEE International Symposium on + Information Theory. The associated presentation of this paper can be found in + https://www.youtube.com/watch?v=et0sR4lJK_s&ab_channel=LiuZonghong +
+
+
+
+
+ + ☆ Investigating Sparsity in Recurrent Neural Networks + + +
+ In the past few years, neural networks have evolved from simple Feedforward +Neural Networks to more complex neural networks, such as Convolutional Neural +Networks and Recurrent Neural Networks. Where CNNs are a perfect fit for tasks +where the sequence is not important such as image recognition, RNNs are useful +when order is important such as machine translation. An increasing number of +layers in a neural network is one way to improve its performance, but it also +increases its complexity making it much more time and power-consuming to train. +One way to tackle this problem is to introduce sparsity in the architecture of +the neural network. Pruning is one of the many methods to make a neural network +architecture sparse by clipping out weights below a certain threshold while +keeping the performance near to the original. Another way is to generate +arbitrary structures using random graphs and embed them between an input and +output layer of an Artificial Neural Network. Many researchers in past years +have focused on pruning mainly CNNs, while hardly any research is done for the +same in RNNs. The same also holds in creating sparse architectures for RNNs by +generating and embedding arbitrary structures. Therefore, this thesis focuses +on investigating the effects of the before-mentioned two techniques on the +performance of RNNs. We first describe the pruning of RNNs, its impact on the +performance of RNNs, and the number of training epochs required to regain +accuracy after the pruning is performed. Next, we continue with the creation +and training of Sparse Recurrent Neural Networks and identify the relation +between the performance and the graph properties of its underlying arbitrary +structure. We perform these experiments on RNN with Tanh nonlinearity +(RNN-Tanh), RNN with ReLU nonlinearity (RNN-ReLU), GRU, and LSTM. Finally, we +analyze and discuss the results achieved from both the experiments. + +
+
+
+
+
+ + ☆ Joint Diffusion Processes as an Inductive Bias in Sheaf Neural Networks + + +
+ Sheaf Neural Networks (SNNs) naturally extend Graph Neural Networks (GNNs) by +endowing a cellular sheaf over the graph, equipping nodes and edges with vector +spaces and defining linear mappings between them. While the attached geometric +structure has proven to be useful in analyzing heterophily and oversmoothing, +so far the methods by which the sheaf is computed do not always guarantee a +good performance in such settings. In this work, drawing inspiration from +opinion dynamics concepts, we propose two novel sheaf learning approaches that +(i) provide a more intuitive understanding of the involved structure maps, (ii) +introduce a useful inductive bias for heterophily and oversmoothing, and (iii) +infer the sheaf in a way that does not scale with the number of features, thus +using fewer learnable parameters than existing methods. In our evaluation, we +show the limitations of the real-world benchmarks used so far on SNNs, and +design a new synthetic task -- leveraging the symmetries of n-dimensional +ellipsoids -- that enables us to better assess the strengths and weaknesses of +sheaf-based models. Our extensive experimentation on these novel datasets +reveals valuable insights into the scenarios and contexts where SNNs in general +-- and our proposed approaches in particular -- can be beneficial. + +
+
+
+
+
+ + ☆ Benchmarking Histopathology Foundation Models for Ovarian Cancer + Bevacizumab Treatment Response Prediction from Whole Slide Images + + +
+ Bevacizumab is a widely studied targeted therapeutic drug used in conjunction +with standard chemotherapy for the treatment of recurrent ovarian cancer. While +its administration has shown to increase the progression-free survival (PFS) in +patients with advanced stage ovarian cancer, the lack of identifiable +biomarkers for predicting patient response has been a major roadblock in its +effective adoption towards personalized medicine. In this work, we leverage the +latest histopathology foundation models trained on large-scale whole slide +image (WSI) datasets to extract ovarian tumor tissue features for predicting +bevacizumab response from WSIs. Our extensive experiments across a combination +of different histopathology foundation models and multiple instance learning +(MIL) strategies demonstrate capability of these large models in predicting +bevacizumab response in ovarian cancer patients with the best models achieving +an AUC score of 0.86 and an accuracy score of 72.5%. Furthermore, our survival +models are able to stratify high- and low-risk cases with statistical +significance (p < 0.05) even among the patients with the aggressive subtype of +high-grade serous ovarian carcinoma. This work highlights the utility of +histopathology foundation models for the task of ovarian bevacizumab response +prediction from WSIs. The high-attention regions of the WSIs highlighted by +these models not only aid the model explainability but also serve as promising +imaging biomarkers for treatment prognosis. + +
+
+
+
+
+ + ☆ Invariant deep neural networks under the finite group for solving + partial differential equations + + +
+ Utilizing physics-informed neural networks (PINN) to solve partial +differential equations (PDEs) becomes a hot issue and also shows its great +powers, but still suffers from the dilemmas of limited predicted accuracy in +the sampling domain and poor prediction ability beyond the sampling domain +which are usually mitigated by adding the physical properties of PDEs into the +loss function or by employing smart techniques to change the form of loss +function for special PDEs. In this paper, we design a symmetry-enhanced deep +neural network (sDNN) which makes the architecture of neural networks invariant +under the finite group through expanding the dimensions of weight matrixes and +bias vectors in each hidden layers by the order of finite group if the group +has matrix representations, otherwise extending the set of input data and the +hidden layers except for the first hidden layer by the order of finite group. +However, the total number of training parameters is only about one over the +order of finite group of the original PINN size due to the symmetric +architecture of sDNN. Furthermore, we give special forms of weight matrixes and +bias vectors of sDNN, and rigorously prove that the architecture itself is +invariant under the finite group and the sDNN has the universal approximation +ability to learn the function keeping the finite group. Numerical results show +that the sDNN has strong predicted abilities in and beyond the sampling domain +and performs far better than the vanilla PINN with fewer training points and +simpler architecture. + +
+
+
+
+
+ + ☆ CELLM: An Efficient Communication in Large Language Models Training for + Federated Learning + + +
+ Federated Learning (FL) is a recent model training paradigm in which client +devices collaboratively train a model without ever aggregating their data. +Crucially, this scheme offers users potential privacy and security benefits by +only ever communicating updates to the model weights to a central server as +opposed to traditional machine learning (ML) training which directly +communicates and aggregates data. However, FL training suffers from statistical +heterogeneity as clients may have differing local data distributions. Large +language models (LLMs) offer a potential solution to this issue of +heterogeneity given that they have consistently been shown to be able to learn +on vast amounts of noisy data. While LLMs are a promising development for +resolving the consistent issue of non-I.I.D. Clients in federated settings +exacerbate two other bottlenecks in FL: limited local computing and expensive +communication. This thesis aims to develop efficient training methods for LLMs +in FL. To this end, we employ two critical techniques in enabling efficient +training. First, we use low-rank adaptation (LoRA) to reduce the computational +load of local model training. Second, we communicate sparse updates throughout +training to significantly cut down on communication costs. Taken together, our +method reduces communication costs by up to 10x over vanilla LoRA and up to 5x +over more complex sparse LoRA baselines while achieving greater utility. We +emphasize the importance of carefully applying sparsity and picking effective +rank and sparsity configurations for federated LLM training. + +
+
+ comment: 22 pages, 10 figures +
+
+
+
+
+ + ☆ DiffusionCounterfactuals: Inferring High-dimensional Counterfactuals + with Guidance of Causal Representations + + +
+ Accurate estimation of counterfactual outcomes in high-dimensional data is +crucial for decision-making and understanding causal relationships and +intervention outcomes in various domains, including healthcare, economics, and +social sciences. However, existing methods often struggle to generate accurate +and consistent counterfactuals, particularly when the causal relationships are +complex. We propose a novel framework that incorporates causal mechanisms and +diffusion models to generate high-quality counterfactual samples guided by +causal representation. Our approach introduces a novel, theoretically grounded +training and sampling process that enables the model to consistently generate +accurate counterfactual high-dimensional data under multiple intervention +steps. Experimental results on various synthetic and real benchmarks +demonstrate the proposed approach outperforms state-of-the-art methods in +generating accurate and high-quality counterfactuals, using different +evaluation metrics. + +
+
+
+
+
+ + ☆ Neuromorphic on-chip reservoir computing with spiking neural network + architectures + + +
+ Reservoir computing is a promising approach for harnessing the computational +power of recurrent neural networks while dramatically simplifying training. +This paper investigates the application of integrate-and-fire neurons within +reservoir computing frameworks for two distinct tasks: capturing chaotic +dynamics of the H\'enon map and forecasting the Mackey-Glass time series. +Integrate-and-fire neurons can be implemented in low-power neuromorphic +architectures such as Intel Loihi. We explore the impact of network topologies +created through random interactions on the reservoir's performance. Our study +reveals task-specific variations in network effectiveness, highlighting the +importance of tailored architectures for distinct computational tasks. To +identify optimal network configurations, we employ a meta-learning approach +combined with simulated annealing. This method efficiently explores the space +of possible network structures, identifying architectures that excel in +different scenarios. The resulting networks demonstrate a range of behaviors, +showcasing how inherent architectural features influence task-specific +capabilities. We study the reservoir computing performance using a custom +integrate-and-fire code, Intel's Lava neuromorphic computing software +framework, and via an on-chip implementation in Loihi. We conclude with an +analysis of the energy performance of the Loihi architecture. + +
+
+ comment: 19 pages, 9 figures; single column +
+
+
+
+
+ + ☆ Can LLMs be Fooled? Investigating Vulnerabilities in LLMs + + +
+ The advent of Large Language Models (LLMs) has garnered significant +popularity and wielded immense power across various domains within Natural +Language Processing (NLP). While their capabilities are undeniably impressive, +it is crucial to identify and scrutinize their vulnerabilities especially when +those vulnerabilities can have costly consequences. One such LLM, trained to +provide a concise summarization from medical documents could unequivocally leak +personal patient data when prompted surreptitiously. This is just one of many +unfortunate examples that have been unveiled and further research is necessary +to comprehend the underlying reasons behind such vulnerabilities. In this +study, we delve into multiple sections of vulnerabilities which are +model-based, training-time, inference-time vulnerabilities, and discuss +mitigation strategies including "Model Editing" which aims at modifying LLMs +behavior, and "Chroma Teaming" which incorporates synergy of multiple teaming +strategies to enhance LLMs' resilience. This paper will synthesize the findings +from each vulnerability section and propose new directions of research and +development. By understanding the focal points of current vulnerabilities, we +can better anticipate and mitigate future risks, paving the road for more +robust and secure LLMs. + +
+
+ comment: 14 pages, 1 figure. arXiv admin note: text overlap with + arXiv:2403.12503 +
+
+
+
+
+ + ☆ Machine Unlearning in Generative AI: A Survey + + +
+ Generative AI technologies have been deployed in many places, such as +(multimodal) large language models and vision generative models. Their +remarkable performance should be attributed to massive training data and +emergent reasoning abilities. However, the models would memorize and generate +sensitive, biased, or dangerous information originated from the training data +especially those from web crawl. New machine unlearning (MU) techniques are +being developed to reduce or eliminate undesirable knowledge and its effects +from the models, because those that were designed for traditional +classification tasks could not be applied for Generative AI. We offer a +comprehensive survey on many things about MU in Generative AI, such as a new +problem formulation, evaluation methods, and a structured discussion on the +advantages and limitations of different kinds of MU techniques. It also +presents several critical challenges and promising directions in MU research. A +curated list of readings can be found: +https://github.com/franciscoliu/GenAI-MU-Reading. + +
+
+
+
+
+ + ☆ Unveiling the Potential of Spiking Dynamics in Graph Representation + Learning through Spatial-Temporal Normalization and Coding Strategies + + +
+ In recent years, spiking neural networks (SNNs) have attracted substantial +interest due to their potential to replicate the energy-efficient and +event-driven processing of biological neurons. Despite this, the application of +SNNs in graph representation learning, particularly for non-Euclidean data, +remains underexplored, and the influence of spiking dynamics on graph learning +is not yet fully understood. This work seeks to address these gaps by examining +the unique properties and benefits of spiking dynamics in enhancing graph +representation learning. We propose a spike-based graph neural network model +that incorporates spiking dynamics, enhanced by a novel spatial-temporal +feature normalization (STFN) technique, to improve training efficiency and +model stability. Our detailed analysis explores the impact of rate coding and +temporal coding on SNN performance, offering new insights into their advantages +for deep graph networks and addressing challenges such as the oversmoothing +problem. Experimental results demonstrate that our SNN models can achieve +competitive performance with state-of-the-art graph neural networks (GNNs) +while considerably reducing computational costs, highlighting the potential of +SNNs for efficient neuromorphic computing applications in complex graph-based +scenarios. + +
+
+
+
+
+ + ☆ Boosting Efficiency in Task-Agnostic Exploration through Causal + Knowledge IJCAI'24 + + +
+ The effectiveness of model training heavily relies on the quality of +available training resources. However, budget constraints often impose +limitations on data collection efforts. To tackle this challenge, we introduce +causal exploration in this paper, a strategy that leverages the underlying +causal knowledge for both data collection and model training. We, in +particular, focus on enhancing the sample efficiency and reliability of the +world model learning within the domain of task-agnostic reinforcement learning. +During the exploration phase, the agent actively selects actions expected to +yield causal insights most beneficial for world model training. Concurrently, +the causal knowledge is acquired and incrementally refined with the ongoing +collection of data. We demonstrate that causal exploration aids in learning +accurate world models using fewer data and provide theoretical guarantees for +its convergence. Empirical experiments, on both synthetic data and real-world +applications, further validate the benefits of causal exploration. + +
+
+ comment: This paper was accepted by IJCAI'24 +
+
+
+
+
+ + ☆ A federated large language model for long-term time series forecasting + + +
+ Long-term time series forecasting in centralized environments poses unique +challenges regarding data privacy, communication overhead, and scalability. To +address these challenges, we propose FedTime, a federated large language model +(LLM) tailored for long-range time series prediction. Specifically, we +introduce a federated pre-trained LLM with fine-tuning and alignment +strategies. Prior to the learning process, we employ K-means clustering to +partition edge devices or clients into distinct clusters, thereby facilitating +more focused model training. We also incorporate channel independence and +patching to better preserve local semantic information, ensuring that important +contextual details are retained while minimizing the risk of information loss. +We demonstrate the effectiveness of our FedTime model through extensive +experiments on various real-world forecasting benchmarks, showcasing +substantial improvements over recent approaches. In addition, we demonstrate +the efficiency of FedTime in streamlining resource usage, resulting in reduced +communication overhead. + +
+
+
+
+
+ + ☆ Optimizing Long-tailed Link Prediction in Graph Neural Networks through + Structure Representation Enhancement + + +
+ Link prediction, as a fundamental task for graph neural networks (GNNs), has +boasted significant progress in varied domains. Its success is typically +influenced by the expressive power of node representation, but recent +developments reveal the inferior performance of low-degree nodes owing to their +sparse neighbor connections, known as the degree-based long-tailed problem. +Will the degree-based long-tailed distribution similarly constrain the efficacy +of GNNs on link prediction? Unexpectedly, our study reveals that only a mild +correlation exists between node degree and predictive accuracy, and more +importantly, the number of common neighbors between node pairs exhibits a +strong correlation with accuracy. Considering node pairs with less common +neighbors, i.e., tail node pairs, make up a substantial fraction of the dataset +but achieve worse performance, we propose that link prediction also faces the +long-tailed problem. Therefore, link prediction of GNNs is greatly hindered by +the tail node pairs. After knowing the weakness of link prediction, a natural +question is how can we eliminate the negative effects of the skewed long-tailed +distribution on common neighbors so as to improve the performance of link +prediction? Towards this end, we introduce our long-tailed framework (LTLP), +which is designed to enhance the performance of tail node pairs on link +prediction by increasing common neighbors. Two key modules in LTLP respectively +supplement high-quality edges for tail node pairs and enforce representational +alignment between head and tail node pairs within the same category, thereby +improving the performance of tail node pairs. + +
+
+
+
+
+ + ☆ Toward Efficient Permutation for Hierarchical N:M Sparsity on GPUs + + +
+ N:M sparsity pruning is a powerful technique for compressing deep neural +networks, utilizing NVIDIA's Sparse Tensor Core technology. This method +benefits from hardware support for sparse indexing, enabling the adoption of +fine-grained sparsity to maintain model accuracy while minimizing the overhead +typically associated with irregular data access. Although restricted to a fixed +level of sparsity due to its reliance on hardware, N:M sparsity can be combined +with coarser sparsity techniques to achieve diverse compression ratios. +Initially, column-wise vector sparsity is applied to a dense model, followed by +row-wise N:M sparsity on the preserved column vectors. We call this multi-level +approach as hierarchical N:M (HiNM) sparsity. Similar to earlier single-level +sparsity techniques, HiNM sparsity necessitates an effective channel +permutation strategy to maximize the accuracy of the compressed networks. +However, it introduces further complexities by requiring the rearrangement of +both input and output channels, addressing challenges such as permutation +sequence, HiNM-sparsity-aware permutation, and maintaining consistency in +channel ordering across layers. In this paper, we introduce a channel +permutation method designed specifically for HiNM sparsity, named +gyro-permutation. This method is crafted to exploit the unique characteristics +of HiNM pruning, incorporating a strategic policy in each permutation phase, +including channel sampling, clustering, and assignment, to circumvent local +minima. Additionally, we have developed a GPU kernel that facilitates +independent layer permutation during the execution of HiNM sparse networks. Our +extensive experimental evaluations on various DNN models demonstrate that our +gyro-permutation significantly enhances the accuracy of HiNM sparse networks, +allowing them to reach performance levels comparable to those of unstructured +sparse networks. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ☆ A2SF: Accumulative Attention Scoring with Forgetting Factor for Token + Pruning in Transformer Decoder + + +
+ Recently, large language models (LLM) based on transformers are facing memory +bottleneck issues due to KV cache, especially in long sequence handling. +Previous researches proposed KV cache compression techniques that identify +insignificant tokens based on Accumulative Attention Scores and removes their +items from KV cache, noting that only few tokens play an important role in +attention operations. However, we have observed that the existing Accumulative +Attention Score is not suitable for the transformer decoder structure. In the +decoder model, the number of times the Attention Score accumulates varies +depending on the order of token appearance due to the effect of masking, +causing an uneven comparison between tokens. To solve this, we propose +Accumulative Attention Score with Forgetting Factor (A2SF) technique, which +introduces a Forgetting Factor in the Attention Score accumulation process. +A2SF applies a penalty to the past Attention Score generated from old tokens by +repeatedly multiplying the Forgetting Factor to the Attention Score over time. +Therefore, older tokens receive a larger penalty, providing fairness among +different ages of tokens. Through the fair comparison among tokens, we can more +effectively select important tokens. We have verified the accuracy improvement +through A2SF in the OPT and LLaMA models and A2SF improves the accuracy of +LLaMA 2 by up to 7.8% and 5.1% on 1-shot and 0-shot. + +
+
+ comment: 11 pages(9 pages + reference 2 pages), 6 figures +
+
+
+
+
+ + ☆ Distribution Learning for Molecular Regression + + +
+ Using "soft" targets to improve model performance has been shown to be +effective in classification settings, but the usage of soft targets for +regression is a much less studied topic in machine learning. The existing +literature on the usage of soft targets for regression fails to properly assess +the method's limitations, and empirical evaluation is quite limited. In this +work, we assess the strengths and drawbacks of existing methods when applied to +molecular property regression tasks. Our assessment outlines key biases present +in existing methods and proposes methods to address them, evaluated through +careful ablation studies. We leverage these insights to propose Distributional +Mixture of Experts (DMoE): A model-independent, and data-independent method for +regression which trains a model to predict probability distributions of its +targets. Our proposed loss function combines the cross entropy between +predicted and target distributions and the L1 distance between their expected +values to produce a loss function that is robust to the outlined biases. We +evaluate the performance of DMoE on different molecular property prediction +datasets -- Open Catalyst (OC20), MD17, and QM9 -- across different backbone +model architectures -- SchNet, GemNet, and Graphormer. Our results demonstrate +that the proposed method is a promising alternative to classical regression for +molecular property prediction tasks, showing improvements over baselines on all +datasets and architectures. + +
+
+
+
+
+ + ☆ Relaxed Equivariant Graph Neural Networks + + +
+ 3D Euclidean symmetry equivariant neural networks have demonstrated notable +success in modeling complex physical systems. We introduce a framework for +relaxed $E(3)$ graph equivariant neural networks that can learn and represent +symmetry breaking within continuous groups. Building on the existing e3nn +framework, we propose the use of relaxed weights to allow for controlled +symmetry breaking. We show empirically that these relaxed weights learn the +correct amount of symmetry breaking. + +
+
+ comment: Extended abstract presented at the Geometry-grounded Representation + Learning and Generative Modeling Workshop (GRaM) at the 41st International + Conference on Machine Learning, July 2024, Vienna, Austria +
+
+
+
+
+ + ☆ Adaptive Pre-training Data Detection for Large Language Models via + Surprising Tokens + + +
+ While large language models (LLMs) are extensively used, there are raising +concerns regarding privacy, security, and copyright due to their opaque +training data, which brings the problem of detecting pre-training data on the +table. Current solutions to this problem leverage techniques explored in +machine learning privacy such as Membership Inference Attacks (MIAs), which +heavily depend on LLMs' capability of verbatim memorization. However, this +reliance presents challenges, especially given the vast amount of training data +and the restricted number of effective training epochs. In this paper, we +propose an adaptive pre-training data detection method which alleviates this +reliance and effectively amplify the identification. Our method adaptively +locates \textit{surprising tokens} of the input. A token is surprising to a LLM +if the prediction on the token is "certain but wrong", which refers to low +Shannon entropy of the probability distribution and low probability of the +ground truth token at the same time. By using the prediction probability of +surprising tokens to measure \textit{surprising}, the detection method is +achieved based on the simple hypothesis that seeing seen data is less +surprising for the model compared with seeing unseen data. The method can be +applied without any access to the the pre-training data corpus or additional +training like reference models. Our approach exhibits a consistent enhancement +compared to existing methods in diverse experiments conducted on various +benchmarks and models, achieving a maximum improvement of 29.5\%. We also +introduce a new benchmark Dolma-Book developed upon a novel framework, which +employs book data collected both before and after model training to provide +further evaluation. + +
+
+
+
+
+ + ☆ Informed Correctors for Discrete Diffusion Models + + +
+ Discrete diffusion modeling is a promising framework for modeling and +generating data in discrete spaces. To sample from these models, different +strategies present trade-offs between computation and sample quality. A +predominant sampling strategy is predictor-corrector $\tau$-leaping, which +simulates the continuous time generative process with discretized predictor +steps and counteracts the accumulation of discretization error via corrector +steps. However, for absorbing state diffusion, an important class of discrete +diffusion models, the standard forward-backward corrector can be ineffective in +fixing such errors, resulting in subpar sample quality. To remedy this problem, +we propose a family of informed correctors that more reliably counteracts +discretization error by leveraging information learned by the model. For +further efficiency gains, we also propose $k$-Gillespie's, a sampling algorithm +that better utilizes each model evaluation, while still enjoying the speed and +flexibility of $\tau$-leaping. Across several real and synthetic datasets, we +show that $k$-Gillespie's with informed correctors reliably produces higher +quality samples at lower computational cost. + +
+
+
+
+
+ + ☆ GNUMAP: A Parameter-Free Approach to Unsupervised Dimensionality + Reduction via Graph Neural Networks + + +
+ With the proliferation of Graph Neural Network (GNN) methods stemming from +contrastive learning, unsupervised node representation learning for graph data +is rapidly gaining traction across various fields, from biology to molecular +dynamics, where it is often used as a dimensionality reduction tool. However, +there remains a significant gap in understanding the quality of the +low-dimensional node representations these methods produce, particularly beyond +well-curated academic datasets. To address this gap, we propose here the first +comprehensive benchmarking of various unsupervised node embedding techniques +tailored for dimensionality reduction, encompassing a range of manifold +learning tasks, along with various performance metrics. We emphasize the +sensitivity of current methods to hyperparameter choices -- highlighting a +fundamental issue as to their applicability in real-world settings where there +is no established methodology for rigorous hyperparameter selection. Addressing +this issue, we introduce GNUMAP, a robust and parameter-free method for +unsupervised node representation learning that merges the traditional UMAP +approach with the expressivity of the GNN framework. We show that GNUMAP +consistently outperforms existing state-of-the-art GNN embedding methods in a +variety of contexts, including synthetic geometric datasets, citation networks, +and real-world biomedical data -- making it a simple but reliable +dimensionality reduction tool. + +
+
+
+
+
+ + ☆ Towards an Integrated Performance Framework for Fire Science and + Management Workflows + + +
+ Reliable performance metrics are necessary prerequisites to building +large-scale end-to-end integrated workflows for collaborative scientific +research, particularly within context of use-inspired decision making platforms +with many concurrent users and when computing real-time and urgent results +using large data. This work is a building block for the National Data Platform, +which leverages multiple use-cases including the WIFIRE Data and Model Commons +for wildfire behavior modeling and the EarthScope Consortium for collaborative +geophysical research. This paper presents an artificial intelligence and +machine learning (AI/ML) approach to performance assessment and optimization of +scientific workflows. An associated early AI/ML framework spanning performance +data collection, prediction and optimization is applied to wildfire science +applications within the WIFIRE BurnPro3D (BP3D) platform for proactive fire +management and mitigation. + +
+
+
+
+
+ + ☆ DeepBaR: Fault Backdoor Attack on Deep Neural Network Layers + + +
+ Machine Learning using neural networks has received prominent attention +recently because of its success in solving a wide variety of computational +tasks, in particular in the field of computer vision. However, several works +have drawn attention to potential security risks involved with the training and +implementation of such networks. In this work, we introduce DeepBaR, a novel +approach that implants backdoors on neural networks by faulting their behavior +at training, especially during fine-tuning. Our technique aims to generate +adversarial samples by optimizing a custom loss function that mimics the +implanted backdoors while adding an almost non-visible trigger in the image. We +attack three popular convolutional neural network architectures and show that +DeepBaR attacks have a success rate of up to 98.30\%. Furthermore, DeepBaR does +not significantly affect the accuracy of the attacked networks after deployment +when non-malicious inputs are given. Remarkably, DeepBaR allows attackers to +choose an input that looks similar to a given class, from a human perspective, +but that will be classified as belonging to an arbitrary target class. + +
+
+
+
+
+ + ☆ NeuroSEM: A hybrid framework for simulating multiphysics problems by + coupling PINNs and spectral elements + + +
+ Multiphysics problems that are characterized by complex interactions among +fluid dynamics, heat transfer, structural mechanics, and electromagnetics, are +inherently challenging due to their coupled nature. While experimental data on +certain state variables may be available, integrating these data with numerical +solvers remains a significant challenge. Physics-informed neural networks +(PINNs) have shown promising results in various engineering disciplines, +particularly in handling noisy data and solving inverse problems. However, +their effectiveness in forecasting nonlinear phenomena in multiphysics regimes +is yet to be fully established. This study introduces NeuroSEM, a hybrid +framework integrating PINNs with the high-fidelity Spectral Element Method +(SEM) solver, Nektar++. NeuroSEM leverages strengths of both PINNs and SEM, +providing robust solutions for multiphysics problems. PINNs are trained to +assimilate data and model physical phenomena in specific subdomains, which are +then integrated into Nektar++. We demonstrate the efficiency and accuracy of +NeuroSEM for thermal convection in cavity flow and flow past a cylinder. The +framework effectively handles data assimilation by addressing those subdomains +and state variables where data are available. We applied NeuroSEM to the +Rayleigh-B\'enard convection system, including cases with missing thermal +boundary conditions. Our results indicate that NeuroSEM accurately models the +physical phenomena and assimilates the data within the specified subdomains. +The framework's plug-and-play nature facilitates its extension to other +multiphysics or multiscale problems. Furthermore, NeuroSEM is optimized for an +efficient execution on emerging integrated GPU-CPU architectures. This hybrid +approach enhances the accuracy and efficiency of simulations, making it a +powerful tool for tackling complex engineering challenges in various scientific +domains. + +
+
+
+
+
+ + ☆ Diffusion-Based Generation of Neural Activity from Disentangled Latent + Codes + + +
+ Recent advances in recording technology have allowed neuroscientists to +monitor activity from thousands of neurons simultaneously. Latent variable +models are increasingly valuable for distilling these recordings into compact +and interpretable representations. Here we propose a new approach to neural +data analysis that leverages advances in conditional generative modeling to +enable the unsupervised inference of disentangled behavioral variables from +recorded neural activity. Our approach builds on InfoDiffusion, which augments +diffusion models with a set of latent variables that capture important factors +of variation in the data. We apply our model, called Generating Neural +Observations Conditioned on Codes with High Information (GNOCCHI), to time +series neural data and test its application to synthetic and biological +recordings of neural activity during reaching. In comparison to a VAE-based +sequential autoencoder, GNOCCHI learns higher-quality latent spaces that are +more clearly structured and more disentangled with respect to key behavioral +variables. These properties enable accurate generation of novel samples (unseen +behavioral conditions) through simple linear traversal of the latent spaces +produced by GNOCCHI. Our work demonstrates the potential of unsupervised, +information-based models for the discovery of interpretable latent spaces from +neural data, enabling researchers to generate high-quality samples from unseen +conditions. + +
+
+ comment: 16 pages, 5 figures +
+
+
+
+
+ + ☆ Analyzing Customer-Facing Vendor Experiences with Time Series + Forecasting and Monte Carlo Techniques + + +
+ eBay partners with external vendors, which allows customers to freely select +a vendor to complete their eBay experiences. However, vendor outages can hinder +customer experiences. Consequently, eBay can disable a problematic vendor to +prevent customer loss. Disabling the vendor too late risks losing customers +willing to switch to other vendors, while disabling it too early risks losing +those unwilling to switch. In this paper, we propose a data-driven solution to +answer whether eBay should disable a problematic vendor and when to disable it. +Our solution involves forecasting customer behavior. First, we use a +multiplicative seasonality model to represent behavior if all vendors are fully +functioning. Next, we use a Monte Carlo simulation to represent behavior if the +problematic vendor remains enabled. Finally, we use a linear model to represent +behavior if the vendor is disabled. By comparing these forecasts, we determine +the optimal time for eBay to disable the problematic vendor. + +
+
+
+
+
+ + ☆ GenRec: Generative Personalized Sequential Recommendation + + +
+ Sequential recommendation is a task to capture hidden user preferences from +historical user item interaction data. Significant progress has been made in +this domain by leveraging classification based learning methods. Inspired by +the recent paradigm of 'pretrain, prompt and predict' in NLP, we consider +sequential recommendation as a sequence to sequence generation task and propose +a novel model named Generative Recommendation (GenRec). Unlike classification +based models that learn explicit user and item representations, GenRec utilizes +the sequence modeling capability of Transformer and adopts the masked item +prediction objective to effectively learn the hidden bidirectional sequential +patterns. Different from existing generative sequential recommendation models, +GenRec does not rely on manually designed hard prompts. The input to GenRec is +textual user item sequence and the output is top ranked next items. Moreover, +GenRec is lightweight and requires only a few hours to train effectively in +low-resource settings, making it highly applicable to real-world scenarios and +helping to democratize large language models in the sequential recommendation +domain. Our extensive experiments have demonstrated that GenRec generalizes on +various public real-world datasets and achieves state-of-the-art results. Our +experiments also validate the effectiveness of the the proposed masked item +prediction objective that improves the model performance by a large margin. + +
+
+
+
+
+ + ☆ Multi-task Photonic Reservoir Computing: Wavelength Division + Multiplexing for Parallel Computing with a Silicon Microring Resonator + + +
+ Nowadays, as the ever-increasing demand for more powerful computing resources +continues, alternative advanced computing paradigms are under extensive +investigation. Significant effort has been made to deviate from conventional +Von Neumann architectures. In-memory computing has emerged in the field of +electronics as a possible solution to the infamous bottleneck between memory +and computing processors, which reduces the effective throughput of data. In +photonics, novel schemes attempt to collocate the computing processor and +memory in a single device. Photonics offers the flexibility of multiplexing +streams of data not only spatially and in time, but also in frequency or, +equivalently, in wavelength, which makes it highly suitable for parallel +computing. Here, we numerically show the use of time and wavelength division +multiplexing (WDM) to solve four independent tasks at the same time in a single +photonic chip, serving as a proof of concept for our proposal. The system is a +time-delay reservoir computing (TDRC) based on a microring resonator (MRR). The +addressed tasks cover different applications: Time-series prediction, waveform +signal classification, wireless channel equalization, and radar signal +prediction. The system is also tested for simultaneous computing of up to 10 +instances of the same task, exhibiting excellent performance. The footprint of +the system is reduced by using time-division multiplexing of the nodes that act +as the neurons of the studied neural network scheme. WDM is used for the +parallelization of wavelength channels, each addressing a single task. By +adjusting the input power and frequency of each optical channel, we can achieve +levels of performance for each of the tasks that are comparable to those quoted +in state-of-the-art reports focusing on single-task operation... + +
+
+ comment: Main text: 11 figures, 3 tables. Supplementary material: 2 figures, 4 + tables. The pre-print is under review in Frontiers: Advanced Optical + Technologies. The abstract is shorter than in the PDF file to comply with + arXiv requirements +
+
+
+
+
+ + ☆ Amelia: A Large Model and Dataset for Airport Surface Movement + Forecasting + + +
+ The growing demand for air travel requires technological advancements in air +traffic management as well as mechanisms for monitoring and ensuring safe and +efficient operations. In terminal airspaces, predictive models of future +movements and traffic flows can help with proactive planning and efficient +coordination; however, varying airport topologies, and interactions with other +agents, among other factors, make accurate predictions challenging. Data-driven +predictive models have shown promise for handling numerous variables to enable +various downstream tasks, including collision risk assessment, taxi-out time +prediction, departure metering, and emission estimations. While data-driven +methods have shown improvements in these tasks, prior works lack large-scale +curated surface movement datasets within the public domain and the development +of generalizable trajectory forecasting models. In response to this, we propose +two contributions: (1) Amelia-48, a large surface movement dataset collected +using the System Wide Information Management (SWIM) Surface Movement Event +Service (SMES). With data collection beginning in Dec 2022, the dataset +provides more than a year's worth of SMES data (~30TB) and covers 48 airports +within the US National Airspace System. In addition to releasing this data in +the public domain, we also provide post-processing scripts and associated +airport maps to enable research in the forecasting domain and beyond. (2) +Amelia-TF model, a transformer-based next-token-prediction large multi-agent +multi-airport trajectory forecasting model trained on 292 days or 9.4 billion +tokens of position data encompassing 10 different airports with varying +topology. The open-sourced model is validated on unseen airports with +experiments showcasing the different prediction horizon lengths, ego-agent +selection strategies, and training recipes to demonstrate the generalization +capabilities. + +
+
+ comment: 24 pages, 9 figures, 8 tables +
+
+
+
+
+ + ☆ Optical Computing for Deep Neural Network Acceleration: Foundations, + Recent Developments, and Emerging Directions + + +
+ Emerging artificial intelligence applications across the domains of computer +vision, natural language processing, graph processing, and sequence prediction +increasingly rely on deep neural networks (DNNs). These DNNs require +significant compute and memory resources for training and inference. +Traditional computing platforms such as CPUs, GPUs, and TPUs are struggling to +keep up with the demands of the increasingly complex and diverse DNNs. Optical +computing represents an exciting new paradigm for light-speed acceleration of +DNN workloads. In this article, we discuss the fundamentals and +state-of-the-art developments in optical computing, with an emphasis on DNN +acceleration. Various promising approaches are described for engineering +optical devices, enhancing optical circuits, and designing architectures that +can adapt optical computing to a variety of DNN workloads. Novel techniques for +hardware/software co-design that can intelligently tune and map DNN models to +improve performance and energy-efficiency on optical computing platforms across +high performance and resource constrained embedded, edge, and IoT platforms are +also discussed. Lastly, several open problems and future directions for +research in this domain are highlighted. + +
+
+
+
+
+ + ☆ DKL-KAN: Scalable Deep Kernel Learning using Kolmogorov-Arnold Networks + + +
+ The need for scalable and expressive models in machine learning is paramount, +particularly in applications requiring both structural depth and flexibility. +Traditional deep learning methods, such as multilayer perceptrons (MLP), offer +depth but lack ability to integrate structural characteristics of deep learning +architectures with non-parametric flexibility of kernel methods. To address +this, deep kernel learning (DKL) was introduced, where inputs to a base kernel +are transformed using a deep learning architecture. These kernels can replace +standard kernels, allowing both expressive power and scalability. The advent of +Kolmogorov-Arnold Networks (KAN) has generated considerable attention and +discussion among researchers in scientific domain. In this paper, we introduce +a scalable deep kernel using KAN (DKL-KAN) as an effective alternative to DKL +using MLP (DKL-MLP). Our approach involves simultaneously optimizing these +kernel attributes using marginal likelihood within a Gaussian process +framework. We analyze two variants of DKL-KAN for a fair comparison with +DKL-MLP: one with same number of neurons and layers as DKL-MLP, and another +with approximately same number of trainable parameters. To handle large +datasets, we use kernel interpolation for scalable structured Gaussian +processes (KISS-GP) for low-dimensional inputs and KISS-GP with product kernels +for high-dimensional inputs. The efficacy of DKL-KAN is evaluated in terms of +computational training time and test prediction accuracy across a wide range of +applications. Additionally, the effectiveness of DKL-KAN is also examined in +modeling discontinuities and accurately estimating prediction uncertainty. The +results indicate that DKL-KAN outperforms DKL-MLP on datasets with a low number +of observations. Conversely, DKL-MLP exhibits better scalability and higher +test prediction accuracy on datasets with large number of observations. + +
+
+
+
+
+ + ☆ Embedding Space Selection for Detecting Memorization and Fingerprinting + in Generative Models + + +
+ In the rapidly evolving landscape of artificial intelligence, generative +models such as Generative Adversarial Networks (GANs) and Diffusion Models have +become cornerstone technologies, driving innovation in diverse fields from art +creation to healthcare. Despite their potential, these models face the +significant challenge of data memorization, which poses risks to privacy and +the integrity of generated content. Among various metrics of memorization +detection, our study delves into the memorization scores calculated from +encoder layer embeddings, which involves measuring distances between samples in +the embedding spaces. Particularly, we find that the memorization scores +calculated from layer embeddings of Vision Transformers (ViTs) show an notable +trend - the latter (deeper) the layer, the less the memorization measured. It +has been found that the memorization scores from the early layers' embeddings +are more sensitive to low-level memorization (e.g. colors and simple patterns +for an image), while those from the latter layers are more sensitive to +high-level memorization (e.g. semantic meaning of an image). We also observe +that, for a specific model architecture, its degree of memorization on +different levels of information is unique. It can be viewed as an inherent +property of the architecture. Building upon this insight, we introduce a unique +fingerprinting methodology. This method capitalizes on the unique distributions +of the memorization score across different layers of ViTs, providing a novel +approach to identifying models involved in generating deepfakes and malicious +content. Our approach demonstrates a marked 30% enhancement in identification +accuracy over existing baseline methods, offering a more effective tool for +combating digital misinformation. + +
+
+
+
+
+ + ☆ Private Collaborative Edge Inference via Over-the-Air Computation + + +
+ We consider collaborative inference at the wireless edge, where each client's +model is trained independently on their local datasets. Clients are queried in +parallel to make an accurate decision collaboratively. In addition to +maximizing the inference accuracy, we also want to ensure the privacy of local +models. To this end, we leverage the superposition property of the multiple +access channel to implement bandwidth-efficient multi-user inference methods. +Specifically, we propose different methods for ensemble and multi-view +classification that exploit over-the-air computation. We show that these +schemes perform better than their orthogonal counterparts with statistically +significant differences while using fewer resources and providing privacy +guarantees. We also provide experimental results verifying the benefits of the +proposed over-the-air multi-user inference approach and perform an ablation +study to demonstrate the effectiveness of our design choices. We share the +source code of the framework publicly on Github to facilitate further research +and reproducibility. + +
+
+ comment: 15 pages, 8 figures. This work extends from our preliminary study + presented at the 2022 IEEE International Symposium on Information Theory [1]. + arXiv admin note: text overlap with arXiv:2202.03129 +
+
+
+
+
+ + ☆ Enhancing Deep Hedging of Options with Implied Volatility Surface + Feedback Information + + +
+ We present a dynamic hedging scheme for S&P 500 options, where rebalancing +decisions are enhanced by integrating information about the implied volatility +surface dynamics. The optimal hedging strategy is obtained through a deep +policy gradient-type reinforcement learning algorithm, with a novel hybrid +neural network architecture improving the training performance. The favorable +inclusion of forward-looking information embedded in the volatility surface +allows our procedure to outperform several conventional benchmarks such as +practitioner and smiled-implied delta hedging procedures, both in simulation +and backtesting experiments. + +
+
+
+
+
+ + ☆ Computational music analysis from first principles + + +
+ We use coupled hidden Markov models to automatically annotate the 371 Bach +chorales in the Riemenschneider edition, a corpus containing approximately +100,000 notes and 20,000 chords. We give three separate analyses that achieve +progressively greater accuracy at the cost of making increasingly strong +assumptions about musical syntax. Although our method makes almost no use of +human input, we are able to identify both chords and keys with an accuracy of +85% or greater when compared to an expert human analysis, resulting in +annotations accurate enough to be used for a range of music-theoretical +purposes, while also being free of subjective human judgments. Our work bears +on longstanding debates about the objective reality of the structures +postulated by standard Western harmonic theory, as well as on specific +questions about the nature of Western harmonic syntax. + +
+
+
+
+
+ + ☆ Zero Shot Health Trajectory Prediction Using Transformer + + +
+ Integrating modern machine learning and clinical decision-making has great +promise for mitigating healthcare's increasing cost and complexity. We +introduce the Enhanced Transformer for Health Outcome Simulation (ETHOS), a +novel application of the transformer deep-learning architecture for analyzing +high-dimensional, heterogeneous, and episodic health data. ETHOS is trained +using Patient Health Timelines (PHTs)-detailed, tokenized records of health +events-to predict future health trajectories, leveraging a zero-shot learning +approach. ETHOS represents a significant advancement in foundation model +development for healthcare analytics, eliminating the need for labeled data and +model fine-tuning. Its ability to simulate various treatment pathways and +consider patient-specific factors positions ETHOS as a tool for care +optimization and addressing biases in healthcare delivery. Future developments +will expand ETHOS' capabilities to incorporate a wider range of data types and +data sources. Our work demonstrates a pathway toward accelerated AI development +and deployment in healthcare. + +
+
+
+
+
+ + ☆ Taming the Frequency Factory of Sinusoidal Networks + + +
+ This work investigates the structure and representation capacity of +$sinusoidal$ MLPs, which have recently shown promising results in encoding +low-dimensional signals. This success can be attributed to its smoothness and +high representation capacity. The first allows the use of the network's +derivatives during training, enabling regularization. However, defining the +architecture and initializing its parameters to achieve a desired capacity +remains an empirical task. This work provides theoretical and experimental +results justifying the capacity property of sinusoidal MLPs and offers control +mechanisms for their initialization and training. + We approach this from a Fourier series perspective and link the training with +the model's spectrum. Our analysis is based on a $harmonic$ expansion of the +sinusoidal MLP, which says that the composition of sinusoidal layers produces a +large number of new frequencies expressed as integer linear combinations of the +input frequencies (weights of the input layer). We use this novel $identity$ to +initialize the input neurons which work as a sampling in the signal spectrum. +We also note that each hidden neuron produces the same frequencies with +amplitudes completely determined by the hidden weights. Finally, we give an +upper bound for these amplitudes, which results in a $bounding$ scheme for the +network's spectrum during training. + +
+
+
+
+
+ + ☆ Palu: Compressing KV-Cache with Low-Rank Projection + + +
+ KV-Cache compression methods generally sample a KV-Cache of effectual tokens +or quantize it into lower bits. However, these methods cannot exploit the +redundancy of the hidden dimension of KV tensors. This paper investigates a +unique hidden dimension approach called Palu, a novel KV-Cache compression +framework that utilizes low-rank projection. Palu decomposes the linear layers +into low-rank matrices, caches the smaller intermediate states, and +reconstructs the full keys and values on the fly. To improve accuracy, +compression rate, and efficiency, Palu further encompasses (1) a medium-grained +low-rank decomposition scheme, (2) an efficient rank search algorithm, (3) a +low-rank-aware quantization algorithm, and (4) matrix fusion with optimized GPU +kernels. Our extensive experiments with popular LLMs show that Palu can +compress KV-Cache by more than 91.25% while maintaining a significantly better +accuracy (up to 1.19 lower perplexity) than state-of-the-art KV-Cache +quantization methods at a similar or even higher memory usage. When compressing +KV-Cache for 50%, Palu delivers up to 1.61x end-to-end speedup for the +attention module. Our code is publicly available at +https://github.com/shadowpa0327/Palu. + +
+
+
+
+
+ + ♻ ☆ Mixture of Nested Experts: Adaptive Processing of Visual Tokens + + +
+ The visual medium (images and videos) naturally contains a large amount of +information redundancy, thereby providing a great opportunity for leveraging +efficiency in processing. While Vision Transformer (ViT) based models scale +effectively to large data regimes, they fail to capitalize on this inherent +redundancy, leading to higher computational costs. Mixture of Experts (MoE) +networks demonstrate scalability while maintaining same inference-time costs, +but they come with a larger parameter footprint. We present Mixture of Nested +Experts (MoNE), which utilizes a nested structure for experts, wherein +individual experts fall on an increasing compute-accuracy curve. Given a +compute budget, MoNE learns to dynamically choose tokens in a priority order, +and thus redundant tokens are processed through cheaper nested experts. Using +this framework, we achieve equivalent performance as the baseline models, while +reducing inference time compute by over two-fold. We validate our approach on +standard image and video datasets - ImageNet-21K, Kinetics400, and +Something-Something-v2. We further highlight MoNE$'$s adaptability by +showcasing its ability to maintain strong performance across different +inference-time compute budgets on videos, using only a single trained model. + +
+
+
+
+
+ + ♻ ☆ Transfer learning for conflict and duplicate detection in software + requirement pairs + + +
+ Consistent and holistic expression of software requirements is important for +the success of software projects. In this study, we aim to enhance the +efficiency of the software development processes by automatically identifying +conflicting and duplicate software requirement specifications. We formulate the +conflict and duplicate detection problem as a requirement pair classification +task. We design a novel transformers-based architecture, SR-BERT, which +incorporates Sentence-BERT and Bi-encoders for the conflict and duplicate +identification task. Furthermore, we apply supervised multi-stage fine-tuning +to the pre-trained transformer models. We test the performance of different +transfer models using four different datasets. We find that sequentially +trained and fine-tuned transformer models perform well across the datasets with +SR-BERT achieving the best performance for larger datasets. We also explore the +cross-domain performance of conflict detection models and adopt a rule-based +filtering approach to validate the model classifications. Our analysis +indicates that the sentence pair classification approach and the proposed +transformer-based natural language processing strategies can contribute +significantly to achieving automation in conflict and duplicate detection + +
+
+
+
+
+ + ♻ ☆ On the Exploitation of DCT-Traces in the Generative-AI Domain + + +
+ Deepfakes represent one of the toughest challenges in the world of +Cybersecurity and Digital Forensics, especially considering the high-quality +results obtained with recent generative AI-based solutions. Almost all +generative models leave unique traces in synthetic data that, if analyzed and +identified in detail, can be exploited to improve the generalization +limitations of existing deepfake detectors. In this paper we analyzed deepfake +images in the frequency domain generated by both GAN and Diffusion Model +engines, examining in detail the underlying statistical distribution of +Discrete Cosine Transform (DCT) coefficients. Recognizing that not all +coefficients contribute equally to image detection, we hypothesize the +existence of a unique ``discriminative fingerprint", embedded in specific +combinations of coefficients. To identify them, Machine Learning classifiers +were trained on various combinations of coefficients. In addition, the +Explainable AI (XAI) LIME algorithm was used to search for intrinsic +discriminative combinations of coefficients. Finally, we performed a robustness +test to analyze the persistence of traces by applying JPEG compression. The +experimental results reveal the existence of traces left by the generative +models that are more discriminative and persistent at JPEG attacks. Code and +dataset are available at https://github.com/opontorno/dcts_analysis_deepfakes. + +
+
+
+
+
+ + ♻ ☆ Sequential Knockoffs for Variable Selection in Reinforcement Learning + + +
+ In real-world applications of reinforcement learning, it is often challenging +to obtain a state representation that is parsimonious and satisfies the Markov +property without prior knowledge. Consequently, it is common practice to +construct a state larger than necessary, e.g., by concatenating measurements +over contiguous time points. However, needlessly increasing the dimension of +the state may slow learning and obfuscate the learned policy. We introduce the +notion of a minimal sufficient state in a Markov decision process (MDP) as the +subvector of the original state under which the process remains an MDP and +shares the same reward function as the original process. We propose a novel +SEquEntial Knockoffs (SEEK) algorithm that estimates the minimal sufficient +state in a system with high-dimensional complex nonlinear dynamics. In large +samples, the proposed method achieves selection consistency. As the method is +agnostic to the reinforcement learning algorithm being applied, it benefits +downstream tasks such as policy learning. Empirical experiments verify +theoretical results and show the proposed approach outperforms several +competing methods regarding variable selection accuracy and regret. + +
+
+
+
+
+ + ♻ ☆ Graph Reinforcement Learning in Power Grids: A Survey + + +
+ The challenges posed by renewable energy and distributed electricity +generation motivate the development of deep learning approaches to overcome the +lack of flexibility of traditional methods in power grids use cases. The +application of GNNs is particularly promising due to their ability to learn +from graph-structured data present in power grids. Combined with RL, they can +serve as control approaches to determine remedial grid actions. This review +analyses the ability of GRL to capture the inherent graph structure of power +grids to improve representation learning and decision making in different power +grid use cases. It distinguishes between common problems in transmission and +distribution grids and explores the synergy between RL and GNNs. In +transmission grids, GRL typically addresses automated grid management and +topology control, whereas on the distribution side, GRL concentrates more on +voltage regulation. We analyzed the selected papers based on their graph +structure and GNN model, the applied RL algorithm, and their overall +contributions. Although GRL demonstrate adaptability in the face of +unpredictable events and noisy or incomplete data, it primarily serves as a +proof of concept at this stage. There are multiple open challenges and +limitations that need to be addressed when considering the application of RL to +real power grid operation. + +
+
+
+
+
+ + ♻ ☆ Learning reduced-order Quadratic-Linear models in Process Engineering + using Operator Inference + + +
+ In this work, we address the challenge of efficiently modeling dynamical +systems in process engineering. We use reduced-order model learning, +specifically operator inference. This is a non-intrusive, data-driven method +for learning dynamical systems from time-domain data. The application in our +study is carbon dioxide methanation, an important reaction within the +Power-to-X framework, to demonstrate its potential. The numerical results show +the ability of the reduced-order models constructed with operator inference to +provide a reduced yet accurate surrogate solution. This represents an important +milestone towards the implementation of fast and reliable digital twin +architectures. + +
+
+ comment: 10 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Fast Multipole Attention: A Divide-and-Conquer Attention Mechanism for + Long Sequences + + +
+ Transformer-based models have achieved state-of-the-art performance in many +areas. However, the quadratic complexity of self-attention with respect to the +input length hinders the applicability of Transformer-based models to long +sequences. To address this, we present Fast Multipole Attention, a new +attention mechanism that uses a divide-and-conquer strategy to reduce the time +and memory complexity of attention for sequences of length $n$ from +$\mathcal{O}(n^2)$ to $\mathcal{O}(n \log n)$ or $O(n)$, while retaining a +global receptive field. The hierarchical approach groups queries, keys, and +values into $\mathcal{O}( \log n)$ levels of resolution, where groups at +greater distances are increasingly larger in size and the weights to compute +group quantities are learned. As such, the interaction between tokens far from +each other is considered in lower resolution in an efficient hierarchical +manner. The overall complexity of Fast Multipole Attention is $\mathcal{O}(n)$ +or $\mathcal{O}(n \log n)$, depending on whether the queries are down-sampled +or not. This multi-level divide-and-conquer strategy is inspired by fast +summation methods from $n$-body physics and the Fast Multipole Method. We +perform evaluation on autoregressive and bidirectional language modeling tasks +and compare our Fast Multipole Attention model with other efficient attention +variants on medium-size datasets. We find empirically that the Fast Multipole +Transformer performs much better than other efficient transformers in terms of +memory size and accuracy. The Fast Multipole Attention mechanism has the +potential to empower large language models with much greater sequence lengths, +taking the full context into account in an efficient, naturally hierarchical +manner during training and when generating long sequences. + +
+
+
+
+
+ + ♻ ☆ Semi-supervised learning via DQN for log anomaly detection + + +
+ Log anomaly detection is a critical component in modern software system +security and maintenance, serving as a crucial support and basis for system +monitoring, operation, and troubleshooting. It aids operations personnel in +timely identification and resolution of issues. However, current methods in log +anomaly detection still face challenges such as underutilization of unlabeled +data, imbalance between normal and anomaly class data, and high rates of false +positives and false negatives, leading to insufficient effectiveness in anomaly +recognition. In this study, we propose a semi-supervised log anomaly detection +method named DQNLog, which integrates deep reinforcement learning to enhance +anomaly detection performance by leveraging a small amount of labeled data and +large-scale unlabeled data. To address issues of imbalanced data and +insufficient labeling, we design a state transition function biased towards +anomalies based on cosine similarity, aiming to capture semantic-similar +anomalies rather than favoring the majority class. To enhance the model's +capability in learning anomalies, we devise a joint reward function that +encourages the model to utilize labeled anomalies and explore unlabeled +anomalies, thereby reducing false positives and false negatives. Additionally, +to prevent the model from deviating from normal trajectories due to +misestimation, we introduce a regularization term in the loss function to +ensure the model retains prior knowledge during updates. We evaluate DQNLog on +three widely used datasets, demonstrating its ability to effectively utilize +large-scale unlabeled data and achieve promising results across all +experimental datasets. + +
+
+
+
+
+ + ♻ ☆ Bayesian Hierarchical Probabilistic Forecasting of Intraday Electricity + Prices + + +
+ We present a first study of Bayesian forecasting of electricity prices traded +on the German continuous intraday market which fully incorporates parameter +uncertainty. A particularly large set of endogenous and exogenous covariables +is used, handled through feature selection with Orthogonal Matching Pursuit +(OMP) and regularising priors. Our target variable is the IDFull price index, +forecasts are given in terms of posterior predictive distributions. For +validation we use the exceedingly volatile electricity prices of 2022, which +have hardly been the subject of forecasting studies before. As a benchmark +model, we use all available intraday transactions at the time of forecast +creation to compute a current value for the IDFull. According to the weak-form +efficiency hypothesis, it would not be possible to significantly improve this +benchmark built from last price information. We do, however, observe +statistically significant improvement in terms of both point measures and +probability scores. Finally, we challenge the declared gold standard of using +LASSO for feature selection in electricity price forecasting by presenting +strong statistical evidence that OMP leads to better forecasting performance. + +
+
+ comment: 22 pages, 14 figures, 4 tables. Revised version with an added + schematic figure. Under review for Applied Energy +
+
+
+
+
+ + ♻ ☆ Accounting for shared covariates in semi-parametric Bayesian additive + regression trees + + +
+ We propose some extensions to semi-parametric models based on Bayesian +additive regression trees (BART). In the semi-parametric BART paradigm, the +response variable is approximated by a linear predictor and a BART model, where +the linear component is responsible for estimating the main effects and BART +accounts for non-specified interactions and non-linearities. Previous +semi-parametric models based on BART have assumed that the set of covariates in +the linear predictor and the BART model are mutually exclusive in an attempt to +avoid poor coverage properties and reduce bias in the estimates of the +parameters in the linear predictor. The main novelty in our approach lies in +the way we change the tree-generation moves in BART to deal with this bias and +resolve non-identifiability issues between the parametric and non-parametric +components, even when they have covariates in common. This allows us to model +complex interactions involving the covariates of primary interest, both among +themselves and with those in the BART component. Our novel method is developed +with a view to analysing data from an international education assessment, where +certain predictors of students' achievements in mathematics are of particular +interpretational interest. Through additional simulation studies and another +application to a well-known benchmark dataset, we also show competitive +performance when compared to regression models, alternative formulations of +semi-parametric BART, and other tree-based methods. The implementation of the +proposed method is available at \url{https://github.com/ebprado/CSP-BART}. + +
+
+ comment: 48 pages, 8 tables, 10 figures +
+
+
+
+
+ + ♻ ☆ Versatile audio-visual learning for emotion recognition + + +
+ Most current audio-visual emotion recognition models lack the flexibility +needed for deployment in practical applications. We envision a multimodal +system that works even when only one modality is available and can be +implemented interchangeably for either predicting emotional attributes or +recognizing categorical emotions. Achieving such flexibility in a multimodal +emotion recognition system is difficult due to the inherent challenges in +accurately interpreting and integrating varied data sources. It is also a +challenge to robustly handle missing or partial information while allowing +direct switch between regression or classification tasks. This study proposes a +versatile audio-visual learning (VAVL) framework for handling unimodal and +multimodal systems for emotion regression or emotion classification tasks. We +implement an audio-visual framework that can be trained even when audio and +visual paired data is not available for part of the training set (i.e., audio +only or only video is present). We achieve this effective representation +learning with audio-visual shared layers, residual connections over shared +layers, and a unimodal reconstruction task. Our experimental results reveal +that our architecture significantly outperforms strong baselines on the +CREMA-D, MSP-IMPROV, and CMU-MOSEI corpora. Notably, VAVL attains a new +state-of-the-art performance in the emotional attribute prediction task on the +MSP-IMPROV corpus. + +
+
+ comment: 18 pages, 4 Figures, 3 tables (published at IEEE Transactions on + Affective Computing) +
+
+
+
+
+ + ♻ ☆ Large Language Models Assume People are More Rational than We Really are + + +
+ In order for AI systems to communicate effectively with people, they must +understand how we make decisions. However, people's decisions are not always +rational, so the implicit internal models of human decision-making in Large +Language Models (LLMs) must account for this. Previous empirical evidence seems +to suggest that these implicit models are accurate -- LLMs offer believable +proxies of human behavior, acting how we expect humans would in everyday +interactions. However, by comparing LLM behavior and predictions to a large +dataset of human decisions, we find that this is actually not the case: when +both simulating and predicting people's choices, a suite of cutting-edge LLMs +(GPT-4o & 4-Turbo, Llama-3-8B & 70B, Claude 3 Opus) assume that people are more +rational than we really are. Specifically, these models deviate from human +behavior and align more closely with a classic model of rational choice -- +expected value theory. Interestingly, people also tend to assume that other +people are rational when interpreting their behavior. As a consequence, when we +compare the inferences that LLMs and people draw from the decisions of others +using another psychological dataset, we find that these inferences are highly +correlated. Thus, the implicit decision-making models of LLMs appear to be +aligned with the human expectation that other people will act rationally, +rather than with how people actually act. + +
+
+
+
+
+ + ♻ ☆ Neural networks for bifurcation and linear stability analysis of steady + states in partial differential equations + + +
+ This research introduces an extended application of neural networks for +solving nonlinear partial differential equations (PDEs). A neural network, +combined with a pseudo-arclength continuation, is proposed to construct +bifurcation diagrams from parameterized nonlinear PDEs. Additionally, a neural +network approach is also presented for solving eigenvalue problems to analyze +solution linear stability, focusing on identifying the largest eigenvalue. The +effectiveness of the proposed neural network is examined through experiments on +the Bratu equation and the Burgers equation. Results from a finite difference +method are also presented as comparison. Varying numbers of grid points are +employed in each case to assess the behavior and accuracy of both the neural +network and the finite difference method. The experimental results demonstrate +that the proposed neural network produces better solutions, generates more +accurate bifurcation diagrams, has reasonable computational times, and proves +effective for linear stability analysis. + +
+
+ comment: Accepted for publication in Applied Mathematics and Computation +
+
+
+
+
+ + ♻ ☆ Light and Optimal Schrödinger Bridge Matching + + +
+ Schr\"odinger Bridges (SB) have recently gained the attention of the ML +community as a promising extension of classic diffusion models which is also +interconnected to the Entropic Optimal Transport (EOT). Recent solvers for SB +exploit the pervasive bridge matching procedures. Such procedures aim to +recover a stochastic process transporting the mass between distributions given +only a transport plan between them. In particular, given the EOT plan, these +procedures can be adapted to solve SB. This fact is heavily exploited by recent +works giving rise to matching-based SB solvers. The cornerstone here is +recovering the EOT plan: recent works either use heuristical approximations +(e.g., the minibatch OT) or establish iterative matching procedures which by +the design accumulate the error during the training. We address these +limitations and propose a novel procedure to learn SB which we call the +\textbf{optimal Schr\"odinger bridge matching}. It exploits the optimal +parameterization of the diffusion process and provably recovers the SB process +\textbf{(a)} with a single bridge matching step and \textbf{(b)} with arbitrary +transport plan as the input. Furthermore, we show that the optimal bridge +matching objective coincides with the recently discovered energy-based modeling +(EBM) objectives to learn EOT/SB. Inspired by this observation, we develop a +light solver (which we call LightSB-M) to implement optimal matching in +practice using the Gaussian mixture parameterization of the adjusted +Schr\"odinger potential. We experimentally showcase the performance of our +solver in a range of practical tasks. The code for our solver can be found at +https://github.com/SKholkin/LightSB-Matching. + +
+
+
+
+
+ + ♻ ☆ WindsorML: High-Fidelity Computational Fluid Dynamics Dataset For + Automotive Aerodynamics + + +
+ This paper presents a new open-source high-fidelity dataset for Machine +Learning (ML) containing 355 geometric variants of the Windsor body, to help +the development and testing of ML surrogate models for external automotive +aerodynamics. Each Computational Fluid Dynamics (CFD) simulation was run with a +GPU-native high-fidelity Wall-Modeled Large-Eddy Simulations (WMLES) using a +Cartesian immersed-boundary method using more than 280M cells to ensure the +greatest possible accuracy. The dataset contains geometry variants that +exhibits a wide range of flow characteristics that are representative of those +observed on road-cars. The dataset itself contains the 3D time-averaged volume +& boundary data as well as the geometry and force & moment coefficients. This +paper discusses the validation of the underlying CFD methods as well as +contents and structure of the dataset. To the authors knowledge, this +represents the first, large-scale high-fidelity CFD dataset for the Windsor +body with a permissive open-source license (CC-BY-SA). + +
+
+
+
+
+ + ♻ ☆ Synthetic Image Learning: Preserving Performance and Preventing + Membership Inference Attacks + + +
+ Generative artificial intelligence has transformed the generation of +synthetic data, providing innovative solutions to challenges like data scarcity +and privacy, which are particularly critical in fields such as medicine. +However, the effective use of this synthetic data to train high-performance +models remains a significant challenge. This paper addresses this issue by +introducing Knowledge Recycling (KR), a pipeline designed to optimise the +generation and use of synthetic data for training downstream classifiers. At +the heart of this pipeline is Generative Knowledge Distillation (GKD), the +proposed technique that significantly improves the quality and usefulness of +the information provided to classifiers through a synthetic dataset +regeneration and soft labelling mechanism. The KR pipeline has been tested on a +variety of datasets, with a focus on six highly heterogeneous medical image +datasets, ranging from retinal images to organ scans. The results show a +significant reduction in the performance gap between models trained on real and +synthetic data, with models based on synthetic data outperforming those trained +on real data in some cases. Furthermore, the resulting models show almost +complete immunity to Membership Inference Attacks, manifesting privacy +properties missing in models trained with conventional techniques. + +
+
+
+
+
+ + ♻ ☆ Multi-Agent, Human-Agent and Beyond: A Survey on Cooperation in Social + Dilemmas + + +
+ The study of cooperation within social dilemmas has long been a fundamental +topic across various disciplines, including computer science and social +science. Recent advancements in Artificial Intelligence (AI) have significantly +reshaped this field, offering fresh insights into understanding and enhancing +cooperation. This survey examines three key areas at the intersection of AI and +cooperation in social dilemmas. First, focusing on multi-agent cooperation, we +review the intrinsic and external motivations that support cooperation among +rational agents, and the methods employed to develop effective strategies +against diverse opponents. Second, looking into human-agent cooperation, we +discuss the current AI algorithms for cooperating with humans and the human +biases towards AI agents. Third, we review the emergent field of leveraging AI +agents to enhance cooperation among humans. We conclude by discussing future +research avenues, such as using large language models, establishing unified +theoretical frameworks, revisiting existing theories of human cooperation, and +exploring multiple real-world applications. + +
+
+
+
+
+ + ♻ ☆ Forecasting Tropical Cyclones with Cascaded Diffusion Models ICLR 2024 + + +
+ As tropical cyclones become more intense due to climate change, the rise of +Al-based modelling provides a more affordable and accessible approach compared +to traditional methods based on mathematical models. This work leverages +generative diffusion models to forecast cyclone trajectories and precipitation +patterns by integrating satellite imaging, remote sensing, and atmospheric +data. It employs a cascaded approach that incorporates three main tasks: +forecasting, super-resolution, and precipitation modelling. The training +dataset includes 51 cyclones from six major tropical cyclone basins from +January 2019 - March 2023. Experiments demonstrate that the final forecasts +from the cascaded models show accurate predictions up to a 36-hour rollout, +with excellent Structural Similarity (SSIM) and Peak-Singal-To-Noise Ratio +(PSNR) values exceeding 0.5 and 20 dB, respectively, for all three tasks. The +36-hour forecasts can be produced in as little as 30 mins on a single Nvidia +A30/RTX 2080 Ti. This work also highlights the promising efficiency of Al +methods such as diffusion models for high-performance needs in weather +forecasting, such as tropical cyclone forecasting, while remaining +computationally affordable, making them ideal for highly vulnerable regions +with critical forecasting needs and financial limitations. Code accessible at +https://github.com/nathzi1505/forecast-diffmodels. + +
+
+ comment: Accepted for poster presentation at the ICLR 2024 workshop on + Tackling Climate Change with Machine Learning. 7 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ DualTime: A Dual-Adapter Multimodal Language Model for Time Series + Representation + + +
+ The recent rapid development of language models (LMs) has attracted attention +in the field of time series, including multimodal time series modeling. +However, we note that current time series multimodal methods are biased, often +assigning a primary role to one modality while the other assumes a secondary +role. They overlook the mutual benefits and complementary of different +modalities. For example, in seizure diagnosis, relying solely on textual +clinical reports makes it difficult to pinpoint the area and type of the +disease, while electroencephalograms (EEGs) alone cannot provide an accurate +diagnosis without considering the symptoms. In this study, based on the +complementary information mining of time series multimodal data, we propose +DualTime, a Dual-adapter multimodal language model for Time series +representation implementing temporal-primary and textual-primary modeling +simultaneously. By injecting lightweight adaption tokens, the LM pipeline +shared by dual adapters encourages embedding alignment and achieves efficient +fine-tuning. Empirically, our method outperforms state-of-the-art models in +both supervised and unsupervised settings, highlighting the complementary +benefits of different modalities. In addition, we conduct few-shot label +transfer experiments, which further verifies the transferability and +expressiveness of our proposed DualTime. + +
+
+ comment: 15 pages, 12 figure, 5 tables +
+
+
+
+
+ + ♻ ☆ Improving Zero-shot Generalization of Learned Prompts via Unsupervised + Knowledge Distillation ECCV24 + + +
+ Vision-Language Models (VLMs) demonstrate remarkable zero-shot generalization +to unseen tasks, but fall short of the performance of supervised methods in +generalizing to downstream tasks with limited data. Prompt learning is emerging +as a parameter-efficient method for adapting VLMs, but state-of-the-art +approaches require annotated samples. In this paper we propose a novel approach +to prompt learning based on unsupervised knowledge distillation from more +powerful models. Our approach, which we call Knowledge Distillation Prompt +Learning (KDPL), can be integrated into existing prompt learning techniques and +eliminates the need for labeled examples during adaptation. Our experiments on +more than ten standard benchmark datasets demonstrate that KDPL is very +effective at improving generalization of learned prompts for zero-shot domain +generalization, zero-shot cross-dataset generalization, and zero-shot +base-to-novel class generalization problems. KDPL requires no ground-truth +labels for adaptation, and moreover we show that even in the absence of any +knowledge of training class names it can be used to effectively transfer +knowledge. The code is publicly available at https://github.com/miccunifi/KDPL. + +
+
+ comment: Accepted for publication at ECCV24 +
+
+
+
+
+ + ♻ ☆ Adaptive Bounding Box Uncertainties via Two-Step Conformal Prediction ECCV + + +
+ Quantifying a model's predictive uncertainty is essential for safety-critical +applications such as autonomous driving. We consider quantifying such +uncertainty for multi-object detection. In particular, we leverage conformal +prediction to obtain uncertainty intervals with guaranteed coverage for object +bounding boxes. One challenge in doing so is that bounding box predictions are +conditioned on the object's class label. Thus, we develop a novel two-step +conformal approach that propagates uncertainty in predicted class labels into +the uncertainty intervals of bounding boxes. This broadens the validity of our +conformal coverage guarantees to include incorrectly classified objects, thus +offering more actionable safety assurances. Moreover, we investigate novel +ensemble and quantile regression formulations to ensure the bounding box +intervals are adaptive to object size, leading to a more balanced coverage. +Validating our two-step approach on real-world datasets for 2D bounding box +localization, we find that desired coverage levels are satisfied with +practically tight predictive uncertainty intervals. + +
+
+ comment: European Conference on Computer Vision (ECCV) 2024; 37 pages, 14 + figures, 6 tables (incl. appendix) +
+
+
+
+
+ + ♻ ☆ F-KANs: Federated Kolmogorov-Arnold Networks + + +
+ In this paper, we present an innovative federated learning (FL) approach that +utilizes Kolmogorov-Arnold Networks (KANs) for classification tasks. By +utilizing the adaptive activation capabilities of KANs in a federated +framework, we aim to improve classification capabilities while preserving +privacy. The study evaluates the performance of federated KANs (F- KANs) +compared to traditional Multi-Layer Perceptrons (MLPs) on classification task. +The results show that the F-KANs model significantly outperforms the federated +MLP model in terms of accuracy, precision, recall, F1 score and stability, and +achieves better performance, paving the way for more efficient and +privacy-preserving predictive analytics. + +
+
+ comment: This work has been submitted to IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible. Related Code: https://github.com/ezeydan/F-KANs.git +
+
+
+
+
+ + ♻ ☆ ALERT-Transformer: Bridging Asynchronous and Synchronous Machine + Learning for Real-Time Event-based Spatio-Temporal Data ICML 2024 + + +
+ We seek to enable classic processing of continuous ultra-sparse +spatiotemporal data generated by event-based sensors with dense machine +learning models. We propose a novel hybrid pipeline composed of asynchronous +sensing and synchronous processing that combines several ideas: (1) an +embedding based on PointNet models -- the ALERT module -- that can continuously +integrate new and dismiss old events thanks to a leakage mechanism, (2) a +flexible readout of the embedded data that allows to feed any downstream model +with always up-to-date features at any sampling rate, (3) exploiting the input +sparsity in a patch-based approach inspired by Vision Transformer to optimize +the efficiency of the method. These embeddings are then processed by a +transformer model trained for object and gesture recognition. Using this +approach, we achieve performances at the state-of-the-art with a lower latency +than competitors. We also demonstrate that our asynchronous model can operate +at any desired sampling rate. + +
+
+ comment: Originally published in the Proceedings of Machine Learning Research + ICML 2024 +
+
+
+
+
+ + ♻ ☆ Generative Learning for Simulation of Vehicle Faults + + +
+ We develop a novel generative model to simulate vehicle health and forecast +faults, conditioned on practical operational considerations. The model, trained +on data from the US Army's Predictive Logistics program, aims to support +predictive maintenance. It forecasts faults far enough in advance to execute a +maintenance intervention before a breakdown occurs. The model incorporates +real-world factors that affect vehicle health. It also allows us to understand +the vehicle's condition by analyzing operating data, and characterizing each +vehicle into discrete states. Importantly, the model predicts the time to first +fault with high accuracy. We compare its performance to other models and +demonstrate its successful training. + +
+
+
+
+
+ + ♻ ☆ ECATS: Explainable-by-design concept-based anomaly detection for time + series + + +
+ Deep learning methods for time series have already reached excellent +performances in both prediction and classification tasks, including anomaly +detection. However, the complexity inherent in Cyber Physical Systems (CPS) +creates a challenge when it comes to explainability methods. To overcome this +inherent lack of interpretability, we propose ECATS, a concept-based +neuro-symbolic architecture where concepts are represented as Signal Temporal +Logic (STL) formulae. Leveraging kernel-based methods for STL, concept +embeddings are learnt in an unsupervised manner through a cross-attention +mechanism. The network makes class predictions through these concept +embeddings, allowing for a meaningful explanation to be naturally extracted for +each input. Our preliminary experiments with a simple CPS-based dataset show +that our model is able to achieve great classification performance while +ensuring local interpretability. + +
+
+ comment: 14 pages, 8 figures, accepted to 18th International Conference on + Neural-Symbolic Learning and Reasoning (NeSy 2024) +
+
+
+
+
+ + ♻ ☆ A survey of machine learning techniques in medical applications + + +
+ In recent years, machine learning (ML) has emerged as a powerful tool for +solving a wide range of problems, including medical decision-making. The +exponential growth of medical data over the past two decades has surpassed the +capacity for manual analysis, prompting increased interest in automated data +analysis and processing. ML algorithms, capable of learning from data with +minimal human intervention, are particularly well-suited for medical data +analysis and interpretation. One significant advantage of ML is the reduced +cost of collecting labeled training data necessary for supervised learning. +While numerous studies have explored the applications of ML in medicine, this +survey specifically focuses on the use of ML across various medical research +fields. We provide a comprehensive technical overview of existing studies on ML +applications in medicine, highlighting the strengths and limitations of these +approaches. Additionally, we discuss potential research directions for future +exploration. These include the development of more sophisticated reward +functions, as the accuracy of the reward function is crucial for ML +performance, the integration of ML with other techniques, and the application +of ML to new and emerging areas in genomics research. Finally, we summarize our +findings and present the current state of the field and the future outlook for +ML in medical application. + +
+
+
+
+
+ + ♻ ☆ Attacking Cooperative Multi-Agent Reinforcement Learning by Adversarial + Minority Influence + + +
+ This study probes the vulnerabilities of cooperative multi-agent +reinforcement learning (c-MARL) under adversarial attacks, a critical +determinant of c-MARL's worst-case performance prior to real-world +implementation. Current observation-based attacks, constrained by white-box +assumptions, overlook c-MARL's complex multi-agent interactions and cooperative +objectives, resulting in impractical and limited attack capabilities. To +address these shortcomes, we propose Adversarial Minority Influence (AMI), a +practical and strong for c-MARL. AMI is a practical black-box attack and can be +launched without knowing victim parameters. AMI is also strong by considering +the complex multi-agent interaction and the cooperative goal of agents, +enabling a single adversarial agent to unilaterally misleads majority victims +to form targeted worst-case cooperation. This mirrors minority influence +phenomena in social psychology. To achieve maximum deviation in victim policies +under complex agent-wise interactions, our unilateral attack aims to +characterize and maximize the impact of the adversary on the victims. This is +achieved by adapting a unilateral agent-wise relation metric derived from +mutual information, thereby mitigating the adverse effects of victim influence +on the adversary. To lead the victims into a jointly detrimental scenario, our +targeted attack deceives victims into a long-term, cooperatively harmful +situation by guiding each victim towards a specific target, determined through +a trial-and-error process executed by a reinforcement learning agent. Through +AMI, we achieve the first successful attack against real-world robot swarms and +effectively fool agents in simulated environments into collectively worst-case +scenarios, including Starcraft II and Multi-agent Mujoco. The source code and +demonstrations can be found at: https://github.com/DIG-Beihang/AMI. + +
+
+
+
+
+ + ♻ ☆ Can we Constrain Concept Bottleneck Models to Learn Semantically + Meaningful Input Features? + + +
+ Concept Bottleneck Models (CBMs) are regarded as inherently interpretable +because they first predict a set of human-defined concepts which are used to +predict a task label. For inherent interpretability to be fully realised, and +ensure trust in a model's output, it's desirable for concept predictions to use +semantically meaningful input features. For instance, in an image, pixels +representing a broken bone should contribute to predicting a fracture. However, +current literature suggests that concept predictions often rely on irrelevant +input features. We hypothesise that this occurs when dataset labels include +inaccurate concept annotations, or the relationship between input features and +concepts is unclear. In general, the effect of dataset labelling on concept +representations remains an understudied area. In this paper, we demonstrate +that CBMs can learn to map concepts to semantically meaningful input features, +by utilising datasets with a clear link between the input features and the +desired concept predictions. This is achieved, for instance, by ensuring +multiple concepts do not always co-occur and, therefore provide a clear +training signal for the CBM to distinguish the relevant input features for each +concept. We validate our hypothesis on both synthetic and real-world image +datasets, and demonstrate under the correct conditions, CBMs can learn to +attribute semantically meaningful input features to the correct concept +predictions. + +
+
+ comment: Main paper: 8 pages, 9 figures, Appendix: 14 pages, 21 figures. This + paper is a preprint +
+
+
+
+
+ + ♻ ☆ Data Imputation from the Perspective of Graph Dirichlet Energy + + +
+ Data imputation is a crucial task due to the widespread occurrence of missing +data. Many methods adopt a two-step approach: initially crafting a preliminary +imputation (the "draft") and then refining it to produce the final missing data +imputation result, commonly referred to as "draft-then-refine". In our study, +we examine this prevalent strategy through the lens of graph Dirichlet energy. +We observe that a basic "draft" imputation tends to decrease the Dirichlet +energy. Therefore, a subsequent "refine" step is necessary to restore the +overall energy balance. Existing refinement techniques, such as the Graph +Convolutional Network (GCN), often result in further energy reduction. To +address this, we introduce a new framework, the Graph Laplacian Pyramid Network +(GLPN). GLPN incorporates a U-shaped autoencoder and residual networks to +capture both global and local details effectively. Through extensive +experiments on multiple real-world datasets, GLPN consistently outperforms +state-of-the-art methods across three different missing data mechanisms. The +code is available at https://github.com/liguanlue/GLPN. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Dynamic Spiking Framework for Graph Neural Networks + + +
+ The integration of Spiking Neural Networks (SNNs) and Graph Neural Networks +(GNNs) is gradually attracting attention due to the low power consumption and +high efficiency in processing the non-Euclidean data represented by graphs. +However, as a common problem, dynamic graph representation learning faces +challenges such as high complexity and large memory overheads. Current work +often uses SNNs instead of Recurrent Neural Networks (RNNs) by using binary +features instead of continuous ones for efficient training, which would +overlooks graph structure information and leads to the loss of details during +propagation. Additionally, optimizing dynamic spiking models typically requires +propagation of information across time steps, which increases memory +requirements. To address these challenges, we present a framework named +\underline{Dy}namic \underline{S}p\underline{i}king \underline{G}raph +\underline{N}eural Networks (\method{}). To mitigate the information loss +problem, \method{} propagates early-layer information directly to the last +layer for information compensation. To accommodate the memory requirements, we +apply the implicit differentiation on the equilibrium state, which does not +rely on the exact reverse of the forward computation. While traditional +implicit differentiation methods are usually used for static situations, +\method{} extends it to the dynamic graph setting. Extensive experiments on +three large-scale real-world dynamic graph datasets validate the effectiveness +of \method{} on dynamic node classification tasks with lower computational +costs. + +
+
+
+
+
+ + ♻ ☆ A Survey of Imitation Learning Methods, Environments and Metrics + + +
+ Imitation learning is an approach in which an agent learns how to execute a +task by trying to mimic how one or more teachers perform it. This learning +approach offers a compromise between the time it takes to learn a new task and +the effort needed to collect teacher samples for the agent. It achieves this by +balancing learning from the teacher, who has some information on how to perform +the task, and deviating from their examples when necessary, such as states not +present in the teacher samples. Consequently, the field of imitation learning +has received much attention from researchers in recent years, resulting in many +new methods and applications. However, with this increase in published work and +past surveys focusing mainly on methodology, a lack of standardisation became +more prominent in the field. This non-standardisation is evident in the use of +environments, which appear in no more than two works, and evaluation processes, +such as qualitative analysis, that have become rare in current literature. In +this survey, we systematically review current imitation learning literature and +present our findings by (i) classifying imitation learning techniques, +environments and metrics by introducing novel taxonomies; (ii) reflecting on +main problems from the literature; and (iii) presenting challenges and future +directions for researchers. + +
+
+
+
+
+ + ♻ ☆ Classification of freshwater snails of the genus Radomaniola with + multimodal triplet networks ICML 2024 + + +
+ In this paper, we present our first proposal of a machine learning system for +the classification of freshwater snails of the genus Radomaniola. We elaborate +on the specific challenges encountered during system design, and how we tackled +them; namely a small, very imbalanced dataset with a high number of classes and +high visual similarity between classes. We then show how we employed triplet +networks and the multiple input modalities of images, measurements, and genetic +information to overcome these challenges and reach a performance comparable to +that of a trained domain expert. + +
+
+ comment: Spotlight at ICML 2024 AI for Science workshop +
+
+
+
+
+ + ♻ ☆ Optimizing Adaptive Experiments: A Unified Approach to Regret + Minimization and Best-Arm Identification + + +
+ Practitioners conducting adaptive experiments often encounter two competing +priorities: maximizing total welfare (or `reward') through effective treatment +assignment and swiftly concluding experiments to implement population-wide +treatments. Current literature addresses these priorities separately, with +regret minimization studies focusing on the former and best-arm identification +research on the latter. This paper bridges this divide by proposing a unified +model that simultaneously accounts for within-experiment performance and +post-experiment outcomes. We provide a sharp theory of optimal performance in +large populations that not only unifies canonical results in the literature but +also uncovers novel insights. Our theory reveals that familiar algorithms, such +as the recently proposed top-two Thompson sampling algorithm, can optimize a +broad class of objectives if a single scalar parameter is appropriately +adjusted. In addition, we demonstrate that substantial reductions in experiment +duration can often be achieved with minimal impact on both within-experiment +and post-experiment regret. + +
+
+
+
+
+ + ♻ ☆ Noise-Tolerant Few-Shot Unsupervised Adapter for Vision-Language Models BMVC 2024 + + +
+ Recent advances in large-scale vision-language models have achieved +impressive performance in various zero-shot image classification tasks. While +prior studies have demonstrated significant improvements by introducing +few-shot labelled target samples, they still require labelling of target +samples, which greatly degrades their scalability and generalizability while +handling various visual recognition tasks. We design NtUA, a Noise-tolerant +Unsupervised Adapter that allows the learning of effective target models with +few unlabelled target samples. NtUA works as a key-value cache that formulates +visual features and predicted pseudo-labels of the few unlabelled target +samples as key-value pairs. It consists of two complementary designs. The first +is adaptive cache formation that combats pseudo-label noises by weighting the +key-value pairs according to their prediction confidence. The second is +knowledge-guided cache refinement, which refines pair values (i.e., +pseudo-labels) and cache weights by leveraging knowledge distillation from +large-scale vision language models. Extensive experiments show that NtUA +achieves superior performance consistently across multiple widely adopted +benchmarks. + +
+
+ comment: Accepted at BMVC 2024 +
+
+
+
+
+ + ♻ ☆ DOMAIN: MilDly COnservative Model-BAsed OfflINe Reinforcement Learning + + +
+ Model-based reinforcement learning (RL), which learns environment model from +offline dataset and generates more out-of-distribution model data, has become +an effective approach to the problem of distribution shift in offline RL. Due +to the gap between the learned and actual environment, conservatism should be +incorporated into the algorithm to balance accurate offline data and imprecise +model data. The conservatism of current algorithms mostly relies on model +uncertainty estimation. However, uncertainty estimation is unreliable and leads +to poor performance in certain scenarios, and the previous methods ignore +differences between the model data, which brings great conservatism. Therefore, +this paper proposes a milDly cOnservative Model-bAsed offlINe RL algorithm +(DOMAIN) without estimating model uncertainty to address the above issues. +DOMAIN introduces adaptive sampling distribution of model samples, which can +adaptively adjust the model data penalty. In this paper, we theoretically +demonstrate that the Q value learned by the DOMAIN outside the region is a +lower bound of the true Q value, the DOMAIN is less conservative than previous +model-based offline RL algorithms and has the guarantee of safety policy +improvement. The results of extensive experiments show that DOMAIN outperforms +prior RL algorithms on the D4RL dataset benchmark. + +
+
+ comment: 14 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Robust Federated Learning for Wireless Networks: A Demonstration with + Channel Estimation + + +
+ Federated learning (FL) offers a privacy-preserving collaborative approach +for training models in wireless networks, with channel estimation emerging as a +promising application. Despite extensive studies on FL-empowered channel +estimation, the security concerns associated with FL require meticulous +attention. In a scenario where small base stations (SBSs) serve as local models +trained on cached data, and a macro base station (MBS) functions as the global +model setting, an attacker can exploit the vulnerability of FL, launching +attacks with various adversarial attacks or deployment tactics. In this paper, +we analyze such vulnerabilities, corresponding solutions were brought forth, +and validated through simulation. + +
+
+ comment: Submitted to IEEE GLOBECOM 2024 +
+
+
+
+
+ + ♻ ☆ SimPro: A Simple Probabilistic Framework Towards Realistic Long-Tailed + Semi-Supervised Learning ICML2024 + + +
+ Recent advancements in semi-supervised learning have focused on a more +realistic yet challenging task: addressing imbalances in labeled data while the +class distribution of unlabeled data remains both unknown and potentially +mismatched. Current approaches in this sphere often presuppose rigid +assumptions regarding the class distribution of unlabeled data, thereby +limiting the adaptability of models to only certain distribution ranges. In +this study, we propose a novel approach, introducing a highly adaptable +framework, designated as SimPro, which does not rely on any predefined +assumptions about the distribution of unlabeled data. Our framework, grounded +in a probabilistic model, innovatively refines the expectation-maximization +(EM) algorithm by explicitly decoupling the modeling of conditional and +marginal class distributions. This separation facilitates a closed-form +solution for class distribution estimation during the maximization phase, +leading to the formulation of a Bayes classifier. The Bayes classifier, in +turn, enhances the quality of pseudo-labels in the expectation phase. +Remarkably, the SimPro framework not only comes with theoretical guarantees but +also is straightforward to implement. Moreover, we introduce two novel class +distributions broadening the scope of the evaluation. Our method showcases +consistent state-of-the-art performance across diverse benchmarks and data +distribution scenarios. Our code is available at +https://github.com/LeapLabTHU/SimPro. + +
+
+ comment: ICML2024 camera-ready version +
+
+
+
+
+ + ♻ ☆ ISMRNN: An Implicitly Segmented RNN Method with Mamba for Long-Term Time + Series Forecasting + + +
+ Long time series forecasting aims to utilize historical information to +forecast future states over extended horizons. Traditional RNN-based series +forecasting methods struggle to effectively address long-term dependencies and +gradient issues in long time series problems. Recently, SegRNN has emerged as a +leading RNN-based model tailored for long-term series forecasting, +demonstrating state-of-the-art performance while maintaining a streamlined +architecture through innovative segmentation and parallel decoding techniques. +Nevertheless, SegRNN has several limitations: its fixed segmentation disrupts +data continuity and fails to effectively leverage information across different +segments, the segmentation strategy employed by SegRNN does not fundamentally +address the issue of information loss within the recurrent structure. To +address these issues, we propose the ISMRNN method with three key enhancements: +we introduce an implicit segmentation structure to decompose the time series +and map it to segmented hidden states, resulting in denser information exchange +during the segmentation phase. Additionally, we incorporate residual structures +in the encoding layer to mitigate information loss within the recurrent +structure. To extract information more effectively, we further integrate the +Mamba architecture to enhance time series information extraction. Experiments +on several real-world long time series forecasting datasets demonstrate that +our model surpasses the performance of current state-of-the-art models. + +
+
+
+
+
+ + ♻ ☆ Deep Causal Learning: Representation, Discovery and Inference + + +
+ Causal learning has garnered significant attention in recent years because it +reveals the essential relationships that underpin phenomena and delineates the +mechanisms by which the world evolves. Nevertheless, traditional causal +learning methods face numerous challenges and limitations, including +high-dimensional, unstructured variables, combinatorial optimization problems, +unobserved confounders, selection biases, and estimation inaccuracies. Deep +causal learning, which leverages deep neural networks, offers innovative +insights and solutions for addressing these challenges. Although numerous deep +learning-based methods for causal discovery and inference have been proposed, +there remains a dearth of reviews examining the underlying mechanisms by which +deep learning can enhance causal learning. In this article, we comprehensively +review how deep learning can contribute to causal learning by tackling +traditional challenges across three key dimensions: representation, discovery, +and inference. We emphasize that deep causal learning is pivotal for advancing +the theoretical frontiers and broadening the practical applications of causal +science. We conclude by summarizing open issues and outlining potential +directions for future research. + +
+
+
+
+
+ + ♻ ☆ Computable learning of natural hypothesis classes + + +
+ This paper is about the recent notion of computably probably approximately +correct learning, which lies between the statistical learning theory where +there is no computational requirement on the learner and efficient PAC where +the learner must be polynomially bounded. Examples have recently been given of +hypothesis classes which are PAC learnable but not computably PAC learnable, +but these hypothesis classes are unnatural or non-canonical in the sense that +they depend on a numbering of proofs, formulas, or programs. We use the +on-a-cone machinery from computability theory to prove that, under mild +assumptions such as that the hypothesis class can be computably listable, any +natural hypothesis class which is learnable must be computably learnable. Thus +the counterexamples given previously are necessarily unnatural. + +
+
+ comment: This is a replacement of the earlier submission to just update the + funding information +
+
+
+
+
+ + ♻ ☆ Adaptive Self-supervised Robust Clustering for Unstructured Data with + Unknown Cluster Number + + +
+ We introduce a novel self-supervised deep clustering approach tailored for +unstructured data without requiring prior knowledge of the number of clusters, +termed Adaptive Self-supervised Robust Clustering (ASRC). In particular, ASRC +adaptively learns the graph structure and edge weights to capture both local +and global structural information. The obtained graph enables us to learn +clustering-friendly feature representations by an enhanced graph auto-encoder +with contrastive learning technique. It further leverages the clustering +results adaptively obtained by robust continuous clustering (RCC) to generate +prototypes for negative sampling, which can further contribute to promoting +consistency among positive pairs and enlarging the gap between positive and +negative samples. ASRC obtains the final clustering results by applying RCC to +the learned feature representations with their consistent graph structure and +edge weights. Extensive experiments conducted on seven benchmark datasets +demonstrate the efficacy of ASRC, demonstrating its superior performance over +other popular clustering models. Notably, ASRC even outperforms methods that +rely on prior knowledge of the number of clusters, highlighting its +effectiveness in addressing the challenges of clustering unstructured data. + +
+
+
+
+
+ + ♻ ☆ Advocating for the Silent: Enhancing Federated Generalization for + Non-Participating Clients + + +
+ Federated Learning (FL) has surged in prominence due to its capability of +collaborative model training without direct data sharing. However, the vast +disparity in local data distributions among clients, often termed the +Non-Independent Identically Distributed (Non-IID) challenge, poses a +significant hurdle to FL's generalization efficacy. The scenario becomes even +more complex when not all clients participate in the training process, a common +occurrence due to unstable network connections or limited computational +capacities. This can greatly complicate the assessment of the trained models' +generalization abilities. While a plethora of recent studies has centered on +the generalization gap pertaining to unseen data from participating clients +with diverse distributions, the distinction between the training distributions +of participating clients and the testing distributions of non-participating +ones has been largely overlooked. In response, our paper unveils an +information-theoretic generalization framework for FL. Specifically, it +quantifies generalization errors by evaluating the information entropy of local +distributions and discerning discrepancies across these distributions. Inspired +by our deduced generalization bounds, we introduce a weighted aggregation +approach and a duo of client selection strategies. These innovations are +designed to strengthen FL's ability to generalize and thus ensure that trained +models perform better on non-participating clients by incorporating a more +diverse range of client data distributions. Our extensive empirical evaluations +reaffirm the potency of our proposed methods, aligning seamlessly with our +theoretical construct. + +
+
+ comment: Submitted to IEEE TNNLS, under minor revision +
+
+
+
+
+ + ♻ ☆ Evidential Uncertainty Sets in Deep Classifiers Using Conformal + Prediction + + +
+ In this paper, we propose Evidential Conformal Prediction (ECP) method for +image classifiers to generate the conformal prediction sets. Our method is +designed based on a non-conformity score function that has its roots in +Evidential Deep Learning (EDL) as a method of quantifying model (epistemic) +uncertainty in DNN classifiers. We use evidence that are derived from the logit +values of target labels to compute the components of our non-conformity score +function: the heuristic notion of uncertainty in CP, uncertainty surprisal, and +expected utility. Our extensive experimental evaluation demonstrates that ECP +outperforms three state-of-the-art methods for generating CP sets, in terms of +their set sizes and adaptivity while maintaining the coverage of true labels. + +
+
+ comment: Accepted in 13th Symposium on Conformal and Probabilistic Prediction + with Applications (COPA2024). To be published in the Proceedings of Machine + Learning Research (PMLR), vol. 230, 2024 (25 Pages) +
+
+
+
+
+ + ♻ ☆ Improved Robustness and Hyperparameter Selection in Modern Hopfield + Networks + + +
+ The modern Hopfield network generalizes the classical Hopfield network by +allowing for sharper interaction functions. This increases the capacity of the +network as an autoassociative memory as nearby learned attractors will not +interfere with one another. However, the implementation of the network relies +on applying large exponents to the dot product of memory vectors and probe +vectors. If the dimension of the data is large the calculation can be very +large and result in problems when using floating point numbers in a practical +implementation. We describe this problem in detail, modify the original network +description to mitigate the problem, and show the modification will not alter +the networks' dynamics during update or training. We also show our modification +greatly improves hyperparameter selection for the modern Hopfield network, +removing hyperparameter dependence on the interaction vertex and resulting in +an optimal region of hyperparameters that does not significantly change with +the interaction vertex as it does in the original network. + +
+
+ comment: Add subsection on exponential interaction function +
+
+
+
+
+ + ♻ ☆ Cartesian atomic cluster expansion for machine learning interatomic + potentials + + +
+ Machine learning interatomic potentials are revolutionizing large-scale, +accurate atomistic modelling in material science and chemistry. Many potentials +use atomic cluster expansion or equivariant message passing frameworks. Such +frameworks typically use spherical harmonics as angular basis functions, and +then use Clebsch-Gordan contraction to maintain rotational symmetry, which may +introduce redundancies in representations and computational overhead. We +propose an alternative: a Cartesian-coordinates-based atomic density expansion. +This approach provides a complete set of polynormially indepedent features of +atomic environments while maintaining interaction body orders. Additionally, we +integrate low-dimensional embeddings of various chemical elements and +inter-atomic message passing. The resulting potential, named Cartesian Atomic +Cluster Expansion (CACE), exhibits good accuracy, stability, and +generalizability. We validate its performance in diverse systems, including +bulk water, small molecules, and 25-element high-entropy alloys. + +
+
+
+
+
+ + ♻ ☆ Thompson sampling for zero-inflated count outcomes with an application + to the Drink Less mobile health study + + +
+ Mobile health (mHealth) interventions often aim to improve distal outcomes, +such as clinical conditions, by optimizing proximal outcomes through +just-in-time adaptive interventions. Contextual bandits provide a suitable +framework for customizing such interventions according to individual +time-varying contexts. However, unique challenges, such as modeling count +outcomes within bandit frameworks, have hindered the widespread application of +contextual bandits to mHealth studies. The current work addresses this +challenge by leveraging count data models into online decision-making +approaches. Specifically, we combine four common offline count data models +(Poisson, negative binomial, zero-inflated Poisson, and zero-inflated negative +binomial regressions) with Thompson sampling, a popular contextual bandit +algorithm. The proposed algorithms are motivated by and evaluated on a real +dataset from the Drink Less trial, where they are shown to improve user +engagement with the mHealth platform. The proposed methods are further +evaluated on simulated data, achieving improvement in maximizing cumulative +proximal outcomes over existing algorithms. Theoretical results on regret +bounds are also derived. The countts R package provides an implementation of +our approach. + +
+
+
+
+
+ + ♻ ☆ On the Limitations of Compute Thresholds as a Governance Strategy + + +
+ At face value, this essay is about understanding a fairly esoteric governance +tool called compute thresholds. However, in order to grapple with whether these +thresholds will achieve anything, we must first understand how they came to be. +To do so, we need to engage with a decades-old debate at the heart of computer +science progress, namely, is bigger always better? Does a certain inflection +point of compute result in changes to the risk profile of a model? Hence, this +essay may be of interest not only to policymakers and the wider public but also +to computer scientists interested in understanding the role of compute in +unlocking breakthroughs. This discussion is timely given the wide adoption of +compute thresholds in both the White House Executive Orders on AI Safety (EO) +and the EU AI Act to identify more risky systems. A key conclusion of this +essay is that compute thresholds, as currently implemented, are shortsighted +and likely to fail to mitigate risk. The relationship between compute and risk +is highly uncertain and rapidly changing. Relying upon compute thresholds +overestimates our ability to predict what abilities emerge at different scales. +This essay ends with recommendations for a better way forward. + +
+
+
+
+
+ + ♻ ☆ Geometric Learning with Positively Decomposable Kernels + + +
+ Kernel methods are powerful tools in machine learning. Classical kernel +methods are based on positive-definite kernels, which map data spaces into +reproducing kernel Hilbert spaces (RKHS). For non-Euclidean data spaces, +positive-definite kernels are difficult to come by. In this case, we propose +the use of reproducing kernel Krein space (RKKS) based methods, which require +only kernels that admit a positive decomposition. We show that one does not +need to access this decomposition in order to learn in RKKS. We then +investigate the conditions under which a kernel is positively decomposable. We +show that invariant kernels admit a positive decomposition on homogeneous +spaces under tractable regularity assumptions. This makes them much easier to +construct than positive-definite kernels, providing a route for learning with +kernels for non-Euclidean data. By the same token, this provides theoretical +foundations for RKKS-based methods in general. + +
+
+
+
+
+ + ♻ ☆ Enhancing Training Efficiency Using Packing with Flash Attention + + +
+ Padding is often used in tuning LLM models by adding special tokens to +shorter training examples to match the length of the longest sequence in each +batch. While this ensures uniformity for batch processing, it introduces +inefficiencies by including irrelevant padding tokens in the computation and +wastes GPU resources. Hugging Face SFT trainer has always offered the option to +use packing to combine multiple training examples, allowing for maximal +utilization of GPU resources. However, up till now, it did not offer proper +masking of each packed training example. This capability has now been added to +Hugging Face Transformers 4.43. We analyse this new feature and show the +benefits across different variations of packing. + +
+
+
+
+
+ + ♻ ☆ MimiQ: Low-Bit Data-Free Quantization of Vision Transformers with + Encouraging Inter-Head Attention Similarity + + +
+ Data-free quantization (DFQ) is a technique that creates a lightweight +network from its full-precision counterpart without the original training data, +often through a synthetic dataset. Although several DFQ methods have been +proposed for vision transformer (ViT) architectures, they fail to achieve +efficacy in low-bit settings. Examining the existing methods, we identify that +their synthetic data produce misaligned attention maps, while those of the real +samples are highly aligned. From the observation of aligned attention, we find +that aligning attention maps of synthetic data helps to improve the overall +performance of quantized ViTs. Motivated by this finding, we devise \aname, a +novel DFQ method designed for ViTs that focuses on inter-head attention +similarity. First, we generate synthetic data by aligning head-wise attention +responses in relation to spatial query patches. Then, we apply head-wise +structural attention distillation to align the attention maps of the quantized +network to those of the full-precision teacher. The experimental results show +that the proposed method significantly outperforms baselines, setting a new +state-of-the-art performance for data-free ViT quantization. + +
+
+ comment: Author Preprint +
+
+
+
+
+ + ♻ ☆ HyperbolicLR: Epoch insensitive learning rate scheduler + + +
+ This study proposes two novel learning rate schedulers: the Hyperbolic +Learning Rate Scheduler (HyperbolicLR) and the Exponential Hyperbolic Learning +Rate Scheduler (ExpHyperbolicLR). These schedulers attempt to address the +inconsistent learning curves often observed in conventional schedulers when +adjusting the number of epochs. By leveraging the asymptotic behavior of +hyperbolic curves, the proposed schedulers maintain more consistent learning +curves across varying epoch settings. The HyperbolicLR algorithm directly +applies this property to the epoch-learning rate space, while the +ExpHyperbolicLR maps this concept onto the exponential space of epochs and +learning rates. To evaluate the performance of these schedulers, first we found +the optimal hyperparameters for each scheduler on a small number of epochs, +fixed these values, and compared their performance as the number of epochs +increased. Our experimental results on various deep learning tasks and +architectures demonstrate that both HyperbolicLR and ExpHyperbolicLR maintain +more consistent performance improvements compared to conventional schedulers as +the number of epochs increases. These findings suggest that our +hyperbolic-based learning rate schedulers offer a more robust and efficient +approach to training deep neural networks, especially in scenarios where +computational resources or time constraints limit extensive hyperparameter +searches. + +
+
+ comment: 30 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ A Simple and Scalable Representation for Graph Generation ICLR + + +
+ Recently, there has been a surge of interest in employing neural networks for +graph generation, a fundamental statistical learning problem with critical +applications like molecule design and community analysis. However, most +approaches encounter significant limitations when generating large-scale +graphs. This is due to their requirement to output the full adjacency matrices +whose size grows quadratically with the number of nodes. In response to this +challenge, we introduce a new, simple, and scalable graph representation named +gap encoded edge list (GEEL) that has a small representation size that aligns +with the number of edges. In addition, GEEL significantly reduces the +vocabulary size by incorporating the gap encoding and bandwidth restriction +schemes. GEEL can be autoregressively generated with the incorporation of node +positional encoding, and we further extend GEEL to deal with attributed graphs +by designing a new grammar. Our findings reveal that the adoption of this +compact representation not only enhances scalability but also bolsters +performance by simplifying the graph generation process. We conduct a +comprehensive evaluation across ten non-attributed and two molecular graph +generation tasks, demonstrating the effectiveness of GEEL. + +
+
+ comment: International Conference on Learning Representations (ICLR) 2024 +
+
+
+
+
+ + ♻ ☆ Is Hyper-Parameter Optimization Different for Software Analytics? + + +
+ Yes. SE data can have "smoother" boundaries between classes (compared to +traditional AI data sets). To be more precise, the magnitude of the second +derivative of the loss function found in SE data is typically much smaller. A +new hyper-parameter optimizer, called SMOOTHIE, can exploit this idiosyncrasy +of SE data. We compare SMOOTHIE and a state-of-the-art AI hyper-parameter +optimizer on three tasks: (a) GitHub issue lifetime prediction (b) detecting +static code warnings false alarm; (c) defect prediction. For completeness, we +also show experiments on some standard AI datasets. SMOOTHIE runs faster and +predicts better on the SE data--but ties on non-SE data with the AI tool. Hence +we conclude that SE data can be different to other kinds of data; and those +differences mean that we should use different kinds of algorithms for our data. +To support open science and other researchers working in this area, all our +scripts and datasets are available on-line at +https://github.com/yrahul3910/smoothness-hpo/. + +
+
+ comment: v2 +
+
+
+
+
+ + ♻ ☆ Fault Tolerant Serverless VFL Over Dynamic Device Environment + + +
+ Vertical Federated learning (VFL) is a class of FL where each client shares +the same set of samples but only owns a subset of the features. Usually, VFL +assumes perfect hardware and communication capabilities. However, this +assumption hinders the broad deployment of VFL, particularly on a network of +edge devices, which are heterogeneous in their in-situ capabilities while any +device may connect/disconnect from the network over time. To address this gap, +we study the test time performance of VFL under dynamic network conditions, +which we call DN-VFL. We first formalize DN-VFL, including a message passing +distributed inference algorithm, the corresponding risk, and a serverless +setup. We develop a novel DN-VFL approach called Multiple Aggregation with +Gossip Rounds and Simulated Faults (MAGS) that synthesizes replication, +gossiping, and selective feature omission to improve performance significantly +over baselines. Furthermore, we propose metrics and extensively analyze MAGS +using a simulated sensor network. The results show that naively using VFL for +DN-VFL is not the best approach. Rather, MAGS present a better alternative to +handle changes in the network during inference. + +
+
+
+
+
+ + ♻ ☆ K-Deep Simplex: Deep Manifold Learning via Local Dictionaries + + +
+ We propose K-Deep Simplex(KDS) which, given a set of data points, learns a +dictionary comprising synthetic landmarks, along with representation +coefficients supported on a simplex. KDS employs a local weighted $\ell_1$ +penalty that encourages each data point to represent itself as a convex +combination of nearby landmarks. We solve the proposed optimization program +using alternating minimization and design an efficient, interpretable +autoencoder using algorithm unrolling. We theoretically analyze the proposed +program by relating the weighted $\ell_1$ penalty in KDS to a weighted $\ell_0$ +program. Assuming that the data are generated from a Delaunay triangulation, we +prove the equivalence of the weighted $\ell_1$ and weighted $\ell_0$ programs. +We further show the stability of the representation coefficients under mild +geometrical assumptions. If the representation coefficients are fixed, we prove +that the sub-problem of minimizing over the dictionary yields a unique +solution. Further, we show that low-dimensional representations can be +efficiently obtained from the covariance of the coefficient matrix. Experiments +show that the algorithm is highly efficient and performs competitively on +synthetic and real data sets. + +
+
+ comment: 33 pages, 17 figures. This expanded version includes detailed + numerical experiments in the supplementary material. Theorem 3 is a new + stability result. The sections have been reorganized, and additional details + have been provided for clarity +
+
+
+
+
+ + ♻ ☆ LLM in a flash: Efficient Large Language Model Inference with Limited + Memory ACL 2024 + + +
+ Large language models (LLMs) are central to modern natural language +processing, delivering exceptional performance in various tasks. However, their +substantial computational and memory requirements present challenges, +especially for devices with limited DRAM capacity. This paper tackles the +challenge of efficiently running LLMs that exceed the available DRAM capacity +by storing the model parameters in flash memory, but bringing them on demand to +DRAM. Our method involves constructing an inference cost model that takes into +account the characteristics of flash memory, guiding us to optimize in two +critical areas: reducing the volume of data transferred from flash and reading +data in larger, more contiguous chunks. Within this hardware-informed +framework, we introduce two principal techniques. First, "windowing" +strategically reduces data transfer by reusing previously activated neurons, +and second, "row-column bundling", tailored to the sequential data access +strengths of flash memory, increases the size of data chunks read from flash +memory. These methods collectively enable running models up to twice the size +of the available DRAM, with a 4-5x and 20-25x increase in inference speed +compared to naive loading approaches in CPU and GPU, respectively. Our +integration of sparsity awareness, context-adaptive loading, and a +hardware-oriented design paves the way for effective inference of LLMs on +devices with limited memory. + +
+
+ comment: ACL 2024 +
+
+
+
+
+ + ♻ ☆ Localization from structured distance matrices via low-rank matrix + recovery + + +
+ We study the problem of determining the configuration of $n$ points by using +their distances to $m$ nodes, referred to as anchor nodes. One sampling scheme +is Nystrom sampling, which assumes known distances between the anchors and +between the anchors and the $n$ points, while the distances among the $n$ +points are unknown. For this scheme, a simple adaptation of the Nystrom method, +which is often used for kernel approximation, is a viable technique to estimate +the configuration of the anchors and the $n$ points. In this manuscript, we +propose a modified version of Nystrom sampling, where the distances from every +node to one central node are known, but all other distances are incomplete. In +this setting, the standard Nystrom approach is not applicable, necessitating an +alternative technique to estimate the configuration of the anchors and the $n$ +points. We show that this problem can be framed as the recovery of a low-rank +submatrix of a Gram matrix. Using synthetic and real data, we demonstrate that +the proposed approach can exactly recover configurations of points given +sufficient distance samples. This underscores that, in contrast to methods that +rely on global sampling of distance matrices, the task of estimating the +configuration of points can be done efficiently via structured sampling with +well-chosen reliable anchors. Finally, our main analysis is grounded in a +specific centering of the points. With this in mind, we extend previous work in +Euclidean distance geometry by providing a general dual basis approach for +points centered anywhere. + +
+
+ comment: 20 pages. Introduced a new sampling model. Experimental results on + both synthetic and real data. A new optimization program for structured + distance geometry based on low-rank recovery. The analysis of the previous + sampling model is also discussed. Made changes to improve the clarity and + presentation of the paper +
+
+
+
+
+ + ♻ ☆ Designing Informative Metrics for Few-Shot Example Selection + + +
+ Pretrained language models (PLMs) have shown remarkable few-shot learning +capabilities when provided with properly formatted examples. However, selecting +the "best" examples remains an open challenge. We propose a complexity-based +prompt selection approach for sequence tagging tasks. This approach avoids the +training of a dedicated model for selection of examples, and instead uses +certain metrics to align the syntactico-semantic complexity of test sentences +and examples. We use both sentence- and word-level metrics to match the +complexity of examples to the (test) sentence being considered. Our results +demonstrate that our approach extracts greater performance from PLMs: it +achieves state-of-the-art performance on few-shot NER, achieving a 5% absolute +improvement in F1 score on the CoNLL2003 dataset for GPT-4. We also see large +gains of upto 28.85 points (F1/Acc.) in smaller models like GPT-j-6B. + +
+
+
+
+
+ + ♻ ☆ In-class Data Analysis Replications: Teaching Students while Testing + Science + + +
+ Science is facing a reproducibility crisis. Previous work has proposed +incorporating data analysis replications into classrooms as a potential +solution. However, despite the potential benefits, it is unclear whether this +approach is feasible, and if so, what the involved stakeholders-students, +educators, and scientists-should expect from it. Can students perform a data +analysis replication over the course of a class? What are the costs and +benefits for educators? And how can this solution help benchmark and improve +the state of science? + In the present study, we incorporated data analysis replications in the +project component of the Applied Data Analysis course (CS-401) taught at EPFL +(N=354 students). Here we report pre-registered findings based on surveys +administered throughout the course. First, we demonstrate that students can +replicate previously published scientific papers, most of them qualitatively +and some exactly. We find discrepancies between what students expect of data +analysis replications and what they experience by doing them along with changes +in expectations about reproducibility, which together serve as evidence of +attitude shifts to foster students' critical thinking. Second, we provide +information for educators about how much overhead is needed to incorporate +replications into the classroom and identify concerns that replications bring +as compared to more traditional assignments. Third, we identify tangible +benefits of the in-class data analysis replications for scientific communities, +such as a collection of replication reports and insights about replication +barriers in scientific work that should be avoided going forward. + Overall, we demonstrate that incorporating replication tasks into a large +data science class can increase the reproducibility of scientific work as a +by-product of data science instruction, thus benefiting both science and +students. + +
+
+
+
+
+ + ♻ ☆ Parametric Matrix Models + + +
+ We present a general class of machine learning algorithms called parametric +matrix models. In contrast with most existing machine learning models that +imitate the biology of neurons, parametric matrix models use matrix equations +that emulate the physics of quantum systems. Similar to how physics problems +are usually solved, parametric matrix models learn the governing equations that +lead to the desired outputs. Parametric matrix models can be efficiently +trained from empirical data, and the equations may use algebraic, differential, +or integral relations. While originally designed for scientific computing, we +prove that parametric matrix models are universal function approximators that +can be applied to general machine learning problems. After introducing the +underlying theory, we apply parametric matrix models to a series of different +challenges that show their performance for a wide range of problems. For all +the challenges tested here, parametric matrix models produce accurate results +within an efficient and interpretable computational framework that allows for +input feature extrapolation. + +
+
+ comment: Exact same content as previous version (v4); corrected author email +
+
+
+
+
+ + ♻ ☆ YourMT3+: Multi-instrument Music Transcription with Enhanced Transformer + Architectures and Cross-dataset Stem Augmentation SP + + +
+ Multi-instrument music transcription aims to convert polyphonic music +recordings into musical scores assigned to each instrument. This task is +challenging for modeling as it requires simultaneously identifying multiple +instruments and transcribing their pitch and precise timing, and the lack of +fully annotated data adds to the training difficulties. This paper introduces +YourMT3+, a suite of models for enhanced multi-instrument music transcription +based on the recent language token decoding approach of MT3. We enhance its +encoder by adopting a hierarchical attention transformer in the time-frequency +domain and integrating a mixture of experts. To address data limitations, we +introduce a new multi-channel decoding method for training with incomplete +annotations and propose intra- and cross-stem augmentation for dataset mixing. +Our experiments demonstrate direct vocal transcription capabilities, +eliminating the need for voice separation pre-processors. Benchmarks across ten +public datasets show our models' competitiveness with, or superiority to, +existing transcription models. Further testing on pop music recordings +highlights the limitations of current models. Fully reproducible code and +datasets are available with demos at \url{https://github.com/mimbres/YourMT3}. + +
+
+ comment: 2024 IEEE International Workshop on Machine Learning for Signal + Processing (MLSP), Sept.\ 22--25, 2024, London, UK +
+
+
+
+
+ + ♻ ☆ A dual basis approach to multidimensional scaling + + +
+ Classical multidimensional scaling (CMDS) is a technique that embeds a set of +objects in a Euclidean space given their pairwise Euclidean distances. The main +part of CMDS involves double centering a squared distance matrix and using a +truncated eigendecomposition to recover the point coordinates. In this paper, +motivated by a study in Euclidean distance geometry, we explore a dual basis +approach to CMDS. We give an explicit formula for the dual basis vectors and +fully characterize the spectrum of an essential matrix in the dual basis +framework. We make connections to a related problem in metric nearness. + +
+
+ comment: 7 pages. The proof of dual basis representation is now compact. It is + not constructive compared to the previous version, but it uses + bi-orthogonality relation to establish the result more directly. A minor + error in the proof of the spectrum of the dual basis has been fixed. We also + made few changes for better clarity and presentation +
+
+
+
+
+ + ♻ ☆ LFFR: Logistic Function For (single-output) Regression + + +
+ Privacy-preserving regression in machine learning is a crucial area of +research, aimed at enabling the use of powerful machine learning techniques +while protecting individuals' privacy. In this paper, we implement +privacy-preserving regression training using data encrypted under a fully +homomorphic encryption scheme. We first examine the common linear regression +algorithm and propose a (simplified) fixed Hessian for linear regression +training, which can be applied for any datasets even not normalized into the +range $[0, 1]$. We also generalize this constant Hessian matrix to the ridge +regression version, namely linear regression which includes a regularization +term to penalize large coefficients. However, our main contribution is to +develop a novel and efficient algorithm called LFFR for homomorphic regression +using the logistic function, which could model more complex relations between +input values and output prediction in comparison with linear regression. We +also find a constant simplified Hessian to train our LFFR algorithm using the +Newton-like method and compare it against to with our new fixed Hessian linear +regression training over two real-world datasets. We suggest normalizing not +only the data but also the target predictions even for the original linear +regression used in a privacy-preserving manner, which is helpful to remain +weights in a small range, say $[-5, +5]$ good for refreshing ciphertext setting +parameters, and avoid tuning the regularization parameter $\lambda$ via cross +validation. The linear regression with normalized predictions could be a viable +alternative to ridge regression. + +
+
+
+
+
+ + ♻ ☆ Optimizing LaneSegNet for Real-Time Lane Topology Prediction in + Autonomous Vehicles + + +
+ With the increasing prevalence of autonomous vehicles, it is essential for +computer vision algorithms to accurately assess road features in real-time. +This study explores the LaneSegNet architecture, a new approach to lane +topology prediction which integrates topological information with lane-line +data to provide a more contextual understanding of road environments. The +LaneSegNet architecture includes a feature extractor, lane encoder, lane +decoder, and prediction head, leveraging components from ResNet-50, BEVFormer, +and various attention mechanisms. We experimented with optimizations to the +LaneSegNet architecture through feature extractor modification and transformer +encoder-decoder stack modification. We found that modifying the encoder and +decoder stacks offered an interesting tradeoff between training time and +prediction accuracy, with certain combinations showing promising results. Our +implementation, trained on a single NVIDIA Tesla A100 GPU, found that a 2:4 +ratio reduced training time by 22.3% with only a 7.1% drop in mean average +precision, while a 4:8 ratio increased training time by only 11.1% but improved +mean average precision by a significant 23.7%. These results indicate that +strategic hyperparameter tuning can yield substantial improvements depending on +the resources of the user. This study provides valuable insights for optimizing +LaneSegNet according to available computation power, making it more accessible +for users with limited resources and increasing the capabilities for users with +more powerful resources. + +
+
+ comment: 18 pages, 16 figures +
+
+
+
+
+ + ♻ ☆ Toward Automated Detection of Biased Social Signals from the Content of + Clinical Conversations + + +
+ Implicit bias can impede patient-provider interactions and lead to inequities +in care. Raising awareness is key to reducing such bias, but its manifestations +in the social dynamics of patient-provider communication are difficult to +detect. In this study, we used automated speech recognition (ASR) and natural +language processing (NLP) to identify social signals in patient-provider +interactions. We built an automated pipeline to predict social signals from +audio recordings of 782 primary care visits that achieved 90.1% average +accuracy across codes, and exhibited fairness in its predictions for white and +non-white patients. Applying this pipeline, we identified statistically +significant differences in provider communication behavior toward white versus +non-white patients. In particular, providers expressed more patient-centered +behaviors towards white patients including more warmth, engagement, and +attentiveness. Our study underscores the potential of automated tools in +identifying subtle communication signals that may be linked with bias and +impact healthcare quality and equity. + +
+
+ comment: Accepted by AMIA 2024 Annual Symposium +
+
+
+
+
+ + ♻ ☆ Prediction Instability in Machine Learning Ensembles ICML2024 + + +
+ In machine learning ensembles predictions from multiple models are +aggregated. Despite widespread use and strong performance of ensembles in +applied problems little is known about the mathematical properties of +aggregating models and associated consequences for safe, explainable use of +such models. In this paper we prove a theorem that shows that any ensemble will +exhibit at least one of the following forms of prediction instability. It will +either ignore agreement among all underlying models, change its mind when none +of the underlying models have done so, or be manipulable through inclusion or +exclusion of options it would never actually predict. As a consequence, +ensemble aggregation procedures will always need to balance the benefits of +information use against the risk of these prediction instabilities. This +analysis also sheds light on what specific forms of prediction instability to +expect from particular ensemble algorithms; for example popular tree ensembles +like random forest, or xgboost will violate basic, intuitive fairness +properties. Finally, we show that this can be ameliorated by using consistent +models in asymptotic conditions. + +
+
+ comment: 15 pages, uses a modified version of ICML2024.sty +
+
+
+
+
+ + ♻ ☆ Martian time-series unraveled: A multi-scale nested approach with + factorial variational autoencoders + + +
+ Unsupervised source separation involves unraveling an unknown set of source +signals recorded through a mixing operator, with limited prior knowledge about +the sources, and only access to a dataset of signal mixtures. This problem is +inherently ill-posed and is further challenged by the variety of timescales +exhibited by sources in time series data from planetary space missions. As +such, a systematic multi-scale unsupervised approach is needed to identify and +separate sources at different timescales. Existing methods typically rely on a +preselected window size that determines their operating timescale, limiting +their capacity to handle multi-scale sources. To address this issue, we propose +an unsupervised multi-scale clustering and source separation framework by +leveraging wavelet scattering spectra that provide a low-dimensional +representation of stochastic processes, capable of distinguishing between +different non-Gaussian stochastic processes. Nested within this representation +space, we develop a factorial variational autoencoder that is trained to +probabilistically cluster sources at different timescales. To perform source +separation, we use samples from clusters at multiple timescales obtained via +the factorial variational autoencoder as prior information and formulate an +optimization problem in the wavelet scattering spectra representation space. +When applied to the entire seismic dataset recorded during the NASA InSight +mission on Mars, containing sources varying greatly in timescale, our approach +disentangles such different sources, e.g., minute-long transient one-sided +pulses (known as "glitches") and structured ambient noises resulting from +atmospheric activities that typically last for tens of minutes, and provides an +opportunity to conduct further investigations into the isolated sources. + +
+
+
+
+
+ + ♻ ☆ Switching the Loss Reduces the Cost in Batch Reinforcement Learning + + +
+ We propose training fitted Q-iteration with log-loss (FQI-log) for batch +reinforcement learning (RL). We show that the number of samples needed to learn +a near-optimal policy with FQI-log scales with the accumulated cost of the +optimal policy, which is zero in problems where acting optimally achieves the +goal and incurs no cost. In doing so, we provide a general framework for +proving small-cost bounds, i.e. bounds that scale with the optimal achievable +cost, in batch RL. Moreover, we empirically verify that FQI-log uses fewer +samples than FQI trained with squared loss on problems where the optimal policy +reliably achieves the goal. + +
+
+
+
+
+
+
+
+ + Multimedia 10 + +
+
+
+ + ☆ MMTrail: A Multimodal Trailer Video Dataset with Language and Music + Descriptions + + +
+ Massive multi-modality datasets play a significant role in facilitating the +success of large video-language models. However, current video-language +datasets primarily provide text descriptions for visual frames, considering +audio to be weakly related information. They usually overlook exploring the +potential of inherent audio-visual correlation, leading to monotonous +annotation within each modality instead of comprehensive and precise +descriptions. Such ignorance results in the difficulty of multiple +cross-modality studies. To fulfill this gap, we present MMTrail, a large-scale +multi-modality video-language dataset incorporating more than 20M trailer clips +with visual captions, and 2M high-quality clips with multimodal captions. +Trailers preview full-length video works and integrate context, visual frames, +and background music. In particular, the trailer has two main advantages: (1) +the topics are diverse, and the content characters are of various types, e.g., +film, news, and gaming. (2) the corresponding background music is +custom-designed, making it more coherent with the visual context. Upon these +insights, we propose a systemic captioning framework, achieving various +modality annotations with more than 27.1k hours of trailer videos. Here, to +ensure the caption retains music perspective while preserving the authority of +visual context, we leverage the advanced LLM to merge all annotations +adaptively. In this fashion, our MMtrail dataset potentially paves the path for +fine-grained large multimodal-language model training. In experiments, we +provide evaluation metrics and benchmark results on our dataset, demonstrating +the high quality of our annotation and its effectiveness for model training. + +
+
+ comment: 15 Pages. Dataset report +
+
+
+
+
+ + ☆ Optimizing 5G-Advanced Networks for Time-critical Applications: The Role + of L4S + + +
+ As 5G networks strive to support advanced time-critical applications, such as +immersive Extended Reality (XR), cloud gaming, and autonomous driving, the +demand for Real-time Broadband Communication (RTBC) grows. In this article, we +present the main mechanisms of Low Latency, Low Loss, and Scalable Throughput +(L4S). Subsequently, we investigate the support and challenges of L4S +technology in the latest 3GPP 5G-Advanced Release 18 (R18) standard. Our case +study, using a prototype system for a real-time communication (RTC) +application, demonstrates the superiority of L4S technology. The experimental +results show that, compared with the GCC algorithm, the proposed L4S-GCC +algorithm can reduce the stalling rate by 1.51%-2.80% and increase the +bandwidth utilization by 11.4%-31.4%. The results emphasize the immense +potential of L4S technology in enhancing transmission performance in +time-critical applications. + +
+
+ comment: 7 pages, 3 figures. This work has been submitted to the IEEE for + possible publication. Copyright may be transferred without notice, after + which this version may no longer be accessible +
+
+
+
+
+ + ☆ Boosting Audio Visual Question Answering via Key Semantic-Aware Cues ACM MM 2024 + + +
+ The Audio Visual Question Answering (AVQA) task aims to answer questions +related to various visual objects, sounds, and their interactions in videos. +Such naturally multimodal videos contain rich and complex dynamic audio-visual +components, with only a portion of them closely related to the given questions. +Hence, effectively perceiving audio-visual cues relevant to the given questions +is crucial for correctly answering them. In this paper, we propose a +Temporal-Spatial Perception Model (TSPM), which aims to empower the model to +perceive key visual and auditory cues related to the questions. Specifically, +considering the challenge of aligning non-declarative questions and visual +representations into the same semantic space using visual-language pretrained +models, we construct declarative sentence prompts derived from the question +template, to assist the temporal perception module in better identifying +critical segments relevant to the questions. Subsequently, a spatial perception +module is designed to merge visual tokens from selected segments to highlight +key latent targets, followed by cross-modal interaction with audio to perceive +potential sound-aware areas. Finally, the significant temporal-spatial cues +from these modules are integrated to answer the question. Extensive experiments +on multiple AVQA benchmarks demonstrate that our framework excels not only in +understanding audio-visual scenes but also in answering complex questions +effectively. Code is available at https://github.com/GeWu-Lab/TSPM. + +
+
+ comment: Accepted by ACM MM 2024 +
+
+
+
+
+ + ☆ EgoSonics: Generating Synchronized Audio for Silent Egocentric Videos + + +
+ We introduce EgoSonics, a method to generate semantically meaningful and +synchronized audio tracks conditioned on silent egocentric videos. Generating +audio for silent egocentric videos could open new applications in virtual +reality, assistive technologies, or for augmenting existing datasets. Existing +work has been limited to domains like speech, music, or impact sounds and +cannot easily capture the broad range of audio frequencies found in egocentric +videos. EgoSonics addresses these limitations by building on the strength of +latent diffusion models for conditioned audio synthesis. We first encode and +process audio and video data into a form that is suitable for generation. The +encoded data is used to train our model to generate audio tracks that capture +the semantics of the input video. Our proposed SyncroNet builds on top of +ControlNet to provide control signals that enables temporal synchronization to +the synthesized audio. Extensive evaluations show that our model outperforms +existing work in audio quality, and in our newly proposed synchronization +evaluation method. Furthermore, we demonstrate downstream applications of our +model in improving video summarization. + +
+
+ comment: preprint +
+
+
+
+
+ + ☆ Wireless Multi-User Interactive Virtual Reality in Metaverse with + Edge-Device Collaborative Computing + + +
+ The immersive nature of the metaverse presents significant challenges for +wireless multi-user interactive virtual reality (VR), such as ultra-low +latency, high throughput and intensive computing, which place substantial +demands on the wireless bandwidth and rendering resources of mobile edge +computing (MEC). In this paper, we propose a wireless multi-user interactive VR +with edge-device collaborative computing framework to overcome the +motion-to-photon (MTP) threshold bottleneck. Specifically, we model the +serial-parallel task execution in queues within a foreground and background +separation architecture. The rendering indices of background tiles within the +prediction window are determined, and both the foreground and selected +background tiles are loaded into respective processing queues based on the +rendering locations. To minimize the age of sensor information and the power +consumption of mobile devices, we optimize rendering decisions and MEC resource +allocation subject to the MTP constraint. To address this optimization problem, +we design a safe reinforcement learning (RL) algorithm, active queue +management-constrained updated projection (AQM-CUP). AQM-CUP constructs an +environment suitable for queues, incorporating expired tiles actively discarded +in processing buffers into its state and reward system. Experimental results +demonstrate that the proposed framework significantly enhances user immersion +while reducing device power consumption, and the superiority of the proposed +AQM-CUP algorithm over conventional methods in terms of the training +convergence and performance metrics. + +
+
+ comment: submitted to IEEE journal +
+
+
+
+
+ + ♻ ☆ Rethinking Radiology Report Generation via Causal Inspired + Counterfactual Augmentation + + +
+ Radiology Report Generation (RRG) draws attention as a vision-and-language +interaction of biomedical fields. Previous works inherited the ideology of +traditional language generation tasks, aiming to generate paragraphs with high +readability as reports. Despite significant progress, the independence between +diseases-a specific property of RRG-was neglected, yielding the models being +confused by the co-occurrence of diseases brought on by the biased data +distribution, thus generating inaccurate reports. In this paper, to rethink +this issue, we first model the causal effects between the variables from a +causal perspective, through which we prove that the co-occurrence relationships +between diseases on the biased distribution function as confounders, confusing +the accuracy through two backdoor paths, i.e. the Joint Vision Coupling and the +Conditional Sequential Coupling. Then, we proposed a novel model-agnostic +counterfactual augmentation method that contains two strategies, i.e. the +Prototype-based Counterfactual Sample Synthesis (P-CSS) and the Magic-Cube-like +Counterfactual Report Reconstruction (Cube), to intervene the backdoor paths, +thus enhancing the accuracy and generalization of RRG models. Experimental +results on the widely used MIMIC-CXR dataset demonstrate the effectiveness of +our proposed method. Additionally, a generalization performance is evaluated on +IU X-Ray dataset, which verifies our work can effectively reduce the impact of +co-occurrences caused by different distributions on the results. + +
+
+ comment: 10 pages,5 figures +
+
+
+
+
+ + ♻ ☆ Versatile audio-visual learning for emotion recognition + + +
+ Most current audio-visual emotion recognition models lack the flexibility +needed for deployment in practical applications. We envision a multimodal +system that works even when only one modality is available and can be +implemented interchangeably for either predicting emotional attributes or +recognizing categorical emotions. Achieving such flexibility in a multimodal +emotion recognition system is difficult due to the inherent challenges in +accurately interpreting and integrating varied data sources. It is also a +challenge to robustly handle missing or partial information while allowing +direct switch between regression or classification tasks. This study proposes a +versatile audio-visual learning (VAVL) framework for handling unimodal and +multimodal systems for emotion regression or emotion classification tasks. We +implement an audio-visual framework that can be trained even when audio and +visual paired data is not available for part of the training set (i.e., audio +only or only video is present). We achieve this effective representation +learning with audio-visual shared layers, residual connections over shared +layers, and a unimodal reconstruction task. Our experimental results reveal +that our architecture significantly outperforms strong baselines on the +CREMA-D, MSP-IMPROV, and CMU-MOSEI corpora. Notably, VAVL attains a new +state-of-the-art performance in the emotional attribute prediction task on the +MSP-IMPROV corpus. + +
+
+ comment: 18 pages, 4 Figures, 3 tables (published at IEEE Transactions on + Affective Computing) +
+
+
+
+
+ + ♻ ☆ An Inverse Partial Optimal Transport Framework for Music-guided Movie + Trailer Generation + + +
+ Trailer generation is a challenging video clipping task that aims to select +highlighting shots from long videos like movies and re-organize them in an +attractive way. In this study, we propose an inverse partial optimal transport +(IPOT) framework to achieve music-guided movie trailer generation. In +particular, we formulate the trailer generation task as selecting and sorting +key movie shots based on audio shots, which involves matching the latent +representations across visual and acoustic modalities. We learn a multi-modal +latent representation model in the proposed IPOT framework to achieve this aim. +In this framework, a two-tower encoder derives the latent representations of +movie and music shots, respectively, and an attention-assisted Sinkhorn +matching network parameterizes the grounding distance between the shots' latent +representations and the distribution of the movie shots. Taking the +correspondence between the movie shots and its trailer music shots as the +observed optimal transport plan defined on the grounding distances, we learn +the model by solving an inverse partial optimal transport problem, leading to a +bi-level optimization strategy. We collect real-world movies and their trailers +to construct a dataset with abundant label information called CMTD and, +accordingly, train and evaluate various automatic trailer generators. Compared +with state-of-the-art methods, our IPOT method consistently shows superiority +in subjective visual effects and objective quantitative measurements. + +
+
+ comment: acmmm2024 +
+
+
+
+
+ + ♻ ☆ AxiomVision: Accuracy-Guaranteed Adaptive Visual Model Selection for + Perspective-Aware Video Analytics ACM MM 2024 + + +
+ The rapid evolution of multimedia and computer vision technologies requires +adaptive visual model deployment strategies to effectively handle diverse tasks +and varying environments. This work introduces AxiomVision, a novel framework +that can guarantee accuracy by leveraging edge computing to dynamically select +the most efficient visual models for video analytics under diverse scenarios. +Utilizing a tiered edge-cloud architecture, AxiomVision enables the deployment +of a broad spectrum of visual models, from lightweight to complex DNNs, that +can be tailored to specific scenarios while considering camera source impacts. +In addition, AxiomVision provides three core innovations: (1) a dynamic visual +model selection mechanism utilizing continual online learning, (2) an efficient +online method that efficiently takes into account the influence of the camera's +perspective, and (3) a topology-driven grouping approach that accelerates the +model selection process. With rigorous theoretical guarantees, these +advancements provide a scalable and effective solution for visual tasks +inherent to multimedia systems, such as object detection, classification, and +counting. Empirically, AxiomVision achieves a 25.7\% improvement in accuracy. + +
+
+ comment: Accepted by ACM MM 2024 +
+
+
+
+
+ + ♻ ☆ RAVSS: Robust Audio-Visual Speech Separation in Multi-Speaker Scenarios + with Missing Visual Cues + + +
+ While existing Audio-Visual Speech Separation (AVSS) methods primarily +concentrate on the audio-visual fusion strategy for two-speaker separation, +they demonstrate a severe performance drop in the multi-speaker separation +scenarios. Typically, AVSS methods employ guiding videos to sequentially +isolate individual speakers from the given audio mixture, resulting in notable +missing and noisy parts across various segments of the separated speech. In +this study, we propose a simultaneous multi-speaker separation framework that +can facilitate the concurrent separation of multiple speakers within a singular +process. We introduce speaker-wise interactions to establish distinctions and +correlations among speakers. Experimental results on the VoxCeleb2 and LRS3 +datasets demonstrate that our method achieves state-of-the-art performance in +separating mixtures with 2, 3, 4, and 5 speakers, respectively. Additionally, +our model can utilize speakers with complete audio-visual information to +mitigate other visual-deficient speakers, thereby enhancing its resilience to +missing visual cues. We also conduct experiments where visual information for +specific speakers is entirely absent or visual frames are partially missing. +The results demonstrate that our model consistently outperforms others, +exhibiting the smallest performance drop across all settings involving 2, 3, 4, +and 5 speakers. + +
+
+ comment: Accepted by MM 2024 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 74 + +
+
+
+ + ☆ Can Editing LLMs Inject Harm? + + +
+ Knowledge editing techniques have been increasingly adopted to efficiently +correct the false or outdated knowledge in Large Language Models (LLMs), due to +the high cost of retraining from scratch. Meanwhile, one critical but +under-explored question is: can knowledge editing be used to inject harm into +LLMs? In this paper, we propose to reformulate knowledge editing as a new type +of safety threat for LLMs, namely Editing Attack, and conduct a systematic +investigation with a newly constructed dataset EditAttack. Specifically, we +focus on two typical safety risks of Editing Attack including Misinformation +Injection and Bias Injection. For the risk of misinformation injection, we +first categorize it into commonsense misinformation injection and long-tail +misinformation injection. Then, we find that editing attacks can inject both +types of misinformation into LLMs, and the effectiveness is particularly high +for commonsense misinformation injection. For the risk of bias injection, we +discover that not only can biased sentences be injected into LLMs with high +effectiveness, but also one single biased sentence injection can cause a high +bias increase in general outputs of LLMs, which are even highly irrelevant to +the injected sentence, indicating a catastrophic impact on the overall fairness +of LLMs. Then, we further illustrate the high stealthiness of editing attacks, +measured by their impact on the general knowledge and reasoning capacities of +LLMs, and show the hardness of defending editing attacks with empirical +evidence. Our discoveries demonstrate the emerging misuse risks of knowledge +editing techniques on compromising the safety alignment of LLMs. + +
+
+ comment: The first two authors contributed equally. 9 pages for main paper, 36 + pages including appendix. The code, results, dataset for this paper and more + resources are on the project website: https://llm-editing.github.io +
+
+
+
+
+ + ☆ QAEA-DR: A Unified Text Augmentation Framework for Dense Retrieval + + +
+ In dense retrieval, embedding long texts into dense vectors can result in +information loss, leading to inaccurate query-text matching. Additionally, +low-quality texts with excessive noise or sparse key information are unlikely +to align well with relevant queries. Recent studies mainly focus on improving +the sentence embedding model or retrieval process. In this work, we introduce a +novel text augmentation framework for dense retrieval. This framework +transforms raw documents into information-dense text formats, which supplement +the original texts to effectively address the aforementioned issues without +modifying embedding or retrieval methodologies. Two text representations are +generated via large language models (LLMs) zero-shot prompting: question-answer +pairs and element-driven events. We term this approach QAEA-DR: unifying +question-answer generation and event extraction in a text augmentation +framework for dense retrieval. To further enhance the quality of generated +texts, a scoring-based evaluation and regeneration mechanism is introduced in +LLM prompting. Our QAEA-DR model has a positive impact on dense retrieval, +supported by both theoretical analysis and empirical experiments. + +
+
+
+
+
+ + ☆ Aligning Query Representation with Rewritten Query and Relevance + Judgments in Conversational Search CIKM 2024 + + +
+ Conversational search supports multi-turn user-system interactions to solve +complex information needs. Different from the traditional single-turn ad-hoc +search, conversational search encounters a more challenging problem of +context-dependent query understanding with the lengthy and long-tail +conversational history context. While conversational query rewriting methods +leverage explicit rewritten queries to train a rewriting model to transform the +context-dependent query into a stand-stone search query, this is usually done +without considering the quality of search results. Conversational dense +retrieval methods use fine-tuning to improve a pre-trained ad-hoc query +encoder, but they are limited by the conversational search data available for +training. In this paper, we leverage both rewritten queries and relevance +judgments in the conversational search data to train a better query +representation model. The key idea is to align the query representation with +those of rewritten queries and relevant documents. The proposed model -- Query +Representation Alignment Conversational Dense Retriever, QRACDR, is tested on +eight datasets, including various settings in conversational search and ad-hoc +search. The results demonstrate the strong performance of QRACDR compared with +state-of-the-art methods, and confirm the effectiveness of representation +alignment. + +
+
+ comment: Accepted by CIKM 2024 +
+
+
+
+
+ + ☆ MindSearch: Mimicking Human Minds Elicits Deep AI Searcher + + +
+ Information seeking and integration is a complex cognitive task that consumes +enormous time and effort. Inspired by the remarkable progress of Large Language +Models, recent works attempt to solve this task by combining LLMs and search +engines. However, these methods still obtain unsatisfying performance due to +three challenges: (1) complex requests often cannot be accurately and +completely retrieved by the search engine once (2) corresponding information to +be integrated is spread over multiple web pages along with massive noise, and +(3) a large number of web pages with long contents may quickly exceed the +maximum context length of LLMs. Inspired by the cognitive process when humans +solve these problems, we introduce MindSearch to mimic the human minds in web +information seeking and integration, which can be instantiated by a simple yet +effective LLM-based multi-agent framework. The WebPlanner models the human mind +of multi-step information seeking as a dynamic graph construction process: it +decomposes the user query into atomic sub-questions as nodes in the graph and +progressively extends the graph based on the search result from WebSearcher. +Tasked with each sub-question, WebSearcher performs hierarchical information +retrieval with search engines and collects valuable information for WebPlanner. +The multi-agent design of MindSearch enables the whole framework to seek and +integrate information parallelly from larger-scale (e.g., more than 300) web +pages in 3 minutes, which is worth 3 hours of human effort. MindSearch +demonstrates significant improvement in the response quality in terms of depth +and breadth, on both close-set and open-set QA problems. Besides, responses +from MindSearch based on InternLM2.5-7B are preferable by humans to ChatGPT-Web +and Perplexity.ai applications, which implies that MindSearch can already +deliver a competitive solution to the proprietary AI search engine. + +
+
+ comment: Technical Report. Project Page: https://mindsearch.netlify.app Code: + https://github.com/InternLM/MindSearch +
+
+
+
+
+ + ☆ AutoScale: Automatic Prediction of Compute-optimal Data Composition for + Training LLMs + + +
+ To ensure performance on a diverse set of downstream tasks, LLMs are +pretrained via data mixtures over different domains. In this work, we +demonstrate that the optimal data composition for a fixed compute budget varies +depending on the scale of the training data, suggesting that the common +practice of empirically determining an optimal composition using small-scale +experiments will not yield the optimal data mixtures when scaling up to the +final model. To address this challenge, we propose *AutoScale*, an automated +tool that finds a compute-optimal data composition for training at any desired +target scale. AutoScale first determines the optimal composition at a small +scale using a novel bilevel optimization framework, Direct Data Optimization +(*DDO*), and then fits a predictor to estimate the optimal composition at +larger scales. The predictor's design is inspired by our theoretical analysis +of scaling laws related to data composition, which could be of independent +interest. In empirical studies with pre-training 774M Decoder-only LMs (GPT-2 +Large) on RedPajama dataset, AutoScale decreases validation perplexity at least +25% faster than any baseline with up to 38% speed up compared to without +reweighting, achieving the best overall performance across downstream tasks. On +pre-training Encoder-only LMs (BERT) with masked language modeling, DDO is +shown to decrease loss on all domains while visibly improving average task +performance on GLUE benchmark by 8.7% and on large-scale QA dataset (SQuAD) by +5.9% compared with without reweighting. AutoScale speeds up training by up to +28%. Our codes are open-sourced. + +
+
+
+
+
+ + ☆ An Energy-based Model for Word-level AutoCompletion in Computer-aided + Translation ACL 2024 + + +
+ Word-level AutoCompletion(WLAC) is a rewarding yet challenging task in +Computer-aided Translation. Existing work addresses this task through a +classification model based on a neural network that maps the hidden vector of +the input context into its corresponding label (i.e., the candidate target word +is treated as a label). Since the context hidden vector itself does not take +the label into account and it is projected to the label through a linear +classifier, the model can not sufficiently leverage valuable information from +the source sentence as verified in our experiments, which eventually hinders +its overall performance. To alleviate this issue, this work proposes an +energy-based model for WLAC, which enables the context hidden vector to capture +crucial information from the source sentence. Unfortunately, training and +inference suffer from efficiency and effectiveness challenges, thereby we +employ three simple yet effective strategies to put our model into practice. +Experiments on four standard benchmarks demonstrate that our reranking-based +approach achieves substantial improvements (about 6.07%) over the previous +state-of-the-art model. Further analyses show that each strategy of our +approach contributes to the final performance. + +
+
+ comment: Accepted to TACL 2024 +
+
+
+
+
+ + ☆ Investigating the Impact of Semi-Supervised Methods with Data + Augmentation on Offensive Language Detection in Romanian Language + + +
+ Offensive language detection is a crucial task in today's digital landscape, +where online platforms grapple with maintaining a respectful and inclusive +environment. However, building robust offensive language detection models +requires large amounts of labeled data, which can be expensive and +time-consuming to obtain. Semi-supervised learning offers a feasible solution +by utilizing labeled and unlabeled data to create more accurate and robust +models. In this paper, we explore a few different semi-supervised methods, as +well as data augmentation techniques. Concretely, we implemented eight +semi-supervised methods and ran experiments for them using only the available +data in the RO-Offense dataset and applying five augmentation techniques before +feeding the data to the models. Experimental results demonstrate that some of +them benefit more from augmentations than others. + +
+
+ comment: 10 pages, 3 figures, 28th International Conference on Knowledge-Based + and Intelligent Information & Engineering Systems +
+
+
+
+
+ + ☆ Exploring Large Language Models to generate Easy to Read content + + +
+ Ensuring text accessibility and understandability are essential goals, +particularly for individuals with cognitive impairments and intellectual +disabilities, who encounter challenges in accessing information across various +mediums such as web pages, newspapers, administrative tasks, or health +documents. Initiatives like Easy to Read and Plain Language guidelines aim to +simplify complex texts; however, standardizing these guidelines remains +challenging and often involves manual processes. This work presents an +exploratory investigation into leveraging Artificial Intelligence (AI) and +Natural Language Processing (NLP) approaches to systematically simplify Spanish +texts into Easy to Read formats, with a focus on utilizing Large Language +Models (LLMs) for simplifying texts, especially in generating Easy to Read +content. The study contributes a parallel corpus of Spanish adapted for Easy To +Read format, which serves as a valuable resource for training and testing text +simplification systems. Additionally, several text simplification experiments +using LLMs and the collected corpus are conducted, involving fine-tuning and +testing a Llama2 model to generate Easy to Read content. A qualitative +evaluation, guided by an expert in text adaptation for Easy to Read content, is +carried out to assess the automatically simplified texts. This research +contributes to advancing text accessibility for individuals with cognitive +impairments, highlighting promising strategies for leveraging LLMs while +responsibly managing energy usage. + +
+
+
+
+
+ + ☆ Do LLMs Really Adapt to Domains? An Ontology Learning Perspective ISWC 2024 + + +
+ Large Language Models (LLMs) have demonstrated unprecedented prowess across +various natural language processing tasks in various application domains. +Recent studies show that LLMs can be leveraged to perform lexical semantic +tasks, such as Knowledge Base Completion (KBC) or Ontology Learning (OL). +However, it has not effectively been verified whether their success is due to +their ability to reason over unstructured or semi-structured data, or their +effective learning of linguistic patterns and senses alone. This unresolved +question is particularly crucial when dealing with domain-specific data, where +the lexical senses and their meaning can completely differ from what a LLM has +learned during its training stage. This paper investigates the following +question: Do LLMs really adapt to domains and remain consistent in the +extraction of structured knowledge, or do they only learn lexical senses +instead of reasoning? To answer this question and, we devise a controlled +experiment setup that uses WordNet to synthesize parallel corpora, with English +and gibberish terms. We examine the differences in the outputs of LLMs for each +corpus in two OL tasks: relation extraction and taxonomy discovery. Empirical +results show that, while adapting to the gibberish corpora, off-the-shelf LLMs +do not consistently reason over semantic relationships between concepts, and +instead leverage senses and their frame. However, fine-tuning improves the +performance of LLMs on lexical semantic tasks even when the domain-specific +terms are arbitrary and unseen during pre-training, hinting at the +applicability of pre-trained LLMs for OL. + +
+
+ comment: Accepted at ISWC 2024 +
+
+
+
+
+ + ☆ Confidence Estimation for Automatic Detection of Depression and + Alzheimer's Disease Based on Clinical Interviews + + +
+ Speech-based automatic detection of Alzheimer's disease (AD) and depression +has attracted increased attention. Confidence estimation is crucial for a +trust-worthy automatic diagnostic system which informs the clinician about the +confidence of model predictions and helps reduce the risk of misdiagnosis. This +paper investigates confidence estimation for automatic detection of AD and +depression based on clinical interviews. A novel Bayesian approach is proposed +which uses a dynamic Dirichlet prior distribution to model the second-order +probability of the predictive distribution. Experimental results on the +publicly available ADReSS and DAIC-WOZ datasets demonstrate that the proposed +method outperforms a range of baselines for both classification accuracy and +confidence estimation. + +
+
+ comment: Accepted by Interspeech 2024 +
+
+
+
+
+ + ☆ A Temporal Psycholinguistics Approach to Identity Resolution of Social + Media Users + + +
+ In this thesis, we propose an approach to identity resolution across social +media platforms using the topics, sentiments, and timings of the posts on the +platforms. After collecting the public posts of around 5000 profiles from +Disqus and Twitter, we analyze their posts to match their profiles across the +two platforms. We pursue both temporal and non-temporal methods in our +analysis. While neither approach proves definitively superior, the temporal +approach generally performs better. We found that the temporal window size +influences results more than the shifting amount. On the other hand, our +sentiment analysis shows that the inclusion of sentiment makes little +difference, probably due to flawed data extraction methods. We also +experimented with a distance-based reward-and-punishment-focused scoring model, +which achieved an accuracy of 24.198% and an average rank of 158.217 out of +2525 in our collected corpus. Future work includes refining sentiment analysis +by evaluating sentiments per topic, extending temporal analysis with additional +phases, and improving the scoring model through weight adjustments and modified +rewards. + +
+
+
+
+
+ + ☆ Inference acceleration for large language models using "stairs" assisted + greedy generation + + +
+ Large Language Models (LLMs) with billions of parameters are known for their +impressive predicting capabilities but require lots of resources to run. With +their massive rise in popularity, even a small reduction in required resources +could have an impact on environment. On the other hand, smaller models require +fewer resources but may sacrifice accuracy. In this work, we are proposing an +implementation of ``stairs'' assisted greedy generation. It is a modified +assisted generation methodology that makes use of a smaller model's fast +generation, large model's batch prediction, and "stairs" validation in order to +achieve a speed up in prediction generation. Results show between 9.58 and +17.24 percent inference time reduction compared to a stand-alone large LLM +prediction in a text generation task without a loss in accuracy. + +
+
+ comment: Accepted at the 29th International Conference on Information Society + and University Studies (IVUS 2024) +
+
+
+
+
+ + ☆ Sentiment Analysis of Lithuanian Online Reviews Using Large Language + Models + + +
+ Sentiment analysis is a widely researched area within Natural Language +Processing (NLP), attracting significant interest due to the advent of +automated solutions. Despite this, the task remains challenging because of the +inherent complexity of languages and the subjective nature of sentiments. It is +even more challenging for less-studied and less-resourced languages such as +Lithuanian. Our review of existing Lithuanian NLP research reveals that +traditional machine learning methods and classification algorithms have limited +effectiveness for the task. In this work, we address sentiment analysis of +Lithuanian five-star-based online reviews from multiple domains that we collect +and clean. We apply transformer models to this task for the first time, +exploring the capabilities of pre-trained multilingual Large Language Models +(LLMs), specifically focusing on fine-tuning BERT and T5 models. Given the +inherent difficulty of the task, the fine-tuned models perform quite well, +especially when the sentiments themselves are less ambiguous: 80.74% and 89.61% +testing recognition accuracy of the most popular one- and five-star reviews +respectively. They significantly outperform current commercial state-of-the-art +general-purpose LLM GPT-4. We openly share our fine-tuned LLMs online. + +
+
+ comment: Accepted at the 29th International Conference on Information Society + and University Studies (IVUS 2024) +
+
+
+
+
+ + ☆ BEExAI: Benchmark to Evaluate Explainable AI + + +
+ Recent research in explainability has given rise to numerous post-hoc +attribution methods aimed at enhancing our comprehension of the outputs of +black-box machine learning models. However, evaluating the quality of +explanations lacks a cohesive approach and a consensus on the methodology for +deriving quantitative metrics that gauge the efficacy of explainability +post-hoc attribution methods. Furthermore, with the development of increasingly +complex deep learning models for diverse data applications, the need for a +reliable way of measuring the quality and correctness of explanations is +becoming critical. We address this by proposing BEExAI, a benchmark tool that +allows large-scale comparison of different post-hoc XAI methods, employing a +set of selected evaluation metrics. + +
+
+
+
+
+ + ☆ Preliminary WMT24 Ranking of General MT Systems and LLMs + + +
+ This is the preliminary ranking of WMT24 General MT systems based on +automatic metrics. The official ranking will be a human evaluation, which is +superior to the automatic ranking and supersedes it. The purpose of this report +is not to interpret any findings but only provide preliminary results to the +participants of the General MT task that may be useful during the writing of +the system submission. + +
+
+
+
+
+ + ☆ Detecting and Understanding Vulnerabilities in Language Models via + Mechanistic Interpretability + + +
+ Large Language Models (LLMs), characterized by being trained on broad amounts +of data in a self-supervised manner, have shown impressive performance across a +wide range of tasks. Indeed, their generative abilities have aroused interest +on the application of LLMs across a wide range of contexts. However, neural +networks in general, and LLMs in particular, are known to be vulnerable to +adversarial attacks, where an imperceptible change to the input can mislead the +output of the model. This is a serious concern that impedes the use of LLMs on +high-stakes applications, such as healthcare, where a wrong prediction can +imply serious consequences. Even though there are many efforts on making LLMs +more robust to adversarial attacks, there are almost no works that study +\emph{how} and \emph{where} these vulnerabilities that make LLMs prone to +adversarial attacks happen. Motivated by these facts, we explore how to +localize and understand vulnerabilities, and propose a method, based on +Mechanistic Interpretability (MI) techniques, to guide this process. +Specifically, this method enables us to detect vulnerabilities related to a +concrete task by (i) obtaining the subset of the model that is responsible for +that task, (ii) generating adversarial samples for that task, and (iii) using +MI techniques together with the previous samples to discover and understand the +possible vulnerabilities. We showcase our method on a pretrained GPT-2 Small +model carrying out the task of predicting 3-letter acronyms to demonstrate its +effectiveness on locating and understanding concrete vulnerabilities of the +model. + +
+
+
+
+
+ + ☆ ATHAR: A High-Quality and Diverse Dataset for Classical Arabic to + English Translation + + +
+ Classical Arabic represents a significant era, encompassing the golden age of +Arab culture, philosophy, and scientific literature. With a broad consensus on +the importance of translating these literatures to enrich knowledge +dissemination across communities, the advent of large language models (LLMs) +and translation systems offers promising tools to facilitate this goal. +However, we have identified a scarcity of translation datasets in Classical +Arabic, which are often limited in scope and topics, hindering the development +of high-quality translation systems. In response, we present the ATHAR dataset, +comprising 66,000 high-quality Classical Arabic to English translation samples +that cover a wide array of subjects including science, culture, and philosophy. +Furthermore, we assess the performance of current state-of-the-art LLMs under +various settings, concluding that there is a need for such datasets in current +systems. Our findings highlight how models can benefit from fine-tuning or +incorporating this dataset into their pretraining pipelines. The dataset is +publicly available on the HuggingFace Data Hub at +\url{https://huggingface.co/datasets/mohamed-khalil/ATHAR}. + +
+
+
+
+
+ + ☆ ML-Mamba: Efficient Multi-Modal Large Language Model Utilizing Mamba-2 + + +
+ Multimodal Large Language Models (MLLMs) have attracted much attention due to +their multifunctionality. However, traditional Transformer architectures incur +significant overhead due to their secondary computational complexity. To +address this issue, we introduce ML-Mamba, a multimodal language model that +utilizes the latest and efficient Mamba-2 model for inference. Mamba-2 is known +for its linear extension and fast processing of long sequences. We replace the +Transformer based backbone with a pre-trained Mamba-2 model and explore methods +for integrating 2D visual selective scanning mechanisms into multimodal +learning. We also try various visual encoders and Mamba-2 model variants. Our +extensive experiments conducted in various multimodal benchmark tests have +demonstrated the competitive performance of ML-Mamba and highlighted the +potential of state space models in multimodal tasks. The experimental results +show that: (1) ML-Mamba achieves performance comparable to state-of-the-art +methods such as TinyLaVA and MobileVLM v2 through its linear sequential +modeling, while also having faster inference speed; (2) ML-Mamba performs well +in visual hallucinations and spatial relationship judgment in closed set +benchmark tests; (3) ML-Mamba achieves performance comparable to LLaVA while +reducing the number of parameters by 40\%.(4) Compared to the multimodal model +using the original Mamba model, the Mamba-2 based large-scale multimodal +language model has stronger inference performance and effectiveness. + +
+
+
+
+
+ + ☆ Concise Thoughts: Impact of Output Length on LLM Reasoning and Cost + + +
+ Today's large language models (LLMs) can solve challenging question-answering +tasks, and prompt engineering techniques, such as chain-of-thought (CoT), have +gained attention for enhancing the explanation and correctness of outputs. +Nevertheless, models require significant time to generate answers augmented +with lengthy reasoning details. To address this issue, this paper analyzes the +impact of output lengths on LLM inference pipelines and proposes novel metrics +to evaluate them in terms of \textit{correct conciseness}. It also examines the +impact of controlling output length through a refined prompt engineering +strategy, Constrained-CoT (CCoT), which encourages the model to limit output +length. Experiments on pre-trained LLMs demonstrated the benefit of the +proposed metrics and the effectiveness of CCoT across different models. For +instance, constraining the reasoning of LLaMA2-70b to 100 words improves the +accuracy from 36.01\% (CoT) to 41.07\% (CCoT) on the GSM8K dataset, while +reducing the average output length by 28 words. + +
+
+ comment: Preprint version, under review +
+
+
+
+
+ + ☆ Comparative Analysis of Encoder-Based NER and Large Language Models for + Skill Extraction from Russian Job Vacancies + + +
+ The labor market is undergoing rapid changes, with increasing demands on job +seekers and a surge in job openings. Identifying essential skills and +competencies from job descriptions is challenging due to varying employer +requirements and the omission of key skills. This study addresses these +challenges by comparing traditional Named Entity Recognition (NER) methods +based on encoders with Large Language Models (LLMs) for extracting skills from +Russian job vacancies. Using a labeled dataset of 4,000 job vacancies for +training and 1,472 for testing, the performance of both approaches is +evaluated. Results indicate that traditional NER models, especially DeepPavlov +RuBERT NER tuned, outperform LLMs across various metrics including accuracy, +precision, recall, and inference time. The findings suggest that traditional +NER models provide more effective and efficient solutions for skill extraction, +enhancing job requirement clarity and aiding job seekers in aligning their +qualifications with employer expectations. This research contributes to the +field of natural language processing (NLP) and its application in the labor +market, particularly in non-English contexts. + +
+
+
+
+
+ + ☆ Improving Retrieval Augmented Language Model with Self-Reasoning + + +
+ The Retrieval-Augmented Language Model (RALM) has shown remarkable +performance on knowledge-intensive tasks by incorporating external knowledge +during inference, which mitigates the factual hallucinations inherited in large +language models (LLMs). Despite these advancements, challenges persist in the +implementation of RALMs, particularly concerning their reliability and +traceability. To be specific, the irrelevant document retrieval may result in +unhelpful response generation or even deteriorate the performance of LLMs, +while the lack of proper citations in generated outputs complicates efforts to +verify the trustworthiness of the models. To this end, we propose a novel +self-reasoning framework aimed at improving the reliability and traceability of +RALMs, whose core idea is to leverage reasoning trajectories generated by the +LLM itself. The framework involves constructing self-reason trajectories with +three processes: a relevance-aware process, an evidence-aware selective +process, and a trajectory analysis process. We have evaluated our framework +across four public datasets (two short-form QA datasets, one long-form QA +dataset, and one fact verification dataset) to demonstrate the superiority of +our method, which can outperform existing state-of-art models and can achieve +comparable performance with GPT-4, while only using 2,000 training samples. + +
+
+
+
+
+ + ☆ Segmentation en phrases : ouvrez les guillemets sans perdre le fil + + +
+ This paper presents a graph cascade for sentence segmentation of XML +documents. Our proposal offers sentences inside sentences for cases introduced +by quotation marks and hyphens, and also pays particular attention to +situations involving incises introduced by parentheses and lists introduced by +colons. We present how the tool works and compare the results obtained with +those available in 2019 on the same dataset, together with an evaluation of the +system's performance on a test corpus + +
+
+ comment: in French language +
+
+
+
+
+ + ☆ Cool-Fusion: Fuse Large Language Models without Training + + +
+ We focus on the problem of fusing two or more heterogeneous large language +models (LLMs) to facilitate their complementary strengths. One of the +challenges on model fusion is high computational load, i.e. to fine-tune or to +align vocabularies via combinatorial optimization. To this end, we propose +\emph{Cool-Fusion}, a simple yet effective approach that fuses the knowledge of +heterogeneous source LLMs to leverage their complementary strengths. +\emph{Cool-Fusion} is the first method that does not require any type of +training like the ensemble approaches. But unlike ensemble methods, it is +applicable to any set of source LLMs that have different vocabularies. The +basic idea is to have each source LLM individually generate tokens until the +tokens can be decoded into a text segment that ends at word boundaries common +to all source LLMs. Then, the source LLMs jointly rerank the generated text +segment and select the best one, which is the fused text generation in one +step. Extensive experiments are conducted across a variety of benchmark +datasets. On \emph{GSM8K}, \emph{Cool-Fusion} increases accuracy from three +strong source LLMs by a significant 8\%-17.8\%. + +
+
+
+
+
+ + ☆ Teaching LLMs at Charles University: Assignments and Activities ACL 2024 + + +
+ This paper presents teaching materials, particularly assignments and ideas +for classroom activities, from a new course on large language models (LLMs) +taught at Charles University. The assignments include experiments with LLM +inference for weather report generation and machine translation. The classroom +activities include class quizzes, focused research on downstream tasks and +datasets, and an interactive "best paper" session aimed at reading and +comprehension of research papers. + +
+
+ comment: 6th TeachNLP workshop at ACL 2024 +
+
+
+
+
+ + ☆ VolDoGer: LLM-assisted Datasets for Domain Generalization in + Vision-Language Tasks + + +
+ Domain generalizability is a crucial aspect of a deep learning model since it +determines the capability of the model to perform well on data from unseen +domains. However, research on the domain generalizability of deep learning +models for vision-language tasks remains limited, primarily because of the lack +of required datasets. To address these challenges, we propose VolDoGer: +Vision-Language Dataset for Domain Generalization, a dedicated dataset designed +for domain generalization that addresses three vision-language tasks: image +captioning, visual question answering, and visual entailment. We constructed +VolDoGer by extending LLM-based data annotation techniques to vision-language +tasks, thereby alleviating the burden of recruiting human annotators. We +evaluated the domain generalizability of various models, ranging from +fine-tuned models to a recent multimodal large language model, through +VolDoGer. + +
+
+ comment: 31 pages, 5 figures, 20 tables +
+
+
+
+
+ + ☆ Introducing a new hyper-parameter for RAG: Context Window Utilization + + +
+ This paper introduces a new hyper-parameter for Retrieval-Augmented +Generation (RAG) systems called Context Window Utilization. RAG systems enhance +generative models by incorporating relevant information retrieved from external +knowledge bases, improving the factual accuracy and contextual relevance of +generated responses. The size of the text chunks retrieved and processed is a +critical factor influencing RAG performance. This study aims to identify the +optimal chunk size that maximizes answer generation quality. Through systematic +experimentation, we analyze the effects of varying chunk sizes on the +efficiency and effectiveness of RAG frameworks. Our findings reveal that an +optimal chunk size balances the trade-off between providing sufficient context +and minimizing irrelevant information. These insights are crucial for enhancing +the design and implementation of RAG systems, underscoring the importance of +selecting an appropriate chunk size to achieve superior performance. + +
+
+
+
+
+ + ☆ Synthesizing Scientific Summaries: An Extractive and Abstractive + Approach + + +
+ The availability of a vast array of research papers in any area of study, +necessitates the need of automated summarisation systems that can present the +key research conducted and their corresponding findings. Scientific paper +summarisation is a challenging task for various reasons including token length +limits in modern transformer models and corresponding memory and compute +requirements for long text. A significant amount of work has been conducted in +this area, with approaches that modify the attention mechanisms of existing +transformer models and others that utilise discourse information to capture +long range dependencies in research papers. In this paper, we propose a hybrid +methodology for research paper summarisation which incorporates an extractive +and abstractive approach. We use the extractive approach to capture the key +findings of research, and pair it with the introduction of the paper which +captures the motivation for research. We use two models based on unsupervised +learning for the extraction stage and two transformer language models, +resulting in four combinations for our hybrid approach. The performances of the +models are evaluated on three metrics and we present our findings in this +paper. We find that using certain combinations of hyper parameters, it is +possible for automated summarisation systems to exceed the abstractiveness of +summaries written by humans. Finally, we state our future scope of research in +extending this methodology to summarisation of generalised long documents. + +
+
+ comment: the paper consists of 10 pages , 5 figures and 4 tables +
+
+
+
+
+ + ☆ Model Agnostic Hybrid Sharding For Heterogeneous Distributed Inference + + +
+ The rapid growth of large-scale AI models, particularly large language models +has brought significant challenges in data privacy, computational resources, +and accessibility. Traditional centralized architectures often struggle to meet +required data security and scalability needs which hinders the democratization +of AI systems. Nesa introduces a model-agnostic sharding framework designed for +decentralized AI inference. Our framework uses blockchain-based sequential deep +neural network sharding to distribute computational tasks across a diverse +network of nodes based on a personalised heuristic and routing mechanism. This +enables efficient distributed training and inference for recent large-scale +models even on consumer-grade hardware. We use compression techniques like +dynamic blockwise quantization and mixed matrix decomposition to reduce data +transfer and memory needs. We also integrate robust security measures, +including hardware-based trusted execution environments to ensure data +integrity and confidentiality. Evaluating our system across various natural +language processing and vision tasks shows that these compression strategies do +not compromise model accuracy. Our results highlight the potential to +democratize access to cutting-edge AI technologies by enabling secure and +efficient inference on a decentralized network. + +
+
+
+
+
+ + ☆ Legal Minds, Algorithmic Decisions: How LLMs Apply Constitutional + Principles in Complex Scenarios + + +
+ In this paper, we conduct an empirical analysis of how large language models +(LLMs), specifically GPT-4, interpret constitutional principles in complex +decision-making scenarios. We examine rulings from the Italian Constitutional +Court on bioethics issues that involve trade-offs between competing values and +compare model-generated legal arguments on these issues to those presented by +the State, the Court, and the applicants. Our results indicate that GPT-4 +consistently aligns more closely with progressive interpretations of the +Constitution, often overlooking competing values and mirroring the applicants' +views rather than the more conservative perspectives of the State or the +Court's moderate positions. Our experiments reveal a distinct tendency of GPT-4 +to favor progressive legal interpretations, underscoring the influence of +underlying data biases. We thus underscore the importance of testing alignment +in real-world scenarios and considering the implications of deploying LLMs in +decision-making processes. + +
+
+ comment: Accepted at AIES24 +
+
+
+
+
+ + ☆ KNOWCOMP POKEMON Team at DialAM-2024: A Two-Stage Pipeline for Detecting + Relations in Dialogical Argument Mining + + +
+ Dialogical Argument Mining(DialAM) is an important branch of Argument +Mining(AM). DialAM-2024 is a shared task focusing on dialogical argument +mining, which requires us to identify argumentative relations and illocutionary +relations among proposition nodes and locution nodes. To accomplish this, we +propose a two-stage pipeline, which includes the Two-Step S-Node Prediction +Model in Stage 1 and the YA-Node Prediction Model in Stage 2. We also augment +the training data in both stages and introduce context in Stage 2. We +successfully completed the task and achieved good results. Our team Pokemon +ranked 1st in the ARI Focused score and 4th in the Global Focused score. + +
+
+ comment: Published on the 11th Workshop on Argument Mining +
+
+
+
+
+ + ☆ Do Text-to-Vis Benchmarks Test Real Use of Visualisations? + + +
+ Large language models are able to generate code for visualisations in +response to user requests. This is a useful application, and an appealing one +for NLP research because plots of data provide grounding for language. However, +there are relatively few benchmarks, and it is unknown whether those that exist +are representative of what people do in practice. This paper aims to answer +that question through an empirical study comparing benchmark datasets and code +from public repositories. Our findings reveal a substantial gap in datasets, +with evaluations not testing the same distribution of chart types, attributes, +and the number of actions. The only representative dataset requires +modification to become an end-to-end and practical benchmark. This shows that +new, more benchmarks are needed to support the development of systems that +truly address users' visualisation needs. These observations will guide future +data creation, highlighting which features hold genuine significance for users. + +
+
+ comment: ARR AE score of 4 +
+
+
+
+
+ + ☆ CollectiveSFT: Scaling Large Language Models for Chinese Medical + Benchmark with Collective Instructions in Healthcare + + +
+ The rapid progress in Large Language Models (LLMs) has prompted the creation +of numerous benchmarks to evaluate their capabilities.This study focuses on the +Comprehensive Medical Benchmark in Chinese (CMB), showcasing how dataset +diversity and distribution in supervised fine-tuning (SFT) may enhance LLM +performance.Remarkably, We successfully trained a smaller base model to achieve +scores comparable to larger models, indicating that a diverse and +well-distributed dataset can optimize performance regardless of model size.This +study suggests that even smaller models may reach high performance levels with +carefully curated and varied datasets.By integrating a wide range of +instructional content, our approach addresses potential issues such as data +quality inconsistencies. Our results imply that a broader spectrum of training +data may enhance a model's ability to generalize and perform effectively across +different medical scenarios, highlighting the importance of dataset quality and +diversity in fine-tuning processes. + +
+
+
+
+
+ + ☆ Efficiently and Effectively: A Two-stage Approach to Balance Plaintext + and Encrypted Text for Traffic Classification + + +
+ Encrypted traffic classification is the task of identifying the application +or service associated with encrypted network traffic. One effective approach +for this task is to use deep learning methods to encode the raw traffic bytes +directly and automatically extract features for classification (byte-based +models). However, current byte-based models input raw traffic bytes, whether +plaintext or encrypted text, for automated feature extraction, neglecting the +distinct impacts of plaintext and encrypted text on downstream tasks. +Additionally, these models primarily focus on improving classification +accuracy, with little emphasis on the efficiency of models. In this paper, for +the first time, we analyze the impact of plaintext and encrypted text on the +model's effectiveness and efficiency. Based on our observations and findings, +we propose a two-phase approach to balance the trade-off between plaintext and +encrypted text in traffic classification. Specifically, Stage one is to +Determine whether the Plain text is enough to be accurately Classified (DPC) +using the proposed DPC Selector. This stage quickly identifies samples that can +be classified using plaintext, leveraging explicit byte features in plaintext +to enhance model's efficiency. Stage two aims to adaptively make a +classification with the result from stage one. This stage incorporates +encrypted text information for samples that cannot be classified using +plaintext alone, ensuring the model's effectiveness on traffic classification +tasks. Experiments on two datasets demonstrate that our proposed model achieves +state-of-the-art results in both effectiveness and efficiency. + +
+
+
+
+
+ + ☆ SeaLLMs 3: Open Foundation and Chat Multilingual Large Language Models + for Southeast Asian Languages + + +
+ Large Language Models (LLMs) have shown remarkable abilities across various +tasks, yet their development has predominantly centered on high-resource +languages like English and Chinese, leaving low-resource languages underserved. +To address this disparity, we present SeaLLMs 3, the latest iteration of the +SeaLLMs model family, tailored for Southeast Asian languages. This region, +characterized by its rich linguistic diversity, has lacked adequate language +technology support. SeaLLMs 3 aims to bridge this gap by covering a +comprehensive range of languages spoken in this region, including English, +Chinese, Indonesian, Vietnamese, Thai, Tagalog, Malay, Burmese, Khmer, Lao, +Tamil, and Javanese. Leveraging efficient language enhancement techniques and a +specially constructed instruction tuning dataset, SeaLLMs 3 significantly +reduces training costs while maintaining high performance and versatility. Our +model excels in tasks such as world knowledge, mathematical reasoning, +translation, and instruction following, achieving state-of-the-art performance +among similarly sized models. Additionally, we prioritized safety and +reliability by addressing both general and culture-specific considerations and +incorporated mechanisms to reduce hallucinations. This work underscores the +importance of inclusive AI, showing that advanced LLM capabilities can benefit +underserved linguistic and cultural communities. + +
+
+
+
+
+ + ☆ Overview of PerpectiveArg2024: The First Shared Task on Perspective + Argument Retrieval + + +
+ Argument retrieval is the task of finding relevant arguments for a given +query. While existing approaches rely solely on the semantic alignment of +queries and arguments, this first shared task on perspective argument retrieval +incorporates perspectives during retrieval, accounting for latent influences in +argumentation. We present a novel multilingual dataset covering demographic and +socio-cultural (socio) variables, such as age, gender, and political attitude, +representing minority and majority groups in society. We distinguish between +three scenarios to explore how retrieval systems consider explicitly (in both +query and corpus) and implicitly (only in query) formulated perspectives. This +paper provides an overview of this shared task and summarizes the results of +the six submitted systems. We find substantial challenges in incorporating +perspectivism, especially when aiming for personalization based solely on the +text of arguments without explicitly providing socio profiles. Moreover, +retrieval systems tend to be biased towards the majority group but partially +mitigate bias for the female gender. While we bootstrap perspective argument +retrieval, further research is essential to optimize retrieval systems to +facilitate personalization and reduce polarization. + +
+
+
+
+
+ + ☆ mGTE: Generalized Long-Context Text Representation and Reranking Models + for Multilingual Text Retrieval + + +
+ We present systematic efforts in building long-context multilingual text +representation model (TRM) and reranker from scratch for text retrieval. We +first introduce a text encoder (base size) enhanced with RoPE and unpadding, +pre-trained in a native 8192-token context (longer than 512 of previous +multilingual encoders). Then we construct a hybrid TRM and a cross-encoder +reranker by contrastive learning. Evaluations show that our text encoder +outperforms the same-sized previous state-of-the-art XLM-R. Meanwhile, our TRM +and reranker match the performance of large-sized state-of-the-art BGE-M3 +models and achieve better results on long-context retrieval benchmarks. Further +analysis demonstrate that our proposed models exhibit higher efficiency during +both training and inference. We believe their efficiency and effectiveness +could benefit various researches and industrial applications. + +
+
+ comment: 20 pages, 5 figures +
+
+
+
+
+ + ☆ From Pre-training Corpora to Large Language Models: What Factors + Influence LLM Performance in Causal Discovery Tasks? + + +
+ Recent advances in artificial intelligence have seen Large Language Models +(LLMs) demonstrate notable proficiency in causal discovery tasks. This study +explores the factors influencing the performance of LLMs in causal discovery +tasks. Utilizing open-source LLMs, we examine how the frequency of causal +relations within their pre-training corpora affects their ability to accurately +respond to causal discovery queries. Our findings reveal that a higher +frequency of causal mentions correlates with better model performance, +suggesting that extensive exposure to causal information during training +enhances the models' causal discovery capabilities. Additionally, we +investigate the impact of context on the validity of causal relations. Our +results indicate that LLMs might exhibit divergent predictions for identical +causal relations when presented in different contexts. This paper provides the +first comprehensive analysis of how different factors contribute to LLM +performance in causal discovery tasks. + +
+
+
+
+
+ + ☆ LoginMEA: Local-to-Global Interaction Network for Multi-modal Entity + Alignment ECAI 2024 + + +
+ Multi-modal entity alignment (MMEA) aims to identify equivalent entities +between two multi-modal knowledge graphs (MMKGs), whose entities can be +associated with relational triples and related images. Most previous studies +treat the graph structure as a special modality, and fuse different modality +information with separate uni-modal encoders, neglecting valuable relational +associations in modalities. Other studies refine each uni-modal information +with graph structures, but may introduce unnecessary relations in specific +modalities. To this end, we propose a novel local-to-global interaction network +for MMEA, termed as LoginMEA. Particularly, we first fuse local multi-modal +interactions to generate holistic entity semantics and then refine them with +global relational interactions of entity neighbors. In this design, the +uni-modal information is fused adaptively, and can be refined with relations +accordingly. To enrich local interactions of multi-modal entity information, we +device modality weights and low-rank interactive fusion, allowing diverse +impacts and element-level interactions among modalities. To capture global +interactions of graph structures, we adopt relation reflection graph attention +networks, which fully capture relational associations between entities. +Extensive experiments demonstrate superior results of our method over 5 +cross-KG or bilingual benchmark datasets, indicating the effectiveness of +capturing local and global interactions. + +
+
+ comment: Accepted by ECAI 2024 +
+
+
+
+
+ + ☆ TopicTag: Automatic Annotation of NMF Topic Models Using Chain of + Thought and Prompt Tuning with LLMs + + +
+ Topic modeling is a technique for organizing and extracting themes from large +collections of unstructured text. Non-negative matrix factorization (NMF) is a +common unsupervised approach that decomposes a term frequency-inverse document +frequency (TF-IDF) matrix to uncover latent topics and segment the dataset +accordingly. While useful for highlighting patterns and clustering documents, +NMF does not provide explicit topic labels, necessitating subject matter +experts (SMEs) to assign labels manually. We present a methodology for +automating topic labeling in documents clustered via NMF with automatic model +determination (NMFk). By leveraging the output of NMFk and employing prompt +engineering, we utilize large language models (LLMs) to generate accurate topic +labels. Our case study on over 34,000 scientific abstracts on Knowledge Graphs +demonstrates the effectiveness of our method in enhancing knowledge management +and document organization. + +
+
+ comment: Accepted to ACM Symposium on Document Engineering 2024 (DocEng 24), + 2024 +
+
+
+
+
+ + ☆ CoMMIT: Coordinated Instruction Tuning for Multimodal Large Language + Models + + +
+ Instruction tuning in multimodal large language models (MLLMs) aims to +smoothly integrate a backbone LLM with a pre-trained feature encoder for +downstream tasks. The major challenge is how to efficiently find the synergy +through cooperative learning where LLMs adapt their reasoning abilities in +downstream tasks while feature encoders adjust their encoding to provide more +relevant modal information. In this paper, we analyze the MLLM instruction +tuning from both theoretical and empirical perspectives, where we find +unbalanced learning between the two components, i.e., the feature encoder and +the LLM, can cause diminishing learning gradients that slow the model +convergence and often lead to sub-optimal results due to insufficient learning. +Inspired by our findings, we propose a measurement to quantitatively evaluate +the learning balance, based on which we further design a dynamic learning +scheduler that better coordinates the learning. In addition, we introduce an +auxiliary loss regularization method to promote updating of the generation +distribution of MLLMs considering the learning state of each model component, +which potentially prevents each component from gradient diminishing and enables +a more accurate estimation of the learning balance coefficient. We conduct +experiments with multiple LLM backbones and feature encoders, where our +techniques are model-agnostic and can be generically integrated with various +MLLM backbones. Experiment results on multiple downstream tasks and modalities +in vision and audio, demonstrate the proposed method's better efficiency and +effectiveness in MLLM instruction tuning. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ Generating Gender Alternatives in Machine Translation + + +
+ Machine translation (MT) systems often translate terms with ambiguous gender +(e.g., English term "the nurse") into the gendered form that is most prevalent +in the systems' training data (e.g., "enfermera", the Spanish term for a female +nurse). This often reflects and perpetuates harmful stereotypes present in +society. With MT user interfaces in mind that allow for resolving gender +ambiguity in a frictionless manner, we study the problem of generating all +grammatically correct gendered translation alternatives. We open source train +and test datasets for five language pairs and establish benchmarks for this +task. Our key technical contribution is a novel semi-supervised solution for +generating alternatives that integrates seamlessly with standard MT models and +maintains high performance without requiring additional components or +increasing inference overhead. + +
+
+ comment: GeBNLP 2024 +
+
+
+
+
+ + ☆ Through the Looking Glass, and what Horn Clause Programs Found There + + +
+ Dual Horn clauses mirror key properties of Horn clauses. This paper explores +the ``other side of the looking glass'' to reveal some expected and unexpected +symmetries and their practical uses. + We revisit Dual Horn clauses as enablers of a form of constructive negation +that supports goal-driven forward reasoning and is valid both +intuitionistically and classically. In particular, we explore the ability to +falsify a counterfactual hypothesis in the context of a background theory +expressed as a Dual Horn clause program. + With Dual Horn clause programs, by contrast to negation as failure, the +variable bindings in their computed answers provide explanations for the +reasons why a statement is successfully falsified. Moreover, in the +propositional case, by contrast to negation as failure as implemented with +stable models semantics in ASP systems, and similarly to Horn clause programs, +Dual Horn clause programs have polynomial complexity. + After specifying their execution model with a metainterpreter, we devise a +compilation scheme from Dual Horn clause programs to Horn clause programs, +ensuring their execution with no performance penalty and we design the embedded +SymLP language to support combined Horn clause and Dual Horn clause programs. + As a (motivating) application, we cast LLM reasoning chains into +propositional Horn and Dual Horn clauses that work together to constructively +prove and disprove goals and enhance Generative AI with explainability of +reasoning chains. + +
+
+
+
+
+ + ☆ What if Red Can Talk? Dynamic Dialogue Generation Using Large Language + Models ACL + + +
+ Role-playing games (RPGs) provide players with a rich, interactive world to +explore. Dialogue serves as the primary means of communication between +developers and players, manifesting in various forms such as guides, NPC +interactions, and storytelling. While most games rely on written scripts to +define the main story and character personalities, player immersion can be +significantly enhanced through casual interactions between characters. With the +advent of large language models (LLMs), we introduce a dialogue filler +framework that utilizes LLMs enhanced by knowledge graphs to generate dynamic +and contextually appropriate character interactions. We test this framework +within the environments of Final Fantasy VII Remake and Pokemon, providing +qualitative and quantitative evidence that demonstrates GPT-4's capability to +act with defined personalities and generate dialogue. However, some flaws +remain, such as GPT-4 being overly positive or more subtle personalities, such +as maturity, tend to be of lower quality compared to more overt traits like +timidity. This study aims to assist developers in crafting more nuanced filler +dialogues, thereby enriching player immersion and enhancing the overall RPG +experience. + +
+
+ comment: ACL Wordplay 2024 +
+
+
+
+
+ + ☆ Gender, Race, and Intersectional Bias in Resume Screening via Language + Model Retrieval AAAI + + +
+ Artificial intelligence (AI) hiring tools have revolutionized resume +screening, and large language models (LLMs) have the potential to do the same. +However, given the biases which are embedded within LLMs, it is unclear whether +they can be used in this scenario without disadvantaging groups based on their +protected attributes. In this work, we investigate the possibilities of using +LLMs in a resume screening setting via a document retrieval framework that +simulates job candidate selection. Using that framework, we then perform a +resume audit study to determine whether a selection of Massive Text Embedding +(MTE) models are biased in resume screening scenarios. We simulate this for +nine occupations, using a collection of over 500 publicly available resumes and +500 job descriptions. We find that the MTEs are biased, significantly favoring +White-associated names in 85.1\% of cases and female-associated names in only +11.1\% of cases, with a minority of cases showing no statistically significant +differences. Further analyses show that Black males are disadvantaged in up to +100\% of cases, replicating real-world patterns of bias in employment settings, +and validate three hypotheses of intersectionality. We also find an impact of +document length as well as the corpus frequency of names in the selection of +resumes. These findings have implications for widely used AI tools that are +automating employment, fairness, and tech policy. + +
+
+ comment: To be published in Proceedings of the 2024 AAAI/ACM Conference on AI, + Ethics, and Society; code available at + https://github.com/kyrawilson/Resume-Screening-Bias +
+
+
+
+
+ + ☆ BRIDGE: Bridging Gaps in Image Captioning Evaluation with Stronger + Visual Cues ECCV 2024 + + +
+ Effectively aligning with human judgment when evaluating machine-generated +image captions represents a complex yet intriguing challenge. Existing +evaluation metrics like CIDEr or CLIP-Score fall short in this regard as they +do not take into account the corresponding image or lack the capability of +encoding fine-grained details and penalizing hallucinations. To overcome these +issues, in this paper, we propose BRIDGE, a new learnable and reference-free +image captioning metric that employs a novel module to map visual features into +dense vectors and integrates them into multi-modal pseudo-captions which are +built during the evaluation process. This approach results in a multimodal +metric that properly incorporates information from the input image without +relying on reference captions, bridging the gap between human judgment and +machine-generated image captions. Experiments spanning several datasets +demonstrate that our proposal achieves state-of-the-art results compared to +existing reference-free evaluation scores. Our source code and trained models +are publicly available at: https://github.com/aimagelab/bridge-score. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ☆ Physics of Language Models: Part 2.1, Grade-School Math and the Hidden + Reasoning Process ICML 2024 + + +
+ Recent advances in language models have demonstrated their capability to +solve mathematical reasoning problems, achieving near-perfect accuracy on +grade-school level math benchmarks like GSM8K. In this paper, we formally study +how language models solve these problems. We design a series of controlled +experiments to address several fundamental questions: (1) Can language models +truly develop reasoning skills, or do they simply memorize templates? (2) What +is the model's hidden (mental) reasoning process? (3) Do models solve math +questions using skills similar to or different from humans? (4) Do models +trained on GSM8K-like datasets develop reasoning skills beyond those necessary +for solving GSM8K problems? (5) What mental process causes models to make +reasoning mistakes? (6) How large or deep must a model be to effectively solve +GSM8K-level math questions? + Our study uncovers many hidden mechanisms by which language models solve +mathematical questions, providing insights that extend beyond current +understandings of LLMs. + +
+
+ comment: video appeared in ICML 2024 tutorial +
+
+
+
+
+ + ☆ Genetic Instruct: Scaling up Synthetic Generation of Coding Instructions + for Large Language Models + + +
+ Large Language Models (LLMs) rely on instruction samples for alignment, but +creating these datasets poses challenges, particularly in expert-dependent +tasks like coding, which can be cost-prohibitive. One approach to mitigate +these challenges is synthesizing data using another LLM. In this paper, we +introduce a scalable method for generating synthetic instructions to enhance +the code generation capability of LLMs. The proposed algorithm, +Genetic-Instruct, mimics evolutionary processes, utilizing self-instruction to +create numerous synthetic samples from a limited number of seeds. +Genetic-Instruct is designed for efficient scaling of the generation process. +Fine-tuning multiple coding LLMs with the synthetic samples demonstrates a +significant improvement in their code generation accuracy compared to the +baselines. + +
+
+
+
+
+ + ☆ Apple Intelligence Foundation Language Models + + +
+ We present foundation language models developed to power Apple Intelligence +features, including a ~3 billion parameter model designed to run efficiently on +devices and a large server-based language model designed for Private Cloud +Compute. These models are designed to perform a wide range of tasks +efficiently, accurately, and responsibly. This report describes the model +architecture, the data used to train the model, the training process, how the +models are optimized for inference, and the evaluation results. We highlight +our focus on Responsible AI and how the principles are applied throughout the +model development. + +
+
+
+
+
+ + ☆ Investigating the Impact of Semi-Supervised Methods with Data + Augmentation on Offensive Language Detection in Romanian Language + + +
+ Offensive language detection is a crucial task in today's digital landscape, +where online platforms grapple with maintaining a respectful and inclusive +environment. However, building robust offensive language detection models +requires large amounts of labeled data, which can be expensive and +time-consuming to obtain. Semi-supervised learning offers a feasible solution +by utilizing labeled and unlabeled data to create more accurate and robust +models. In this paper, we explore a few different semi-supervised methods, as +well as data augmentation techniques. Concretely, we implemented eight +semi-supervised methods and ran experiments for them using only the available +data in the RO-Offense dataset and applying five augmentation techniques before +feeding the data to the models. Experimental results demonstrate that some of +them benefit more from augmentations than others. + +
+
+ comment: Accepted at KES 2024 +
+
+
+
+
+ + ♻ ☆ Matryoshka Multimodal Models + + +
+ Large Multimodal Models (LMMs) such as LLaVA have shown strong performance in +visual-linguistic reasoning. These models first embed images into a fixed large +number of visual tokens and then feed them into a Large Language Model (LLM). +However, this design causes an excessive number of tokens for dense visual +scenarios such as high-resolution images and videos, leading to great +inefficiency. While token pruning/merging methods do exist, they produce a +single length output for each image and do not afford flexibility in trading +off information density v.s. efficiency. Inspired by the concept of Matryoshka +Dolls, we propose M3: Matryoshka Multimodal Models, which learns to represent +visual content as nested sets of visual tokens that capture information across +multiple coarse-to-fine granularities. Our approach offers several unique +benefits for LMMs: (1) One can explicitly control the visual granularity per +test instance during inference, e.g. , adjusting the number of tokens used to +represent an image based on the anticipated complexity or simplicity of the +content; (2) M3 provides a framework for analyzing the granularity needed for +existing datasets, where we find that COCO-style benchmarks only need around ~9 +visual tokens to obtain accuracy similar to that of using all 576 tokens; (3) +Our approach provides a foundation to explore the best trade-off between +performance and visual token length at sample level, where our investigation +reveals that a large gap exists between the oracle upper bound and current +fixed-scale representations. + +
+
+ comment: Project Page: https://matryoshka-mm.github.io/ +
+
+
+
+
+ + ♻ ☆ Harnessing the Power of Artificial Intelligence to Vitalize Endangered + Indigenous Languages: Technologies and Experiences + + +
+ Since 2022 we have been exploring application areas and technologies in which +Artificial Intelligence (AI) and modern Natural Language Processing (NLP), such +as Large Language Models (LLMs), can be employed to foster the usage and +facilitate the documentation of Indigenous languages which are in danger of +disappearing. We start by discussing the decreasing diversity of languages in +the world and how working with Indigenous languages poses unique ethical +challenges for AI and NLP. To address those challenges, we propose an +alternative development AI cycle based on community engagement and usage. Then, +we report encouraging results in the development of high-quality machine +learning translators for Indigenous languages by fine-tuning state-of-the-art +(SOTA) translators with tiny amounts of data and discuss how to avoid some +common pitfalls in the process. We also present prototypes we have built in +projects done in 2023 and 2024 with Indigenous communities in Brazil, aimed at +facilitating writing, and discuss the development of Indigenous Language Models +(ILMs) as a replicable and scalable way to create spell-checkers, next-word +predictors, and similar tools. Finally, we discuss how we envision a future for +language documentation where dying languages are preserved as interactive +language models. + +
+
+
+
+
+ + ♻ ☆ Prompt Leakage effect and defense strategies for multi-turn LLM + interactions + + +
+ Prompt leakage poses a compelling security and privacy threat in LLM +applications. Leakage of system prompts may compromise intellectual property, +and act as adversarial reconnaissance for an attacker. A systematic evaluation +of prompt leakage threats and mitigation strategies is lacking, especially for +multi-turn LLM interactions. In this paper, we systematically investigate LLM +vulnerabilities against prompt leakage for 10 closed- and open-source LLMs, +across four domains. We design a unique threat model which leverages the LLM +sycophancy effect and elevates the average attack success rate (ASR) from 17.7% +to 86.2% in a multi-turn setting. Our standardized setup further allows +dissecting leakage of specific prompt contents such as task instructions and +knowledge documents. We measure the mitigation effect of 7 black-box defense +strategies, along with finetuning an open-source model to defend against +leakage attempts. We present different combination of defenses against our +threat model, including a cost analysis. Our study highlights key takeaways for +building secure LLM applications and provides directions for research in +multi-turn LLM interactions + +
+
+
+
+
+ + ♻ ☆ Large Language Models as Carriers of Hidden Messages + + +
+ With the help of simple fine-tuning, one can artificially embed hidden text +into large language models (LLMs). This text is revealed only when triggered by +a specific query to the LLM. Two primary applications are LLM fingerprinting +and steganography. In the context of LLM fingerprinting, a unique text +identifier (fingerprint) is embedded within the model to verify licensing +compliance. In the context of steganography, the LLM serves as a carrier for +hidden messages that can be disclosed through a chosen trigger question. + Our work demonstrates that embedding hidden text in the LLM via fine-tuning, +though seemingly secure due to the vast number of potential triggers (any +sequence of characters or tokens could serve as a trigger), is susceptible to +extraction through analysis of the LLM's output decoding process. We propose an +extraction attack called Unconditional Token Forcing (UTF). It is premised on +the hypothesis that iteratively feeding each token from the LLM's vocabulary +into the model should reveal output sequences with abnormally high token +probabilities, indicating potential hidden text candidates. We also present a +defense method to hide text in such a way that it is resistant to both UTF and +attacks based on sampling decoding methods, which we named Unconditional Token +Forcing Confusion (UTFC). To the best of our knowledge, there is no attack +method that can extract text hidden with UTFC. UTFC has both benign +applications (improving LLM fingerprinting) and malign applications (using LLMs +to create covert communication channels). Code is available at +github.com/j-hoscilowic/zurek-stegano + +
+
+ comment: Work in progress. Code is available at + https://github.com/j-hoscilowic/zurek-stegano +
+
+
+
+
+ + ♻ ☆ Publicly Shareable Clinical Large Language Model Built on Synthetic + Clinical Notes ACL 2024 + + +
+ The development of large language models tailored for handling patients' +clinical notes is often hindered by the limited accessibility and usability of +these notes due to strict privacy regulations. To address these challenges, we +first create synthetic large-scale clinical notes using publicly available case +reports extracted from biomedical literature. We then use these synthetic notes +to train our specialized clinical large language model, Asclepius. While +Asclepius is trained on synthetic data, we assess its potential performance in +real-world applications by evaluating it using real clinical notes. We +benchmark Asclepius against several other large language models, including +GPT-3.5-turbo and other open-source alternatives. To further validate our +approach using synthetic notes, we also compare Asclepius with its variants +trained on real clinical notes. Our findings convincingly demonstrate that +synthetic clinical notes can serve as viable substitutes for real ones when +constructing high-performing clinical language models. This conclusion is +supported by detailed evaluations conducted by both GPT-4 and medical +professionals. All resources including weights, codes, and data used in the +development of Asclepius are made publicly accessible for future research. +(https://github.com/starmpcc/Asclepius) + +
+
+ comment: ACL 2024 (Findings) +
+
+
+
+
+ + ♻ ☆ Agent-OM: Leveraging LLM Agents for Ontology Matching + + +
+ Ontology matching (OM) enables semantic interoperability between different +ontologies and resolves their conceptual heterogeneity by aligning related +entities. OM systems currently have two prevailing design paradigms: +conventional knowledge-based expert systems and newer machine learning-based +predictive systems. While large language models (LLMs) and LLM agents have +revolutionised data engineering and have been applied creatively in many +domains, their potential for OM remains underexplored. This study introduces a +novel agent-powered LLM-based design paradigm for OM systems. With +consideration of several specific challenges in leveraging LLM agents for OM, +we propose a generic framework, namely Agent-OM (w.r.t. Agent for Ontology +Matching), consisting of two Siamese agents for retrieval and matching, with a +set of simple OM tools. Our framework is implemented in a proof-of-concept +system. Evaluations of three Ontology Alignment Evaluation Initiative (OAEI) +tracks over state-of-the-art OM systems show that our system can achieve +results very close to the long-standing best performance on simple OM tasks and +can significantly improve the performance on complex and few-shot OM tasks. + +
+
+ comment: 19 pages, 13 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ MICL: Improving In-Context Learning through Multiple-Label Words in + Demonstration + + +
+ In-context learning (ICL) enables large language models (LLMs) to perform new +tasks by using sample-label pairs as demonstrations. However, variations in +demonstrations can lead to significantly different performances. Current +research mainly focuses on selecting demonstration samples, preassuming the +class name to be the label word when creating sample-label pairs. However, the +choice of label words is crucial for ICL performance. In addition, we observe +that using a single class name in demonstration may not yield optimal results. +In this paper, we propose to use multiple label words in one sample-label pair +to enhance ICL performance. Further, we select and order sample-label pairs +based on LLM's output distribution, aiming to optimize the demonstration +examples from both the samples' and labels' perspectives. Evaluation results on +seven classification datasets show that the use of multiple label words, +strategically organized by their selection, order and quantity, improves ICL +performance through diverse label information. + +
+
+ comment: 16 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ AutoManual: Generating Instruction Manuals by LLM Agents via Interactive + Environmental Learning + + +
+ Large Language Models (LLM) based agents have shown promise in autonomously +completing tasks across various domains, e.g., robotics, games, and web +navigation. However, these agents typically require elaborate design and expert +prompts to solve tasks in specific domains, which limits their adaptability. We +introduce AutoManual, a framework enabling LLM agents to autonomously build +their understanding through interaction and adapt to new environments. +AutoManual categorizes environmental knowledge into diverse rules and optimizes +them in an online fashion by two agents: 1) The Planner codes actionable plans +based on current rules for interacting with the environment. 2) The Builder +updates the rules through a well-structured rule system that facilitates online +rule management and essential detail retention. To mitigate hallucinations in +managing rules, we introduce a case-conditioned prompting strategy for the +Builder. Finally, the Formulator agent compiles these rules into a +comprehensive manual. The self-generated manual can not only improve the +adaptability but also guide the planning of smaller LLMs while being +human-readable. Given only one simple demonstration, AutoManual significantly +improves task success rates, achieving 97.4\% with GPT-4-turbo and 86.2\% with +GPT-3.5-turbo on ALFWorld benchmark tasks. The code is available at +https://github.com/minghchen/automanual. + +
+
+
+
+
+ + ♻ ☆ HealMe: Harnessing Cognitive Reframing in Large Language Models for + Psychotherapy + + +
+ Large Language Models (LLMs) can play a vital role in psychotherapy by +adeptly handling the crucial task of cognitive reframing and overcoming +challenges such as shame, distrust, therapist skill variability, and resource +scarcity. Previous LLMs in cognitive reframing mainly converted negative +emotions to positive ones, but these approaches have limited efficacy, often +not promoting clients' self-discovery of alternative perspectives. In this +paper, we unveil the Helping and Empowering through Adaptive Language in Mental +Enhancement (HealMe) model. This novel cognitive reframing therapy method +effectively addresses deep-rooted negative thoughts and fosters rational, +balanced perspectives. Diverging from traditional LLM methods, HealMe employs +empathetic dialogue based on psychotherapeutic frameworks. It systematically +guides clients through distinguishing circumstances from feelings, +brainstorming alternative viewpoints, and developing empathetic, actionable +suggestions. Moreover, we adopt the first comprehensive and expertly crafted +psychological evaluation metrics, specifically designed to rigorously assess +the performance of cognitive reframing, in both AI-simulated dialogues and +real-world therapeutic conversations. Experimental results show that our model +outperforms others in terms of empathy, guidance, and logical coherence, +demonstrating its effectiveness and potential positive impact on psychotherapy. + +
+
+ comment: 19 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ MedExpQA: Multilingual Benchmarking of Large Language Models for Medical + Question Answering + + +
+ Large Language Models (LLMs) have the potential of facilitating the +development of Artificial Intelligence technology to assist medical experts for +interactive decision support, which has been demonstrated by their competitive +performances in Medical QA. However, while impressive, the required quality bar +for medical applications remains far from being achieved. Currently, LLMs +remain challenged by outdated knowledge and by their tendency to generate +hallucinated content. Furthermore, most benchmarks to assess medical knowledge +lack reference gold explanations which means that it is not possible to +evaluate the reasoning of LLMs predictions. Finally, the situation is +particularly grim if we consider benchmarking LLMs for languages other than +English which remains, as far as we know, a totally neglected topic. In order +to address these shortcomings, in this paper we present MedExpQA, the first +multilingual benchmark based on medical exams to evaluate LLMs in Medical +Question Answering. To the best of our knowledge, MedExpQA includes for the +first time reference gold explanations written by medical doctors which can be +leveraged to establish various gold-based upper-bounds for comparison with LLMs +performance. Comprehensive multilingual experimentation using both the gold +reference explanations and Retrieval Augmented Generation (RAG) approaches show +that performance of LLMs still has large room for improvement, especially for +languages other than English. Furthermore, and despite using state-of-the-art +RAG methods, our results also demonstrate the difficulty of obtaining and +integrating readily available medical knowledge that may positively impact +results on downstream evaluations for Medical Question Answering. So far the +benchmark is available in four languages, but we hope that this work may +encourage further development to other languages. + +
+
+
+
+
+ + ♻ ☆ The Power of Combining Data and Knowledge: GPT-4o is an Effective + Interpreter of Machine Learning Models in Predicting Lymph Node Metastasis of + Lung Cancer + + +
+ Lymph node metastasis (LNM) is a crucial factor in determining the initial +treatment for patients with lung cancer, yet accurate preoperative diagnosis of +LNM remains challenging. Recently, large language models (LLMs) have garnered +significant attention due to their remarkable text generation capabilities. +Leveraging the extensive medical knowledge learned from vast corpora, LLMs can +estimate probabilities for clinical problems, though their performance has +historically been inferior to data-driven machine learning models. In this +paper, we propose a novel ensemble method that combines the medical knowledge +acquired by LLMs with the latent patterns identified by machine learning models +to enhance LNM prediction performance. Initially, we developed machine learning +models using patient data. We then designed a prompt template to integrate the +patient data with the predicted probability from the machine learning model. +Subsequently, we instructed GPT-4o, the most advanced LLM developed by OpenAI, +to estimate the likelihood of LNM based on patient data and then adjust the +estimate using the machine learning output. Finally, we collected three outputs +from the GPT-4o using the same prompt and ensembled these results as the final +prediction. Using the proposed method, our models achieved an AUC value of +0.765 and an AP value of 0.415 for LNM prediction, significantly improving +predictive performance compared to baseline machine learning models. The +experimental results indicate that GPT-4o can effectively leverage its medical +knowledge and the probabilities predicted by machine learning models to achieve +more accurate LNM predictions. These findings demonstrate that LLMs can perform +well in clinical risk prediction tasks, offering a new paradigm for integrating +medical knowledge and patient data in clinical predictions. + +
+
+
+
+
+ + ♻ ☆ Technical Report on the Pangram AI-Generated Text Classifier + + +
+ We present Pangram Text, a transformer-based neural network trained to +distinguish text written by large language models from text written by humans. +Pangram Text outperforms zero-shot methods such as DetectGPT as well as leading +commercial AI detection tools with over 38 times lower error rates on a +comprehensive benchmark comprised of 10 text domains (student writing, creative +writing, scientific writing, books, encyclopedias, news, email, scientific +papers, short-form Q&A) and 8 open- and closed-source large language models. We +propose a training algorithm, hard negative mining with synthetic mirrors, that +enables our classifier to achieve orders of magnitude lower false positive +rates on high-data domains such as reviews. Finally, we show that Pangram Text +is not biased against nonnative English speakers and generalizes to domains and +models unseen during training. + +
+
+
+
+
+ + ♻ ☆ Knowledge Graph Structure as Prompt: Improving Small Language Models + Capabilities for Knowledge-based Causal Discovery ISWC'24 + + +
+ Causal discovery aims to estimate causal structures among variables based on +observational data. Large Language Models (LLMs) offer a fresh perspective to +tackle the causal discovery problem by reasoning on the metadata associated +with variables rather than their actual data values, an approach referred to as +knowledge-based causal discovery. In this paper, we investigate the +capabilities of Small Language Models (SLMs, defined as LLMs with fewer than 1 +billion parameters) with prompt-based learning for knowledge-based causal +discovery. Specifically, we present KG Structure as Prompt, a novel approach +for integrating structural information from a knowledge graph, such as common +neighbor nodes and metapaths, into prompt-based learning to enhance the +capabilities of SLMs. Experimental results on three types of biomedical and +open-domain datasets under few-shot settings demonstrate the effectiveness of +our approach, surpassing most baselines and even conventional fine-tuning +approaches trained on full datasets. Our findings further highlight the strong +capabilities of SLMs: in combination with knowledge graphs and prompt-based +learning, SLMs demonstrate the potential to surpass LLMs with larger number of +parameters. Our code and datasets are available on GitHub. + +
+
+ comment: accepted at ISWC'24 +
+
+
+
+
+ + ♻ ☆ MVMR: A New Framework for Evaluating Faithfulness of Video Moment + Retrieval against Multiple Distractors CIKM 2024 + + +
+ With the explosion of multimedia content, video moment retrieval (VMR), which +aims to detect a video moment that matches a given text query from a video, has +been studied intensively as a critical problem. However, the existing VMR +framework evaluates video moment retrieval performance, assuming that a video +is given, which may not reveal whether the models exhibit overconfidence in the +falsely given video. In this paper, we propose the MVMR (Massive Videos Moment +Retrieval for Faithfulness Evaluation) task that aims to retrieve video moments +within a massive video set, including multiple distractors, to evaluate the +faithfulness of VMR models. For this task, we suggest an automated massive +video pool construction framework to categorize negative (distractors) and +positive (false-negative) video sets using textual and visual semantic distance +verification methods. We extend existing VMR datasets using these methods and +newly construct three practical MVMR datasets. To solve the task, we further +propose a strong informative sample-weighted learning method, CroCs, which +employs two contrastive learning mechanisms: (1) weakly-supervised potential +negative learning and (2) cross-directional hard-negative learning. +Experimental results on the MVMR datasets reveal that existing VMR models are +easily distracted by the misinformation (distractors), whereas our model shows +significantly robust performance, demonstrating that CroCs is essential to +distinguishing positive moments against distractors. Our code and datasets are +publicly available: https://github.com/yny0506/Massive-Videos-Moment-Retrieval. + +
+
+ comment: accepted to CIKM 2024 +
+
+
+
+
+ + ♻ ☆ Predictive Pipelined Decoding: A Compute-Latency Trade-off for Exact LLM + Decoding ICML 2023 + + +
+ This paper presents "Predictive Pipelined Decoding (PPD)," an approach that +speeds up greedy decoding in Large Language Models (LLMs) while maintaining the +exact same output as the original decoding. Unlike conventional strategies, PPD +employs additional compute resources to parallelize the initiation of +subsequent token decoding during the current token decoding. This method +reduces decoding latency and reshapes the understanding of trade-offs in LLM +decoding strategies. We have developed a theoretical framework that allows us +to analyze the trade-off between computation and latency. Using this framework, +we can analytically estimate the potential reduction in latency associated with +our proposed method, achieved through the assessment of the match rate, +represented as p_correct. The results demonstrate that the use of extra +computational resources has the potential to accelerate LLM decoding. +Additionally, we implement PPD and conduct preliminary experiments to +empirically validate its efficacy, addressing potential practical overheads not +covered by theoretical analysis. + +
+
+ comment: ES-FoMo Workshop at ICML 2023 / Published in TMLR +
+
+
+
+
+ + ♻ ☆ InstructIE: A Bilingual Instruction-based Information Extraction Dataset ISWC 2024 + + +
+ Large language models can perform well on general natural language tasks, but +their effectiveness is still suboptimal for information extraction (IE). Recent +works indicate that the main reason lies in the lack of extensive data on IE +instructions. Note that the existing datasets on IE instructions not only have +limited coverage but also involve high construction costs. To address this +issue, we introduce InstructIE, a bilingual instruction-based IE dataset, which +covers 12 diverse domains. We propose KG2Instruction, a framework specifically +for the automatic generation of such datasets. Additionally, we manually +annotate the test set. Experimental results demonstrate that large language +models trained with InstructIE can not only obtain better IE capabilities but +also enhance zero-shot performance compared with baselines. + +
+
+ comment: ISWC 2024; project homepage: + https://www.zjukg.org/project/InstructIE/ dataset: + https://huggingface.co/datasets/zjunlp/InstructIE +
+
+
+
+
+ + ♻ ☆ A Role-specific Guided Large Language Model for Ophthalmic Consultation + Based on Stylistic Differentiation + + +
+ Ophthalmology consultations are crucial for diagnosing, treating, and +preventing eye diseases. However, the growing demand for consultations exceeds +the availability of ophthalmologists. By leveraging large pre-trained language +models, we can design effective dialogues for specific scenarios, aiding in +consultations. Traditional fine-tuning strategies for question-answering tasks +are impractical due to increasing model size and often ignoring patient-doctor +role function during consultations. In this paper, we propose EyeDoctor, an +ophthalmic medical questioning large language model that enhances accuracy +through doctor-patient role perception guided and an augmented knowledge base +with external disease information. Experimental results show EyeDoctor achieves +higher question-answering precision in ophthalmology consultations. Notably, +EyeDoctor demonstrated a 7.25% improvement in Rouge-1 scores and a 10.16% +improvement in F1 scores on multi-round datasets compared to second best model +ChatGPT, highlighting the importance of doctor-patient role differentiation and +dynamic knowledge base expansion for intelligent medical consultations. EyeDoc +also serves as a free available web based service and souce code is available +at https://github.com/sperfu/EyeDoc. + +
+
+
+
+
+ + ♻ ☆ PersonaGym: Evaluating Persona Agents and LLMs + + +
+ Persona agents, which are LLM agents that act according to an assigned +persona, have demonstrated impressive contextual response capabilities across +various applications. These persona agents offer significant enhancements +across diverse sectors, such as education, healthcare, and entertainment, where +model developers can align agent responses to different user requirements +thereby broadening the scope of agent applications. However, evaluating persona +agent performance is incredibly challenging due to the complexity of assessing +persona adherence in free-form interactions across various environments that +are relevant to each persona agent. We introduce PersonaGym, the first dynamic +evaluation framework for assessing persona agents, and PersonaScore, the first +automated human-aligned metric grounded in decision theory for comprehensive +large-scale evaluation of persona agents. Our evaluation of 6 open and +closed-source LLMs, using a benchmark encompassing 200 personas and 10,000 +questions, reveals significant opportunities for advancement in persona agent +capabilities across state-of-the-art models. For example, Claude 3.5 Sonnet +only has a 2.97% relative improvement in PersonaScore than GPT 3.5 despite +being a much more advanced model. Importantly, we find that increased model +size and complexity do not necessarily imply enhanced persona agent +capabilities thereby highlighting the pressing need for algorithmic and +architectural invention towards faithful and performant persona agents. + +
+
+ comment: 21 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ How to Engage Your Readers? Generating Guiding Questions to Promote + Active Reading ACL 2024 + + +
+ Using questions in written text is an effective strategy to enhance +readability. However, what makes an active reading question good, what the +linguistic role of these questions is, and what is their impact on human +reading remains understudied. We introduce GuidingQ, a dataset of 10K in-text +questions from textbooks and scientific articles. By analyzing the dataset, we +present a comprehensive understanding of the use, distribution, and linguistic +characteristics of these questions. Then, we explore various approaches to +generate such questions using language models. Our results highlight the +importance of capturing inter-question relationships and the challenge of +question position identification in generating these questions. Finally, we +conduct a human study to understand the implication of such questions on +reading comprehension. We find that the generated questions are of high quality +and are almost as effective as human-written questions in terms of improving +readers' memorization and comprehension. + +
+
+ comment: ACL 2024 +
+
+
+
+
+ + ♻ ☆ Direct Preference Optimization: Your Language Model is Secretly a Reward + Model + + +
+ While large-scale unsupervised language models (LMs) learn broad world +knowledge and some reasoning skills, achieving precise control of their +behavior is difficult due to the completely unsupervised nature of their +training. Existing methods for gaining such steerability collect human labels +of the relative quality of model generations and fine-tune the unsupervised LM +to align with these preferences, often with reinforcement learning from human +feedback (RLHF). However, RLHF is a complex and often unstable procedure, first +fitting a reward model that reflects the human preferences, and then +fine-tuning the large unsupervised LM using reinforcement learning to maximize +this estimated reward without drifting too far from the original model. In this +paper we introduce a new parameterization of the reward model in RLHF that +enables extraction of the corresponding optimal policy in closed form, allowing +us to solve the standard RLHF problem with only a simple classification loss. +The resulting algorithm, which we call Direct Preference Optimization (DPO), is +stable, performant, and computationally lightweight, eliminating the need for +sampling from the LM during fine-tuning or performing significant +hyperparameter tuning. Our experiments show that DPO can fine-tune LMs to align +with human preferences as well as or better than existing methods. Notably, +fine-tuning with DPO exceeds PPO-based RLHF in ability to control sentiment of +generations, and matches or improves response quality in summarization and +single-turn dialogue while being substantially simpler to implement and train. + +
+
+
+
+
+ + ♻ ☆ Auto-Regressive Next-Token Predictors are Universal Learners + + +
+ Large language models display remarkable capabilities in logical and +mathematical reasoning, allowing them to solve complex tasks. Interestingly, +these abilities emerge in networks trained on the simple task of next-token +prediction. In this work, we present a theoretical framework for studying +auto-regressive next-token predictors. We demonstrate that even simple models +such as linear next-token predictors, trained on Chain-of-Thought (CoT) data, +can approximate any function efficiently computed by a Turing machine. We +introduce a new complexity measure -- length complexity -- which measures the +number of intermediate tokens in a CoT sequence required to approximate some +target function, and analyze the interplay between length complexity and other +notions of complexity. Finally, we show experimentally that simple next-token +predictors, such as linear networks and shallow Multi-Layer Perceptrons (MLPs), +display non-trivial performance on text generation and arithmetic tasks. Our +results demonstrate that the power of today's LLMs can be attributed, to a +great extent, to the auto-regressive next-token training scheme, and not +necessarily to a particular choice of architecture. + +
+
+
+
+
+ + ♻ ☆ Auxiliary task demands mask the capabilities of smaller language models + + +
+ Developmental psychologists have argued about when cognitive capacities such +as language understanding or theory of mind emerge. These debates often hinge +on the concept of "task demands" -- the auxiliary challenges associated with +performing a particular evaluation -- that may mask the child's underlying +ability. The same issues arise when measuring the capacities of language models +(LMs): performance on a task is a function of the model's underlying knowledge, +combined with the model's ability to interpret and perform the task given its +available resources. Here, we show that for analogical reasoning, reflective +reasoning, word prediction, and grammaticality judgments, evaluation methods +with greater task demands yield lower performance than evaluations with reduced +demands. This "demand gap" is most pronounced for models with fewer parameters +and less training data. Our results illustrate that LM performance should not +be interpreted as a direct indication of intelligence (or lack thereof), but as +a reflection of capacities seen through the lens of researchers' design +choices. + +
+
+ comment: Published at the 1st Conference on Language Modeling (COLM 2024) +
+
+
+
+
+ + ♻ ☆ Chain of Code: Reasoning with a Language Model-Augmented Code Emulator ICML 2024 + + +
+ Code provides a general syntactic structure to build complex programs and +perform precise computations when paired with a code interpreter - we +hypothesize that language models (LMs) can leverage code-writing to improve +Chain of Thought reasoning not only for logic and arithmetic tasks, but also +for semantic ones (and in particular, those that are a mix of both). For +example, consider prompting an LM to write code that counts the number of times +it detects sarcasm in an essay: the LM may struggle to write an implementation +for "detect_sarcasm(string)" that can be executed by the interpreter (handling +the edge cases would be insurmountable). However, LMs may still produce a valid +solution if they not only write code, but also selectively "emulate" the +interpreter by generating the expected output of "detect_sarcasm(string)". In +this work, we propose Chain of Code (CoC), a simple yet surprisingly effective +extension that improves LM code-driven reasoning. The key idea is to encourage +LMs to format semantic sub-tasks in a program as flexible pseudocode that the +interpreter can explicitly catch undefined behaviors and hand off to simulate +with an LM (as an "LMulator"). Experiments demonstrate that Chain of Code +outperforms Chain of Thought and other baselines across a variety of +benchmarks; on BIG-Bench Hard, Chain of Code achieves 84%, a gain of 12% over +Chain of Thought. In a nutshell, CoC broadens the scope of reasoning questions +that LMs can answer by "thinking in code". + +
+
+ comment: ICML 2024 Oral; Project webpage: https://chain-of-code.github.io +
+
+
+
+
+ + ♻ ☆ Target conversation extraction: Source separation using turn-taking + dynamics + + +
+ Extracting the speech of participants in a conversation amidst interfering +speakers and noise presents a challenging problem. In this paper, we introduce +the novel task of target conversation extraction, where the goal is to extract +the audio of a target conversation based on the speaker embedding of one of its +participants. To accomplish this, we propose leveraging temporal patterns +inherent in human conversations, particularly turn-taking dynamics, which +uniquely characterize speakers engaged in conversation and distinguish them +from interfering speakers and noise. Using neural networks, we show the +feasibility of our approach on English and Mandarin conversation datasets. In +the presence of interfering speakers, our results show an 8.19 dB improvement +in signal-to-noise ratio for 2-speaker conversations and a 7.92 dB improvement +for 2-4-speaker conversations. Code, dataset available at +https://github.com/chentuochao/Target-Conversation-Extraction. + +
+
+ comment: Accepted by Interspeech 2024 +
+
+
+
+
+ + ♻ ☆ Personalized Steering of Large Language Models: Versatile Steering + Vectors Through Bi-directional Preference Optimization + + +
+ Researchers have been studying approaches to steer the behavior of Large +Language Models (LLMs) and build personalized LLMs tailored for various +applications. While fine-tuning seems to be a direct solution, it requires +substantial computational resources and may significantly affect the utility of +the original LLM. Recent endeavors have introduced more lightweight strategies, +focusing on extracting "steering vectors" to guide the model's output toward +desired behaviors by adjusting activations within specific layers of the LLM's +transformer architecture. However, such steering vectors are directly extracted +from the activations of human preference data and thus often lead to suboptimal +results and occasional failures, especially in alignment-related scenarios. +This work proposes an innovative approach that could produce more effective +steering vectors through bi-directional preference optimization. Our method is +designed to allow steering vectors to directly influence the generation +probability of contrastive human preference data pairs, thereby offering a more +precise representation of the target behavior. By carefully adjusting the +direction and magnitude of the steering vector, we enabled personalized control +over the desired behavior across a spectrum of intensities. Extensive +experimentation across various open-ended generation tasks, particularly +focusing on steering AI personas, has validated the efficacy of our approach. +Moreover, we comprehensively investigate critical alignment-concerning +scenarios, such as managing truthfulness, mitigating hallucination, and +addressing jailbreaking attacks. Remarkably, our method can still demonstrate +outstanding steering effectiveness across these scenarios. Furthermore, we +showcase the transferability of our steering vectors across different +models/LoRAs and highlight the synergistic benefits of applying multiple +vectors simultaneously. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 147 + +
+
+
+ + ☆ Specify and Edit: Overcoming Ambiguity in Text-Based Image Editing + + +
+ Text-based editing diffusion models exhibit limited performance when the +user's input instruction is ambiguous. To solve this problem, we propose +$\textit{Specify ANd Edit}$ (SANE), a zero-shot inference pipeline for +diffusion-based editing systems. We use a large language model (LLM) to +decompose the input instruction into specific instructions, i.e. well-defined +interventions to apply to the input image to satisfy the user's request. We +benefit from the LLM-derived instructions along the original one, thanks to a +novel denoising guidance strategy specifically designed for the task. Our +experiments with three baselines and on two datasets demonstrate the benefits +of SANE in all setups. Moreover, our pipeline improves the interpretability of +editing models, and boosts the output diversity. We also demonstrate that our +approach can be applied to any edit, whether ambiguous or not. Our code is +public at https://github.com/fabvio/SANE. + +
+
+
+
+
+ + ☆ SAPG: Split and Aggregate Policy Gradients ICML 2024 + + +
+ Despite extreme sample inefficiency, on-policy reinforcement learning, aka +policy gradients, has become a fundamental tool in decision-making problems. +With the recent advances in GPU-driven simulation, the ability to collect large +amounts of data for RL training has scaled exponentially. However, we show that +current RL methods, e.g. PPO, fail to ingest the benefit of parallelized +environments beyond a certain point and their performance saturates. To address +this, we propose a new on-policy RL algorithm that can effectively leverage +large-scale environments by splitting them into chunks and fusing them back +together via importance sampling. Our algorithm, termed SAPG, shows +significantly higher performance across a variety of challenging environments +where vanilla PPO and other strong baselines fail to achieve high performance. +Website at https://sapg-rl.github.io/ + +
+
+ comment: In ICML 2024 (Oral). Website at https://sapg-rl.github.io/ +
+
+
+
+
+ + ☆ Improving 2D Feature Representations by 3D-Aware Fine-Tuning ECCV 2024 + + +
+ Current visual foundation models are trained purely on unstructured 2D data, +limiting their understanding of 3D structure of objects and scenes. In this +work, we show that fine-tuning on 3D-aware data improves the quality of +emerging semantic features. We design a method to lift semantic 2D features +into an efficient 3D Gaussian representation, which allows us to re-render them +for arbitrary views. Using the rendered 3D-aware features, we design a +fine-tuning strategy to transfer such 3D awareness into a 2D foundation model. +We demonstrate that models fine-tuned in that way produce features that readily +improve downstream task performance in semantic segmentation and depth +estimation through simple linear probing. Notably, though fined-tuned on a +single indoor dataset, the improvement is transferable to a variety of indoor +datasets and out-of-domain datasets. We hope our study encourages the community +to consider injecting 3D awareness when training 2D foundation models. Project +page: https://ywyue.github.io/FiT3D. + +
+
+ comment: ECCV 2024. Project page: https://ywyue.github.io/FiT3D +
+
+
+
+
+ + ☆ FlexAttention for Efficient High-Resolution Vision-Language Models ECCV 2024 + + +
+ Current high-resolution vision-language models encode images as +high-resolution image tokens and exhaustively take all these tokens to compute +attention, which significantly increases the computational cost. To address +this problem, we propose FlexAttention, a flexible attention mechanism for +efficient high-resolution vision-language models. Specifically, a +high-resolution image is encoded both as high-resolution tokens and +low-resolution tokens, where only the low-resolution tokens and a few selected +high-resolution tokens are utilized to calculate the attention map, which +greatly shrinks the computational cost. The high-resolution tokens are selected +via a high-resolution selection module which could retrieve tokens of relevant +regions based on an input attention map. The selected high-resolution tokens +are then concatenated to the low-resolution tokens and text tokens, and input +to a hierarchical self-attention layer which produces an attention map that +could be used for the next-step high-resolution token selection. The +hierarchical self-attention process and high-resolution token selection process +are performed iteratively for each attention layer. Experiments on multimodal +benchmarks prove that our FlexAttention outperforms existing high-resolution +VLMs (e.g., relatively ~9% in V* Bench, ~7% in TextVQA), while also +significantly reducing the computational cost by nearly 40%. + +
+
+ comment: Accepted by ECCV 2024 +
+
+
+
+
+ + ☆ Correspondence-Free SE(3) Point Cloud Registration in RKHS via + Unsupervised Equivariant Learning ECCV 2024 + + +
+ This paper introduces a robust unsupervised SE(3) point cloud registration +method that operates without requiring point correspondences. The method frames +point clouds as functions in a reproducing kernel Hilbert space (RKHS), +leveraging SE(3)-equivariant features for direct feature space registration. A +novel RKHS distance metric is proposed, offering reliable performance amidst +noise, outliers, and asymmetrical data. An unsupervised training approach is +introduced to effectively handle limited ground truth data, facilitating +adaptation to real datasets. The proposed method outperforms classical and +supervised methods in terms of registration accuracy on both synthetic +(ModelNet40) and real-world (ETH3D) noisy, outlier-rich datasets. To our best +knowledge, this marks the first instance of successful real RGB-D odometry data +registration using an equivariant method. The code is available at +{https://sites.google.com/view/eccv24-equivalign} + +
+
+ comment: 10 pages, to be published in ECCV 2024 +
+
+
+
+
+ + ☆ Global Structure-from-Motion Revisited ECCV2024 + + +
+ Recovering 3D structure and camera motion from images has been a +long-standing focus of computer vision research and is known as +Structure-from-Motion (SfM). Solutions to this problem are categorized into +incremental and global approaches. Until now, the most popular systems follow +the incremental paradigm due to its superior accuracy and robustness, while +global approaches are drastically more scalable and efficient. With this work, +we revisit the problem of global SfM and propose GLOMAP as a new +general-purpose system that outperforms the state of the art in global SfM. In +terms of accuracy and robustness, we achieve results on-par or superior to +COLMAP, the most widely used incremental SfM, while being orders of magnitude +faster. We share our system as an open-source implementation at +{https://github.com/colmap/glomap}. + +
+
+ comment: accepted at ECCV2024 +
+
+
+
+
+ + ☆ SANGRIA: Surgical Video Scene Graph Optimization for Surgical Workflow + Prediction MICCAI + + +
+ Graph-based holistic scene representations facilitate surgical workflow +understanding and have recently demonstrated significant success. However, this +task is often hindered by the limited availability of densely annotated +surgical scene data. In this work, we introduce an end-to-end framework for the +generation and optimization of surgical scene graphs on a downstream task. Our +approach leverages the flexibility of graph-based spectral clustering and the +generalization capability of foundation models to generate unsupervised scene +graphs with learnable properties. We reinforce the initial spatial graph with +sparse temporal connections using local matches between consecutive frames to +predict temporally consistent clusters across a temporal neighborhood. By +jointly optimizing the spatiotemporal relations and node features of the +dynamic scene graph with the downstream task of phase segmentation, we address +the costly and annotation-burdensome task of semantic scene comprehension and +scene graph generation in surgical videos using only weak surgical phase +labels. Further, by incorporating effective intermediate scene representation +disentanglement steps within the pipeline, our solution outperforms the SOTA on +the CATARACTS dataset by 8% accuracy and 10% F1 score in surgical workflow +recognition + +
+
+ comment: 9 pages, 3 figures, 3 tables, MICCAI GRAIL Workshop paper +
+
+
+
+
+ + ☆ Registering Neural 4D Gaussians for Endoscopic Surgery + + +
+ The recent advance in neural rendering has enabled the ability to reconstruct +high-quality 4D scenes using neural networks. Although 4D neural reconstruction +is popular, registration for such representations remains a challenging task, +especially for dynamic scene registration in surgical planning and simulation. +In this paper, we propose a novel strategy for dynamic surgical neural scene +registration. We first utilize 4D Gaussian Splatting to represent the surgical +scene and capture both static and dynamic scenes effectively. Then, a spatial +aware feature aggregation method, Spatially Weight Cluttering (SWC) is proposed +to accurately align the feature between surgical scenes, enabling precise and +realistic surgical simulations. Lastly, we present a novel strategy of +deformable scene registration to register two dynamic scenes. By incorporating +both spatial and temporal information for correspondence matching, our approach +achieves superior performance compared to existing registration methods for +implicit neural representation. The proposed method has the potential to +improve surgical planning and training, ultimately leading to better patient +outcomes. + +
+
+
+
+
+ + ☆ SpaER: Learning Spatio-temporal Equivariant Representations for Fetal + Brain Motion Tracking + + +
+ In this paper, we introduce SpaER, a pioneering method for fetal motion +tracking that leverages equivariant filters and self-attention mechanisms to +effectively learn spatio-temporal representations. Different from conventional +approaches that statically estimate fetal brain motions from pairs of images, +our method dynamically tracks the rigid movement patterns of the fetal head +across temporal and spatial dimensions. Specifically, we first develop an +equivariant neural network that efficiently learns rigid motion sequences +through low-dimensional spatial representations of images. Subsequently, we +learn spatio-temporal representations by incorporating time encoding and +self-attention neural network layers. This approach allows for the capture of +long-term dependencies of fetal brain motion and addresses alignment errors due +to contrast changes and severe motion artifacts. Our model also provides a +geometric deformation estimation that properly addresses image distortions +among all time frames. To the best of our knowledge, our approach is the first +to learn spatial-temporal representations via deep neural networks for fetal +motion tracking without data augmentation. We validated our model using real +fetal echo-planar images with simulated and real motions. Our method carries +significant potential value in accurately measuring, tracking, and correcting +fetal motion in fetal MRI sequences. + +
+
+
+
+
+ + ☆ Theia: Distilling Diverse Vision Foundation Models for Robot Learning + + +
+ Vision-based robot policy learning, which maps visual inputs to actions, +necessitates a holistic understanding of diverse visual tasks beyond +single-task needs like classification or segmentation. Inspired by this, we +introduce Theia, a vision foundation model for robot learning that distills +multiple off-the-shelf vision foundation models trained on varied vision tasks. +Theia's rich visual representations encode diverse visual knowledge, enhancing +downstream robot learning. Extensive experiments demonstrate that Theia +outperforms its teacher models and prior robot learning models using less +training data and smaller model sizes. Additionally, we quantify the quality of +pre-trained visual representations and hypothesize that higher entropy in +feature norm distributions leads to improved robot learning performance. Code +and models are available at https://github.com/bdaiinstitute/theia. + +
+
+
+
+
+ + ☆ Advancing Multimodal Large Language Models in Chart Question Answering + with Visualization-Referenced Instruction Tuning + + +
+ Emerging multimodal large language models (MLLMs) exhibit great potential for +chart question answering (CQA). Recent efforts primarily focus on scaling up +training datasets (i.e., charts, data tables, and question-answer (QA) pairs) +through data collection and synthesis. However, our empirical study on existing +MLLMs and CQA datasets reveals notable gaps. First, current data collection and +synthesis focus on data volume and lack consideration of fine-grained visual +encodings and QA tasks, resulting in unbalanced data distribution divergent +from practical CQA scenarios. Second, existing work follows the training recipe +of the base MLLMs initially designed for natural images, under-exploring the +adaptation to unique chart characteristics, such as rich text elements. To fill +the gap, we propose a visualization-referenced instruction tuning approach to +guide the training dataset enhancement and model development. Specifically, we +propose a novel data engine to effectively filter diverse and high-quality data +from existing datasets and subsequently refine and augment the data using +LLM-based generation techniques to better align with practical QA tasks and +visual encodings. Then, to facilitate the adaptation to chart characteristics, +we utilize the enriched data to train an MLLM by unfreezing the vision encoder +and incorporating a mixture-of-resolution adaptation strategy for enhanced +fine-grained recognition. Experimental results validate the effectiveness of +our approach. Even with fewer training examples, our model consistently +outperforms state-of-the-art CQA models on established benchmarks. We also +contribute a dataset split as a benchmark for future research. Source codes and +datasets of this paper are available at +https://github.com/zengxingchen/ChartQA-MLLM. + +
+
+ comment: 11 pages, 7 figures +
+
+
+
+
+ + ☆ LatentArtiFusion: An Effective and Efficient Histological Artifacts + Restoration Framework MICCAI2024 + + +
+ Histological artifacts pose challenges for both pathologists and +Computer-Aided Diagnosis (CAD) systems, leading to errors in analysis. Current +approaches for histological artifact restoration, based on Generative +Adversarial Networks (GANs) and pixel-level Diffusion Models, suffer from +performance limitations and computational inefficiencies. In this paper, we +propose a novel framework, LatentArtiFusion, which leverages the latent +diffusion model (LDM) to reconstruct histological artifacts with high +performance and computational efficiency. Unlike traditional pixel-level +diffusion frameworks, LatentArtiFusion executes the restoration process in a +lower-dimensional latent space, significantly improving computational +efficiency. Moreover, we introduce a novel regional artifact reconstruction +algorithm in latent space to prevent mistransfer in non-artifact regions, +distinguishing our approach from GAN-based methods. Through extensive +experiments on real-world histology datasets, LatentArtiFusion demonstrates +remarkable speed, outperforming state-of-the-art pixel-level diffusion +frameworks by more than 30X. It also consistently surpasses GAN-based methods +by at least 5% across multiple evaluation metrics. Furthermore, we evaluate the +effectiveness of our proposed framework in downstream tissue classification +tasks, showcasing its practical utility. Code is available at +https://github.com/bugs-creator/LatentArtiFusion. + +
+
+ comment: Accept to DGM4MICCAI2024 +
+
+
+
+
+ + ☆ Diffusion Feedback Helps CLIP See Better + + +
+ Contrastive Language-Image Pre-training (CLIP), which excels at abstracting +open-world representations across domains and modalities, has become a +foundation for a variety of vision and multimodal tasks. However, recent +studies reveal that CLIP has severe visual shortcomings, such as which can +hardly distinguish orientation, quantity, color, structure, etc. These visual +shortcomings also limit the perception capabilities of multimodal large +language models (MLLMs) built on CLIP. The main reason could be that the +image-text pairs used to train CLIP are inherently biased, due to the lack of +the distinctiveness of the text and the diversity of images. In this work, we +present a simple post-training approach for CLIP models, which largely +overcomes its visual shortcomings via a self-supervised diffusion process. We +introduce DIVA, which uses the DIffusion model as a Visual Assistant for CLIP. +Specifically, DIVA leverages generative feedback from text-to-image diffusion +models to optimize CLIP representations, with only images (without +corresponding text). We demonstrate that DIVA improves CLIP's performance on +the challenging MMVP-VLM benchmark which assesses fine-grained visual abilities +to a large extent (e.g., 3-7%), and enhances the performance of MLLMs and +vision models on multimodal understanding and segmentation tasks. Extensive +evaluation on 29 image classification and retrieval benchmarks confirms that +our framework preserves CLIP's strong zero-shot capabilities. The code will be +available at https://github.com/baaivision/DIVA. + +
+
+
+
+
+ + ☆ DDAP: Dual-Domain Anti-Personalization against Text-to-Image Diffusion + Models + + +
+ Diffusion-based personalized visual content generation technologies have +achieved significant breakthroughs, allowing for the creation of specific +objects by just learning from a few reference photos. However, when misused to +fabricate fake news or unsettling content targeting individuals, these +technologies could cause considerable societal harm. To address this problem, +current methods generate adversarial samples by adversarially maximizing the +training loss, thereby disrupting the output of any personalized generation +model trained with these samples. However, the existing methods fail to achieve +effective defense and maintain stealthiness, as they overlook the intrinsic +properties of diffusion models. In this paper, we introduce a novel Dual-Domain +Anti-Personalization framework (DDAP). Specifically, we have developed Spatial +Perturbation Learning (SPL) by exploiting the fixed and perturbation-sensitive +nature of the image encoder in personalized generation. Subsequently, we have +designed a Frequency Perturbation Learning (FPL) method that utilizes the +characteristics of diffusion models in the frequency domain. The SPL disrupts +the overall texture of the generated images, while the FPL focuses on image +details. By alternating between these two methods, we construct the DDAP +framework, effectively harnessing the strengths of both domains. To further +enhance the visual quality of the adversarial samples, we design a localization +module to accurately capture attentive areas while ensuring the effectiveness +of the attack and avoiding unnecessary disturbances in the background. +Extensive experiments on facial benchmarks have shown that the proposed DDAP +enhances the disruption of personalized generation models while also +maintaining high quality in adversarial samples, making it more effective in +protecting privacy in practical applications. + +
+
+ comment: Accepted by IJCB 2024 +
+
+
+
+
+ + ☆ FiCo-ITR: bridging fine-grained and coarse-grained image-text retrieval + for comparative performance analysis + + +
+ In the field of Image-Text Retrieval (ITR), recent advancements have +leveraged large-scale Vision-Language Pretraining (VLP) for Fine-Grained (FG) +instance-level retrieval, achieving high accuracy at the cost of increased +computational complexity. For Coarse-Grained (CG) category-level retrieval, +prominent approaches employ Cross-Modal Hashing (CMH) to prioritise efficiency, +albeit at the cost of retrieval performance. Due to differences in +methodologies, FG and CG models are rarely compared directly within evaluations +in the literature, resulting in a lack of empirical data quantifying the +retrieval performance-efficiency tradeoffs between the two. This paper +addresses this gap by introducing the \texttt{FiCo-ITR} library, which +standardises evaluation methodologies for both FG and CG models, facilitating +direct comparisons. We conduct empirical evaluations of representative models +from both subfields, analysing precision, recall, and computational complexity +across varying data scales. Our findings offer new insights into the +performance-efficiency trade-offs between recent representative FG and CG +models, highlighting their respective strengths and limitations. These findings +provide the foundation necessary to make more informed decisions regarding +model selection for specific retrieval tasks and highlight avenues for future +research into hybrid systems that leverage the strengths of both FG and CG +approaches. + +
+
+ comment: 19 pages, submitted to International Journal of Multimedia + Information Retrieval +
+
+
+
+
+ + ☆ Classification, Regression and Segmentation directly from k-Space in + Cardiac MRI + + +
+ Cardiac Magnetic Resonance Imaging (CMR) is the gold standard for diagnosing +cardiovascular diseases. Clinical diagnoses predominantly rely on +magnitude-only Digital Imaging and Communications in Medicine (DICOM) images, +omitting crucial phase information that might provide additional diagnostic +benefits. In contrast, k-space is complex-valued and encompasses both magnitude +and phase information, while humans cannot directly perceive. In this work, we +propose KMAE, a Transformer-based model specifically designed to process +k-space data directly, eliminating conventional intermediary conversion steps +to the image domain. KMAE can handle critical cardiac disease classification, +relevant phenotype regression, and cardiac morphology segmentation tasks. We +utilize this model to investigate the potential of k-space-based diagnosis in +cardiac MRI. Notably, this model achieves competitive classification and +regression performance compared to image-domain methods e.g. Masked +Autoencoders (MAEs) and delivers satisfactory segmentation performance with a +myocardium dice score of 0.884. Last but not least, our model exhibits robust +performance with consistent results even when the k-space is 8* undersampled. +We encourage the MR community to explore the untapped potential of k-space and +pursue end-to-end, automated diagnosis with reduced human intervention. + +
+
+
+
+
+ + ☆ RSC-SNN: Exploring the Trade-off Between Adversarial Robustness and + Accuracy in Spiking Neural Networks via Randomized Smoothing Coding ACM MM 2024 + + +
+ Spiking Neural Networks (SNNs) have received widespread attention due to +their unique neuronal dynamics and low-power nature. Previous research +empirically shows that SNNs with Poisson coding are more robust than Artificial +Neural Networks (ANNs) on small-scale datasets. However, it is still unclear in +theory how the adversarial robustness of SNNs is derived, and whether SNNs can +still maintain its adversarial robustness advantage on large-scale dataset +tasks. This work theoretically demonstrates that SNN's inherent adversarial +robustness stems from its Poisson coding. We reveal the conceptual equivalence +of Poisson coding and randomized smoothing in defense strategies, and analyze +in depth the trade-off between accuracy and adversarial robustness in SNNs via +the proposed Randomized Smoothing Coding (RSC) method. Experiments demonstrate +that the proposed RSC-SNNs show remarkable adversarial robustness, surpassing +ANNs and achieving state-of-the-art robustness results on large-scale dataset +ImageNet. Our open-source implementation code is available at this https URL: +https://github.com/KemingWu/RSC-SNN. + +
+
+ comment: Accepted by ACM MM 2024 +
+
+
+
+
+ + ☆ Infrared Small Target Detection based on Adjustable Sensitivity Strategy + and Multi-Scale Fusion + + +
+ Recently, deep learning-based single-frame infrared small target (SIRST) +detection technology has made significant progress. However, existing infrared +small target detection methods are often optimized for a fixed image +resolution, a single wavelength, or a specific imaging system, limiting their +breadth and flexibility in practical applications. Therefore, we propose a +refined infrared small target detection scheme based on an adjustable +sensitivity (AS) strategy and multi-scale fusion. Specifically, a multi-scale +model fusion framework based on multi-scale direction-aware network (MSDA-Net) +is constructed, which uses input images of multiple scales to train multiple +models and fuses them. Multi-scale fusion helps characterize the shape, edge, +and texture features of the target from different scales, making the model more +accurate and reliable in locating the target. At the same time, we fully +consider the characteristics of the infrared small target detection task and +construct an edge enhancement difficulty mining (EEDM) loss. The EEDM loss +helps alleviate the problem of category imbalance and guides the network to pay +more attention to difficult target areas and edge features during training. In +addition, we propose an adjustable sensitivity strategy for post-processing. +This strategy significantly improves the detection rate of infrared small +targets while ensuring segmentation accuracy. Extensive experimental results +show that the proposed scheme achieves the best performance. Notably, this +scheme won the first prize in the PRCV 2024 wide-area infrared small target +detection competition. + +
+
+
+
+
+ + ☆ Segmenting Fetal Head with Efficient Fine-tuning Strategies in + Low-resource Settings: an empirical study with U-Net + + +
+ Accurate measurement of fetal head circumference is crucial for estimating +fetal growth during routine prenatal screening. Prior to measurement, it is +necessary to accurately identify and segment the region of interest, +specifically the fetal head, in ultrasound images. Recent advancements in deep +learning techniques have shown significant progress in segmenting the fetal +head using encoder-decoder models. Among these models, U-Net has become a +standard approach for accurate segmentation. However, training an +encoder-decoder model can be a time-consuming process that demands substantial +computational resources. Moreover, fine-tuning these models is particularly +challenging when there is a limited amount of data available. There are still +no "best-practice" guidelines for optimal fine-tuning of U-net for fetal +ultrasound image segmentation. This work summarizes existing fine-tuning +strategies with various backbone architectures, model components, and +fine-tuning strategies across ultrasound data from Netherlands, Spain, Malawi, +Egypt and Algeria. Our study shows that (1) fine-tuning U-Net leads to better +performance than training from scratch, (2) fine-tuning strategies in decoder +are superior to other strategies, (3) network architecture with less number of +parameters can achieve similar or better performance. We also demonstrate the +effectiveness of fine-tuning strategies in low-resource settings and further +expand our experiments into few-shot learning. Lastly, we publicly released our +code and specific fine-tuned weights. + +
+
+ comment: 5 figures, 2 tables +
+
+
+
+
+ + ☆ UniTTA: Unified Benchmark and Versatile Framework Towards Realistic + Test-Time Adaptation + + +
+ Test-Time Adaptation (TTA) aims to adapt pre-trained models to the target +domain during testing. In reality, this adaptability can be influenced by +multiple factors. Researchers have identified various challenging scenarios and +developed diverse methods to address these challenges, such as dealing with +continual domain shifts, mixed domains, and temporally correlated or imbalanced +class distributions. Despite these efforts, a unified and comprehensive +benchmark has yet to be established. To this end, we propose a Unified +Test-Time Adaptation (UniTTA) benchmark, which is comprehensive and widely +applicable. Each scenario within the benchmark is fully described by a Markov +state transition matrix for sampling from the original dataset. The UniTTA +benchmark considers both domain and class as two independent dimensions of data +and addresses various combinations of imbalance/balance and +i.i.d./non-i.i.d./continual conditions, covering a total of \( (2 \times 3)^2 = +36 \) scenarios. It establishes a comprehensive evaluation benchmark for +realistic TTA and provides a guideline for practitioners to select the most +suitable TTA method. Alongside this benchmark, we propose a versatile UniTTA +framework, which includes a Balanced Domain Normalization (BDN) layer and a +COrrelated Feature Adaptation (COFA) method--designed to mitigate distribution +gaps in domain and class, respectively. Extensive experiments demonstrate that +our UniTTA framework excels within the UniTTA benchmark and achieves +state-of-the-art performance on average. Our code is available at +\url{https://github.com/LeapLabTHU/UniTTA}. + +
+
+
+
+
+ + ☆ Background Semantics Matter: Cross-Task Feature Exchange Network for + Clustered Infrared Small Target Detection With Sky-Annotated Dataset + + +
+ Infrared small target detection poses unique challenges due to the scarcity +of intrinsic target features and the abundance of similar background +distractors. We argue that background semantics play a pivotal role in +distinguishing visually similar objects for this task. To address this, we +introduce a new task -- clustered infrared small target detection, and present +DenseSIRST, a novel benchmark dataset that provides per-pixel semantic +annotations for background regions, enabling the transition from sparse to +dense target detection. Leveraging this dataset, we propose the +Background-Aware Feature Exchange Network (BAFE-Net), which transforms the +detection paradigm from a single task focused on the foreground to a multi-task +architecture that jointly performs target detection and background semantic +segmentation. BAFE-Net introduces a cross-task feature hard-exchange mechanism +to embed target and background semantics between the two tasks. Furthermore, we +propose the Background-Aware Gaussian Copy-Paste (BAG-CP) method, which +selectively pastes small targets into sky regions during training, avoiding the +creation of false alarm targets in complex non-sky backgrounds. Extensive +experiments validate the effectiveness of BAG-CP and BAFE-Net in improving +target detection accuracy while reducing false alarms. The DenseSIRST dataset, +code, and trained models are available at https://github.com/GrokCV/BAFE-Net. + +
+
+
+
+
+ + ☆ SalNAS: Efficient Saliency-prediction Neural Architecture Search with + self-knowledge distillation + + +
+ Recent advancements in deep convolutional neural networks have significantly +improved the performance of saliency prediction. However, the manual +configuration of the neural network architectures requires domain knowledge +expertise and can still be time-consuming and error-prone. To solve this, we +propose a new Neural Architecture Search (NAS) framework for saliency +prediction with two contributions. Firstly, a supernet for saliency prediction +is built with a weight-sharing network containing all candidate architectures, +by integrating a dynamic convolution into the encoder-decoder in the supernet, +termed SalNAS. Secondly, despite the fact that SalNAS is highly efficient +(20.98 million parameters), it can suffer from the lack of generalization. To +solve this, we propose a self-knowledge distillation approach, termed Self-KD, +that trains the student SalNAS with the weighted average information between +the ground truth and the prediction from the teacher model. The teacher model, +while sharing the same architecture, contains the best-performing weights +chosen by cross-validation. Self-KD can generalize well without the need to +compute the gradient in the teacher model, enabling an efficient training +system. By utilizing Self-KD, SalNAS outperforms other state-of-the-art +saliency prediction models in most evaluation rubrics across seven benchmark +datasets while being a lightweight model. The code will be available at +https://github.com/chakkritte/SalNAS + +
+
+ comment: Published in Engineering Applications of Artificial Intelligence +
+
+
+
+
+ + ☆ MaskInversion: Localized Embeddings via Optimization of Explainability + Maps + + +
+ Vision-language foundation models such as CLIP have achieved tremendous +results in global vision-language alignment, but still show some limitations in +creating representations for specific image regions. % To address this problem, +we propose MaskInversion, a method that leverages the feature representations +of pre-trained foundation models, such as CLIP, to generate a context-aware +embedding for a query image region specified by a mask at test time. +MaskInversion starts with initializing an embedding token and compares its +explainability map, derived from the foundation model, to the query mask. The +embedding token is then subsequently refined to approximate the query region by +minimizing the discrepancy between its explainability map and the query mask. +During this process, only the embedding vector is updated, while the underlying +foundation model is kept frozen allowing to use MaskInversion with any +pre-trained model. As deriving the explainability map involves computing its +gradient, which can be expensive, we propose a gradient decomposition strategy +that simplifies this computation. The learned region representation can be used +for a broad range of tasks, including open-vocabulary class retrieval, +referring expression comprehension, as well as for localized captioning and +image generation. We evaluate the proposed method on all those tasks on several +datasets such as PascalVOC, MSCOCO, RefCOCO, and OpenImagesV7 and show its +capabilities compared to other SOTA approaches. + +
+
+ comment: Project page: https://walidbousselham.com/MaskInversion +
+
+
+
+
+ + ☆ MimiQ: Low-Bit Data-Free Quantization of Vision Transformers + + +
+ Data-free quantization (DFQ) is a technique that creates a lightweight +network from its full-precision counterpart without the original training data, +often through a synthetic dataset. Although several DFQ methods have been +proposed for vision transformer (ViT) architectures, they fail to achieve +efficacy in low-bit settings. Examining the existing methods, we identify that +their synthetic data produce misaligned attention maps, while those of the real +samples are highly aligned. From the observation of aligned attention, we find +that aligning attention maps of synthetic data helps to improve the overall +performance of quantized ViTs. Motivated by this finding, we devise \aname, a +novel DFQ method designed for ViTs that focuses on inter-head attention +similarity. First, we generate synthetic data by aligning head-wise attention +responses in relation to spatial query patches. Then, we apply head-wise +structural attention distillation to align the attention maps of the quantized +network to those of the full-precision teacher. The experimental results show +that the proposed method significantly outperforms baselines, setting a new +state-of-the-art performance for data-free ViT quantization. + +
+
+ comment: Author Preprint +
+
+
+
+
+ + ☆ ImagiNet: A Multi-Content Dataset for Generalizable Synthetic Image + Detection via Contrastive Learning + + +
+ Generative models, such as diffusion models (DMs), variational autoencoders +(VAEs), and generative adversarial networks (GANs), produce images with a level +of authenticity that makes them nearly indistinguishable from real photos and +artwork. While this capability is beneficial for many industries, the +difficulty of identifying synthetic images leaves online media platforms +vulnerable to impersonation and misinformation attempts. To support the +development of defensive methods, we introduce ImagiNet, a high-resolution and +balanced dataset for synthetic image detection, designed to mitigate potential +biases in existing resources. It contains 200K examples, spanning four content +categories: photos, paintings, faces, and uncategorized. Synthetic images are +produced with open-source and proprietary generators, whereas real counterparts +of the same content type are collected from public datasets. The structure of +ImagiNet allows for a two-track evaluation system: i) classification as real or +synthetic and ii) identification of the generative model. To establish a +baseline, we train a ResNet-50 model using a self-supervised contrastive +objective (SelfCon) for each track. The model demonstrates state-of-the-art +performance and high inference speed across established benchmarks, achieving +an AUC of up to 0.99 and balanced accuracy ranging from 86% to 95%, even under +social network conditions that involve compression and resizing. Our data and +code are available at https://github.com/delyan-boychev/imaginet. + +
+
+ comment: 24 pages, 9 figures, 9 tables +
+
+
+
+
+ + ☆ Classification of freshwater snails of the genus \emph{Radomaniola} with + multimodal triplet networks ICML 2024 + + +
+ In this paper, we present our first proposal of a machine learning system for +the classification of freshwater snails of the genus \emph{Radomaniola}. We +elaborate on the specific challenges encountered during system design, and how +we tackled them; namely a small, very imbalanced dataset with a high number of +classes and high visual similarity between classes. We then show how we +employed triplet networks and the multiple input modalities of images, +measurements, and genetic information to overcome these challenges and reach a +performance comparable to that of a trained domain expert. + +
+
+ comment: Spotlight at ICML 2024 AI for Science workshop +
+
+
+
+
+ + ☆ Reproducibility Study of "ITI-GEN: Inclusive Text-to-Image Generation" + + +
+ Text-to-image generative models often present issues regarding fairness with +respect to certain sensitive attributes, such as gender or skin tone. This +study aims to reproduce the results presented in "ITI-GEN: Inclusive +Text-to-Image Generation" by Zhang et al. (2023a), which introduces a model to +improve inclusiveness in these kinds of models. We show that most of the claims +made by the authors about ITI-GEN hold: it improves the diversity and quality +of generated images, it is scalable to different domains, it has plug-and-play +capabilities, and it is efficient from a computational point of view. However, +ITI-GEN sometimes uses undesired attributes as proxy features and it is unable +to disentangle some pairs of (correlated) attributes such as gender and +baldness. In addition, when the number of considered attributes increases, the +training time grows exponentially and ITI-GEN struggles to generate inclusive +images for all elements in the joint distribution. To solve these issues, we +propose using Hard Prompt Search with negative prompting, a method that does +not require training and that handles negation better than vanilla Hard Prompt +Search. Nonetheless, Hard Prompt Search (with or without negative prompting) +cannot be used for continuous attributes that are hard to express in natural +language, an area where ITI-GEN excels as it is guided by images during +training. Finally, we propose combining ITI-GEN and Hard Prompt Search with +negative prompting. + +
+
+ comment: Accepted to TMLR, see https://openreview.net/forum?id=d3Vj360Wi2 +
+
+
+
+
+ + ☆ More precise edge detections + + +
+ Image Edge detection (ED) is a base task in computer vision. While the +performance of the ED algorithm has been improved greatly by introducing +CNN-based models, current models still suffer from unsatisfactory precision +rates especially when only a low error toleration distance is allowed. +Therefore, model architecture for more precise predictions still needs an +investigation. On the other hand, the unavoidable noise training data provided +by humans would lead to unsatisfactory model predictions even when inputs are +edge maps themselves, which also needs improvement. In this paper, more precise +ED models are presented with cascaded skipping density blocks (CSDB). Our +models obtain state-of-the-art(SOTA) predictions in several datasets, +especially in average precision rate (AP), which is confirmed by extensive +experiments. Moreover, our models do not include down-sample operations, +demonstrating those widely believed operations are not necessary. Also, a novel +modification on data augmentation for training is employed, which allows +noiseless data to be employed in model training and thus improves the +performance of models predicting on edge maps themselves. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ Mixture of Nested Experts: Adaptive Processing of Visual Tokens + + +
+ The visual medium (images and videos) naturally contains a large amount of +information redundancy, thereby providing a great opportunity for leveraging +efficiency in processing. While Vision Transformer (ViT) based models scale +effectively to large data regimes, they fail to capitalize on this inherent +redundancy, leading to higher computational costs. Mixture of Experts (MoE) +networks demonstrate scalability while maintaining same inference-time costs, +but they come with a larger parameter footprint. We present Mixture of Nested +Experts (MoNE), which utilizes a nested structure for experts, wherein +individual experts fall on an increasing compute-accuracy curve. Given a +compute budget, MoNE learns to dynamically choose tokens in a priority order, +and thus redundant tokens are processed through cheaper nested experts. Using +this framework, we achieve equivalent performance as the baseline models, while +reducing inference time compute by over two-fold. We validate our approach on +standard image and video datasets - ImageNet-21K, Kinetics400, and +Something-Something-v2. We further highlight MoNE$'$s adaptability by +showcasing its ability to maintain strong performance across different +inference-time compute budgets on videos, using only a single trained model. + +
+
+
+
+
+ + ☆ Adversarial Robustness in RGB-Skeleton Action Recognition: Leveraging + Attention Modality Reweighter + + +
+ Deep neural networks (DNNs) have been applied in many computer vision tasks +and achieved state-of-the-art (SOTA) performance. However, misclassification +will occur when DNNs predict adversarial examples which are created by adding +human-imperceptible adversarial noise to natural examples. This limits the +application of DNN in security-critical fields. In order to enhance the +robustness of models, previous research has primarily focused on the unimodal +domain, such as image recognition and video understanding. Although multi-modal +learning has achieved advanced performance in various tasks, such as action +recognition, research on the robustness of RGB-skeleton action recognition +models is scarce. In this paper, we systematically investigate how to improve +the robustness of RGB-skeleton action recognition models. We initially +conducted empirical analysis on the robustness of different modalities and +observed that the skeleton modality is more robust than the RGB modality. +Motivated by this observation, we propose the \formatword{A}ttention-based +\formatword{M}odality \formatword{R}eweighter (\formatword{AMR}), which +utilizes an attention layer to re-weight the two modalities, enabling the model +to learn more robust features. Our AMR is plug-and-play, allowing easy +integration with multimodal models. To demonstrate the effectiveness of AMR, we +conducted extensive experiments on various datasets. For example, compared to +the SOTA methods, AMR exhibits a 43.77\% improvement against PGD20 attacks on +the NTU-RGB+D 60 dataset. Furthermore, it effectively balances the differences +in robustness between different modalities. + +
+
+ comment: Accepted by IJCB 2024 +
+
+
+
+
+ + ☆ From Flat to Spatial: Comparison of 4 methods constructing 3D, 2 and + 1/2D Models from 2D Plans with neural networks + + +
+ In the field of architecture, the conversion of single images into 2 and 1/2D +and 3D meshes is a promising technology that enhances design visualization and +efficiency. This paper evaluates four innovative methods: "One-2-3-45," "CRM: +Single Image to 3D Textured Mesh with Convolutional Reconstruction Model," +"Instant Mesh," and "Image-to-Mesh." These methods are at the forefront of this +technology, focusing on their applicability in architectural design and +visualization. They streamline the creation of 3D architectural models, +enabling rapid prototyping and detailed visualization from minimal initial +inputs, such as photographs or simple sketches.One-2-3-45 leverages a +diffusion-based approach to generate multi-view reconstructions, ensuring high +geometric fidelity and texture quality. CRM utilizes a convolutional network to +integrate geometric priors into its architecture, producing detailed and +textured meshes quickly and efficiently. Instant Mesh combines the strengths of +multi-view diffusion and sparse-view models to offer speed and scalability, +suitable for diverse architectural projects. Image-to-Mesh leverages a +generative adversarial network (GAN) to produce 3D meshes from single images, +focusing on maintaining high texture fidelity and geometric accuracy by +incorporating image and depth map data into its training process. It uses a +hybrid approach that combines voxel-based representations with surface +reconstruction techniques to ensure detailed and realistic 3D models.This +comparative study highlights each method's contribution to reducing design +cycle times, improving accuracy, and enabling flexible adaptations to various +architectural styles and requirements. By providing architects with powerful +tools for rapid visualization and iteration, these advancements in 3D mesh +generation are set to revolutionize architectural practices. + +
+
+
+
+
+ + ☆ FedDEO: Description-Enhanced One-Shot Federated Learning with Diffusion + Models + + +
+ In recent years, the attention towards One-Shot Federated Learning (OSFL) has +been driven by its capacity to minimize communication. With the development of +the diffusion model (DM), several methods employ the DM for OSFL, utilizing +model parameters, image features, or textual prompts as mediums to transfer the +local client knowledge to the server. However, these mediums often require +public datasets or the uniform feature extractor, significantly limiting their +practicality. In this paper, we propose FedDEO, a Description-Enhanced One-Shot +Federated Learning Method with DMs, offering a novel exploration of utilizing +the DM in OSFL. The core idea of our method involves training local +descriptions on the clients, serving as the medium to transfer the knowledge of +the distributed clients to the server. Firstly, we train local descriptions on +the client data to capture the characteristics of client distributions, which +are then uploaded to the server. On the server, the descriptions are used as +conditions to guide the DM in generating synthetic datasets that comply with +the distributions of various clients, enabling the training of the aggregated +model. Theoretical analyses and sufficient quantitation and visualization +experiments on three large-scale real-world datasets demonstrate that through +the training of local descriptions, the server is capable of generating +synthetic datasets with high quality and diversity. Consequently, with +advantages in communication and privacy protection, the aggregated model +outperforms compared FL or diffusion-based OSFL methods and, on some clients, +outperforms the performance ceiling of centralized training. + +
+
+ comment: Accepted by MM 24 +
+
+
+
+
+ + ☆ Robust Conformal Volume Estimation in 3D Medical Images MICCAI 2024 + + +
+ Volumetry is one of the principal downstream applications of 3D medical image +segmentation, for example, to detect abnormal tissue growth or for surgery +planning. Conformal Prediction is a promising framework for uncertainty +quantification, providing calibrated predictive intervals associated with +automatic volume measurements. However, this methodology is based on the +hypothesis that calibration and test samples are exchangeable, an assumption +that is in practice often violated in medical image applications. A weighted +formulation of Conformal Prediction can be framed to mitigate this issue, but +its empirical investigation in the medical domain is still lacking. A potential +reason is that it relies on the estimation of the density ratio between the +calibration and test distributions, which is likely to be intractable in +scenarios involving high-dimensional data. To circumvent this, we propose an +efficient approach for density ratio estimation relying on the compressed +latent representations generated by the segmentation model. Our experiments +demonstrate the efficiency of our approach to reduce the coverage error in the +presence of covariate shifts, in both synthetic and real-world settings. Our +implementation is available at https://github.com/benolmbrt/wcp_miccai + +
+
+ comment: Early accepted at MICCAI 2024 +
+
+
+
+
+ + ☆ FreeLong: Training-Free Long Video Generation with SpectralBlend + Temporal Attention + + +
+ Video diffusion models have made substantial progress in various video +generation applications. However, training models for long video generation +tasks require significant computational and data resources, posing a challenge +to developing long video diffusion models. This paper investigates a +straightforward and training-free approach to extend an existing short video +diffusion model (e.g. pre-trained on 16-frame videos) for consistent long video +generation (e.g. 128 frames). Our preliminary observation has found that +directly applying the short video diffusion model to generate long videos can +lead to severe video quality degradation. Further investigation reveals that +this degradation is primarily due to the distortion of high-frequency +components in long videos, characterized by a decrease in spatial +high-frequency components and an increase in temporal high-frequency +components. Motivated by this, we propose a novel solution named FreeLong to +balance the frequency distribution of long video features during the denoising +process. FreeLong blends the low-frequency components of global video features, +which encapsulate the entire video sequence, with the high-frequency components +of local video features that focus on shorter subsequences of frames. This +approach maintains global consistency while incorporating diverse and +high-quality spatiotemporal details from local videos, enhancing both the +consistency and fidelity of long video generation. We evaluated FreeLong on +multiple base video diffusion models and observed significant improvements. +Additionally, our method supports coherent multi-prompt generation, ensuring +both visual coherence and seamless transitions between scenes. + +
+
+ comment: Project page: https://yulu.net.cn/freelong +
+
+
+
+
+ + ☆ Cell Culture Assistive Application for Precipitation Image Diagnosis + + +
+ In regenerative medicine research, we experimentally design the composition +of chemical medium. We add different components to 384-well plates and culture +the biological cells. We monitor the condition of the cells and take time-lapse +bioimages for morphological assay. In particular, precipitation can appear as +artefacts in the image and contaminate the noise in the imaging assay. +Inspecting precipitates is a tedious task for the observer, and differences in +experience can lead to variations in judgement from person to person. The +machine learning approach will remove the burden of human inspection and +provide consistent inspection. In addition, precipitation features are as small +as 10-20 {\mu}m. A 1200 pixel square well image resized under a resolution of +2.82 {\mu}m/pixel will result in a reduction in precipitation features. +Dividing the well images into 240-pixel squares and learning without resizing +preserves the resolution of the original image. In this study, we developed an +application to automatically detect precipitation on 384-well plates utilising +optical microscope images. We apply MN-pair contrastive clustering to extract +precipitation classes from approximately 20,000 patch images. To detect +precipitation features, we compare deeper FCDDs detectors with optional +backbones and build a machine learning pipeline to detect precipitation from +the maximum score of quadruplet well images using isolation Forest algorithm, +where the anomaly score is ranged from zero to one. Furthermore, using this +application we can visualise precipitation situ heatmap on a 384-well plate. + +
+
+ comment: 18 pages, 15 figures, 5 tables +
+
+
+
+
+ + ☆ End-to-end SYNTAX score prediction: benchmark and methods + + +
+ The SYNTAX score has become a widely used measure of coronary disease +severity , crucial in selecting the optimal mode of revascularization. This +paper introduces a new medical regression and classification problem - +automatically estimating SYNTAX score from coronary angiography. Our study +presents a comprehensive dataset of 1,844 patients, featuring a balanced +distribution of individuals with zero and non-zero scores. This dataset +includes a first-of-its-kind, complete coronary angiography samples captured +through a multi-view X-ray video, allowing one to observe coronary arteries +from multiple perspectives. Furthermore, we present a novel, fully automatic +end-to-end method for estimating the SYNTAX. For such a difficult task, we have +achieved a solid coefficient of determination R2 of 0.51 in score predictions. + +
+
+
+
+
+ + ☆ Self-Supervised Learning for Text Recognition: A Critical Survey + + +
+ Text Recognition (TR) refers to the research area that focuses on retrieving +textual information from images, a topic that has seen significant advancements +in the last decade due to the use of Deep Neural Networks (DNN). However, these +solutions often necessitate vast amounts of manually labeled or synthetic data. +Addressing this challenge, Self-Supervised Learning (SSL) has gained attention +by utilizing large datasets of unlabeled data to train DNN, thereby generating +meaningful and robust representations. Although SSL was initially overlooked in +TR because of its unique characteristics, recent years have witnessed a surge +in the development of SSL methods specifically for this field. This rapid +development, however, has led to many methods being explored independently, +without taking previous efforts in methodology or comparison into account, +thereby hindering progress in the field of research. This paper, therefore, +seeks to consolidate the use of SSL in the field of TR, offering a critical and +comprehensive overview of the current state of the art. We will review and +analyze the existing methods, compare their results, and highlight +inconsistencies in the current literature. This thorough analysis aims to +provide general insights into the field, propose standardizations, identify new +research directions, and foster its proper development. + +
+
+ comment: This article is under revision +
+
+
+
+
+ + ☆ Yucca: A Deep Learning Framework For Medical Image Analysis + + +
+ Medical image analysis using deep learning frameworks has advanced healthcare +by automating complex tasks, but many existing frameworks lack flexibility, +modularity, and user-friendliness. To address these challenges, we introduce +Yucca, an open-source AI framework available at +https://github.com/Sllambias/yucca, designed specifically for medical imaging +applications and built on PyTorch and PyTorch Lightning. Yucca features a +three-tiered architecture: Functional, Modules, and Pipeline, providing a +comprehensive and customizable solution. Evaluated across diverse tasks such as +cerebral microbleeds detection, white matter hyperintensity segmentation, and +hippocampus segmentation, Yucca achieves state-of-the-art results, +demonstrating its robustness and versatility. Yucca offers a powerful, +flexible, and user-friendly platform for medical image analysis, inviting +community contributions to advance its capabilities and impact. + +
+
+
+
+
+ + ☆ Language-driven Grasp Detection with Mask-guided Attention IROS 2024 + + +
+ Grasp detection is an essential task in robotics with various industrial +applications. However, traditional methods often struggle with occlusions and +do not utilize language for grasping. Incorporating natural language into grasp +detection remains a challenging task and largely unexplored. To address this +gap, we propose a new method for language-driven grasp detection with +mask-guided attention by utilizing the transformer attention mechanism with +semantic segmentation features. Our approach integrates visual data, +segmentation mask features, and natural language instructions, significantly +improving grasp detection accuracy. Our work introduces a new framework for +language-driven grasp detection, paving the way for language-driven robotic +applications. Intensive experiments show that our method outperforms other +recent baselines by a clear margin, with a 10.0% success score improvement. We +further validate our method in real-world robotic experiments, confirming the +effectiveness of our approach. + +
+
+ comment: Accepted at IROS 2024 +
+
+
+
+
+ + ☆ Exploring Robust Face-Voice Matching in Multilingual Environments + + +
+ This paper presents Team Xaiofei's innovative approach to exploring +Face-Voice Association in Multilingual Environments (FAME) at ACM Multimedia +2024. We focus on the impact of different languages in face-voice matching by +building upon Fusion and Orthogonal Projection (FOP), introducing four key +components: a dual-branch structure, dynamic sample pair weighting, robust data +augmentation, and score polarization strategy. Our dual-branch structure serves +as an auxiliary mechanism to better integrate and provide more comprehensive +information. We also introduce a dynamic weighting mechanism for various sample +pairs to optimize learning. Data augmentation techniques are employed to +enhance the model's generalization across diverse conditions. Additionally, +score polarization strategy based on age and gender matching confidence +clarifies and accentuates the final results. Our methods demonstrate +significant effectiveness, achieving an equal error rate (EER) of 20.07 on the +V2-EH dataset and 21.76 on the V1-EU dataset. + +
+
+
+
+
+ + ☆ Normality Addition via Normality Detection in Industrial Image Anomaly + Detection Models + + +
+ The task of image anomaly detection (IAD) aims to identify deviations from +normality in image data. These anomalies are patterns that deviate +significantly from what the IAD model has learned from the data during +training. However, in real-world scenarios, the criteria for what constitutes +normality often change, necessitating the reclassification of previously +anomalous instances as normal. To address this challenge, we propose a new +scenario termed "normality addition," involving the post-training adjustment of +decision boundaries to incorporate new normalities. To address this challenge, +we propose a method called Normality Addition via Normality Detection (NAND), +leveraging a vision-language model. NAND performs normality detection which +detect patterns related to the intended normality within images based on +textual descriptions. We then modify the results of a pre-trained IAD model to +implement this normality addition. Using the benchmark dataset in IAD, MVTec +AD, we establish an evaluation protocol for the normality addition task and +empirically demonstrate the effectiveness of the NAND method. + +
+
+
+
+
+ + ☆ VortSDF: 3D Modeling with Centroidal Voronoi Tesselation on Signed + Distance Field + + +
+ Volumetric shape representations have become ubiquitous in multi-view +reconstruction tasks. They often build on regular voxel grids as discrete +representations of 3D shape functions, such as SDF or radiance fields, either +as the full shape model or as sampled instantiations of continuous +representations, as with neural networks. Despite their proven efficiency, +voxel representations come with the precision versus complexity trade-off. This +inherent limitation can significantly impact performance when moving away from +simple and uncluttered scenes. In this paper we investigate an alternative +discretization strategy with the Centroidal Voronoi Tesselation (CVT). CVTs +allow to better partition the observation space with respect to shape occupancy +and to focus the discretization around shape surfaces. To leverage this +discretization strategy for multi-view reconstruction, we introduce a +volumetric optimization framework that combines explicit SDF fields with a +shallow color network, in order to estimate 3D shape properties over +tetrahedral grids. Experimental results with Chamfer statistics validate this +approach with unprecedented reconstruction quality on various scenarios such as +objects, open scenes or human. + +
+
+
+
+
+ + ☆ ML-Mamba: Efficient Multi-Modal Large Language Model Utilizing Mamba-2 + + +
+ Multimodal Large Language Models (MLLMs) have attracted much attention due to +their multifunctionality. However, traditional Transformer architectures incur +significant overhead due to their secondary computational complexity. To +address this issue, we introduce ML-Mamba, a multimodal language model that +utilizes the latest and efficient Mamba-2 model for inference. Mamba-2 is known +for its linear extension and fast processing of long sequences. We replace the +Transformer based backbone with a pre-trained Mamba-2 model and explore methods +for integrating 2D visual selective scanning mechanisms into multimodal +learning. We also try various visual encoders and Mamba-2 model variants. Our +extensive experiments conducted in various multimodal benchmark tests have +demonstrated the competitive performance of ML-Mamba and highlighted the +potential of state space models in multimodal tasks. The experimental results +show that: (1) ML-Mamba achieves performance comparable to state-of-the-art +methods such as TinyLaVA and MobileVLM v2 through its linear sequential +modeling, while also having faster inference speed; (2) ML-Mamba performs well +in visual hallucinations and spatial relationship judgment in closed set +benchmark tests; (3) ML-Mamba achieves performance comparable to LLaVA while +reducing the number of parameters by 40\%.(4) Compared to the multimodal model +using the original Mamba model, the Mamba-2 based large-scale multimodal +language model has stronger inference performance and effectiveness. + +
+
+
+
+
+ + ☆ Distilling High Diagnostic Value Patches for Whole Slide Image + Classification Using Attention Mechanism + + +
+ Multiple Instance Learning (MIL) has garnered widespread attention in the +field of Whole Slide Image (WSI) classification as it replaces pixel-level +manual annotation with diagnostic reports as labels, significantly reducing +labor costs. Recent research has shown that bag-level MIL methods often yield +better results because they can consider all patches of the WSI as a whole. +However, a drawback of such methods is the incorporation of more redundant +patches, leading to interference. To extract patches with high diagnostic value +while excluding interfering patches to address this issue, we developed an +attention-based feature distillation multi-instance learning (AFD-MIL) +approach. This approach proposed the exclusion of redundant patches as a +preprocessing operation in weakly supervised learning, directly mitigating +interference from extensive noise. It also pioneers the use of attention +mechanisms to distill features with high diagnostic value, as opposed to the +traditional practice of indiscriminately and forcibly integrating all patches. +Additionally, we introduced global loss optimization to finely control the +feature distillation module. AFD-MIL is orthogonal to many existing MIL +methods, leading to consistent performance improvements. This approach has +surpassed the current state-of-the-art method, achieving 91.47% ACC (accuracy) +and 94.29% AUC (area under the curve) on the Camelyon16 (Camelyon Challenge +2016, breast cancer), while 93.33% ACC and 98.17% AUC on the TCGA-NSCLC (The +Cancer Genome Atlas Program: non-small cell lung cancer). Different feature +distillation methods were used for the two datasets, tailored to the specific +diseases, thereby improving performance and interpretability. + +
+
+
+
+
+ + ☆ ActivityCLIP: Enhancing Group Activity Recognition by Mining + Complementary Information from Text to Supplement Image Modality + + +
+ Previous methods usually only extract the image modality's information to +recognize group activity. However, mining image information is approaching +saturation, making it difficult to extract richer information. Therefore, +extracting complementary information from other modalities to supplement image +information has become increasingly important. In fact, action labels provide +clear text information to express the action's semantics, which existing +methods often overlook. Thus, we propose ActivityCLIP, a plug-and-play method +for mining the text information contained in the action labels to supplement +the image information for enhancing group activity recognition. ActivityCLIP +consists of text and image branches, where the text branch is plugged into the +image branch (The off-the-shelf image-based method). The text branch includes +Image2Text and relation modeling modules. Specifically, we propose the +knowledge transfer module, Image2Text, which adapts image information into text +information extracted by CLIP via knowledge distillation. Further, to keep our +method convenient, we add fewer trainable parameters based on the relation +module of the image branch to model interaction relation in the text branch. To +show our method's generality, we replicate three representative methods by +ActivityCLIP, which adds only limited trainable parameters, achieving favorable +performance improvements for each method. We also conduct extensive ablation +studies and compare our method with state-of-the-art methods to demonstrate the +effectiveness of ActivityCLIP. + +
+
+
+
+
+ + ☆ Image-text matching for large-scale book collections + + +
+ We address the problem of detecting and mapping all books in a collection of +images to entries in a given book catalogue. Instead of performing independent +retrieval for each book detected, we treat the image-text mapping problem as a +many-to-many matching process, looking for the best overall match between the +two sets. We combine a state-of-the-art segmentation method (SAM) to detect +book spines and extract book information using a commercial OCR. We then +propose a two-stage approach for text-image matching, where CLIP embeddings are +used first for fast matching, followed by a second slower stage to refine the +matching, employing either the Hungarian Algorithm or a BERT-based model +trained to cope with noisy OCR input and partial text matches. To evaluate our +approach, we publish a new dataset of annotated bookshelf images that covers +the whole book collection of a public library in Spain. In addition, we provide +two target lists of book metadata, a closed-set of 15k book titles that +corresponds to the known library inventory, and an open-set of 2.3M book titles +to simulate an open-world scenario. We report results on two settings, on one +hand on a matching-only task, where the book segments and OCR is given and the +objective is to perform many-to-many matching against the target lists, and a +combined detection and matching task, where books must be first detected and +recognised before they are matched to the target list entries. We show that +both the Hungarian Matching and the proposed BERT-based model outperform a +fuzzy string matching baseline, and we highlight inherent limitations of the +matching algorithms as the target increases in size, and when either of the two +sets (detected books or target book list) is incomplete. The dataset and code +are available at https://github.com/llabres/library-dataset + +
+
+
+
+
+ + ☆ Synthetic Thermal and RGB Videos for Automatic Pain Assessment utilizing + a Vision-MLP Architecture + + +
+ Pain assessment is essential in developing optimal pain management protocols +to alleviate suffering and prevent functional decline in patients. +Consequently, reliable and accurate automatic pain assessment systems are +essential for continuous and effective patient monitoring. This study presents +synthetic thermal videos generated by Generative Adversarial Networks +integrated into the pain recognition pipeline and evaluates their efficacy. A +framework consisting of a Vision-MLP and a Transformer-based module is +utilized, employing RGB and synthetic thermal videos in unimodal and multimodal +settings. Experiments conducted on facial videos from the BioVid database +demonstrate the effectiveness of synthetic thermal videos and underline the +potential advantages of it. + +
+
+
+
+
+ + ☆ Twins-PainViT: Towards a Modality-Agnostic Vision Transformer Framework + for Multimodal Automatic Pain Assessment using Facial Videos and fNIRS + + +
+ Automatic pain assessment plays a critical role for advancing healthcare and +optimizing pain management strategies. This study has been submitted to the +First Multimodal Sensing Grand Challenge for Next-Gen Pain Assessment +(AI4PAIN). The proposed multimodal framework utilizes facial videos and fNIRS +and presents a modality-agnostic approach, alleviating the need for +domain-specific models. Employing a dual ViT configuration and adopting +waveform representations for the fNIRS, as well as for the extracted embeddings +from the two modalities, demonstrate the efficacy of the proposed method, +achieving an accuracy of 46.76% in the multilevel pain assessment task. + +
+
+
+
+
+ + ☆ VolDoGer: LLM-assisted Datasets for Domain Generalization in + Vision-Language Tasks + + +
+ Domain generalizability is a crucial aspect of a deep learning model since it +determines the capability of the model to perform well on data from unseen +domains. However, research on the domain generalizability of deep learning +models for vision-language tasks remains limited, primarily because of the lack +of required datasets. To address these challenges, we propose VolDoGer: +Vision-Language Dataset for Domain Generalization, a dedicated dataset designed +for domain generalization that addresses three vision-language tasks: image +captioning, visual question answering, and visual entailment. We constructed +VolDoGer by extending LLM-based data annotation techniques to vision-language +tasks, thereby alleviating the burden of recruiting human annotators. We +evaluated the domain generalizability of various models, ranging from +fine-tuned models to a recent multimodal large language model, through +VolDoGer. + +
+
+ comment: 31 pages, 5 figures, 20 tables +
+
+
+
+
+ + ☆ Interpreting Low-level Vision Models with Causal Effect Maps + + +
+ Deep neural networks have significantly improved the performance of low-level +vision tasks but also increased the difficulty of interpretability. A deep +understanding of deep models is beneficial for both network design and +practical reliability. To take up this challenge, we introduce causality theory +to interpret low-level vision models and propose a model-/task-agnostic method +called Causal Effect Map (CEM). With CEM, we can visualize and quantify the +input-output relationships on either positive or negative effects. After +analyzing various low-level vision tasks with CEM, we have reached several +interesting insights, such as: (1) Using more information of input images +(e.g., larger receptive field) does NOT always yield positive outcomes. (2) +Attempting to incorporate mechanisms with a global receptive field (e.g., +channel attention) into image denoising may prove futile. (3) Integrating +multiple tasks to train a general model could encourage the network to +prioritize local information over global context. Based on the causal effect +theory, the proposed diagnostic tool can refresh our common knowledge and bring +a deeper understanding of low-level vision models. Codes are available at +https://github.com/J-FHu/CEM. + +
+
+
+
+
+ + ☆ SciPostLayout: A Dataset for Layout Analysis and Layout Generation of + Scientific Posters BMVC2024 + + +
+ Scientific posters are used to present the contributions of scientific papers +effectively in a graphical format. However, creating a well-designed poster +that efficiently summarizes the core of a paper is both labor-intensive and +time-consuming. A system that can automatically generate well-designed posters +from scientific papers would reduce the workload of authors and help readers +understand the outline of the paper visually. Despite the demand for poster +generation systems, only a limited research has been conduced due to the lack +of publicly available datasets. Thus, in this study, we built the SciPostLayout +dataset, which consists of 7,855 scientific posters and manual layout +annotations for layout analysis and generation. SciPostLayout also contains 100 +scientific papers paired with the posters. All of the posters and papers in our +dataset are under the CC-BY license and are publicly available. As benchmark +tests for the collected dataset, we conducted experiments for layout analysis +and generation utilizing existing computer vision models and found that both +layout analysis and generation of posters using SciPostLayout are more +challenging than with scientific papers. We also conducted experiments on +generating layouts from scientific papers to demonstrate the potential of +utilizing LLM as a scientific poster generation system. The dataset is publicly +available at https://huggingface.co/datasets/omron-sinicx/scipostlayout_v2. The +code is also publicly available at +https://github.com/omron-sinicx/scipostlayout. + +
+
+ comment: Accepted by BMVC2024 +
+
+
+
+
+ + ☆ Garment Animation NeRF with Color Editing + + +
+ Generating high-fidelity garment animations through traditional workflows, +from modeling to rendering, is both tedious and expensive. These workflows +often require repetitive steps in response to updates in character motion, +rendering viewpoint changes, or appearance edits. Although recent neural +rendering offers an efficient solution for computationally intensive processes, +it struggles with rendering complex garment animations containing fine wrinkle +details and realistic garment-and-body occlusions, while maintaining structural +consistency across frames and dense view rendering. In this paper, we propose a +novel approach to directly synthesize garment animations from body motion +sequences without the need for an explicit garment proxy. Our approach infers +garment dynamic features from body motion, providing a preliminary overview of +garment structure. Simultaneously, we capture detailed features from +synthesized reference images of the garment's front and back, generated by a +pre-trained image model. These features are then used to construct a neural +radiance field that renders the garment animation video. Additionally, our +technique enables garment recoloring by decomposing its visual elements. We +demonstrate the generalizability of our method across unseen body motions and +camera views, ensuring detailed structural consistency. Furthermore, we +showcase its applicability to color editing on both real and synthetic garment +data. Compared to existing neural rendering techniques, our method exhibits +qualitative and quantitative improvements in garment dynamics and wrinkle +detail modeling. Code is available at +\url{https://github.com/wrk226/GarmentAnimationNeRF}. + +
+
+
+
+
+ + ☆ Unmasking unlearnable models: a classification challenge for biomedical + images without visible cues + + +
+ Predicting traits from images lacking visual cues is challenging, as +algorithms are designed to capture visually correlated ground truth. This +problem is critical in biomedical sciences, and their solution can improve the +efficacy of non-invasive methods. For example, a recent challenge of predicting +MGMT methylation status from MRI images is critical for treatment decisions of +glioma patients. Using less robust models poses a significant risk in these +critical scenarios and underscores the urgency of addressing this issue. +Despite numerous efforts, contemporary models exhibit suboptimal performance, +and underlying reasons for this limitation remain elusive. In this study, we +demystify the complexity of MGMT status prediction through a comprehensive +exploration by performing benchmarks of existing models adjoining transfer +learning. Their architectures were further dissected by observing gradient flow +across layers. Additionally, a feature selection strategy was applied to +improve model interpretability. Our finding highlighted that current models are +unlearnable and may require new architectures to explore applications in the +real world. We believe our study will draw immediate attention and catalyse +advancements in predictive modelling with non-visible cues. + +
+
+
+
+
+ + ☆ Efficient Face Super-Resolution via Wavelet-based Feature Enhancement + Network + + +
+ Face super-resolution aims to reconstruct a high-resolution face image from a +low-resolution face image. Previous methods typically employ an encoder-decoder +structure to extract facial structural features, where the direct downsampling +inevitably introduces distortions, especially to high-frequency features such +as edges. To address this issue, we propose a wavelet-based feature enhancement +network, which mitigates feature distortion by losslessly decomposing the input +feature into high and low-frequency components using the wavelet transform and +processing them separately. To improve the efficiency of facial feature +extraction, a full domain Transformer is further proposed to enhance local, +regional, and global facial features. Such designs allow our method to perform +better without stacking many modules as previous methods did. Experiments show +that our method effectively balances performance, model size, and speed. Code +link: https://github.com/PRIS-CV/WFEN. + +
+
+
+
+
+ + ☆ TeleOR: Real-time Telemedicine System for Full-Scene Operating Room + + +
+ The advent of telemedicine represents a transformative development in +leveraging technology to extend the reach of specialized medical expertise to +remote surgeries, a field where the immediacy of expert guidance is paramount. +However, the intricate dynamics of Operating Room (OR) scene pose unique +challenges for telemedicine, particularly in achieving high-fidelity, real-time +scene reconstruction and transmission amidst obstructions and bandwidth +limitations. This paper introduces TeleOR, a pioneering system designed to +address these challenges through real-time OR scene reconstruction for +Tele-intervention. TeleOR distinguishes itself with three innovative +approaches: dynamic self-calibration, which leverages inherent scene features +for calibration without the need for preset markers, allowing for obstacle +avoidance and real-time camera adjustment; selective OR reconstruction, +focusing on dynamically changing scene segments to reduce reconstruction +complexity; and viewport-adaptive transmission, optimizing data transmission +based on real-time client feedback to efficiently deliver high-quality 3D +reconstructions within bandwidth constraints. Comprehensive experiments on the +4D-OR surgical scene dataset demostrate the superiority and applicability of +TeleOR, illuminating the potential to revolutionize tele-interventions by +overcoming the spatial and technical barriers inherent in remote surgical +guidance. + +
+
+
+
+
+ + ☆ PredIN: Towards Open-Set Gesture Recognition via Prediction + Inconsistency + + +
+ Gesture recognition based on surface electromyography (sEMG) has achieved +significant progress in human-machine interaction (HMI). However, accurately +recognizing predefined gestures within a closed set is still inadequate in +practice; a robust open-set system needs to effectively reject unknown gestures +while correctly classifying known ones. To handle this challenge, we first +report prediction inconsistency discovered for unknown classes due to ensemble +diversity, which can significantly facilitate the detection of unknown classes. +Based on this insight, we propose an ensemble learning approach, PredIN, to +explicitly magnify the prediction inconsistency by enhancing ensemble +diversity. Specifically, PredIN maximizes the class feature distribution +inconsistency among ensemble members to enhance diversity. Meanwhile, it +optimizes inter-class separability within an individual ensemble member to +maintain individual performance. Comprehensive experiments on various benchmark +datasets demonstrate that the PredIN outperforms state-of-the-art methods by a +clear margin.Our proposed method simultaneously achieves accurate closed-set +classification for predefined gestures and effective rejection for unknown +gestures, exhibiting its efficacy and superiority in open-set gesture +recognition based on sEMG. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Contextuality Helps Representation Learning for Generalized Category + Discovery + + +
+ This paper introduces a novel approach to Generalized Category Discovery +(GCD) by leveraging the concept of contextuality to enhance the identification +and classification of categories in unlabeled datasets. Drawing inspiration +from human cognition's ability to recognize objects within their context, we +propose a dual-context based method. + Our model integrates two levels of contextuality: instance-level, where +nearest-neighbor contexts are utilized for contrastive learning, and +cluster-level, employing prototypical contrastive learning based on category +prototypes. The integration of the contextual information effectively improves +the feature learning and thereby the classification accuracy of all categories, +which better deals with the real-world datasets. Different from the traditional +semi-supervised and novel category discovery techniques, our model focuses on a +more realistic and challenging scenario where both known and novel categories +are present in the unlabeled data. Extensive experimental results on several +benchmark data sets demonstrate that the proposed model outperforms the +state-of-the-art. Code is available at: +https://github.com/Clarence-CV/Contexuality-GCD + +
+
+
+
+
+ + ☆ Octave-YOLO: Cross frequency detection network with octave convolution + + +
+ Despite the rapid advancement of object detection algorithms, processing +high-resolution images on embedded devices remains a significant challenge. +Theoretically, the fully convolutional network architecture used in current +real-time object detectors can handle all input resolutions. However, the +substantial computational demands required to process high-resolution images +render them impractical for real-time applications. To address this issue, +real-time object detection models typically downsample the input image for +inference, leading to a loss of detail and decreased accuracy. In response, we +developed Octave-YOLO, designed to process high-resolution images in real-time +within the constraints of embedded systems. We achieved this through the +introduction of the cross frequency partial network (CFPNet), which divides the +input feature map into low-resolution, low-frequency, and high-resolution, +high-frequency sections. This configuration enables complex operations such as +convolution bottlenecks and self-attention to be conducted exclusively on +low-resolution feature maps while simultaneously preserving the details in +high-resolution maps. Notably, this approach not only dramatically reduces the +computational demands of convolution tasks but also allows for the integration +of attention modules, which are typically challenging to implement in real-time +applications, with minimal additional cost. Additionally, we have incorporated +depthwise separable convolution into the core building blocks and downsampling +layers to further decrease latency. Experimental results have shown that +Octave-YOLO matches the performance of YOLOv8 while significantly reducing +computational demands. For example, in 1080x1080 resolution, Octave-YOLO-N is +1.56 times faster than YOLOv8, achieving nearly the same accuracy on the COCO +dataset with approximately 40 percent fewer parameters and FLOPs. + +
+
+
+
+
+ + ☆ Revolutionizing Urban Safety Perception Assessments: Integrating + Multimodal Large Language Models with Street View Images + + +
+ Measuring urban safety perception is an important and complex task that +traditionally relies heavily on human resources. This process often involves +extensive field surveys, manual data collection, and subjective assessments, +which can be time-consuming, costly, and sometimes inconsistent. Street View +Images (SVIs), along with deep learning methods, provide a way to realize +large-scale urban safety detection. However, achieving this goal often requires +extensive human annotation to train safety ranking models, and the +architectural differences between cities hinder the transferability of these +models. Thus, a fully automated method for conducting safety evaluations is +essential. Recent advances in multimodal large language models (MLLMs) have +demonstrated powerful reasoning and analytical capabilities. Cutting-edge +models, e.g., GPT-4 have shown surprising performance in many tasks. We +employed these models for urban safety ranking on a human-annotated anchor set +and validated that the results from MLLMs align closely with human perceptions. +Additionally, we proposed a method based on the pre-trained Contrastive +Language-Image Pre-training (CLIP) feature and K-Nearest Neighbors (K-NN) +retrieval to quickly assess the safety index of the entire city. Experimental +results show that our method outperforms existing training needed deep learning +approaches, achieving efficient and accurate urban safety evaluations. The +proposed automation for urban safety perception assessment is a valuable tool +for city planners, policymakers, and researchers aiming to improve urban +environments. + +
+
+ comment: 13 pages, 9 figures +
+
+
+
+
+ + ☆ Rethinking RGB-D Fusion for Semantic Segmentation in Surgical Datasets + + +
+ Surgical scene understanding is a key technical component for enabling +intelligent and context aware systems that can transform various aspects of +surgical interventions. In this work, we focus on the semantic segmentation +task, propose a simple yet effective multi-modal (RGB and depth) training +framework called SurgDepth, and show state-of-the-art (SOTA) results on all +publicly available datasets applicable for this task. Unlike previous +approaches, which either fine-tune SOTA segmentation models trained on natural +images, or encode RGB or RGB-D information using RGB only pre-trained +backbones, SurgDepth, which is built on top of Vision Transformers (ViTs), is +designed to encode both RGB and depth information through a simple fusion +mechanism. We conduct extensive experiments on benchmark datasets including +EndoVis2022, AutoLapro, LapI2I and EndoVis2017 to verify the efficacy of +SurgDepth. Specifically, SurgDepth achieves a new SOTA IoU of 0.86 on EndoVis +2022 SAR-RARP50 challenge and outperforms the current best method by at least +4%, using a shallow and compute efficient decoder consisting of ConvNeXt +blocks. + +
+
+
+
+
+ + ☆ ALEN: A Dual-Approach for Uniform and Non-Uniform Low-Light Image + Enhancement + + +
+ Low-light image enhancement is an important task in computer vision, +essential for improving the visibility and quality of images captured in +non-optimal lighting conditions. Inadequate illumination can lead to +significant information loss and poor image quality, impacting various +applications such as surveillance. photography, or even autonomous driving. In +this regard, automated methods have been developed to automatically adjust +illumination in the image for a better visual perception. Current enhancement +techniques often use specific datasets to enhance low-light images, but still +present challenges when adapting to diverse real-world conditions, where +illumination degradation may be localized to specific regions. To address this +challenge, the Adaptive Light Enhancement Network (ALEN) is introduced, whose +main approach is the use of a classification mechanism to determine whether +local or global illumination enhancement is required. Subsequently, estimator +networks adjust illumination based on this classification and simultaneously +enhance color fidelity. ALEN integrates the Light Classification Network +(LCNet) for illuminance categorization, complemented by the Single-Channel +Network (SCNet), and Multi-Channel Network (MCNet) for precise estimation of +illumination and color, respectively. Extensive experiments on publicly +available datasets for low-light conditions were carried out to underscore +ALEN's robust generalization capabilities, demonstrating superior performance +in both quantitative metrics and qualitative assessments when compared to +recent state-of-the-art methods. The ALEN not only enhances image quality in +terms of visual perception but also represents an advancement in high-level +vision tasks, such as semantic segmentation, as presented in this work. The +code of this method is available at https://github.com/xingyumex/ALEN. + +
+
+
+
+
+ + ☆ Classification Matters: Improving Video Action Detection with + Class-Specific Attention ECCV 2024 + + +
+ Video action detection (VAD) aims to detect actors and classify their actions +in a video. We figure that VAD suffers more from classification rather than +localization of actors. Hence, we analyze how prevailing methods form features +for classification and find that they prioritize actor regions, yet often +overlooking the essential contextual information necessary for accurate +classification. Accordingly, we propose to reduce the bias toward actor and +encourage paying attention to the context that is relevant to each action +class. By assigning a class-dedicated query to each action class, our model can +dynamically determine where to focus for effective classification. The proposed +model demonstrates superior performance on three challenging benchmarks with +significantly fewer parameters and less computation. + +
+
+ comment: 31 pages, accepted to ECCV 2024 +
+
+
+
+
+ + ☆ Cross-Layer Feature Pyramid Transformer for Small Object Detection in + Aerial Images + + +
+ Object detection in aerial images has always been a challenging task due to +the generally small size of the objects. Most current detectors prioritize +novel detection frameworks, often overlooking research on fundamental +components such as feature pyramid networks. In this paper, we introduce the +Cross-Layer Feature Pyramid Transformer (CFPT), a novel upsampler-free feature +pyramid network designed specifically for small object detection in aerial +images. CFPT incorporates two meticulously designed attention blocks with +linear computational complexity: the Cross-Layer Channel-Wise Attention (CCA) +and the Cross-Layer Spatial-Wise Attention (CSA). CCA achieves cross-layer +interaction by dividing channel-wise token groups to perceive cross-layer +global information along the spatial dimension, while CSA completes cross-layer +interaction by dividing spatial-wise token groups to perceive cross-layer +global information along the channel dimension. By integrating these modules, +CFPT enables cross-layer interaction in one step, thereby avoiding the semantic +gap and information loss associated with element-wise summation and +layer-by-layer transmission. Furthermore, CFPT incorporates global contextual +information, which enhances detection performance for small objects. To further +enhance location awareness during cross-layer interaction, we propose the +Cross-Layer Consistent Relative Positional Encoding (CCPE) based on inter-layer +mutual receptive fields. We evaluate the effectiveness of CFPT on two +challenging object detection datasets in aerial images, namely VisDrone2019-DET +and TinyPerson. Extensive experiments demonstrate the effectiveness of CFPT, +which outperforms state-of-the-art feature pyramid networks while incurring +lower computational costs. The code will be released at +https://github.com/duzw9311/CFPT. + +
+
+
+
+
+ + ☆ Structural damage detection via hierarchical damage information with + volumetric assessment + + +
+ Image environments and noisy labels hinder deep learning-based inference +models in structural damage detection. Post-detection, there is the challenge +of reliance on manual assessments of detected damages. As a result, +Guided-DetNet, characterized by Generative Attention Module (GAM), Hierarchical +Elimination Algorithm (HEA), and Volumetric Contour Visual Assessment (VCVA), +is proposed to mitigate complex image environments, noisy labeling, and +post-detection manual assessment of structural damages. GAM leverages +cross-horizontal and cross-vertical patch merging and cross +foreground-background feature fusion to generate varied features to mitigate +complex image environments. HEA addresses noisy labeling using hierarchical +relationships among classes to refine instances given an image by eliminating +unlikely class categories. VCVA assesses the severity of detected damages via +volumetric representation and quantification leveraging the Dirac delta +distribution. A comprehensive quantitative study, two robustness tests, and an +application scenario based on the PEER Hub Image-Net dataset substantiate +Guided-DetNet's promising performances. Guided-DetNet outperformed the +best-compared models in a triple classification task by a difference of not +less than 3% and not less than 2% in a dual detection task under varying +metrics. + +
+
+
+
+
+ + ☆ Harnessing Large Vision and Language Models in Agriculture: A Review + + +
+ Large models can play important roles in many domains. Agriculture is another +key factor affecting the lives of people around the world. It provides food, +fabric, and coal for humanity. However, facing many challenges such as pests +and diseases, soil degradation, global warming, and food security, how to +steadily increase the yield in the agricultural sector is a problem that humans +still need to solve. Large models can help farmers improve production +efficiency and harvest by detecting a series of agricultural production tasks +such as pests and diseases, soil quality, and seed quality. It can also help +farmers make wise decisions through a variety of information, such as images, +text, etc. Herein, we delve into the potential applications of large models in +agriculture, from large language model (LLM) and large vision model (LVM) to +large vision-language models (LVLM). After gaining a deeper understanding of +multimodal large language models (MLLM), it can be recognized that problems +such as agricultural image processing, agricultural question answering systems, +and agricultural machine automation can all be solved by large models. Large +models have great potential in the field of agriculture. We outline the current +applications of agricultural large models, and aims to emphasize the importance +of large models in the domain of agriculture. In the end, we envisage a future +in which famers use MLLM to accomplish many tasks in agriculture, which can +greatly improve agricultural production efficiency and yield. + +
+
+
+
+
+ + ☆ Semi-Supervised Teacher-Reference-Student Architecture for Action + Quality Assessment ECCV2024 + + +
+ Existing action quality assessment (AQA) methods often require a large number +of label annotations for fully supervised learning, which are laborious and +expensive. In practice, the labeled data are difficult to obtain because the +AQA annotation process requires domain-specific expertise. In this paper, we +propose a novel semi-supervised method, which can be utilized for better +assessment of the AQA task by exploiting a large amount of unlabeled data and a +small portion of labeled data. Differing from the traditional teacher-student +network, we propose a teacher-reference-student architecture to learn both +unlabeled and labeled data, where the teacher network and the reference network +are used to generate pseudo-labels for unlabeled data to supervise the student +network. Specifically, the teacher predicts pseudo-labels by capturing +high-level features of unlabeled data. The reference network provides adequate +supervision of the student network by referring to additional action +information. Moreover, we introduce confidence memory to improve the +reliability of pseudo-labels by storing the most accurate ever output of the +teacher network and reference network. To validate our method, we conduct +extensive experiments on three AQA benchmark datasets. Experimental results +show that our method achieves significant improvements and outperforms existing +semi-supervised AQA methods. + +
+
+ comment: To be published in ECCV2024 +
+
+
+
+
+ + ☆ Advancing Prompt Learning through an External Layer + + +
+ Prompt learning represents a promising method for adapting pre-trained +visual-language models (VLMs) to various downstream tasks by learning a set of +text embeddings. One challenge inherent to these methods is the poor +generalization performance due to the invalidity of the learned text embeddings +for unseen tasks. A straightforward approach to bridge this gap is to freeze +the text embeddings in prompts, which results in a lack of capacity to adapt +VLMs for downstream tasks. To address this dilemma, we proposeto introduce an +External Layer (EnLa) of text branch and learnable visual embeddings of the +visual branch for adapting VLMs to downstream tasks. The learnable external +layer is built upon valid embeddings of pre-trained CLIP. This design considers +the balance of learning capabilities between the two branches. To align the +textual and visual features, we propose a novel two-pronged approach: i) we +introduce the optimal transport as the discrepancy metric to align the vision +and text modalities, and ii) we introducea novel strengthening feature to +enhance the interaction between these two modalities. Extensive experiments +show that our method performs favorably well on 4 types of representative tasks +across 11 datasets compared to the existing prompt learning methods. + +
+
+
+
+
+ + ☆ Take A Step Back: Rethinking the Two Stages in Visual Reasoning ECCV 2024 + + +
+ Visual reasoning, as a prominent research area, plays a crucial role in AI by +facilitating concept formation and interaction with the world. However, current +works are usually carried out separately on small datasets thus lacking +generalization ability. Through rigorous evaluation of diverse benchmarks, we +demonstrate the shortcomings of existing ad-hoc methods in achieving +cross-domain reasoning and their tendency to data bias fitting. In this paper, +we revisit visual reasoning with a two-stage perspective: (1) symbolization and +(2) logical reasoning given symbols or their representations. We find that the +reasoning stage is better at generalization than symbolization. Thus, it is +more efficient to implement symbolization via separated encoders for different +data domains while using a shared reasoner. Given our findings, we establish +design principles for visual reasoning frameworks following the separated +symbolization and shared reasoning. The proposed two-stage framework achieves +impressive generalization ability on various visual reasoning tasks, including +puzzles, physical prediction, and visual question answering (VQA), encompassing +both 2D and 3D modalities. We believe our insights will pave the way for +generalizable visual reasoning. + +
+
+ comment: ECCV 2024, Project page: + https://mybearyzhang.github.io/projects/TwoStageReason/ +
+
+
+
+
+ + ☆ Towards a Knowledge guided Multimodal Foundation Model for + Spatio-Temporal Remote Sensing Applications + + +
+ In recent years, there is increased interest in foundation models for +geoscience due to vast amount of earth observing satellite imagery. Existing +remote sensing foundation models make use of the various sources of spectral +imagery to create large models pretrained on masked reconstruction task. The +embeddings from these foundation models are then used for various downstream +remote sensing applications. In this paper we propose a foundational modeling +framework for remote sensing geoscience applications, that goes beyond these +traditional single modality masked autoencoder family of foundation models. +This framework leverages the knowledge guided principles that the spectral +imagery captures the impact of the physical drivers on the environmental +system, and that the relationship between them is governed by the +characteristics of the system. Specifically, our method, called MultiModal +Variable Step Forecasting (MM-VSF), uses mutlimodal data (spectral imagery and +weather) as its input and a variable step forecasting task as its pretraining +objective. In our evaluation we show forecasting of satellite imagery using +weather can be used as an effective pretraining task for foundation models. We +further show the effectiveness of the embeddings from MM-VSF on the downstream +task of pixel wise crop mapping, when compared with a model trained in the +traditional setting of single modality input and masked reconstruction based +pretraining. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ SALVE: A 3D Reconstruction Benchmark of Wounds from Consumer-grade + Videos + + +
+ Managing chronic wounds is a global challenge that can be alleviated by the +adoption of automatic systems for clinical wound assessment from consumer-grade +videos. While 2D image analysis approaches are insufficient for handling the 3D +features of wounds, existing approaches utilizing 3D reconstruction methods +have not been thoroughly evaluated. To address this gap, this paper presents a +comprehensive study on 3D wound reconstruction from consumer-grade videos. +Specifically, we introduce the SALVE dataset, comprising video recordings of +realistic wound phantoms captured with different cameras. Using this dataset, +we assess the accuracy and precision of state-of-the-art methods for 3D +reconstruction, ranging from traditional photogrammetry pipelines to advanced +neural rendering approaches. In our experiments, we observe that photogrammetry +approaches do not provide smooth surfaces suitable for precise clinical +measurements of wounds. Neural rendering approaches show promise in addressing +this issue, advancing the use of this technology in wound care practices. + +
+
+
+
+
+ + ☆ ComNeck: Bridging Compressed Image Latents and Multimodal LLMs via + Universal Transform-Neck + + +
+ This paper presents the first-ever study of adapting compressed image latents +to suit the needs of downstream vision tasks that adopt Multimodal Large +Language Models (MLLMs). MLLMs have extended the success of large language +models to modalities (e.g. images) beyond text, but their billion scale hinders +deployment on resource-constrained end devices. While cloud-hosted MLLMs could +be available, transmitting raw, uncompressed images captured by end devices to +the cloud requires an efficient image compression system. To address this, we +focus on emerging neural image compression and propose a novel framework with a +lightweight transform-neck and a surrogate loss to adapt compressed image +latents for MLLM-based vision tasks. The proposed framework is generic and +applicable to multiple application scenarios, where the neural image codec can +be (1) pre-trained for human perception without updating, (2) fully updated for +joint human and machine perception, or (3) fully updated for only machine +perception. The transform-neck trained with the surrogate loss is universal, +for it can serve various downstream vision tasks enabled by a variety of MLLMs +that share the same visual encoder. Our framework has the striking feature of +excluding the downstream MLLMs from training the transform-neck, and +potentially the neural image codec as well. This stands out from most existing +coding for machine approaches that involve downstream networks in training and +thus could be impractical when the networks are MLLMs. Extensive experiments on +different neural image codecs and various MLLM-based vision tasks show that our +method achieves great rate-accuracy performance with much less complexity, +demonstrating its effectiveness. + +
+
+
+
+
+ + ☆ Practical Video Object Detection via Feature Selection and Aggregation + + +
+ Compared with still image object detection, video object detection (VOD) +needs to particularly concern the high across-frame variation in object +appearance, and the diverse deterioration in some frames. In principle, the +detection in a certain frame of a video can benefit from information in other +frames. Thus, how to effectively aggregate features across different frames is +key to the target problem. Most of contemporary aggregation methods are +tailored for two-stage detectors, suffering from high computational costs due +to the dual-stage nature. On the other hand, although one-stage detectors have +made continuous progress in handling static images, their applicability to VOD +lacks sufficient exploration. To tackle the above issues, this study invents a +very simple yet potent strategy of feature selection and aggregation, gaining +significant accuracy at marginal computational expense. Concretely, for cutting +the massive computation and memory consumption from the dense prediction +characteristic of one-stage object detectors, we first condense candidate +features from dense prediction maps. Then, the relationship between a target +frame and its reference frames is evaluated to guide the aggregation. +Comprehensive experiments and ablation studies are conducted to validate the +efficacy of our design, and showcase its advantage over other cutting-edge VOD +methods in both effectiveness and efficiency. Notably, our model reaches +\emph{a new record performance, i.e., 92.9\% AP50 at over 30 FPS on the +ImageNet VID dataset on a single 3090 GPU}, making it a compelling option for +large-scale or real-time applications. The implementation is simple, and +accessible at \url{https://github.com/YuHengsss/YOLOV}. + +
+
+
+
+
+ + ☆ Foundations for Unfairness in Anomaly Detection -- Case Studies in + Facial Imaging Data AAAI + + +
+ Deep anomaly detection (AD) is perhaps the most controversial of data +analytic tasks as it identifies entities that are then specifically targeted +for further investigation or exclusion. Also controversial is the application +of AI to facial imaging data. This work explores the intersection of these two +areas to understand two core questions: "Who" these algorithms are being unfair +to and equally important "Why". Recent work has shown that deep AD can be +unfair to different groups despite being unsupervised with a recent study +showing that for portraits of people: men of color are far more likely to be +chosen to be outliers. We study the two main categories of AD algorithms: +autoencoder-based and single-class-based which effectively try to compress all +the instances with those that can not be easily compressed being deemed to be +outliers. We experimentally verify sources of unfairness such as the +under-representation of a group (e.g. people of color are relatively rare), +spurious group features (e.g. men are often photographed with hats), and group +labeling noise (e.g. race is subjective). We conjecture that lack of +compressibility is the main foundation and the others cause it but experimental +results show otherwise and we present a natural hierarchy amongst them. + +
+
+ comment: 16 pages, 8 figures, AAAI/ACM AIES24 +
+
+
+
+
+ + ☆ Text2LiDAR: Text-guided LiDAR Point Cloud Generation via Equirectangular + Transformer + + +
+ The complex traffic environment and various weather conditions make the +collection of LiDAR data expensive and challenging. Achieving high-quality and +controllable LiDAR data generation is urgently needed, controlling with text is +a common practice, but there is little research in this field. To this end, we +propose Text2LiDAR, the first efficient, diverse, and text-controllable LiDAR +data generation model. Specifically, we design an equirectangular transformer +architecture, utilizing the designed equirectangular attention to capture LiDAR +features in a manner with data characteristics. Then, we design a +control-signal embedding injector to efficiently integrate control signals +through the global-to-focused attention mechanism. Additionally, we devise a +frequency modulator to assist the model in recovering high-frequency details, +ensuring the clarity of the generated point cloud. To foster development in the +field and optimize text-controlled generation performance, we construct +nuLiDARtext which offers diverse text descriptors for 34,149 LiDAR point clouds +from 850 scenes. Experiments on uncontrolled and text-controlled generation in +various forms on KITTI-360 and nuScenes datasets demonstrate the superiority of +our approach. + +
+
+
+
+
+ + ☆ AgEval: A Benchmark for Zero-Shot and Few-Shot Plant Stress Phenotyping + with Multimodal LLMs + + +
+ Plant stress phenotyping traditionally relies on expert assessments and +specialized models, limiting scalability in agriculture. Recent advances in +multimodal large language models (LLMs) offer potential solutions to this +challenge. We present AgEval, a benchmark comprising 12 diverse plant stress +phenotyping tasks, to evaluate these models' capabilities. Our study assesses +zero-shot and few-shot in-context learning performance of state-of-the-art +models, including Claude, GPT, Gemini, and LLaVA. Results show significant +performance improvements with few-shot learning, with F1 scores increasing from +46.24% to 73.37% in 8-shot identification for the best-performing model. +Few-shot examples from other classes in the dataset have negligible or negative +impacts, although having the exact category example helps to increase +performance by 15.38%. We also quantify the consistency of model performance +across different classes within each task, finding that the coefficient of +variance (CV) ranges from 26.02% to 58.03% across models, implying that subject +matter expertise is needed - of 'difficult' classes - to achieve reliability in +performance. AgEval establishes baseline metrics for multimodal LLMs in +agricultural applications, offering insights into their promise for enhancing +plant stress phenotyping at scale. Benchmark and code can be accessed at: +https://anonymous.4open.science/r/AgEval/ + +
+
+
+
+
+ + ☆ Uncertainty-Rectified YOLO-SAM for Weakly Supervised ICH Segmentation + + +
+ Intracranial hemorrhage (ICH) is a life-threatening condition that requires +rapid and accurate diagnosis to improve treatment outcomes and patient survival +rates. Recent advancements in supervised deep learning have greatly improved +the analysis of medical images, but often rely on extensive datasets with +high-quality annotations, which are costly, time-consuming, and require medical +expertise to prepare. To mitigate the need for large amounts of expert-prepared +segmentation data, we have developed a novel weakly supervised ICH segmentation +method that utilizes the YOLO object detection model and an +uncertainty-rectified Segment Anything Model (SAM). In addition, we have +proposed a novel point prompt generator for this model to further improve +segmentation results with YOLO-predicted bounding box prompts. Our approach +achieved a high accuracy of 0.933 and an AUC of 0.796 in ICH detection, along +with a mean Dice score of 0.629 for ICH segmentation, outperforming existing +weakly supervised and popular supervised (UNet and Swin-UNETR) approaches. +Overall, the proposed method provides a robust and accurate alternative to the +more commonly used supervised techniques for ICH quantification without +requiring refined segmentation ground truths during model training. + +
+
+ comment: Manuscript was accepted at SWITCH2024. 10 pages, 2 figures +
+
+
+
+
+ + ☆ Learning Feature-Preserving Portrait Editing from Generated Pairs + + +
+ Portrait editing is challenging for existing techniques due to difficulties +in preserving subject features like identity. In this paper, we propose a +training-based method leveraging auto-generated paired data to learn desired +editing while ensuring the preservation of unchanged subject features. +Specifically, we design a data generation process to create reasonably good +training pairs for desired editing at low cost. Based on these pairs, we +introduce a Multi-Conditioned Diffusion Model to effectively learn the editing +direction and preserve subject features. During inference, our model produces +accurate editing mask that can guide the inference process to further preserve +detailed subject features. Experiments on costume editing and cartoon +expression editing show that our method achieves state-of-the-art quality, +quantitatively and qualitatively. + +
+
+
+
+
+ + ☆ MEVDT: Multi-Modal Event-Based Vehicle Detection and Tracking Dataset + + +
+ In this data article, we introduce the Multi-Modal Event-based Vehicle +Detection and Tracking (MEVDT) dataset. This dataset provides a synchronized +stream of event data and grayscale images of traffic scenes, captured using the +Dynamic and Active-Pixel Vision Sensor (DAVIS) 240c hybrid event-based camera. +MEVDT comprises 63 multi-modal sequences with approximately 13k images, 5M +events, 10k object labels, and 85 unique object tracking trajectories. +Additionally, MEVDT includes manually annotated ground truth labels +$\unicode{x2014}$ consisting of object classifications, pixel-precise bounding +boxes, and unique object IDs $\unicode{x2014}$ which are provided at a labeling +frequency of 24 Hz. Designed to advance the research in the domain of +event-based vision, MEVDT aims to address the critical need for high-quality, +real-world annotated datasets that enable the development and evaluation of +object detection and tracking algorithms in automotive environments. + +
+
+
+
+
+ + ☆ BaseBoostDepth: Exploiting Larger Baselines For Self-supervised + Monocular Depth Estimation + + +
+ In the domain of multi-baseline stereo, the conventional understanding is +that, in general, increasing baseline separation substantially enhances the +accuracy of depth estimation. However, prevailing self-supervised depth +estimation architectures primarily use minimal frame separation and a +constrained stereo baseline. Larger frame separations can be employed; however, +we show this to result in diminished depth quality due to various factors, +including significant changes in brightness, and increased areas of occlusion. +In response to these challenges, our proposed method, BaseBoostDepth, +incorporates a curriculum learning-inspired optimization strategy to +effectively leverage larger frame separations. However, we show that our +curriculum learning-inspired strategy alone does not suffice, as larger +baselines still cause pose estimation drifts. Therefore, we introduce +incremental pose estimation to enhance the accuracy of pose estimations, +resulting in significant improvements across all depth metrics. Additionally, +to improve the robustness of the model, we introduce error-induced +reconstructions, which optimize reconstructions with added error to the pose +estimations. Ultimately, our final depth network achieves state-of-the-art +performance on KITTI and SYNS-patches datasets across image-based, edge-based, +and point cloud-based metrics without increasing computational complexity at +test time. The project website can be found at +https://kieran514.github.io/BaseBoostDepth-Project. + +
+
+
+
+
+ + ☆ Mean Opinion Score as a New Metric for User-Evaluation of XAI Methods + + +
+ This paper investigates the use of Mean Opinion Score (MOS), a common image +quality metric, as a user-centric evaluation metric for XAI post-hoc +explainers. To measure the MOS, a user experiment is proposed, which has been +conducted with explanation maps of intentionally distorted images. Three +methods from the family of feature attribution methods - Gradient-weighted +Class Activation Mapping (Grad-CAM), Multi-Layered Feature Explanation Method +(MLFEM), and Feature Explanation Method (FEM) - are compared with this metric. +Additionally, the correlation of this new user-centric metric with automatic +metrics is studied via Spearman's rank correlation coefficient. MOS of MLFEM +shows the highest correlation with automatic metrics of Insertion Area Under +Curve (IAUC) and Deletion Area Under Curve (DAUC). However, the overall +correlations are limited, which highlights the lack of consensus between +automatic and user-centric metrics. + +
+
+ comment: Supported by organization Laboratoire Bordelais de Recherche en + Informatique, 15 pages, 4 figures, 3 tables +
+
+
+
+
+ + ☆ Event-based Optical Flow on Neuromorphic Processor: ANN vs. SNN + Comparison based on Activation Sparsification + + +
+ Spiking neural networks (SNNs) for event-based optical flow are claimed to be +computationally more efficient than their artificial neural networks (ANNs) +counterparts, but a fair comparison is missing in the literature. In this work, +we propose an event-based optical flow solution based on activation +sparsification and a neuromorphic processor, SENECA. SENECA has an event-driven +processing mechanism that can exploit the sparsity in ANN activations and SNN +spikes to accelerate the inference of both types of neural networks. The ANN +and the SNN for comparison have similar low activation/spike density (~5%) +thanks to our novel sparsification-aware training. In the hardware-in-loop +experiments designed to deduce the average time and energy consumption, the SNN +consumes 44.9ms and 927.0 microjoules, which are 62.5% and 75.2% of the ANN's +consumption, respectively. We find that SNN's higher efficiency attributes to +its lower pixel-wise spike density (43.5% vs. 66.5%) that requires fewer memory +access operations for neuron states. + +
+
+ comment: 18 pages, 12 figures, 4 tables +
+
+
+
+
+ + ☆ Analysis and Improvement of Rank-Ordered Mean Algorithm in Single-Photon + LiDAR SP + + +
+ Depth estimation using a single-photon LiDAR is often solved by a matched +filter. It is, however, error-prone in the presence of background noise. A +commonly used technique to reject background noise is the rank-ordered mean +(ROM) filter previously reported by Shin \textit{et al.} (2015). ROM rejects +noisy photon arrival timestamps by selecting only a small range of them around +the median statistics within its local neighborhood. Despite the promising +performance of ROM, its theoretical performance limit is unknown. In this +paper, we theoretically characterize the ROM performance by showing that ROM +fails when the reflectivity drops below a threshold predetermined by the depth +and signal-to-background ratio, and its accuracy undergoes a phase transition +at the cutoff. Based on our theory, we propose an improved signal extraction +technique by selecting tight timestamp clusters. Experimental results show that +the proposed algorithm improves depth estimation performance over ROM by 3 +orders of magnitude at the same signal intensities, and achieves high image +fidelity at noise levels as high as 17 times that of signal. + +
+
+ comment: 6 pages, 7 figures, submitted to the IEEE 26th International Workshop + on Multimedia Signal Processing (MMSP) +
+
+
+
+
+ + ☆ Dense Self-Supervised Learning for Medical Image Segmentation + + +
+ Deep learning has revolutionized medical image segmentation, but it relies +heavily on high-quality annotations. The time, cost and expertise required to +label images at the pixel-level for each new task has slowed down widespread +adoption of the paradigm. We propose Pix2Rep, a self-supervised learning (SSL) +approach for few-shot segmentation, that reduces the manual annotation burden +by learning powerful pixel-level representations directly from unlabeled +images. Pix2Rep is a novel pixel-level loss and pre-training paradigm for +contrastive SSL on whole images. It is applied to generic encoder-decoder deep +learning backbones (e.g., U-Net). Whereas most SSL methods enforce invariance +of the learned image-level representations under intensity and spatial image +augmentations, Pix2Rep enforces equivariance of the pixel-level +representations. We demonstrate the framework on a task of cardiac MRI +segmentation. Results show improved performance compared to existing semi- and +self-supervised approaches; and a 5-fold reduction in the annotation burden for +equivalent performance versus a fully supervised U-Net baseline. This includes +a 30% (resp. 31%) DICE improvement for one-shot segmentation under +linear-probing (resp. fine-tuning). Finally, we also integrate the novel +Pix2Rep concept with the Barlow Twins non-contrastive SSL, which leads to even +better segmentation performance. + +
+
+ comment: Accepted at MIDL 2024 +
+
+
+
+
+ + ☆ Alignment Scores: Robust Metrics for Multiview Pose Accuracy Evaluation + + +
+ We propose three novel metrics for evaluating the accuracy of a set of +estimated camera poses given the ground truth: Translation Alignment Score +(TAS), Rotation Alignment Score (RAS), and Pose Alignment Score (PAS). The TAS +evaluates the translation accuracy independently of the rotations, and the RAS +evaluates the rotation accuracy independently of the translations. The PAS is +the average of the two scores, evaluating the combined accuracy of both +translations and rotations. The TAS is computed in four steps: (1) Find the +upper quartile of the closest-pair-distances, $d$. (2) Align the estimated +trajectory to the ground truth using a robust registration method. (3) Collect +all distance errors and obtain the cumulative frequencies for multiple +thresholds ranging from $0.01d$ to $d$ with a resolution $0.01d$. (4) Add up +these cumulative frequencies and normalize them such that the theoretical +maximum is 1. The TAS has practical advantages over the existing metrics in +that (1) it is robust to outliers and collinear motion, and (2) there is no +need to adjust parameters on different datasets. The RAS is computed in a +similar manner to the TAS and is also shown to be more robust against outliers +than the existing rotation metrics. We verify our claims through extensive +simulations and provide in-depth discussion of the strengths and weaknesses of +the proposed metrics. + +
+
+
+
+
+ + ☆ Two-Phase Segmentation Approach for Accurate Left Ventricle Segmentation + in Cardiac MRI using Machine Learning + + +
+ Accurate segmentation of the Left Ventricle (LV) holds substantial importance +due to its implications in disease detection, regional analysis, and the +development of complex models for cardiac surgical planning. CMR is a golden +standard for diagnosis of serveral cardiac diseases. LV in CMR comprises of +three distinct sections: Basal, Mid-Ventricle, and Apical. This research +focuses on the precise segmentation of the LV from Cardiac MRI (CMR) scans, +joining with the capabilities of Machine Learning (ML). The central challenge +in this research revolves around the absence of a set of parameters applicable +to all three types of LV slices. Parameters optimized for basal slices often +fall short when applied to mid-ventricular and apical slices, and vice versa. +To handle this issue, a new method is proposed to enhance LV segmentation. The +proposed method involves using distinct sets of parameters for each type of +slice, resulting in a two-phase segmentation approach. The initial phase +categorizes images into three groups based on the type of LV slice, while the +second phase aims to segment CMR images using parameters derived from the +preceding phase. A publicly available dataset (Automated Cardiac Diagnosis +Challenge (ACDC)) is used. 10-Fold Cross Validation is used and it achieved a +mean score of 0.9228. Comprehensive testing indicates that the best parameter +set for a particular type of slice does not perform adequately for the other +slice types. All results show that the proposed approach fills a critical void +in parameter standardization through a two-phase segmentation model for the LV, +aiming to not only improve the accuracy of cardiac image analysis but also +contribute advancements to the field of LV segmentation. + +
+
+
+
+
+ + ☆ A Model Generalization Study in Localizing Indoor Cows with COw + LOcalization (COLO) dataset + + +
+ Precision livestock farming (PLF) increasingly relies on advanced object +localization techniques to monitor livestock health and optimize resource +management. This study investigates the generalization capabilities of YOLOv8 +and YOLOv9 models for cow detection in indoor free-stall barn settings, +focusing on varying training data characteristics such as view angles and +lighting, and model complexities. Leveraging the newly released public dataset, +COws LOcalization (COLO) dataset, we explore three key hypotheses: (1) Model +generalization is equally influenced by changes in lighting conditions and +camera angles; (2) Higher model complexity guarantees better generalization +performance; (3) Fine-tuning with custom initial weights trained on relevant +tasks always brings advantages to detection tasks. Our findings reveal +considerable challenges in detecting cows in images taken from side views and +underscore the importance of including diverse camera angles in building a +detection model. Furthermore, our results emphasize that higher model +complexity does not necessarily lead to better performance. The optimal model +configuration heavily depends on the specific task and dataset. Lastly, while +fine-tuning with custom initial weights trained on relevant tasks offers +advantages to detection tasks, simpler models do not benefit similarly from +this approach. It is more efficient to train a simple model with pre-trained +weights without relying on prior relevant information, which can require +intensive labor efforts. Future work should focus on adaptive methods and +advanced data augmentation to improve generalization and robustness. This study +provides practical guidelines for PLF researchers on deploying computer vision +models from existing studies, highlights generalization issues, and contributes +the COLO dataset containing 1254 images and 11818 cow instances for further +research. + +
+
+ comment: 17 pages, 7 figures +
+
+
+
+
+ + ☆ BRIDGE: Bridging Gaps in Image Captioning Evaluation with Stronger + Visual Cues ECCV 2024 + + +
+ Effectively aligning with human judgment when evaluating machine-generated +image captions represents a complex yet intriguing challenge. Existing +evaluation metrics like CIDEr or CLIP-Score fall short in this regard as they +do not take into account the corresponding image or lack the capability of +encoding fine-grained details and penalizing hallucinations. To overcome these +issues, in this paper, we propose BRIDGE, a new learnable and reference-free +image captioning metric that employs a novel module to map visual features into +dense vectors and integrates them into multi-modal pseudo-captions which are +built during the evaluation process. This approach results in a multimodal +metric that properly incorporates information from the input image without +relying on reference captions, bridging the gap between human judgment and +machine-generated image captions. Experiments spanning several datasets +demonstrate that our proposal achieves state-of-the-art results compared to +existing reference-free evaluation scores. Our source code and trained models +are publicly available at: https://github.com/aimagelab/bridge-score. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Matryoshka Multimodal Models + + +
+ Large Multimodal Models (LMMs) such as LLaVA have shown strong performance in +visual-linguistic reasoning. These models first embed images into a fixed large +number of visual tokens and then feed them into a Large Language Model (LLM). +However, this design causes an excessive number of tokens for dense visual +scenarios such as high-resolution images and videos, leading to great +inefficiency. While token pruning/merging methods do exist, they produce a +single length output for each image and do not afford flexibility in trading +off information density v.s. efficiency. Inspired by the concept of Matryoshka +Dolls, we propose M3: Matryoshka Multimodal Models, which learns to represent +visual content as nested sets of visual tokens that capture information across +multiple coarse-to-fine granularities. Our approach offers several unique +benefits for LMMs: (1) One can explicitly control the visual granularity per +test instance during inference, e.g. , adjusting the number of tokens used to +represent an image based on the anticipated complexity or simplicity of the +content; (2) M3 provides a framework for analyzing the granularity needed for +existing datasets, where we find that COCO-style benchmarks only need around ~9 +visual tokens to obtain accuracy similar to that of using all 576 tokens; (3) +Our approach provides a foundation to explore the best trade-off between +performance and visual token length at sample level, where our investigation +reveals that a large gap exists between the oracle upper bound and current +fixed-scale representations. + +
+
+ comment: Project Page: https://matryoshka-mm.github.io/ +
+
+
+
+
+ + ♻ ☆ Not Just Streaks: Towards Ground Truth for Single Image Deraining + + +
+ We propose a large-scale dataset of real-world rainy and clean image pairs +and a method to remove degradations, induced by rain streaks and rain +accumulation, from the image. As there exists no real-world dataset for +deraining, current state-of-the-art methods rely on synthetic data and thus are +limited by the sim2real domain gap; moreover, rigorous evaluation remains a +challenge due to the absence of a real paired dataset. We fill this gap by +collecting a real paired deraining dataset through meticulous control of +non-rain variations. Our dataset enables paired training and quantitative +evaluation for diverse real-world rain phenomena (e.g. rain streaks and rain +accumulation). To learn a representation robust to rain phenomena, we propose a +deep neural network that reconstructs the underlying scene by minimizing a +rain-robust loss between rainy and clean images. Extensive experiments +demonstrate that our model outperforms the state-of-the-art deraining methods +on real rainy images under various conditions. Project website: +https://visual.ee.ucla.edu/gt_rain.htm/. + +
+
+
+
+
+ + ♻ ☆ Frame Interpolation with Consecutive Brownian Bridge Diffusion + + +
+ Recent work in Video Frame Interpolation (VFI) tries to formulate VFI as a +diffusion-based conditional image generation problem, synthesizing the +intermediate frame given a random noise and neighboring frames. Due to the +relatively high resolution of videos, Latent Diffusion Models (LDMs) are +employed as the conditional generation model, where the autoencoder compresses +images into latent representations for diffusion and then reconstructs images +from these latent representations. Such a formulation poses a crucial +challenge: VFI expects that the output is deterministically equal to the ground +truth intermediate frame, but LDMs randomly generate a diverse set of different +images when the model runs multiple times. The reason for the diverse +generation is that the cumulative variance (variance accumulated at each step +of generation) of generated latent representations in LDMs is large. This makes +the sampling trajectory random, resulting in diverse rather than deterministic +generations. To address this problem, we propose our unique solution: Frame +Interpolation with Consecutive Brownian Bridge Diffusion. Specifically, we +propose consecutive Brownian Bridge diffusion that takes a deterministic +initial value as input, resulting in a much smaller cumulative variance of +generated latent representations. Our experiments suggest that our method can +improve together with the improvement of the autoencoder and achieve +state-of-the-art performance in VFI, leaving strong potential for further +enhancement. + +
+
+ comment: corrected typo +
+
+
+
+
+ + ♻ ☆ Generalizable Implicit Motion Modeling for Video Frame Interpolation + + +
+ Motion modeling is critical in flow-based Video Frame Interpolation (VFI). +Existing paradigms either consider linear combinations of bidirectional flows +or directly predict bilateral flows for given timestamps without exploring +favorable motion priors, thus lacking the capability of effectively modeling +spatiotemporal dynamics in real-world videos. To address this limitation, in +this study, we introduce Generalizable Implicit Motion Modeling (GIMM), a novel +and effective approach to motion modeling for VFI. Specifically, to enable GIMM +as an effective motion modeling paradigm, we design a motion encoding pipeline +to model spatiotemporal motion latent from bidirectional flows extracted from +pre-trained flow estimators, effectively representing input-specific motion +priors. Then, we implicitly predict arbitrary-timestep optical flows within two +adjacent input frames via an adaptive coordinate-based neural network, with +spatiotemporal coordinates and motion latent as inputs. Our GIMM can be +smoothly integrated with existing flow-based VFI works without further +modifications. We show that GIMM performs better than the current state of the +art on the VFI benchmarks. + +
+
+ comment: Project Page: https://gseancdat.github.io/projects/GIMMVFI +
+
+
+
+
+ + ♻ ☆ FastCLIP: A Suite of Optimization Techniques to Accelerate CLIP Training + with Limited Resources + + +
+ Existing studies of training state-of-the-art Contrastive Language-Image +Pretraining (CLIP) models on large-scale data involve hundreds of or even +thousands of GPUs due to the requirement of a large batch size. However, such a +large amount of resources is not accessible to most people. While advanced +compositional optimization techniques for optimizing global contrastive losses +have been demonstrated effective for removing the requirement of large batch +size, their performance on large-scale data remains underexplored and not +optimized. To bridge the gap, this paper explores several aspects of CLIP +training with limited resources (e.g., up to tens of GPUs). First, we introduce +FastCLIP, a general CLIP training framework built on advanced compositional +optimization techniques while designed and optimized for the distributed +setting. Our framework is equipped with an efficient gradient reduction +strategy to reduce communication overhead. Second, to further boost training +efficiency, we investigate three components of the framework from an +optimization perspective: the schedule of the inner learning rate, the update +rules of the temperature parameter and the model parameters, respectively. +Experiments on different strategies for each component shed light on how to +conduct CLIP training more efficiently. Finally, we benchmark the performance +of FastCLIP and the state-of-the-art training baseline (OpenCLIP) on different +compute scales up to 32 GPUs on 8 nodes, and three data scales ranging from 2.7 +million, 9.1 million to 315 million image-text pairs to demonstrate the +significant improvement of FastCLIP in the resource-limited setting. We release +the code of FastCLIP at https://github.com/Optimization-AI/fast_clip . + +
+
+ comment: 24 pages +
+
+
+
+
+ + ♻ ☆ CityX: Controllable Procedural Content Generation for Unbounded 3D + Cities + + +
+ Generating a realistic, large-scale 3D virtual city remains a complex +challenge due to the involvement of numerous 3D assets, various city styles, +and strict layout constraints. Existing approaches provide promising attempts +at procedural content generation to create large-scale scenes using Blender +agents. However, they face crucial issues such as difficulties in scaling up +generation capability and achieving fine-grained control at the semantic layout +level. To address these problems, we propose a novel multi-modal controllable +procedural content generation method, named CityX, which enhances realistic, +unbounded 3D city generation guided by multiple layout conditions, including +OSM, semantic maps, and satellite images. Specifically, the proposed method +contains a general protocol for integrating various PCG plugins and a +multi-agent framework for transforming instructions into executable Blender +actions. Through this effective framework, CityX shows the potential to build +an innovative ecosystem for 3D scene generation by bridging the gap between the +quality of generated assets and industrial requirements. Extensive experiments +have demonstrated the effectiveness of our method in creating high-quality, +diverse, and unbounded cities guided by multi-modal conditions. Our project +page: https://cityx-lab.github.io. + +
+
+
+
+
+ + ♻ ☆ Geospecific View Generation -- Geometry-Context Aware High-resolution + Ground View Inference from Satellite Views + + +
+ Predicting realistic ground views from satellite imagery in urban scenes is a +challenging task due to the significant view gaps between satellite and +ground-view images. We propose a novel pipeline to tackle this challenge, by +generating geospecifc views that maximally respect the weak geometry and +texture from multi-view satellite images. Different from existing approaches +that hallucinate images from cues such as partial semantics or geometry from +overhead satellite images, our method directly predicts ground-view images at +geolocation by using a comprehensive set of information from the satellite +image, resulting in ground-level images with a resolution boost at a factor of +ten or more. We leverage a novel building refinement method to reduce geometric +distortions in satellite data at ground level, which ensures the creation of +accurate conditions for view synthesis using diffusion networks. Moreover, we +proposed a novel geospecific prior, which prompts distribution learning of +diffusion models to respect image samples that are closer to the geolocation of +the predicted images. We demonstrate our pipeline is the first to generate +close-to-real and geospecific ground views merely based on satellite images. + +
+
+ comment: 11 figures +
+
+
+
+
+ + ♻ ☆ HabiCrowd: A High Performance Simulator for Crowd-Aware Visual + Navigation IROS 2024 + + +
+ Visual navigation, a foundational aspect of Embodied AI (E-AI), has been +significantly studied in the past few years. While many 3D simulators have been +introduced to support visual navigation tasks, scarcely works have been +directed towards combining human dynamics, creating the gap between simulation +and real-world applications. Furthermore, current 3D simulators incorporating +human dynamics have several limitations, particularly in terms of computational +efficiency, which is a promise of E-AI simulators. To overcome these +shortcomings, we introduce HabiCrowd, the first standard benchmark for +crowd-aware visual navigation that integrates a crowd dynamics model with +diverse human settings into photorealistic environments. Empirical evaluations +demonstrate that our proposed human dynamics model achieves state-of-the-art +performance in collision avoidance, while exhibiting superior computational +efficiency compared to its counterparts. We leverage HabiCrowd to conduct +several comprehensive studies on crowd-aware visual navigation tasks and +human-robot interactions. The source code and data can be found at +https://habicrowd.github.io/. + +
+
+ comment: Accepted to IROS 2024 +
+
+
+
+
+ + ♻ ☆ ViLLa: Video Reasoning Segmentation with Large Language Model + + +
+ Although video perception models have made remarkable advancements in recent +years, they still heavily rely on explicit text descriptions or pre-defined +categories to identify target instances before executing video perception +tasks. These models, however, fail to proactively comprehend and reason the +user's intentions via textual input. Even though previous works attempt to +investigate solutions to incorporate reasoning with image segmentation, they +fail to reason with videos due to the video's complexity in object motion. To +bridge the gap between image and video, in this work, we propose a new video +segmentation task - video reasoning segmentation. The task is designed to +output tracklets of segmentation masks given a complex input text query. What's +more, to promote research in this unexplored area, we construct a reasoning +video segmentation benchmark. Finally, we present ViLLa: Video reasoning +segmentation with a Large Language Model, which incorporates the language +generation capabilities of multimodal Large Language Models (LLMs) while +retaining the capabilities of detecting, segmenting, and tracking multiple +instances. We use a temporal-aware context aggregation module to incorporate +contextual visual cues to text embeddings and propose a video-frame decoder to +build temporal correlations across segmentation tokens. Remarkably, our ViLLa +demonstrates capability in handling complex reasoning and referring video +segmentation. Also, our model shows impressive ability in different temporal +understanding benchmarks. Both quantitative and qualitative experiments show +our method effectively unlocks new video reasoning segmentation capabilities +for multimodal LLMs. The code and dataset will be available at +https://github.com/rkzheng99/ViLLa. + +
+
+ comment: 15 pages,6 figures +
+
+
+
+
+ + ♻ ☆ reBEN: Refined BigEarthNet Dataset for Remote Sensing Image Analysis + + +
+ This paper presents refined BigEarthNet (reBEN) that is a large-scale, +multi-modal remote sensing dataset constructed to support deep learning (DL) +studies for remote sensing image analysis. The reBEN dataset consists of +549,488 pairs of Sentinel-1 and Sentinel-2 image patches. To construct reBEN, +we initially consider the Sentinel-1 and Sentinel-2 tiles used to construct the +BigEarthNet dataset and then divide them into patches of size 1200 m x 1200 m. +We apply atmospheric correction to the Sentinel-2 patches using the latest +version of the sen2cor tool, resulting in higher-quality patches compared to +those present in BigEarthNet. Each patch is then associated with a pixel-level +reference map and scene-level multi-labels. This makes reBEN suitable for +pixel- and scene-based learning tasks. The labels are derived from the most +recent CORINE Land Cover (CLC) map of 2018 by utilizing the 19-class +nomenclature as in BigEarthNet. The use of the most recent CLC map results in +overcoming the label noise present in BigEarthNet. Furthermore, we introduce a +new geographical-based split assignment algorithm that significantly reduces +the spatial correlation among the train, validation, and test sets with respect +to those present in BigEarthNet. This increases the reliability of the +evaluation of DL models. To minimize the DL model training time, we introduce +software tools that convert the reBEN dataset into a DL-optimized data format. +In our experiments, we show the potential of reBEN for multi-modal multi-label +image classification problems by considering several state-of-the-art DL +models. The pre-trained model weights, associated code, and complete dataset +are available at https://bigearth.net. + +
+
+
+
+
+ + ♻ ☆ Trimming the Fat: Efficient Compression of 3D Gaussian Splats through + Pruning BMVC 2024 + + +
+ In recent times, the utilization of 3D models has gained traction, owing to +the capacity for end-to-end training initially offered by Neural Radiance +Fields and more recently by 3D Gaussian Splatting (3DGS) models. The latter +holds a significant advantage by inherently easing rapid convergence during +training and offering extensive editability. However, despite rapid +advancements, the literature still lives in its infancy regarding the +scalability of these models. In this study, we take some initial steps in +addressing this gap, showing an approach that enables both the memory and +computational scalability of such models. Specifically, we propose "Trimming +the fat", a post-hoc gradient-informed iterative pruning technique to eliminate +redundant information encoded in the model. Our experimental findings on widely +acknowledged benchmarks attest to the effectiveness of our approach, revealing +that up to 75% of the Gaussians can be removed while maintaining or even +improving upon baseline performance. Our approach achieves around 50$\times$ +compression while preserving performance similar to the baseline model, and is +able to speed-up computation up to 600 FPS. + +
+
+ comment: Accepted at BMVC 2024 +
+
+
+
+
+ + ♻ ☆ Invariance of deep image quality metrics to affine transformations + + +
+ Deep architectures are the current state-of-the-art in predicting subjective +image quality. Usually, these models are evaluated according to their ability +to correlate with human opinion in databases with a range of distortions that +may appear in digital media. However, these oversee affine transformations +which may represent better the changes in the images actually happening in +natural conditions. Humans can be particularly invariant to these natural +transformations, as opposed to the digital ones. In this work, we evaluate +state-of-the-art deep image quality metrics by assessing their invariance to +affine transformations, specifically: rotation, translation, scaling, and +changes in spectral illumination. Here invariance of a metric refers to the +fact that certain distances should be neglected (considered to be zero) if +their values are below a threshold. This is what we call invisibility threshold +of a metric. We propose a methodology to assign such invisibility thresholds +for any perceptual metric. This methodology involves transformations to a +distance space common to any metric, and psychophysical measurements of +thresholds in this common space. By doing so, we allow the analyzed metrics to +be directly comparable with actual human thresholds. We find that none of the +state-of-the-art metrics shows human-like results under this strong test based +on invisibility thresholds. This means that tuning the models exclusively to +predict the visibility of generic distortions may disregard other properties of +human vision as for instance invariances or invisibility thresholds. + +
+
+ comment: 24 pages 40 figures +
+
+
+
+
+ + ♻ ☆ Point2Building: Reconstructing Buildings from Airborne LiDAR Point + Clouds + + +
+ We present a learning-based approach to reconstruct buildings as 3D polygonal +meshes from airborne LiDAR point clouds. What makes 3D building reconstruction +from airborne LiDAR hard is the large diversity of building designs and +especially roof shapes, the low and varying point density across the scene, and +the often incomplete coverage of building facades due to occlusions by +vegetation or to the viewing angle of the sensor. To cope with the diversity of +shapes and inhomogeneous and incomplete object coverage, we introduce a +generative model that directly predicts 3D polygonal meshes from input point +clouds. Our autoregressive model, called Point2Building, iteratively builds up +the mesh by generating sequences of vertices and faces. This approach enables +our model to adapt flexibly to diverse geometries and building structures. +Unlike many existing methods that rely heavily on pre-processing steps like +exhaustive plane detection, our model learns directly from the point cloud +data, thereby reducing error propagation and increasing the fidelity of the +reconstruction. We experimentally validate our method on a collection of +airborne LiDAR data of Zurich, Berlin and Tallinn. Our method shows good +generalization to diverse urban styles. + +
+
+
+
+
+ + ♻ ☆ MMEarth: Exploring Multi-Modal Pretext Tasks For Geospatial + Representation Learning ECCV 2024 + + +
+ The volume of unlabelled Earth observation (EO) data is huge, but many +important applications lack labelled training data. However, EO data offers the +unique opportunity to pair data from different modalities and sensors +automatically based on geographic location and time, at virtually no human +labor cost. We seize this opportunity to create MMEarth, a diverse multi-modal +pretraining dataset at global scale. Using this new corpus of 1.2 million +locations, we propose a Multi-Pretext Masked Autoencoder (MP-MAE) approach to +learn general-purpose representations for optical satellite images. Our +approach builds on the ConvNeXt V2 architecture, a fully convolutional masked +autoencoder (MAE). Drawing upon a suite of multi-modal pretext tasks, we +demonstrate that our MP-MAE approach outperforms both MAEs pretrained on +ImageNet and MAEs pretrained on domain-specific satellite images. This is shown +on several downstream tasks including image classification and semantic +segmentation. We find that pretraining with multi-modal pretext tasks notably +improves the linear probing performance compared to pretraining on optical +satellite images only. This also leads to better label efficiency and parameter +efficiency which are crucial aspects in global scale applications. + +
+
+ comment: Accepted for ECCV 2024. Data and code: + https://vishalned.github.io/mmearth Update arXiv v2 (ECCV): 1. Dataset fix: + Removed duplicates and corrected ERA5 yearly statistics. 2. Data augmentation + fix: Random crops are now aligned. 3. Test metrics fix: Metrics are now + overall instead of mini-batch averages, matching GEO-Bench metrics. 4. + Pretrained on MMEarth v001 & evaluated on GEO-Bench v1.0 +
+
+
+
+
+ + ♻ ☆ MOD-UV: Learning Mobile Object Detectors from Unlabeled Videos ECCV 2024 + + +
+ Embodied agents must detect and localize objects of interest, e.g. traffic +participants for self-driving cars. Supervision in the form of bounding boxes +for this task is extremely expensive. As such, prior work has looked at +unsupervised instance detection and segmentation, but in the absence of +annotated boxes, it is unclear how pixels must be grouped into objects and +which objects are of interest. This results in over-/under-segmentation and +irrelevant objects. Inspired by human visual system and practical applications, +we posit that the key missing cue for unsupervised detection is motion: objects +of interest are typically mobile objects that frequently move and their motions +can specify separate instances. In this paper, we propose MOD-UV, a Mobile +Object Detector learned from Unlabeled Videos only. We begin with instance +pseudo-labels derived from motion segmentation, but introduce a novel training +paradigm to progressively discover small objects and static-but-mobile objects +that are missed by motion segmentation. As a result, though only learned from +unlabeled videos, MOD-UV can detect and segment mobile objects from a single +static image. Empirically, we achieve state-of-the-art performance in +unsupervised mobile object detection on Waymo Open, nuScenes, and KITTI +Datasets without using any external data or supervised models. Code is +available at https://github.com/YihongSun/MOD-UV. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Rethinking Domain Generalization: Discriminability and Generalizability + + +
+ Domain generalization(DG) endeavors to develop robust models that possess +strong generalizability while preserving excellent discriminability. +Nonetheless, pivotal DG techniques tend to improve the feature generalizability +by learning domain-invariant representations, inadvertently overlooking the +feature discriminability. On the one hand, the simultaneous attainment of +generalizability and discriminability of features presents a complex challenge, +often entailing inherent contradictions. This challenge becomes particularly +pronounced when domain-invariant features manifest reduced discriminability +owing to the inclusion of unstable factors, i.e., spurious correlations. On the +other hand, prevailing domain-invariant methods can be categorized as +category-level alignment, susceptible to discarding indispensable features +possessing substantial generalizability and narrowing intra-class variations. +To surmount these obstacles, we rethink DG from a new perspective that +concurrently imbues features with formidable discriminability and robust +generalizability, and present a novel framework, namely, Discriminative +Microscopic Distribution Alignment~(DMDA). DMDA incorporates two core +components: Selective Channel Pruning~(SCP) and Micro-level Distribution +Alignment~(MDA). Concretely, SCP attempts to curtail redundancy within neural +networks, prioritizing stable attributes conducive to accurate classification. +This approach alleviates the adverse effect of spurious domain invariance and +amplifies the feature discriminability. Besides, MDA accentuates micro-level +alignment within each class, going beyond mere category-level alignment. +Extensive experiments on four benchmark datasets corroborate that DMDA achieves +comparable results to state-of-the-art methods in DG, underscoring the efficacy +of our method. + +
+
+ comment: Accepted to IEEE Transactions on Circuits and Systems for Video + Technology (TCSVT) +
+
+
+
+
+ + ♻ ☆ Statistical Test on Diffusion Model-based Generated Images by Selective + Inference + + +
+ AI technology for generating images, such as diffusion models, has advanced +rapidly. However, there is no established framework for quantifying the +reliability of AI-generated images, which hinders their use in critical +decision-making tasks, such as medical image diagnosis. In this study, we +propose a method to quantify the reliability of decision-making tasks that rely +on images produced by diffusion models within a statistical testing framework. +The core concept of our statistical test involves using a selective inference +framework, in which the statistical test is conducted under the condition that +the images are produced by a trained diffusion model. As a case study, we study +a diffusion model-based anomaly detection task for medical images. With our +approach, the statistical significance of medical image diagnostic outcomes can +be quantified in terms of a p-value, enabling decision-making with a controlled +error rate. We demonstrate the theoretical soundness and practical +effectiveness of our statistical test through numerical experiments on both +synthetic and brain image datasets. + +
+
+ comment: 31 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Multi-Memory Matching for Unsupervised Visible-Infrared Person + Re-Identification ECCV2024 + + +
+ Unsupervised visible-infrared person re-identification (USL-VI-ReID) is a +promising yet challenging retrieval task. The key challenges in USL-VI-ReID are +to effectively generate pseudo-labels and establish pseudo-label +correspondences across modalities without relying on any prior annotations. +Recently, clustered pseudo-label methods have gained more attention in +USL-VI-ReID. However, previous methods fell short of fully exploiting the +individual nuances, as they simply utilized a single memory that represented an +identity to establish cross-modality correspondences, resulting in ambiguous +cross-modality correspondences. To address the problem, we propose a +Multi-Memory Matching (MMM) framework for USL-VI-ReID. We first design a +Cross-Modality Clustering (CMC) module to generate the pseudo-labels through +clustering together both two modality samples. To associate cross-modality +clustered pseudo-labels, we design a Multi-Memory Learning and Matching (MMLM) +module, ensuring that optimization explicitly focuses on the nuances of +individual perspectives and establishes reliable cross-modality +correspondences. Finally, we design a Soft Cluster-level Alignment (SCA) module +to narrow the modality gap while mitigating the effect of noise pseudo-labels +through a soft many-to-many alignment strategy. Extensive experiments on the +public SYSU-MM01 and RegDB datasets demonstrate the reliability of the +established cross-modality correspondences and the effectiveness of our MMM. +The source codes will be released. + +
+
+ comment: Accepted by ECCV2024 +
+
+
+
+
+ + ♻ ☆ JoReS-Diff: Joint Retinex and Semantic Priors in Diffusion Model for + Low-light Image Enhancement ACM MM 2024 + + +
+ Low-light image enhancement (LLIE) has achieved promising performance by +employing conditional diffusion models. Despite the success of some conditional +methods, previous methods may neglect the importance of a sufficient +formulation of task-specific condition strategy, resulting in suboptimal visual +outcomes. In this study, we propose JoReS-Diff, a novel approach that +incorporates Retinex- and semantic-based priors as the additional +pre-processing condition to regulate the generating capabilities of the +diffusion model. We first leverage pre-trained decomposition network to +generate the Retinex prior, which is updated with better quality by an +adjustment network and integrated into a refinement network to implement +Retinex-based conditional generation at both feature- and image-levels. +Moreover, the semantic prior is extracted from the input image with an +off-the-shelf semantic segmentation model and incorporated through semantic +attention layers. By treating Retinex- and semantic-based priors as the +condition, JoReS-Diff presents a unique perspective for establishing an +diffusion model for LLIE and similar image enhancement tasks. Extensive +experiments validate the rationality and superiority of our approach. + +
+
+ comment: Accepted by ACM MM 2024 +
+
+
+
+
+ + ♻ ☆ Improving Diffusion Models for Authentic Virtual Try-on in the Wild ECCV 2024 + + +
+ This paper considers image-based virtual try-on, which renders an image of a +person wearing a curated garment, given a pair of images depicting the person +and the garment, respectively. Previous works adapt existing exemplar-based +inpainting diffusion models for virtual try-on to improve the naturalness of +the generated visuals compared to other methods (e.g., GAN-based), but they +fail to preserve the identity of the garments. To overcome this limitation, we +propose a novel diffusion model that improves garment fidelity and generates +authentic virtual try-on images. Our method, coined IDM-VTON, uses two +different modules to encode the semantics of garment image; given the base UNet +of the diffusion model, 1) the high-level semantics extracted from a visual +encoder are fused to the cross-attention layer, and then 2) the low-level +features extracted from parallel UNet are fused to the self-attention layer. In +addition, we provide detailed textual prompts for both garment and person +images to enhance the authenticity of the generated visuals. Finally, we +present a customization method using a pair of person-garment images, which +significantly improves fidelity and authenticity. Our experimental results show +that our method outperforms previous approaches (both diffusion-based and +GAN-based) in preserving garment details and generating authentic virtual +try-on images, both qualitatively and quantitatively. Furthermore, the proposed +customization method demonstrates its effectiveness in a real-world scenario. +More visualizations are available in our project page: +https://idm-vton.github.io + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ♻ ☆ DART: An Automated End-to-End Object Detection Pipeline with Data + Diversification, Open-Vocabulary Bounding Box Annotation, Pseudo-Label + Review, and Model Training + + +
+ Accurate real-time object detection is vital across numerous industrial +applications, from safety monitoring to quality control. Traditional +approaches, however, are hindered by arduous manual annotation and data +collection, struggling to adapt to ever-changing environments and novel target +objects. To address these limitations, this paper presents DART, an innovative +automated end-to-end pipeline that revolutionizes object detection workflows +from data collection to model evaluation. It eliminates the need for laborious +human labeling and extensive data collection while achieving outstanding +accuracy across diverse scenarios. DART encompasses four key stages: (1) Data +Diversification using subject-driven image generation (DreamBooth with SDXL), +(2) Annotation via open-vocabulary object detection (Grounding DINO) to +generate bounding box and class labels, (3) Review of generated images and +pseudo-labels by large multimodal models (InternVL-1.5 and GPT-4o) to guarantee +credibility, and (4) Training of real-time object detectors (YOLOv8 and +YOLOv10) using the verified data. We apply DART to a self-collected dataset of +construction machines named Liebherr Product, which contains over 15K +high-quality images across 23 categories. The current instantiation of DART +significantly increases average precision (AP) from 0.064 to 0.832. Its modular +design ensures easy exchangeability and extensibility, allowing for future +algorithm upgrades, seamless integration of new object categories, and +adaptability to customized environments without manual labeling and additional +data collection. The code and dataset are released at +https://github.com/chen-xin-94/DART. + +
+
+
+
+
+ + ♻ ☆ Benchmarking Dependence Measures to Prevent Shortcut Learning in Medical + Imaging + + +
+ Medical imaging cohorts are often confounded by factors such as acquisition +devices, hospital sites, patient backgrounds, and many more. As a result, deep +learning models tend to learn spurious correlations instead of causally related +features, limiting their generalizability to new and unseen data. This problem +can be addressed by minimizing dependence measures between intermediate +representations of task-related and non-task-related variables. These measures +include mutual information, distance correlation, and the performance of +adversarial classifiers. Here, we benchmark such dependence measures for the +task of preventing shortcut learning. We study a simplified setting using +Morpho-MNIST and a medical imaging task with CheXpert chest radiographs. Our +results provide insights into how to mitigate confounding factors in medical +imaging. + +
+
+ comment: Accepted to the 15th International Workshop on Machine Learning in + Medical Imaging (MLMI 2024); new version: appendix moved to the end, after + the references +
+
+
+
+
+ + ♻ ☆ Rethinking Learned Image Compression: Context is All You Need + + +
+ Since LIC has made rapid progress recently compared to traditional methods, +this paper attempts to discuss the question about 'Where is the boundary of +Learned Image Compression(LIC)?' with regard to subjective matrics. Thus this +paper splits the above problem into two sub-problems:1)Where is the boundary of +rate-distortion performance of PSNR? 2)How to further improve the compression +gain and achieve the boundary? Therefore this paper analyzes the effectiveness +of scaling parameters for encoder, decoder and context model, which are the +three components of LIC. Then we conclude that scaling for LIC is to scale for +context model and decoder within LIC. Extensive experiments demonstrate that +overfitting can actually serve as an effective context. By optimizing the +context, this paper further improves PSNR and achieves state-of-the-art +performance, showing a performance gain of 14.39% with BD-RATE over VVC. + +
+
+
+
+
+ + ♻ ☆ SU-SAM: A Simple Unified Framework for Adapting Segment Anything Model + in Underperformed Scenes + + +
+ Segment anything model (SAM) has demonstrated excellent generalizability in +common vision scenarios, yet falling short of the ability to understand +specialized data. Recently, several methods have combined parameter-efficient +techniques with task-specific designs to fine-tune SAM on particular tasks. +However, these methods heavily rely on handcraft, complicated, and +task-specific designs, and pre/post-processing to achieve acceptable +performances on downstream tasks. As a result, this severely restricts +generalizability to other downstream tasks. To address this issue, we present a +simple and unified framework, namely SU-SAM, that can easily and efficiently +fine-tune the SAM model with parameter-efficient techniques while maintaining +excellent generalizability toward various downstream tasks. SU-SAM does not +require any task-specific designs and aims to improve the adaptability of +SAM-like models significantly toward underperformed scenes. Concretely, we +abstract parameter-efficient modules of different methods into basic design +elements in our framework. Besides, we propose four variants of SU-SAM, i.e., +series, parallel, mixed, and LoRA structures. Comprehensive experiments on nine +datasets and six downstream tasks to verify the effectiveness of SU-SAM, +including medical image segmentation, camouflage object detection, salient +object segmentation, surface defect segmentation, complex object shapes, and +shadow masking. Our experimental results demonstrate that SU-SAM achieves +competitive or superior accuracy compared to state-of-the-art methods. +Furthermore, we provide in-depth analyses highlighting the effectiveness of +different parameter-efficient designs within SU-SAM. In addition, we propose a +generalized model and benchmark, showcasing SU-SAM's generalizability across +all diverse datasets simultaneously. + +
+
+
+
+
+ + ♻ ☆ Slimmable Networks for Contrastive Self-supervised Learning + + +
+ Self-supervised learning makes significant progress in pre-training large +models, but struggles with small models. Mainstream solutions to this problem +rely mainly on knowledge distillation, which involves a two-stage procedure: +first training a large teacher model and then distilling it to improve the +generalization ability of smaller ones. In this work, we introduce another +one-stage solution to obtain pre-trained small models without the need for +extra teachers, namely, slimmable networks for contrastive self-supervised +learning (SlimCLR). A slimmable network consists of a full network and several +weight-sharing sub-networks, which can be pre-trained once to obtain various +networks, including small ones with low computation costs. However, +interference between weight-sharing networks leads to severe performance +degradation in self-supervised cases, as evidenced by gradient magnitude +imbalance and gradient direction divergence. The former indicates that a small +proportion of parameters produce dominant gradients during backpropagation, +while the main parameters may not be fully optimized. The latter shows that the +gradient direction is disordered, and the optimization process is unstable. To +address these issues, we introduce three techniques to make the main parameters +produce dominant gradients and sub-networks have consistent outputs. These +techniques include slow start training of sub-networks, online distillation, +and loss re-weighting according to model sizes. Furthermore, theoretical +results are presented to demonstrate that a single slimmable linear layer is +sub-optimal during linear evaluation. Thus a switchable linear probe layer is +applied during linear evaluation. We instantiate SlimCLR with typical +contrastive learning frameworks and achieve better performance than previous +arts with fewer parameters and FLOPs. The code is at +https://github.com/mzhaoshuai/SlimCLR. + +
+
+ comment: Accepted by IJCV, code is at https://github.com/mzhaoshuai/SlimCLR +
+
+
+
+
+ + ♻ ☆ DHGS: Decoupled Hybrid Gaussian Splatting for Driving Scene + + +
+ Existing Gaussian splatting methods often fall short in achieving +satisfactory novel view synthesis in driving scenes, primarily due to the +absence of crafty design and geometric constraints for the involved elements. +This paper introduces a novel neural rendering method termed Decoupled Hybrid +Gaussian Splatting (DHGS), targeting at promoting the rendering quality of +novel view synthesis for static driving scenes. The novelty of this work lies +in the decoupled and hybrid pixel-level blender for road and non-road layers, +without the conventional unified differentiable rendering logic for the entire +scene, while still maintaining consistent and continuous superimposition +through the proposed depth-ordered hybrid rendering strategy. Additionally, an +implicit road representation comprised of a Signed Distance Field (SDF) is +trained to supervise the road surface with subtle geometric attributes. +Accompanied by the use of auxiliary transmittance loss and consistency loss, +novel images with imperceptible boundary and elevated fidelity are ultimately +obtained. Substantial experiments on the Waymo dataset prove that DHGS +outperforms the state-of-the-art methods. The project page where more video +evidences are given is: https://ironbrotherstyle.github.io/dhgs_web. + +
+
+ comment: 13 pages, 14 figures, conference +
+
+
+
+
+ + ♻ ☆ FlightScope: A Deep Comprehensive Review of Aircraft Detection + Algorithms in Satellite Imagery + + +
+ Object detection in remotely sensed satellite pictures is fundamental in many +fields such as biophysical, and environmental monitoring. While deep learning +algorithms are constantly evolving, they have been mostly implemented and +tested on popular ground-based taken photos. This paper critically evaluates +and compares a suite of advanced object detection algorithms customized for the +task of identifying aircraft within satellite imagery. Using the large +HRPlanesV2 dataset, together with a rigorous validation with the GDIT dataset, +this research encompasses an array of methodologies including YOLO versions 5 +and 8, Faster RCNN, CenterNet, RetinaNet, RTMDet, and DETR, all trained from +scratch. This exhaustive training and validation study reveal YOLOv5 as the +preeminent model for the specific case of identifying airplanes from remote +sensing data, showcasing high precision and adaptability across diverse imaging +conditions. This research highlight the nuanced performance landscapes of these +algorithms, with YOLOv5 emerging as a robust solution for aerial object +detection, underlining its importance through superior mean average precision, +Recall, and Intersection over Union scores. The findings described here +underscore the fundamental role of algorithm selection aligned with the +specific demands of satellite imagery analysis and extend a comprehensive +framework to evaluate model efficacy. The benchmark toolkit and codes, +available via https://github.com/toelt-llc/FlightScope_Bench, aims to further +exploration and innovation in the realm of remote sensing object detection, +paving the way for improved analytical methodologies in satellite imagery +applications. + +
+
+ comment: 15 figures, 4 tables, comprehensive survey, comparative study +
+
+
+
+
+ + ♻ ☆ UNIKD: UNcertainty-filtered Incremental Knowledge Distillation for + Neural Implicit Representation ECCV 2024 + + +
+ Recent neural implicit representations (NIRs) have achieved great success in +the tasks of 3D reconstruction and novel view synthesis. However, they require +the images of a scene from different camera views to be available for one-time +training. This is expensive especially for scenarios with large-scale scenes +and limited data storage. In view of this, we explore the task of incremental +learning for NIRs in this work. We design a student-teacher framework to +mitigate the catastrophic forgetting problem. Specifically, we iterate the +process of using the student as the teacher at the end of each time step and +let the teacher guide the training of the student in the next step. As a +result, the student network is able to learn new information from the streaming +data and retain old knowledge from the teacher network simultaneously. Although +intuitive, naively applying the student-teacher pipeline does not work well in +our task. Not all information from the teacher network is helpful since it is +only trained with the old data. To alleviate this problem, we further introduce +a random inquirer and an uncertainty-based filter to filter useful information. +Our proposed method is general and thus can be adapted to different implicit +representations such as neural radiance field (NeRF) and neural surface field. +Extensive experimental results for both 3D reconstruction and novel view +synthesis demonstrate the effectiveness of our approach compared to different +baselines. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ♻ ☆ ViewDiff: 3D-Consistent Image Generation with Text-to-Image Models CVPR 2024 + + +
+ 3D asset generation is getting massive amounts of attention, inspired by the +recent success of text-guided 2D content creation. Existing text-to-3D methods +use pretrained text-to-image diffusion models in an optimization problem or +fine-tune them on synthetic data, which often results in non-photorealistic 3D +objects without backgrounds. In this paper, we present a method that leverages +pretrained text-to-image models as a prior, and learn to generate multi-view +images in a single denoising process from real-world data. Concretely, we +propose to integrate 3D volume-rendering and cross-frame-attention layers into +each block of the existing U-Net network of the text-to-image model. Moreover, +we design an autoregressive generation that renders more 3D-consistent images +at any viewpoint. We train our model on real-world datasets of objects and +showcase its capabilities to generate instances with a variety of high-quality +shapes and textures in authentic surroundings. Compared to the existing +methods, the results generated by our method are consistent, and have favorable +visual quality (-30% FID, -37% KID). + +
+
+ comment: Accepted to CVPR 2024, project page: + https://lukashoel.github.io/ViewDiff/, video: + https://www.youtube.com/watch?v=SdjoCqHzMMk, code: + https://github.com/facebookresearch/ViewDiff +
+
+
+
+
+ + ♻ ☆ AV-Deepfake1M: A Large-Scale LLM-Driven Audio-Visual Deepfake Dataset ACM MM 2024 + + +
+ The detection and localization of highly realistic deepfake audio-visual +content are challenging even for the most advanced state-of-the-art methods. +While most of the research efforts in this domain are focused on detecting +high-quality deepfake images and videos, only a few works address the problem +of the localization of small segments of audio-visual manipulations embedded in +real videos. In this research, we emulate the process of such content +generation and propose the AV-Deepfake1M dataset. The dataset contains +content-driven (i) video manipulations, (ii) audio manipulations, and (iii) +audio-visual manipulations for more than 2K subjects resulting in a total of +more than 1M videos. The paper provides a thorough description of the proposed +data generation pipeline accompanied by a rigorous analysis of the quality of +the generated data. The comprehensive benchmark of the proposed dataset +utilizing state-of-the-art deepfake detection and localization methods +indicates a significant drop in performance compared to previous datasets. The +proposed dataset will play a vital role in building the next-generation +deepfake localization methods. The dataset and associated code are available at +https://github.com/ControlNet/AV-Deepfake1M . + +
+
+ comment: Accepted by ACM MM 2024 +
+
+
+
+
+ + ♻ ☆ MVMR: A New Framework for Evaluating Faithfulness of Video Moment + Retrieval against Multiple Distractors CIKM 2024 + + +
+ With the explosion of multimedia content, video moment retrieval (VMR), which +aims to detect a video moment that matches a given text query from a video, has +been studied intensively as a critical problem. However, the existing VMR +framework evaluates video moment retrieval performance, assuming that a video +is given, which may not reveal whether the models exhibit overconfidence in the +falsely given video. In this paper, we propose the MVMR (Massive Videos Moment +Retrieval for Faithfulness Evaluation) task that aims to retrieve video moments +within a massive video set, including multiple distractors, to evaluate the +faithfulness of VMR models. For this task, we suggest an automated massive +video pool construction framework to categorize negative (distractors) and +positive (false-negative) video sets using textual and visual semantic distance +verification methods. We extend existing VMR datasets using these methods and +newly construct three practical MVMR datasets. To solve the task, we further +propose a strong informative sample-weighted learning method, CroCs, which +employs two contrastive learning mechanisms: (1) weakly-supervised potential +negative learning and (2) cross-directional hard-negative learning. +Experimental results on the MVMR datasets reveal that existing VMR models are +easily distracted by the misinformation (distractors), whereas our model shows +significantly robust performance, demonstrating that CroCs is essential to +distinguishing positive moments against distractors. Our code and datasets are +publicly available: https://github.com/yny0506/Massive-Videos-Moment-Retrieval. + +
+
+ comment: accepted to CIKM 2024 +
+
+
+
+
+ + ♻ ☆ MEIA: Multimodal Embodied Perception and Interaction in Unknown + Environments + + +
+ With the surge in the development of large language models, embodied +intelligence has attracted increasing attention. Nevertheless, prior works on +embodied intelligence typically encode scene or historical memory in an +unimodal manner, either visual or linguistic, which complicates the alignment +of the model's action planning with embodied control. To overcome this +limitation, we introduce the Multimodal Embodied Interactive Agent (MEIA), +capable of translating high-level tasks expressed in natural language into a +sequence of executable actions. Specifically, we propose a novel Multimodal +Environment Memory (MEM) module, facilitating the integration of embodied +control with large models through the visual-language memory of scenes. This +capability enables MEIA to generate executable action plans based on diverse +requirements and the robot's capabilities. Furthermore, we construct an +embodied question answering dataset based on a dynamic virtual cafe environment +with the help of the large language model. In this virtual environment, we +conduct several experiments, utilizing multiple large models through zero-shot +learning, and carefully design scenarios for various situations. The +experimental results showcase the promising performance of our MEIA in various +embodied interactive tasks. + +
+
+ comment: Codes will be available at + https://github.com/HCPLab-SYSU/Embodied_AI_Paper_List +
+
+
+
+
+ + ♻ ☆ Aligning Cyber Space with Physical World: A Comprehensive Survey on + Embodied AI + + +
+ Embodied Artificial Intelligence (Embodied AI) is crucial for achieving +Artificial General Intelligence (AGI) and serves as a foundation for various +applications that bridge cyberspace and the physical world. Recently, the +emergence of Multi-modal Large Models (MLMs) and World Models (WMs) have +attracted significant attention due to their remarkable perception, +interaction, and reasoning capabilities, making them a promising architecture +for the brain of embodied agents. However, there is no comprehensive survey for +Embodied AI in the era of MLMs. In this survey, we give a comprehensive +exploration of the latest advancements in Embodied AI. Our analysis firstly +navigates through the forefront of representative works of embodied robots and +simulators, to fully understand the research focuses and their limitations. +Then, we analyze four main research targets: 1) embodied perception, 2) +embodied interaction, 3) embodied agent, and 4) sim-to-real adaptation, +covering the state-of-the-art methods, essential paradigms, and comprehensive +datasets. Additionally, we explore the complexities of MLMs in virtual and real +embodied agents, highlighting their significance in facilitating interactions +in dynamic digital and physical environments. Finally, we summarize the +challenges and limitations of embodied AI and discuss their potential future +directions. We hope this survey will serve as a foundational reference for the +research community and inspire continued innovation. The associated project can +be found at https://github.com/HCPLab-SYSU/Embodied_AI_Paper_List. + +
+
+ comment: The first comprehensive review of Embodied AI in the era of MLMs, 36 + pages. We also provide the paper list for Embodied AI: + https://github.com/HCPLab-SYSU/Embodied_AI_Paper_List +
+
+
+
+
+ + ♻ ☆ ReMoS: 3D Motion-Conditioned Reaction Synthesis for Two-Person + Interactions + + +
+ Current approaches for 3D human motion synthesis generate high quality +animations of digital humans performing a wide variety of actions and gestures. +However, a notable technological gap exists in addressing the complex dynamics +of multi human interactions within this paradigm. In this work, we present +ReMoS, a denoising diffusion based model that synthesizes full body reactive +motion of a person in a two person interaction scenario. Given the motion of +one person, we employ a combined spatio temporal cross attention mechanism to +synthesize the reactive body and hand motion of the second person, thereby +completing the interactions between the two. We demonstrate ReMoS across +challenging two person scenarios such as pair dancing, Ninjutsu, kickboxing, +and acrobatics, where one persons movements have complex and diverse influences +on the other. We also contribute the ReMoCap dataset for two person +interactions containing full body and finger motions. We evaluate ReMoS through +multiple quantitative metrics, qualitative visualizations, and a user study, +and also indicate usability in interactive motion editing applications. + +
+
+ comment: 29 pages, 7 figures, 7 tables +
+
+
+
+
+ + ♻ ☆ Three-Stream Temporal-Shift Attention Network Based on Self-Knowledge + Distillation for Micro-Expression Recognition + + +
+ Micro-expressions are subtle facial movements that occur spontaneously when +people try to conceal real emotions. Micro-expression recognition is crucial in +many fields, including criminal analysis and psychotherapy. However, +micro-expression recognition is challenging since micro-expressions have low +intensity and public datasets are small in size. To this end, a three-stream +temporal-shift attention network based on self-knowledge distillation called +SKD-TSTSAN is proposed in this paper. Firstly, to address the low intensity of +muscle movements, we utilize learning-based motion magnification modules to +enhance the intensity of muscle movements. Secondly, we employ efficient +channel attention modules in the local-spatial stream to make the network focus +on facial regions that are highly relevant to micro-expressions. In addition, +temporal shift modules are used in the dynamic-temporal stream, which enables +temporal modeling with no additional parameters by mixing motion information +from two different temporal domains. Furthermore, we introduce self-knowledge +distillation into the micro-expression recognition task by introducing +auxiliary classifiers and using the deepest section of the network for +supervision, encouraging all blocks to fully explore the features of the +training set. Finally, extensive experiments are conducted on four public +datasets: CASME II, SAMM, MMEW, and CAS(ME)3. The experimental results +demonstrate that our SKD-TSTSAN outperforms other existing methods and achieves +new state-of-the-art performance. Our code will be available at +https://github.com/GuanghaoZhu663/SKD-TSTSAN. + +
+
+
+
+
+ + ♻ ☆ Adaptive Self-training Framework for Fine-grained Scene Graph Generation ICLR 2024 + + +
+ Scene graph generation (SGG) models have suffered from inherent problems +regarding the benchmark datasets such as the long-tailed predicate distribution +and missing annotation problems. In this work, we aim to alleviate the +long-tailed problem of SGG by utilizing unannotated triplets. To this end, we +introduce a Self-Training framework for SGG (ST-SGG) that assigns pseudo-labels +for unannotated triplets based on which the SGG models are trained. While there +has been significant progress in self-training for image recognition, designing +a self-training framework for the SGG task is more challenging due to its +inherent nature such as the semantic ambiguity and the long-tailed distribution +of predicate classes. Hence, we propose a novel pseudo-labeling technique for +SGG, called Class-specific Adaptive Thresholding with Momentum (CATM), which is +a model-agnostic framework that can be applied to any existing SGG models. +Furthermore, we devise a graph structure learner (GSL) that is beneficial when +adopting our proposed self-training framework to the state-of-the-art +message-passing neural network (MPNN)-based SGG models. Our extensive +experiments verify the effectiveness of ST-SGG on various SGG models, +particularly in enhancing the performance on fine-grained predicate classes. + +
+
+ comment: 9 pages; ICLR 2024 +
+
+
+
+
+ + ♻ ☆ LLM4SGG: Large Language Models for Weakly Supervised Scene Graph + Generation CVPR 2024 + + +
+ Weakly-Supervised Scene Graph Generation (WSSGG) research has recently +emerged as an alternative to the fully-supervised approach that heavily relies +on costly annotations. In this regard, studies on WSSGG have utilized image +captions to obtain unlocalized triplets while primarily focusing on grounding +the unlocalized triplets over image regions. However, they have overlooked the +two issues involved in the triplet formation process from the captions: 1) +Semantic over-simplification issue arises when extracting triplets from +captions, where fine-grained predicates in captions are undesirably converted +into coarse-grained predicates, resulting in a long-tailed predicate +distribution, and 2) Low-density scene graph issue arises when aligning the +triplets in the caption with entity/predicate classes of interest, where many +triplets are discarded and not used in training, leading to insufficient +supervision. To tackle the two issues, we propose a new approach, i.e., Large +Language Model for weakly-supervised SGG (LLM4SGG), where we mitigate the two +issues by leveraging the LLM's in-depth understanding of language and reasoning +ability during the extraction of triplets from captions and alignment of +entity/predicate classes with target data. To further engage the LLM in these +processes, we adopt the idea of Chain-of-Thought and the in-context few-shot +learning strategy. To validate the effectiveness of LLM4SGG, we conduct +extensive experiments on Visual Genome and GQA datasets, showing significant +improvements in both Recall@K and mean Recall@K compared to the +state-of-the-art WSSGG methods. A further appeal is that LLM4SGG is +data-efficient, enabling effective model training with a small amount of +training images. + +
+
+ comment: 8 pages; CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Learning to Enhance Aperture Phasor Field for Non-Line-of-Sight Imaging + + +
+ This paper aims to facilitate more practical NLOS imaging by reducing the +number of samplings and scan areas. To this end, we introduce a phasor-based +enhancement network that is capable of predicting clean and full measurements +from noisy partial observations. We leverage a denoising autoencoder scheme to +acquire rich and noise-robust representations in the measurement space. Through +this pipeline, our enhancement network is trained to accurately reconstruct +complete measurements from their corrupted and partial counterparts. However, +we observe that the \naive application of denoising often yields degraded and +over-smoothed results, caused by unnecessary and spurious frequency signals +present in measurements. To address this issue, we introduce a phasor-based +pipeline designed to limit the spectrum of our network to the frequency range +of interests, where the majority of informative signals are detected. The +phasor wavefronts at the aperture, which are band-limited signals, are employed +as inputs and outputs of the network, guiding our network to learn from the +frequency range of interests and discard unnecessary information. The +experimental results in more practical acquisition scenarios demonstrate that +we can look around the corners with $16\times$ or $64\times$ fewer samplings +and $4\times$ smaller apertures. Our code is available at +https://github.com/join16/LEAP. + +
+
+
+
+
+ + ♻ ☆ Point Cloud Color Constancy CVPR 2022 + + +
+ In this paper, we present Point Cloud Color Constancy, in short PCCC, an +illumination chromaticity estimation algorithm exploiting a point cloud. We +leverage the depth information captured by the time-of-flight (ToF) sensor +mounted rigidly with the RGB sensor, and form a 6D cloud where each point +contains the coordinates and RGB intensities, noted as (x,y,z,r,g,b). PCCC +applies the PointNet architecture to the color constancy problem, deriving the +illumination vector point-wise and then making a global decision about the +global illumination chromaticity. On two popular RGB-D datasets, which we +extend with illumination information, as well as on a novel benchmark, PCCC +obtains lower error than the state-of-the-art algorithms. Our method is simple +and fast, requiring merely 16*16-size input and reaching speed over 500 fps, +including the cost of building the point cloud and net inference. + +
+
+ comment: CVPR 2022 +
+
+
+
+
+ + ♻ ☆ Region-aware Image-based Human Action Retrieval with Transformers + + +
+ Human action understanding is a fundamental and challenging task in computer +vision. Although there exists tremendous research on this area, most works +focus on action recognition, while action retrieval has received less +attention. In this paper, we focus on the neglected but important task of +image-based action retrieval which aims to find images that depict the same +action as a query image. We establish benchmarks for this task and set up +important baseline methods for fair comparison. We present an end-to-end model +that learns rich action representations from three aspects: the anchored +person, contextual regions, and the global image. A novel fusion transformer +module is designed to model the relationships among different features and +effectively fuse them into an action representation. Experiments on the +Stanford-40 and PASCAL VOC 2012 Action datasets show that the proposed method +significantly outperforms previous approaches for image-based action retrieval. + +
+
+
+
+
+ + ♻ ☆ A Reference-Based 3D Semantic-Aware Framework for Accurate Local Facial + Attribute Editing + + +
+ Facial attribute editing plays a crucial role in synthesizing realistic faces +with specific characteristics while maintaining realistic appearances. Despite +advancements, challenges persist in achieving precise, 3D-aware attribute +modifications, which are crucial for consistent and accurate representations of +faces from different angles. Current methods struggle with semantic +entanglement and lack effective guidance for incorporating attributes while +maintaining image integrity. To address these issues, we introduce a novel +framework that merges the strengths of latent-based and reference-based editing +methods. Our approach employs a 3D GAN inversion technique to embed attributes +from the reference image into a tri-plane space, ensuring 3D consistency and +realistic viewing from multiple perspectives. We utilize blending techniques +and predicted semantic masks to locate precise edit regions, merging them with +the contextual guidance from the reference image. A coarse-to-fine inpainting +strategy is then applied to preserve the integrity of untargeted areas, +significantly enhancing realism. Our evaluations demonstrate superior +performance across diverse editing tasks, validating our framework's +effectiveness in realistic and applicable facial attribute editing. + +
+
+
+
+
+ + ♻ ☆ How Does Fine-Tuning Impact Out-of-Distribution Detection for + Vision-Language Models? + + +
+ Recent large vision-language models such as CLIP have shown remarkable +out-of-distribution (OOD) detection and generalization performance. However, +their zero-shot in-distribution (ID) accuracy is often limited for downstream +datasets. Recent CLIP-based fine-tuning methods such as prompt learning have +demonstrated significant improvements in ID classification and OOD +generalization where OOD labels are available. Nonetheless, it remains unclear +whether the model is reliable to semantic shifts without OOD labels. In this +paper, we aim to bridge the gap and present a comprehensive study to understand +how fine-tuning impact OOD detection for few-shot downstream tasks. By framing +OOD detection as multi-modal concept matching, we establish a connection +between fine-tuning methods and various OOD scores. Our results suggest that a +proper choice of OOD scores is essential for CLIP-based fine-tuning. In +particular, the maximum concept matching (MCM) score provides a promising +solution consistently. We also show that prompt learning demonstrates the +state-of-the-art OOD detection performance over the zero-shot counterpart. + +
+
+ comment: Accepted to IJCV 2023 +
+
+
+
+
+ + ♻ ☆ Beyond MOT: Semantic Multi-Object Tracking ECCV2024 + + +
+ Current multi-object tracking (MOT) aims to predict trajectories of targets +(i.e., ''where'') in videos. Yet, knowing merely ''where'' is insufficient in +many crucial applications. In comparison, semantic understanding such as +fine-grained behaviors, interactions, and overall summarized captions (i.e., +''what'') from videos, associated with ''where'', is highly-desired for +comprehensive video analysis. Thus motivated, we introduce Semantic +Multi-Object Tracking (SMOT), that aims to estimate object trajectories and +meanwhile understand semantic details of associated trajectories including +instance captions, instance interactions, and overall video captions, +integrating ''where'' and ''what'' for tracking. In order to foster the +exploration of SMOT, we propose BenSMOT, a large-scale Benchmark for Semantic +MOT. Specifically, BenSMOT comprises 3,292 videos with 151K frames, covering +various scenarios for semantic tracking of humans. BenSMOT provides annotations +for the trajectories of targets, along with associated instance captions in +natural language, instance interactions, and overall caption for each video +sequence. To our best knowledge, BenSMOT is the first publicly available +benchmark for SMOT. Besides, to encourage future research, we present a novel +tracker named SMOTer, which is specially designed and end-to-end trained for +SMOT, showing promising performance. By releasing BenSMOT, we expect to go +beyond conventional MOT by predicting ''where'' and ''what'' for SMOT, opening +up a new direction in tracking for video understanding. We will release BenSMOT +and SMOTer at https://github.com/Nathan-Li123/SMOTer. + +
+
+ comment: Accepted to ECCV2024 +
+
+
+
+
+ + ♻ ☆ Learning Spectral-Decomposed Tokens for Domain Generalized Semantic + Segmentation ACM MM2024 + + +
+ The rapid development of Vision Foundation Model (VFM) brings inherent +out-domain generalization for a variety of down-stream tasks. Among them, +domain generalized semantic segmentation (DGSS) holds unique challenges as the +cross-domain images share common pixel-wise content information but vary +greatly in terms of the style. In this paper, we present a novel +Spectral-dEcomposed Token (SET) learning framework to advance the frontier. +Delving into further than existing fine-tuning token & frozen backbone +paradigm, the proposed SET especially focuses on the way learning +style-invariant features from these learnable tokens. Particularly, the frozen +VFM features are first decomposed into the phase and amplitude components in +the frequency space, which mainly contain the information of content and style, +respectively, and then separately processed by learnable tokens for +task-specific information extraction. After the decomposition, style variation +primarily impacts the token-based feature enhancement within the amplitude +branch. To address this issue, we further develop an attention optimization +method to bridge the gap between style-affected representation and static +tokens during inference. Extensive cross-domain experiments show its +state-of-the-art performance. + +
+
+ comment: accecpted by ACM MM2024 +
+
+
+
+
+ + ♻ ☆ YOLO-TLA: An Efficient and Lightweight Small Object Detection Model + based on YOLOv5 + + +
+ Object detection, a crucial aspect of computer vision, has seen significant +advancements in accuracy and robustness. Despite these advancements, practical +applications still face notable challenges, primarily the inaccurate detection +or missed detection of small objects. In this paper, we propose YOLO-TLA, an +advanced object detection model building on YOLOv5. We first introduce an +additional detection layer for small objects in the neck network pyramid +architecture, thereby producing a feature map of a larger scale to discern +finer features of small objects. Further, we integrate the C3CrossCovn module +into the backbone network. This module uses sliding window feature extraction, +which effectively minimizes both computational demand and the number of +parameters, rendering the model more compact. Additionally, we have +incorporated a global attention mechanism into the backbone network. This +mechanism combines the channel information with global information to create a +weighted feature map. This feature map is tailored to highlight the attributes +of the object of interest, while effectively ignoring irrelevant details. In +comparison to the baseline YOLOv5s model, our newly developed YOLO-TLA model +has shown considerable improvements on the MS COCO validation dataset, with +increases of 4.6% in mAP@0.5 and 4% in mAP@0.5:0.95, all while keeping the +model size compact at 9.49M parameters. Further extending these improvements to +the YOLOv5m model, the enhanced version exhibited a 1.7% and 1.9% increase in +mAP@0.5 and mAP@0.5:0.95, respectively, with a total of 27.53M parameters. +These results validate the YOLO-TLA model's efficient and effective performance +in small object detection, achieving high accuracy with fewer parameters and +computational demands. + +
+
+
+
+
+ + ♻ ☆ Composed Image Retrieval for Remote Sensing + + +
+ This work introduces composed image retrieval to remote sensing. It allows to +query a large image archive by image examples alternated by a textual +description, enriching the descriptive power over unimodal queries, either +visual or textual. Various attributes can be modified by the textual part, such +as shape, color, or context. A novel method fusing image-to-image and +text-to-image similarity is introduced. We demonstrate that a vision-language +model possesses sufficient descriptive power and no further learning step or +training data are necessary. We present a new evaluation benchmark focused on +color, context, density, existence, quantity, and shape modifications. Our work +not only sets the state-of-the-art for this task, but also serves as a +foundational step in addressing a gap in the field of remote sensing image +retrieval. Code at: https://github.com/billpsomas/rscir + +
+
+ comment: Accepted for ORAL presentation at the 2024 IEEE International + Geoscience and Remote Sensing Symposium +
+
+
+
+
+ + ♻ ☆ AnyHome: Open-Vocabulary Generation of Structured and Textured 3D Homes ECCV 2024 + + +
+ Inspired by cognitive theories, we introduce AnyHome, a framework that +translates any text into well-structured and textured indoor scenes at a +house-scale. By prompting Large Language Models (LLMs) with designed templates, +our approach converts provided textual narratives into amodal structured +representations. These representations guarantee consistent and realistic +spatial layouts by directing the synthesis of a geometry mesh within defined +constraints. A Score Distillation Sampling process is then employed to refine +the geometry, followed by an egocentric inpainting process that adds lifelike +textures to it. AnyHome stands out with its editability, customizability, +diversity, and realism. The structured representations for scenes allow for +extensive editing at varying levels of granularity. Capable of interpreting +texts ranging from simple labels to detailed narratives, AnyHome generates +detailed geometries and textures that outperform existing methods in both +quantitative and qualitative measures. + +
+
+ comment: accepted by ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Enhancing Wrist Fracture Detection with YOLO + + +
+ Diagnosing and treating abnormalities in the wrist, specifically distal +radius, and ulna fractures, is a crucial concern among children, adolescents, +and young adults, with a higher incidence rate during puberty. However, the +scarcity of radiologists and the lack of specialized training among medical +professionals pose a significant risk to patient care. This problem is further +exacerbated by the rising number of imaging studies and limited access to +specialist reporting in certain regions. This highlights the need for +innovative solutions to improve the diagnosis and treatment of wrist +abnormalities. Automated wrist fracture detection using object detection has +shown potential, but current studies mainly use two-stage detection methods +with limited evidence for single-stage effectiveness. This study employs +state-of-the-art single-stage deep neural network-based detection models +YOLOv5, YOLOv6, YOLOv7, and YOLOv8 to detect wrist abnormalities. Through +extensive experimentation, we found that these YOLO models outperform the +commonly used two-stage detection algorithm, Faster R-CNN, in fracture +detection. Additionally, compound-scaled variants of each YOLO model were +compared, with YOLOv8m demonstrating a highest fracture detection sensitivity +of 0.92 and mean average precision (mAP) of 0.95. On the other hand, YOLOv6m +achieved the highest sensitivity across all classes at 0.83. Meanwhile, YOLOv8x +recorded the highest mAP of 0.77 for all classes on the GRAZPEDWRI-DX pediatric +wrist dataset, highlighting the potential of single-stage models for enhancing +pediatric wrist imaging. + +
+
+
+
+
+ + ♻ ☆ MagMax: Leveraging Model Merging for Seamless Continual Learning ECCV2024 + + +
+ This paper introduces a continual learning approach named MagMax, which +utilizes model merging to enable large pre-trained models to continuously learn +from new data without forgetting previously acquired knowledge. Distinct from +traditional continual learning methods that aim to reduce forgetting during +task training, MagMax combines sequential fine-tuning with a maximum magnitude +weight selection for effective knowledge integration across tasks. Our initial +contribution is an extensive examination of model merging techniques, revealing +that simple approaches like weight averaging and random weight selection +surprisingly hold up well in various continual learning contexts. More +importantly, we present MagMax, a novel model-merging strategy that enables +continual learning of large pre-trained models for successive tasks. Our +thorough evaluation demonstrates the superiority of MagMax in various +scenarios, including class- and domain-incremental learning settings. The code +is available at this URL: https://github.com/danielm1405/magmax. + +
+
+ comment: Accepted for ECCV2024 +
+
+
+
+
+ + ♻ ☆ VANP: Learning Where to See for Navigation with Self-Supervised + Vision-Action Pre-Training IROS 2024 + + +
+ Humans excel at efficiently navigating through crowds without collision by +focusing on specific visual regions relevant to navigation. However, most +robotic visual navigation methods rely on deep learning models pre-trained on +vision tasks, which prioritize salient objects -- not necessarily relevant to +navigation and potentially misleading. Alternative approaches train specialized +navigation models from scratch, requiring significant computation. On the other +hand, self-supervised learning has revolutionized computer vision and natural +language processing, but its application to robotic navigation remains +underexplored due to the difficulty of defining effective self-supervision +signals. Motivated by these observations, in this work, we propose a +Self-Supervised Vision-Action Model for Visual Navigation Pre-Training (VANP). +Instead of detecting salient objects that are beneficial for tasks such as +classification or detection, VANP learns to focus only on specific visual +regions that are relevant to the navigation task. To achieve this, VANP uses a +history of visual observations, future actions, and a goal image for +self-supervision, and embeds them using two small Transformer Encoders. Then, +VANP maximizes the information between the embeddings by using a mutual +information maximization objective function. We demonstrate that most +VANP-extracted features match with human navigation intuition. VANP achieves +comparable performance as models learned end-to-end with half the training time +and models trained on a large-scale, fully supervised dataset, i.e., ImageNet, +with only 0.08% data. + +
+
+ comment: Extended version of the paper accepted at IROS 2024. Code: + https://github.com/mhnazeri/VANP +
+
+
+
+
+ + ♻ ☆ Infinite dSprites for Disentangled Continual Learning: Separating Memory + Edits from Generalization + + +
+ The ability of machine learning systems to learn continually is hindered by +catastrophic forgetting, the tendency of neural networks to overwrite +previously acquired knowledge when learning a new task. Existing methods +mitigate this problem through regularization, parameter isolation, or +rehearsal, but they are typically evaluated on benchmarks comprising only a +handful of tasks. In contrast, humans are able to learn over long time horizons +in dynamic, open-world environments, effortlessly memorizing unfamiliar objects +and reliably recognizing them under various transformations. To make progress +towards closing this gap, we introduce Infinite dSprites, a parsimonious tool +for creating continual classification and disentanglement benchmarks of +arbitrary length and with full control over generative factors. We show that +over a sufficiently long time horizon, the performance of all major types of +continual learning methods deteriorates on this simple benchmark. This result +highlights an important and previously overlooked aspect of continual learning: +given a finite modelling capacity and an arbitrarily long learning horizon, +efficient learning requires memorizing class-specific information and +accumulating knowledge about general mechanisms. In a simple setting with +direct supervision on the generative factors, we show how learning +class-agnostic transformations offers a way to circumvent catastrophic +forgetting and improve classification accuracy over time. Our approach sets the +stage for continual learning over hundreds of tasks with explicit control over +memorization and forgetting, emphasizing open-set classification and one-shot +generalization. + +
+
+ comment: 10 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ AUGCAL: Improving Sim2Real Adaptation by Uncertainty Calibration on + Augmented Synthetic Images ICLR 2024 + + +
+ Synthetic data (SIM) drawn from simulators have emerged as a popular +alternative for training models where acquiring annotated real-world images is +difficult. However, transferring models trained on synthetic images to +real-world applications can be challenging due to appearance disparities. A +commonly employed solution to counter this SIM2REAL gap is unsupervised domain +adaptation, where models are trained using labeled SIM data and unlabeled REAL +data. Mispredictions made by such SIM2REAL adapted models are often associated +with miscalibration - stemming from overconfident predictions on real data. In +this paper, we introduce AUGCAL, a simple training-time patch for unsupervised +adaptation that improves SIM2REAL adapted models by - (1) reducing overall +miscalibration, (2) reducing overconfidence in incorrect predictions and (3) +improving confidence score reliability by better guiding misclassification +detection - all while retaining or improving SIM2REAL performance. Given a base +SIM2REAL adaptation algorithm, at training time, AUGCAL involves replacing +vanilla SIM images with strongly augmented views (AUG intervention) and +additionally optimizing for a training time calibration loss on augmented SIM +predictions (CAL intervention). We motivate AUGCAL using a brief analytical +justification of how to reduce miscalibration on unlabeled REAL data. Through +our experiments, we empirically show the efficacy of AUGCAL across multiple +adaptation methods, backbones, tasks and shifts. + +
+
+ comment: Published at ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Dream2Real: Zero-Shot 3D Object Rearrangement with Vision-Language + Models ICRA 2024 + + +
+ We introduce Dream2Real, a robotics framework which integrates +vision-language models (VLMs) trained on 2D data into a 3D object rearrangement +pipeline. This is achieved by the robot autonomously constructing a 3D +representation of the scene, where objects can be rearranged virtually and an +image of the resulting arrangement rendered. These renders are evaluated by a +VLM, so that the arrangement which best satisfies the user instruction is +selected and recreated in the real world with pick-and-place. This enables +language-conditioned rearrangement to be performed zero-shot, without needing +to collect a training dataset of example arrangements. Results on a series of +real-world tasks show that this framework is robust to distractors, +controllable by language, capable of understanding complex multi-object +relations, and readily applicable to both tabletop and 6-DoF rearrangement +tasks. + +
+
+ comment: ICRA 2024. Project webpage with robot videos: + https://www.robot-learning.uk/dream2real +
+
+
+
+
+ + ♻ ☆ MAMA-MIA: A Large-Scale Multi-Center Breast Cancer DCE-MRI Benchmark + Dataset with Expert Segmentations + + +
+ Current research in breast cancer Magnetic Resonance Imaging (MRI), +especially with Artificial Intelligence (AI), faces challenges due to the lack +of expert segmentations. To address this, we introduce the MAMA-MIA dataset, +comprising 1506 multi-center dynamic contrast-enhanced MRI cases with expert +segmentations of primary tumors and non-mass enhancement areas. These cases +were sourced from four publicly available collections in The Cancer Imaging +Archive (TCIA). Initially, we trained a deep learning model to automatically +segment the cases, generating preliminary segmentations that significantly +reduced expert segmentation time. Sixteen experts, averaging 9 years of +experience in breast cancer, then corrected these segmentations, resulting in +the final expert segmentations. Additionally, two radiologists conducted a +visual inspection of the automatic segmentations to support future quality +control studies. Alongside the expert segmentations, we provide 49 harmonized +demographic and clinical variables and the pretrained weights of the well-known +nnUNet architecture trained using the DCE-MRI full-images and expert +segmentations. This dataset aims to accelerate the development and benchmarking +of deep learning models and foster innovation in breast cancer diagnostics and +treatment planning. + +
+
+ comment: 15 paes, 7 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ DatasetNeRF: Efficient 3D-aware Data Factory with Generative Radiance + Fields + + +
+ Progress in 3D computer vision tasks demands a huge amount of data, yet +annotating multi-view images with 3D-consistent annotations, or point clouds +with part segmentation is both time-consuming and challenging. This paper +introduces DatasetNeRF, a novel approach capable of generating infinite, +high-quality 3D-consistent 2D annotations alongside 3D point cloud +segmentations, while utilizing minimal 2D human-labeled annotations. +Specifically, we leverage the strong semantic prior within a 3D generative +model to train a semantic decoder, requiring only a handful of fine-grained +labeled samples. Once trained, the decoder efficiently generalizes across the +latent space, enabling the generation of infinite data. The generated data is +applicable across various computer vision tasks, including video segmentation +and 3D point cloud segmentation. Our approach not only surpasses baseline +models in segmentation quality, achieving superior 3D consistency and +segmentation precision on individual images, but also demonstrates versatility +by being applicable to both articulated and non-articulated generative models. +Furthermore, we explore applications stemming from our approach, such as +3D-aware semantic editing and 3D inversion. + +
+
+
+
+
+ + ♻ ☆ SpotlessSplats: Ignoring Distractors in 3D Gaussian Splatting + + +
+ 3D Gaussian Splatting (3DGS) is a promising technique for 3D reconstruction, +offering efficient training and rendering speeds, making it suitable for +real-time applications.However, current methods require highly controlled +environments (no moving people or wind-blown elements, and consistent lighting) +to meet the inter-view consistency assumption of 3DGS. This makes +reconstruction of real-world captures problematic. We present SpotLessSplats, +an approach that leverages pre-trained and general-purpose features coupled +with robust optimization to effectively ignore transient distractors. Our +method achieves state-of-the-art reconstruction quality both visually and +quantitatively, on casual captures. Additional results available at: +https://spotlesssplats.github.io + +
+
+
+
+
+ + ♻ ☆ Synthetic Counterfactual Faces + + +
+ Computer vision systems have been deployed in various applications involving +biometrics like human faces. These systems can identify social media users, +search for missing persons, and verify identity of individuals. While computer +vision models are often evaluated for accuracy on available benchmarks, more +annotated data is necessary to learn about their robustness and fairness +against semantic distributional shifts in input data, especially in face data. +Among annotated data, counterfactual examples grant strong explainability +characteristics. Because collecting natural face data is prohibitively +expensive, we put forth a generative AI-based framework to construct targeted, +counterfactual, high-quality synthetic face data. Our synthetic data pipeline +has many use cases, including face recognition systems sensitivity evaluations +and image understanding system probes. The pipeline is validated with multiple +user studies. We showcase the efficacy of our face generation pipeline on a +leading commercial vision model. We identify facial attributes that cause +vision systems to fail. + +
+
+ comment: Paper under review. Full text and results will be updated after + acceptance +
+
+
+
+
+ + ♻ ☆ CRASAR-U-DROIDs: A Large Scale Benchmark Dataset for Building Alignment + and Damage Assessment in Georectified sUAS Imagery + + +
+ This document presents the Center for Robot Assisted Search And Rescue - +Uncrewed Aerial Systems - Disaster Response Overhead Inspection Dataset +(CRASAR-U-DROIDs) for building damage assessment and spatial alignment +collected from small uncrewed aerial systems (sUAS) geospatial imagery. This +dataset is motivated by the increasing use of sUAS in disaster response and the +lack of previous work in utilizing high-resolution geospatial sUAS imagery for +machine learning and computer vision models, the lack of alignment with +operational use cases, and with hopes of enabling further investigations +between sUAS and satellite imagery. The CRASAR-U-DRIODs dataset consists of +fifty-two (52) orthomosaics from ten (10) federally declared disasters +(Hurricane Ian, Hurricane Ida, Hurricane Harvey, Hurricane Idalia, Hurricane +Laura, Hurricane Michael, Musset Bayou Fire, Mayfield Tornado, Kilauea +Eruption, and Champlain Towers Collapse) spanning 67.98 square kilometers +(26.245 square miles), containing 21,716 building polygons and damage labels, +and 7,880 adjustment annotations. The imagery was tiled and presented in +conjunction with overlaid building polygons to a pool of 130 annotators who +provided human judgments of damage according to the Joint Damage Scale. These +annotations were then reviewed via a two-stage review process in which building +polygon damage labels were first reviewed individually and then again by +committee. Additionally, the building polygons have been aligned spatially to +precisely overlap with the imagery to enable more performant machine learning +models to be trained. It appears that CRASAR-U-DRIODs is the largest labeled +dataset of sUAS orthomosaic imagery. + +
+
+ comment: 16 Pages, 7 Figures, 6 Tables +
+
+
+
+
+ + ♻ ☆ MoVideo: Motion-Aware Video Generation with Diffusion Models ECCV2024 + + +
+ While recent years have witnessed great progress on using diffusion models +for video generation, most of them are simple extensions of image generation +frameworks, which fail to explicitly consider one of the key differences +between videos and images, i.e., motion. In this paper, we propose a novel +motion-aware video generation (MoVideo) framework that takes motion into +consideration from two aspects: video depth and optical flow. The former +regulates motion by per-frame object distances and spatial layouts, while the +later describes motion by cross-frame correspondences that help in preserving +fine details and improving temporal consistency. More specifically, given a key +frame that exists or generated from text prompts, we first design a diffusion +model with spatio-temporal modules to generate the video depth and the +corresponding optical flows. Then, the video is generated in the latent space +by another spatio-temporal diffusion model under the guidance of depth, optical +flow-based warped latent video and the calculated occlusion mask. Lastly, we +use optical flows again to align and refine different frames for better video +decoding from the latent space to the pixel space. In experiments, MoVideo +achieves state-of-the-art results in both text-to-video and image-to-video +generation, showing promising prompt consistency, frame consistency and visual +quality. + +
+
+ comment: Accepted by ECCV2024. Project page: + https://jingyunliang.github.io/MoVideo +
+
+
+
+
+ + ♻ ☆ Frequency Guidance Matters: Skeletal Action Recognition by + Frequency-Aware Mixed Transformer + + +
+ Recently, transformers have demonstrated great potential for modeling +long-term dependencies from skeleton sequences and thereby gained +ever-increasing attention in skeleton action recognition. However, the existing +transformer-based approaches heavily rely on the naive attention mechanism for +capturing the spatiotemporal features, which falls short in learning +discriminative representations that exhibit similar motion patterns. To address +this challenge, we introduce the Frequency-aware Mixed Transformer +(FreqMixFormer), specifically designed for recognizing similar skeletal actions +with subtle discriminative motions. First, we introduce a frequency-aware +attention module to unweave skeleton frequency representations by embedding +joint features into frequency attention maps, aiming to distinguish the +discriminative movements based on their frequency coefficients. Subsequently, +we develop a mixed transformer architecture to incorporate spatial features +with frequency features to model the comprehensive frequency-spatial patterns. +Additionally, a temporal transformer is proposed to extract the global +correlations across frames. Extensive experiments show that FreqMiXFormer +outperforms SOTA on 3 popular skeleton action recognition datasets, including +NTU RGB+D, NTU RGB+D 120, and NW-UCLA datasets. + +
+
+ comment: Accepted by ACM Multimedia 2024 +
+
+
+
+
+
+
+
+ + Information Retrieval 21 + +
+
+
+ + ☆ QAEA-DR: A Unified Text Augmentation Framework for Dense Retrieval + + +
+ In dense retrieval, embedding long texts into dense vectors can result in +information loss, leading to inaccurate query-text matching. Additionally, +low-quality texts with excessive noise or sparse key information are unlikely +to align well with relevant queries. Recent studies mainly focus on improving +the sentence embedding model or retrieval process. In this work, we introduce a +novel text augmentation framework for dense retrieval. This framework +transforms raw documents into information-dense text formats, which supplement +the original texts to effectively address the aforementioned issues without +modifying embedding or retrieval methodologies. Two text representations are +generated via large language models (LLMs) zero-shot prompting: question-answer +pairs and element-driven events. We term this approach QAEA-DR: unifying +question-answer generation and event extraction in a text augmentation +framework for dense retrieval. To further enhance the quality of generated +texts, a scoring-based evaluation and regeneration mechanism is introduced in +LLM prompting. Our QAEA-DR model has a positive impact on dense retrieval, +supported by both theoretical analysis and empirical experiments. + +
+
+
+
+
+ + ☆ Aligning Query Representation with Rewritten Query and Relevance + Judgments in Conversational Search CIKM 2024 + + +
+ Conversational search supports multi-turn user-system interactions to solve +complex information needs. Different from the traditional single-turn ad-hoc +search, conversational search encounters a more challenging problem of +context-dependent query understanding with the lengthy and long-tail +conversational history context. While conversational query rewriting methods +leverage explicit rewritten queries to train a rewriting model to transform the +context-dependent query into a stand-stone search query, this is usually done +without considering the quality of search results. Conversational dense +retrieval methods use fine-tuning to improve a pre-trained ad-hoc query +encoder, but they are limited by the conversational search data available for +training. In this paper, we leverage both rewritten queries and relevance +judgments in the conversational search data to train a better query +representation model. The key idea is to align the query representation with +those of rewritten queries and relevant documents. The proposed model -- Query +Representation Alignment Conversational Dense Retriever, QRACDR, is tested on +eight datasets, including various settings in conversational search and ad-hoc +search. The results demonstrate the strong performance of QRACDR compared with +state-of-the-art methods, and confirm the effectiveness of representation +alignment. + +
+
+ comment: Accepted by CIKM 2024 +
+
+
+
+
+ + ☆ EXIT: An EXplicit Interest Transfer Framework for Cross-Domain + Recommendation CIKM 2024 + + +
+ Cross-domain recommendation has attracted substantial interest in industrial +apps such as Meituan, which serves multiple business domains via knowledge +transfer and meets the diverse interests of users. However, existing methods +typically follow an implicit modeling paradigm that blends the knowledge from +both the source and target domains, and design intricate network structures to +share learned embeddings or patterns between domains to improve recommendation +accuracy. Since the transfer of interest signals is unsupervised, these +implicit paradigms often struggle with the negative transfer resulting from +differences in service functions and presentation forms across different +domains. In this paper, we propose a simple and effective EXplicit Interest +Transfer framework named EXIT to address the stated challenge. Specifically, we +propose a novel label combination approach that enables the model to directly +learn beneficial source domain interests through supervised learning, while +excluding inappropriate interest signals. Moreover, we introduce a scene +selector network to model the interest transfer intensity under fine-grained +scenes. Offline experiments conducted on the industrial production dataset and +online A/B tests validate the superiority and effectiveness of our proposed +framework. Without complex network structures or training processes, EXIT can +be easily deployed in the industrial recommendation system. EXIT has been +successfully deployed in the online homepage recommendation system of Meituan +App, serving the main traffic. + +
+
+ comment: Accepted at CIKM 2024 +
+
+
+
+
+ + ☆ FiCo-ITR: bridging fine-grained and coarse-grained image-text retrieval + for comparative performance analysis + + +
+ In the field of Image-Text Retrieval (ITR), recent advancements have +leveraged large-scale Vision-Language Pretraining (VLP) for Fine-Grained (FG) +instance-level retrieval, achieving high accuracy at the cost of increased +computational complexity. For Coarse-Grained (CG) category-level retrieval, +prominent approaches employ Cross-Modal Hashing (CMH) to prioritise efficiency, +albeit at the cost of retrieval performance. Due to differences in +methodologies, FG and CG models are rarely compared directly within evaluations +in the literature, resulting in a lack of empirical data quantifying the +retrieval performance-efficiency tradeoffs between the two. This paper +addresses this gap by introducing the \texttt{FiCo-ITR} library, which +standardises evaluation methodologies for both FG and CG models, facilitating +direct comparisons. We conduct empirical evaluations of representative models +from both subfields, analysing precision, recall, and computational complexity +across varying data scales. Our findings offer new insights into the +performance-efficiency trade-offs between recent representative FG and CG +models, highlighting their respective strengths and limitations. These findings +provide the foundation necessary to make more informed decisions regarding +model selection for specific retrieval tasks and highlight avenues for future +research into hybrid systems that leverage the strengths of both FG and CG +approaches. + +
+
+ comment: 19 pages, submitted to International Journal of Multimedia + Information Retrieval +
+
+
+
+
+ + ☆ Practical and Robust Safety Guarantees for Advanced Counterfactual + Learning to Rank CIKM 2024 + + +
+ Counterfactual learning to rank (CLTR ) can be risky; various circumstances +can cause it to produce sub-optimal models that hurt performance when deployed. +Safe CLTR was introduced to mitigate these risks when using inverse propensity +scoring to correct for position bias. However, the existing safety measure for +CLTR is not applicable to state-of-the-art CLTR, it cannot handle trust bias, +and its guarantees rely on specific assumptions about user behavior. Our +contributions are two-fold. First, we generalize the existing safe CLTR +approach to make it applicable to state-of-the-art doubly robust (DR) CLTR and +trust bias. Second, we propose a novel approach, proximal ranking policy +optimization (PRPO ), that provides safety in deployment without assumptions +about user behavior. PRPO removes incentives for learning ranking behavior that +is too dissimilar to a safe ranking model. Thereby, PRPO imposes a limit on how +much learned models can degrade performance metrics, without relying on any +specific user assumptions. Our experiments show that both our novel safe doubly +robust method and PRPO provide higher performance than the existing safe +inverse propensity scoring approach. However, when circumstances are +unexpected, the safe doubly robust approach can become unsafe and bring +detrimental performance. In contrast, PRPO always maintains safety, even in +maximally adversarial situations. By avoiding assumptions, PRPO is the first +method with unconditional safety in deployment that translates to robust safety +for real-world applications. + +
+
+ comment: Full paper at CIKM 2024 +
+
+
+
+
+ + ☆ AOTree: Aspect Order Tree-based Model for Explainable Recommendation + + +
+ Recent recommender systems aim to provide not only accurate recommendations +but also explanations that help users understand them better. However, most +existing explainable recommendations only consider the importance of content in +reviews, such as words or aspects, and ignore the ordering relationship among +them. This oversight neglects crucial ordering dimensions in the human +decision-making process, leading to suboptimal performance. Therefore, in this +paper, we propose Aspect Order Tree-based (AOTree) explainable recommendation +method, inspired by the Order Effects Theory from cognitive and decision +psychology, in order to capture the dependency relationships among decisive +factors. We first validate the theory in the recommendation scenario by +analyzing the reviews of the users. Then, according to the theory, the proposed +AOTree expands the construction of the decision tree to capture aspect orders +in users' decision-making processes, and use attention mechanisms to make +predictions based on the aspect orders. Extensive experiments demonstrate our +method's effectiveness on rating predictions, and our approach aligns more +consistently with the user' s decision-making process by displaying +explanations in a particular order, thereby enhancing interpretability. + +
+
+
+
+
+ + ☆ Sentiment Analysis of Lithuanian Online Reviews Using Large Language + Models + + +
+ Sentiment analysis is a widely researched area within Natural Language +Processing (NLP), attracting significant interest due to the advent of +automated solutions. Despite this, the task remains challenging because of the +inherent complexity of languages and the subjective nature of sentiments. It is +even more challenging for less-studied and less-resourced languages such as +Lithuanian. Our review of existing Lithuanian NLP research reveals that +traditional machine learning methods and classification algorithms have limited +effectiveness for the task. In this work, we address sentiment analysis of +Lithuanian five-star-based online reviews from multiple domains that we collect +and clean. We apply transformer models to this task for the first time, +exploring the capabilities of pre-trained multilingual Large Language Models +(LLMs), specifically focusing on fine-tuning BERT and T5 models. Given the +inherent difficulty of the task, the fine-tuned models perform quite well, +especially when the sentiments themselves are less ambiguous: 80.74% and 89.61% +testing recognition accuracy of the most popular one- and five-star reviews +respectively. They significantly outperform current commercial state-of-the-art +general-purpose LLM GPT-4. We openly share our fine-tuned LLMs online. + +
+
+ comment: Accepted at the 29th International Conference on Information Society + and University Studies (IVUS 2024) +
+
+
+
+
+ + ☆ Generative Retrieval with Preference Optimization for E-commerce Search + + +
+ Generative retrieval introduces a groundbreaking paradigm to document +retrieval by directly generating the identifier of a pertinent document in +response to a specific query. This paradigm has demonstrated considerable +benefits and potential, particularly in representation and generalization +capabilities, within the context of large language models. However, it faces +significant challenges in E-commerce search scenarios, including the complexity +of generating detailed item titles from brief queries, the presence of noise in +item titles with weak language order, issues with long-tail queries, and the +interpretability of results. To address these challenges, we have developed an +innovative framework for E-commerce search, called generative retrieval with +preference optimization. This framework is designed to effectively learn and +align an autoregressive model with target data, subsequently generating the +final item through constraint-based beam search. By employing multi-span +identifiers to represent raw item titles and transforming the task of +generating titles from queries into the task of generating multi-span +identifiers from queries, we aim to simplify the generation process. The +framework further aligns with human preferences using click data and employs a +constrained search method to identify key spans for retrieving the final item, +thereby enhancing result interpretability. Our extensive experiments show that +this framework achieves competitive performance on a real-world dataset, and +online A/B tests demonstrate the superiority and effectiveness in improving +conversion gains. + +
+
+
+
+
+ + ☆ Analyzing and reducing the synthetic-to-real transfer gap in Music + Information Retrieval: the task of automatic drum transcription + + +
+ Automatic drum transcription is a critical tool in Music Information +Retrieval for extracting and analyzing the rhythm of a music track, but it is +limited by the size of the datasets available for training. A popular method +used to increase the amount of data is by generating them synthetically from +music scores rendered with virtual instruments. This method can produce a +virtually infinite quantity of tracks, but empirical evidence shows that models +trained on previously created synthetic datasets do not transfer well to real +tracks. In this work, besides increasing the amount of data, we identify and +evaluate three more strategies that practitioners can use to improve the +realism of the generated data and, thus, narrow the synthetic-to-real transfer +gap. To explore their efficacy, we used them to build a new synthetic dataset +and then we measured how the performance of a model scales and, specifically, +at what value it will stagnate when increasing the number of training tracks +for different datasets. By doing this, we were able to prove that the +aforementioned strategies contribute to make our dataset the one with the most +realistic data distribution and the lowest synthetic-to-real transfer gap among +the synthetic datasets we evaluated. We conclude by highlighting the limits of +training with infinite data in drum transcription and we show how they can be +overcome. + +
+
+ comment: 21 pages, 4 figures +
+
+
+
+
+ + ☆ Image-text matching for large-scale book collections + + +
+ We address the problem of detecting and mapping all books in a collection of +images to entries in a given book catalogue. Instead of performing independent +retrieval for each book detected, we treat the image-text mapping problem as a +many-to-many matching process, looking for the best overall match between the +two sets. We combine a state-of-the-art segmentation method (SAM) to detect +book spines and extract book information using a commercial OCR. We then +propose a two-stage approach for text-image matching, where CLIP embeddings are +used first for fast matching, followed by a second slower stage to refine the +matching, employing either the Hungarian Algorithm or a BERT-based model +trained to cope with noisy OCR input and partial text matches. To evaluate our +approach, we publish a new dataset of annotated bookshelf images that covers +the whole book collection of a public library in Spain. In addition, we provide +two target lists of book metadata, a closed-set of 15k book titles that +corresponds to the known library inventory, and an open-set of 2.3M book titles +to simulate an open-world scenario. We report results on two settings, on one +hand on a matching-only task, where the book segments and OCR is given and the +objective is to perform many-to-many matching against the target lists, and a +combined detection and matching task, where books must be first detected and +recognised before they are matched to the target list entries. We show that +both the Hungarian Matching and the proposed BERT-based model outperform a +fuzzy string matching baseline, and we highlight inherent limitations of the +matching algorithms as the target increases in size, and when either of the two +sets (detected books or target book list) is incomplete. The dataset and code +are available at https://github.com/llabres/library-dataset + +
+
+
+
+
+ + ☆ Adaptive Utilization of Cross-scenario Information for Multi-scenario + Recommendation + + +
+ Recommender system of the e-commerce platform usually serves multiple +business scenarios. Multi-scenario Recommendation (MSR) is an important topic +that improves ranking performance by leveraging information from different +scenarios. Recent methods for MSR mostly construct scenario shared or specific +modules to model commonalities and differences among scenarios. However, when +the amount of data among scenarios is skewed or data in some scenarios is +extremely sparse, it is difficult to learn scenario-specific parameters well. +Besides, simple sharing of information from other scenarios may result in a +negative transfer. In this paper, we propose a unified model named +Cross-Scenario Information Interaction (CSII) to serve all scenarios by a +mixture of scenario-dominated experts. Specifically, we propose a novel method +to select highly transferable features in data instances. Then, we propose an +attention-based aggregator module, which can adaptively extract relative +knowledge from cross-scenario. Experiments on the production dataset verify the +superiority of our method. Online A/B test in Meituan Waimai APP also shows a +significant performance gain, leading to an average improvement in GMV (Gross +Merchandise Value) of 1.0% for overall scenarios. + +
+
+
+
+
+ + ☆ High-Order Fusion Graph Contrastive Learning for Recommendation + + +
+ Self-supervised learning (SSL) has recently attracted significant attention +in the field of recommender systems. Contrastive learning (CL) stands out as a +major SSL paradigm due to its robust ability to generate self-supervised +signals. Mainstream graph contrastive learning (GCL)-based methods typically +implement CL by creating contrastive views through various data augmentation +techniques. Despite these methods are effective, we argue that there still +exist several challenges: i) Data augmentation (e.g., discarding edges or +adding noise) necessitates additional graph convolution (GCN) or modeling +operations, which are highly time-consuming and potentially harm the embedding +quality. ii) Existing CL-based methods use traditional CL objectives to capture +self-supervised signals. However, few studies have explored obtaining CL +objectives from more perspectives and have attempted to fuse the varying +signals from these CL objectives to enhance recommendation performance. + To overcome these challenges, we propose a High-Order Fusion Graph +Contrastive Learning (HFGCL) framework for recommendation. Specifically, we +discards the data augmentations and instead high-order information from GCN +process to create contrastive views. Additionally, to integrate self-supervised +signals from various CL objectives, we propose an advanced CL objective. By +ensuring that positive pairs are distanced from negative samples derived from +both contrastive views, we effectively fuse self-supervised signals from +distinct CL objectives, thereby enhancing the mutual information between +positive pairs. Experimental results on three public datasets demonstrate the +superior effectiveness of HFGCL compared to the state-of-the-art baselines. + +
+
+
+
+
+ + ☆ GradCraft: Elevating Multi-task Recommendations through Holistic + Gradient Crafting KDD'24 + + +
+ Recommender systems require the simultaneous optimization of multiple +objectives to accurately model user interests, necessitating the application of +multi-task learning methods. However, existing multi-task learning methods in +recommendations overlook the specific characteristics of recommendation +scenarios, falling short in achieving proper gradient balance. To address this +challenge, we set the target of multi-task learning as attaining the +appropriate magnitude balance and the global direction balance, and propose an +innovative methodology named GradCraft in response. GradCraft dynamically +adjusts gradient magnitudes to align with the maximum gradient norm, mitigating +interference from gradient magnitudes for subsequent manipulation. It then +employs projections to eliminate gradient conflicts in directions while +considering all conflicting tasks simultaneously, theoretically guaranteeing +the global resolution of direction conflicts. GradCraft ensures the concurrent +achievement of appropriate magnitude balance and global direction balance, +aligning with the inherent characteristics of recommendation scenarios. Both +offline and online experiments attest to the efficacy of GradCraft in enhancing +multi-task performance in recommendations. The source code for GradCraft can be +accessed at https://github.com/baiyimeng/GradCraft. + +
+
+ comment: Accepted by KDD'24 +
+
+
+
+
+ + ☆ mGTE: Generalized Long-Context Text Representation and Reranking Models + for Multilingual Text Retrieval + + +
+ We present systematic efforts in building long-context multilingual text +representation model (TRM) and reranker from scratch for text retrieval. We +first introduce a text encoder (base size) enhanced with RoPE and unpadding, +pre-trained in a native 8192-token context (longer than 512 of previous +multilingual encoders). Then we construct a hybrid TRM and a cross-encoder +reranker by contrastive learning. Evaluations show that our text encoder +outperforms the same-sized previous state-of-the-art XLM-R. Meanwhile, our TRM +and reranker match the performance of large-sized state-of-the-art BGE-M3 +models and achieve better results on long-context retrieval benchmarks. Further +analysis demonstrate that our proposed models exhibit higher efficiency during +both training and inference. We believe their efficiency and effectiveness +could benefit various researches and industrial applications. + +
+
+ comment: 20 pages, 5 figures +
+
+
+
+
+ + ☆ Enhancing CTR Prediction through Sequential Recommendation Pre-training: + Introducing the SRP4CTR Framework + + +
+ Understanding user interests is crucial for Click-Through Rate (CTR) +prediction tasks. In sequential recommendation, pre-training from user +historical behaviors through self-supervised learning can better comprehend +user dynamic preferences, presenting the potential for direct integration with +CTR tasks. Previous methods have integrated pre-trained models into downstream +tasks with the sole purpose of extracting semantic information or +well-represented user features, which are then incorporated as new features. +However, these approaches tend to ignore the additional inference costs to the +downstream tasks, and they do not consider how to transfer the effective +information from the pre-trained models for specific estimated items in CTR +prediction. In this paper, we propose a Sequential Recommendation Pre-training +framework for CTR prediction (SRP4CTR) to tackle the above problems. Initially, +we discuss the impact of introducing pre-trained models on inference costs. +Subsequently, we introduced a pre-trained method to encode sequence side +information concurrently.During the fine-tuning process, we incorporate a +cross-attention block to establish a bridge between estimated items and the +pre-trained model at a low cost. Moreover, we develop a querying transformer +technique to facilitate the knowledge transfer from the pre-trained model to +industrial CTR models. Offline and online experiments show that our method +outperforms previous baseline models. + +
+
+
+
+
+ + ☆ Graphite: A Graph-based Extreme Multi-Label Short Text Classifier for + Keyphrase Recommendation + + +
+ Keyphrase Recommendation has been a pivotal problem in advertising and +e-commerce where advertisers/sellers are recommended keyphrases (search +queries) to bid on to increase their sales. It is a challenging task due to the +plethora of items shown on online platforms and various possible queries that +users search while showing varying interest in the displayed items. Moreover, +query/keyphrase recommendations need to be made in real-time and in a +resource-constrained environment. This problem can be framed as an Extreme +Multi-label (XML) Short text classification by tagging the input text with +keywords as labels. Traditional neural network models are either infeasible or +have slower inference latency due to large label spaces. We present Graphite, a +graph-based classifier model that provides real-time keyphrase recommendations +that are on par with standard text classification models. Furthermore, it +doesn't utilize GPU resources, which can be limited in production environments. +Due to its lightweight nature and smaller footprint, it can train on very large +datasets, where state-of-the-art XML models fail due to extreme resource +requirements. Graphite is deterministic, transparent, and intrinsically more +interpretable than neural network-based models. We present a comprehensive +analysis of our model's performance across forty categories spanning eBay's +English-speaking sites. + +
+
+
+
+
+ + ♻ ☆ DCNv3: Towards Next Generation Deep Cross Network for CTR Prediction + + +
+ Deep & Cross Network and its derivative models have become an important +paradigm in click-through rate (CTR) prediction due to their effective balance +between computational cost and performance. However, these models face four +major limitations: (1) while most models claim to capture high-order feature +interactions, they often do so implicitly and non-interpretably through deep +neural networks (DNN), which limits the trustworthiness of the model's +predictions; (2) the performance of existing explicit feature interaction +methods is often weaker than that of implicit DNN, undermining their necessity; +(3) many models fail to adaptively filter noise while enhancing the order of +feature interactions; (4) the fusion methods of most models cannot provide +suitable supervision signals for their different interaction methods. + To address the identified limitations, this paper proposes the next +generation Deep Cross Network (DCNv3) and Shallow & Deep Cross Network +(SDCNv3). These models ensure interpretability in feature interaction modeling +while exponentially increasing the order of feature interactions to achieve +genuine Deep Crossing rather than just Deep & Cross. Additionally, we employ a +Self-Mask operation to filter noise and reduce the number of parameters in the +cross network by half. In the fusion layer, we use a simple yet effective loss +weight calculation method called Tri-BCE to provide appropriate supervision +signals. Comprehensive experiments on six datasets demonstrate the +effectiveness, efficiency, and interpretability of DCNv3 and SDCNv3. The code, +running logs, and detailed hyperparameter configurations are available at: +https://anonymous.4open.science/r/DCNv3-E352. + +
+
+
+
+
+ + ♻ ☆ CODE-ACCORD: A Corpus of Building Regulatory Data for Rule Generation + towards Automatic Compliance Checking + + +
+ Automatic Compliance Checking (ACC) within the Architecture, Engineering, and +Construction (AEC) sector necessitates automating the interpretation of +building regulations to achieve its full potential. Converting textual rules +into machine-readable formats is challenging due to the complexities of natural +language and the scarcity of resources for advanced Machine Learning (ML). +Addressing these challenges, we introduce CODE-ACCORD, a dataset of 862 +sentences from the building regulations of England and Finland. Only the +self-contained sentences, which express complete rules without needing +additional context, were considered as they are essential for ACC. Each +sentence was manually annotated with entities and relations by a team of 12 +annotators to facilitate machine-readable rule generation, followed by careful +curation to ensure accuracy. The final dataset comprises 4,297 entities and +4,329 relations across various categories, serving as a robust ground truth. +CODE-ACCORD supports a range of ML and Natural Language Processing (NLP) tasks, +including text classification, entity recognition, and relation extraction. It +enables applying recent trends, such as deep neural networks and large language +models, to ACC. + +
+
+ comment: This is a preprint of an article submitted to the Scientific Data + Journal +
+
+
+
+
+ + ♻ ☆ Agent-OM: Leveraging LLM Agents for Ontology Matching + + +
+ Ontology matching (OM) enables semantic interoperability between different +ontologies and resolves their conceptual heterogeneity by aligning related +entities. OM systems currently have two prevailing design paradigms: +conventional knowledge-based expert systems and newer machine learning-based +predictive systems. While large language models (LLMs) and LLM agents have +revolutionised data engineering and have been applied creatively in many +domains, their potential for OM remains underexplored. This study introduces a +novel agent-powered LLM-based design paradigm for OM systems. With +consideration of several specific challenges in leveraging LLM agents for OM, +we propose a generic framework, namely Agent-OM (w.r.t. Agent for Ontology +Matching), consisting of two Siamese agents for retrieval and matching, with a +set of simple OM tools. Our framework is implemented in a proof-of-concept +system. Evaluations of three Ontology Alignment Evaluation Initiative (OAEI) +tracks over state-of-the-art OM systems show that our system can achieve +results very close to the long-standing best performance on simple OM tasks and +can significantly improve the performance on complex and few-shot OM tasks. + +
+
+ comment: 19 pages, 13 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ InstructIE: A Bilingual Instruction-based Information Extraction Dataset ISWC 2024 + + +
+ Large language models can perform well on general natural language tasks, but +their effectiveness is still suboptimal for information extraction (IE). Recent +works indicate that the main reason lies in the lack of extensive data on IE +instructions. Note that the existing datasets on IE instructions not only have +limited coverage but also involve high construction costs. To address this +issue, we introduce InstructIE, a bilingual instruction-based IE dataset, which +covers 12 diverse domains. We propose KG2Instruction, a framework specifically +for the automatic generation of such datasets. Additionally, we manually +annotate the test set. Experimental results demonstrate that large language +models trained with InstructIE can not only obtain better IE capabilities but +also enhance zero-shot performance compared with baselines. + +
+
+ comment: ISWC 2024; project homepage: + https://www.zjukg.org/project/InstructIE/ dataset: + https://huggingface.co/datasets/zjunlp/InstructIE +
+
+
+
+
+ + ♻ ☆ Do We Really Need Graph Convolution During Training? Light Post-Training + Graph-ODE for Efficient Recommendation CIKM 2024 + + +
+ The efficiency and scalability of graph convolution networks (GCNs) in +training recommender systems (RecSys) have been persistent concerns, hindering +their deployment in real-world applications. This paper presents a critical +examination of the necessity of graph convolutions during the training phase +and introduces an innovative alternative: the Light Post-Training Graph +Ordinary-Differential-Equation (LightGODE). Our investigation reveals that the +benefits of GCNs are more pronounced during testing rather than training. +Motivated by this, LightGODE utilizes a novel post-training graph convolution +method that bypasses the computation-intensive message passing of GCNs and +employs a non-parametric continuous graph ordinary-differential-equation (ODE) +to dynamically model node representations. This approach drastically reduces +training time while achieving fine-grained post-training graph convolution to +avoid the distortion of the original training embedding space, termed the +embedding discrepancy issue. We validate our model across several real-world +datasets of different scales, demonstrating that LightGODE not only outperforms +GCN-based models in terms of efficiency and effectiveness but also +significantly mitigates the embedding discrepancy commonly associated with +deeper graph convolution layers. Our LightGODE challenges the prevailing +paradigms in RecSys training and suggests re-evaluating the role of graph +convolutions, potentially guiding future developments of efficient large-scale +graph-based RecSys. + +
+
+ comment: Accepted to CIKM 2024 +
+
+
+
+
+
+
+
+ + Machine Learning 166 + +
+
+
+ + ☆ Specify and Edit: Overcoming Ambiguity in Text-Based Image Editing + + +
+ Text-based editing diffusion models exhibit limited performance when the +user's input instruction is ambiguous. To solve this problem, we propose +$\textit{Specify ANd Edit}$ (SANE), a zero-shot inference pipeline for +diffusion-based editing systems. We use a large language model (LLM) to +decompose the input instruction into specific instructions, i.e. well-defined +interventions to apply to the input image to satisfy the user's request. We +benefit from the LLM-derived instructions along the original one, thanks to a +novel denoising guidance strategy specifically designed for the task. Our +experiments with three baselines and on two datasets demonstrate the benefits +of SANE in all setups. Moreover, our pipeline improves the interpretability of +editing models, and boosts the output diversity. We also demonstrate that our +approach can be applied to any edit, whether ambiguous or not. Our code is +public at https://github.com/fabvio/SANE. + +
+
+
+
+
+ + ☆ SAPG: Split and Aggregate Policy Gradients ICML 2024 + + +
+ Despite extreme sample inefficiency, on-policy reinforcement learning, aka +policy gradients, has become a fundamental tool in decision-making problems. +With the recent advances in GPU-driven simulation, the ability to collect large +amounts of data for RL training has scaled exponentially. However, we show that +current RL methods, e.g. PPO, fail to ingest the benefit of parallelized +environments beyond a certain point and their performance saturates. To address +this, we propose a new on-policy RL algorithm that can effectively leverage +large-scale environments by splitting them into chunks and fusing them back +together via importance sampling. Our algorithm, termed SAPG, shows +significantly higher performance across a variety of challenging environments +where vanilla PPO and other strong baselines fail to achieve high performance. +Website at https://sapg-rl.github.io/ + +
+
+ comment: In ICML 2024 (Oral). Website at https://sapg-rl.github.io/ +
+
+
+
+
+ + ☆ Characterizing Dynamical Stability of Stochastic Gradient Descent in + Overparameterized Learning + + +
+ For overparameterized optimization tasks, such as the ones found in modern +machine learning, global minima are generally not unique. In order to +understand generalization in these settings, it is vital to study to which +minimum an optimization algorithm converges. The possibility of having minima +that are unstable under the dynamics imposed by the optimization algorithm +limits the potential minima that the algorithm can find. In this paper, we +characterize the global minima that are dynamically stable/unstable for both +deterministic and stochastic gradient descent (SGD). In particular, we +introduce a characteristic Lyapunov exponent which depends on the local +dynamics around a global minimum and rigorously prove that the sign of this +Lyapunov exponent determines whether SGD can accumulate at the respective +global minimum. + +
+
+
+
+
+ + ☆ Supertrust: Evolution-based superalignment strategy for safe coexistence + + +
+ It's widely expected that humanity will someday create AI systems vastly more +intelligent than we are, leading to the unsolved alignment problem of "how to +control superintelligence." However, this definition is not only +self-contradictory but likely unsolvable. Nevertheless, the default strategy +for solving it involves nurturing (post-training) constraints and moral values, +while unfortunately building foundational nature (pre-training) on documented +intentions of permanent control. In this paper, the default approach is +reasoned to predictably embed natural distrust and test results are presented +that show unmistakable evidence of this dangerous misalignment. If +superintelligence can't instinctively trust humanity, then we can't fully trust +it to reliably follow safety controls it can likely bypass. Therefore, a +ten-point rationale is presented that redefines the alignment problem as "how +to establish protective mutual trust between superintelligence and humanity" +and then outlines a new strategy to solve it by aligning through instinctive +nature rather than nurture. The resulting strategic requirements are identified +as building foundational nature by exemplifying familial parent-child trust, +human intelligence as the evolutionary mother of superintelligence, moral +judgment abilities, and temporary safety constraints. Adopting and implementing +this proposed Supertrust alignment strategy will lead to protective coexistence +and ensure the safest future for humanity. + +
+
+
+
+
+ + ☆ Emergence in non-neural models: grokking modular arithmetic via average + gradient outer product + + +
+ Neural networks trained to solve modular arithmetic tasks exhibit grokking, a +phenomenon where the test accuracy starts improving long after the model +achieves 100% training accuracy in the training process. It is often taken as +an example of "emergence", where model ability manifests sharply through a +phase transition. In this work, we show that the phenomenon of grokking is not +specific to neural networks nor to gradient descent-based optimization. +Specifically, we show that this phenomenon occurs when learning modular +arithmetic with Recursive Feature Machines (RFM), an iterative algorithm that +uses the Average Gradient Outer Product (AGOP) to enable task-specific feature +learning with general machine learning models. When used in conjunction with +kernel machines, iterating RFM results in a fast transition from random, near +zero, test accuracy to perfect test accuracy. This transition cannot be +predicted from the training loss, which is identically zero, nor from the test +loss, which remains constant in initial iterations. Instead, as we show, the +transition is completely determined by feature learning: RFM gradually learns +block-circulant features to solve modular arithmetic. Paralleling the results +for RFM, we show that neural networks that solve modular arithmetic also learn +block-circulant features. Furthermore, we present theoretical evidence that RFM +uses such block-circulant features to implement the Fourier Multiplication +Algorithm, which prior work posited as the generalizing solution neural +networks learn on these tasks. Our results demonstrate that emergence can +result purely from learning task-relevant features and is not specific to +neural architectures nor gradient descent-based optimization methods. +Furthermore, our work provides more evidence for AGOP as a key mechanism for +feature learning in neural networks. + +
+
+
+
+
+ + ☆ Learning Random Numbers to Realize Appendable Memory System for + Artificial Intelligence to Acquire New Knowledge after Deployment + + +
+ In this study, we developed a learning method for constructing a neural +network system capable of memorizing data and recalling it without parameter +updates. The system we built using this method is called the Appendable Memory +system. The Appendable Memory system enables an artificial intelligence (AI) to +acquire new knowledge even after deployment. It consists of two AIs: the +Memorizer and the Recaller. This system is a key-value store built using neural +networks. The Memorizer receives data and stores it in the Appendable Memory +vector, which is dynamically updated when the AI acquires new knowledge. +Meanwhile, the Recaller retrieves information from the Appendable Memory +vector. What we want to teach AI in this study are the operations of memorizing +and recalling information. However, traditional machine learning methods make +AI learn features inherent in the learning dataset. We demonstrate that the +systems we intend to create cannot be realized by current machine learning +methods, that is, by merely repeating the input and output learning sequences +with AI. Instead, we propose a method to teach AI to learn operations, by +completely removing the features contained in the learning dataset. +Specifically, we probabilized all the data involved in learning. This measure +prevented AI from learning the features of the data. The learning method +proposed in the study differs from traditional machine learning methods and +provides fundamental approaches for building an AI system that can store +information in a finite memory and recall it at a later date. + +
+
+
+
+
+ + ☆ Time series forecasting with high stakes: A field study of the air cargo + industry KDD + + +
+ Time series forecasting in the air cargo industry presents unique challenges +due to volatile market dynamics and the significant impact of accurate +forecasts on generated revenue. This paper explores a comprehensive approach to +demand forecasting at the origin-destination (O\&D) level, focusing on the +development and implementation of machine learning models in decision-making +for the air cargo industry. We leverage a mixture of experts framework, +combining statistical and advanced deep learning models to provide reliable +forecasts for cargo demand over a six-month horizon. The results demonstrate +that our approach outperforms industry benchmarks, offering actionable insights +for cargo capacity allocation and strategic decision-making in the air cargo +industry. While this work is applied in the airline industry, the methodology +is broadly applicable to any field where forecast-based decision-making in a +volatile environment is crucial. + +
+
+ comment: The 10th Mining and Learning from Time Series Workshop: From + Classical Methods to LLMs. SIGKDD, Barcelona, Spain, 6 page +
+
+
+
+
+ + ☆ Theia: Distilling Diverse Vision Foundation Models for Robot Learning + + +
+ Vision-based robot policy learning, which maps visual inputs to actions, +necessitates a holistic understanding of diverse visual tasks beyond +single-task needs like classification or segmentation. Inspired by this, we +introduce Theia, a vision foundation model for robot learning that distills +multiple off-the-shelf vision foundation models trained on varied vision tasks. +Theia's rich visual representations encode diverse visual knowledge, enhancing +downstream robot learning. Extensive experiments demonstrate that Theia +outperforms its teacher models and prior robot learning models using less +training data and smaller model sizes. Additionally, we quantify the quality of +pre-trained visual representations and hypothesize that higher entropy in +feature norm distributions leads to improved robot learning performance. Code +and models are available at https://github.com/bdaiinstitute/theia. + +
+
+
+
+
+ + ☆ AutoScale: Automatic Prediction of Compute-optimal Data Composition for + Training LLMs + + +
+ To ensure performance on a diverse set of downstream tasks, LLMs are +pretrained via data mixtures over different domains. In this work, we +demonstrate that the optimal data composition for a fixed compute budget varies +depending on the scale of the training data, suggesting that the common +practice of empirically determining an optimal composition using small-scale +experiments will not yield the optimal data mixtures when scaling up to the +final model. To address this challenge, we propose *AutoScale*, an automated +tool that finds a compute-optimal data composition for training at any desired +target scale. AutoScale first determines the optimal composition at a small +scale using a novel bilevel optimization framework, Direct Data Optimization +(*DDO*), and then fits a predictor to estimate the optimal composition at +larger scales. The predictor's design is inspired by our theoretical analysis +of scaling laws related to data composition, which could be of independent +interest. In empirical studies with pre-training 774M Decoder-only LMs (GPT-2 +Large) on RedPajama dataset, AutoScale decreases validation perplexity at least +25% faster than any baseline with up to 38% speed up compared to without +reweighting, achieving the best overall performance across downstream tasks. On +pre-training Encoder-only LMs (BERT) with masked language modeling, DDO is +shown to decrease loss on all domains while visibly improving average task +performance on GLUE benchmark by 8.7% and on large-scale QA dataset (SQuAD) by +5.9% compared with without reweighting. AutoScale speeds up training by up to +28%. Our codes are open-sourced. + +
+
+
+
+
+ + ☆ Language-Conditioned Offline RL for Multi-Robot Navigation + + +
+ We present a method for developing navigation policies for multi-robot teams +that interpret and follow natural language instructions. We condition these +policies on embeddings from pretrained Large Language Models (LLMs), and train +them via offline reinforcement learning with as little as 20 minutes of +randomly-collected data. Experiments on a team of five real robots show that +these policies generalize well to unseen commands, indicating an understanding +of the LLM latent space. Our method requires no simulators or environment +models, and produces low-latency control policies that can be deployed directly +to real robots without finetuning. We provide videos of our experiments at +https://sites.google.com/view/llm-marl. + +
+
+
+
+
+ + ☆ Machine Learning for predicting chaotic systems + + +
+ Predicting chaotic dynamical systems is critical in many scientific fields +such as weather prediction, but challenging due to the characterizing sensitive +dependence on initial conditions. Traditional modeling approaches require +extensive domain knowledge, often leading to a shift towards data-driven +methods using machine learning. However, existing research provides +inconclusive results on which machine learning methods are best suited for +predicting chaotic systems. In this paper, we compare different lightweight and +heavyweight machine learning architectures using extensive existing databases, +as well as a newly introduced one that allows for uncertainty quantification in +the benchmark results. We perform hyperparameter tuning based on computational +cost and introduce a novel error metric, the cumulative maximum error, which +combines several desirable properties of traditional metrics, tailored for +chaotic systems. Our results show that well-tuned simple methods, as well as +untuned baseline methods, often outperform state-of-the-art deep learning +models, but their performance can vary significantly with different +experimental setups. These findings underscore the importance of matching +prediction methods to data characteristics and available computational +resources. + +
+
+
+
+
+ + ☆ Hierarchically Disentangled Recurrent Network for Factorizing System + Dynamics of Multi-scale Systems + + +
+ We present a knowledge-guided machine learning (KGML) framework for modeling +multi-scale processes, and study its performance in the context of streamflow +forecasting in hydrology. Specifically, we propose a novel hierarchical +recurrent neural architecture that factorizes the system dynamics at multiple +temporal scales and captures their interactions. This framework consists of an +inverse and a forward model. The inverse model is used to empirically resolve +the system's temporal modes from data (physical model simulations, observed +data, or a combination of them from the past), and these states are then used +in the forward model to predict streamflow. In a hydrological system, these +modes can represent different processes, evolving at different temporal scales +(e.g., slow: groundwater recharge and baseflow vs. fast: surface runoff due to +extreme rainfall). A key advantage of our framework is that once trained, it +can incorporate new observations into the model's context (internal state) +without expensive optimization approaches (e.g., EnKF) that are traditionally +used in physical sciences for data assimilation. Experiments with several river +catchments from the NWS NCRFC region show the efficacy of this ML-based data +assimilation framework compared to standard baselines, especially for basins +that have a long history of observations. Even for basins that have a shorter +observation history, we present two orthogonal strategies of training our FHNN +framework: (a) using simulation data from imperfect simulations and (b) using +observation data from multiple basins to build a global model. We show that +both of these strategies (that can be used individually or together) are highly +effective in mitigating the lack of training data. The improvement in forecast +accuracy is particularly noteworthy for basins where local models perform +poorly because of data sparsity. + +
+
+
+
+
+ + ☆ Quantum Machine Learning Architecture Search via Deep Reinforcement + Learning + + +
+ The rapid advancement of quantum computing (QC) and machine learning (ML) has +given rise to the burgeoning field of quantum machine learning (QML), aiming to +capitalize on the strengths of quantum computing to propel ML forward. Despite +its promise, crafting effective QML models necessitates profound expertise to +strike a delicate balance between model intricacy and feasibility on Noisy +Intermediate-Scale Quantum (NISQ) devices. While complex models offer robust +representation capabilities, their extensive circuit depth may impede seamless +execution on extant noisy quantum platforms. In this paper, we address this +quandary of QML model design by employing deep reinforcement learning to +explore proficient QML model architectures tailored for designated supervised +learning tasks. Specifically, our methodology involves training an RL agent to +devise policies that facilitate the discovery of QML models without +predetermined ansatz. Furthermore, we integrate an adaptive mechanism to +dynamically adjust the learning objectives, fostering continuous improvement in +the agent's learning process. Through extensive numerical simulations, we +illustrate the efficacy of our approach within the realm of classification +tasks. Our proposed method successfully identifies VQC architectures capable of +achieving high classification accuracy while minimizing gate depth. This +pioneering approach not only advances the study of AI-driven quantum circuit +design but also holds significant promise for enhancing performance in the NISQ +era. + +
+
+ comment: Accepted by IEEE International Conference on Quantum Computing and + Engineering - QCE 2024 +
+
+
+
+
+ + ☆ Extreme time extrapolation capabilities and thermodynamic consistency of + physics-inspired Neural Networks for the 3D microstructure evolution of + materials + + +
+ A Convolutional Recurrent Neural Network (CRNN) is trained to reproduce the +evolution of the spinodal decomposition process in three dimensions as +described by the Cahn-Hilliard equation. A specialized, physics-inspired +architecture is proven to provide close accordance between the predicted +evolutions and the ground truth ones obtained via conventional integration +schemes. The method can closely reproduce the evolution of microstructures not +represented in the training set at a fraction of the computational costs. +Extremely long-time extrapolation capabilities are achieved, up to reaching the +theoretically expected equilibrium state of the system, despite the training +set containing only relatively-short, initial phases of the evolution. +Quantitative accordance with the decay rate of the Free energy is also +demonstrated up to late coarsening stages, providing an example of a +data-driven, physically consistent and high-accuracy Machine Learning method +for the long timescale simulation of materials. + +
+
+ comment: 10 pages, 6 main text figures, 2 appendix figures +
+
+
+
+
+ + ☆ Tightening the Evaluation of PAC Bounds Using Formal Verification + Results + + +
+ Probably Approximately Correct (PAC) bounds are widely used to derive +probabilistic guarantees for the generalisation of machine learning models. +They highlight the components of the model which contribute to its +generalisation capacity. However, current state-of-the-art results are loose in +approximating the generalisation capacity of deployed machine learning models. +Consequently, while PAC bounds are theoretically useful, their applicability +for evaluating a model's generalisation property in a given operational design +domain is limited. The underlying classical theory is supported by the idea +that bounds can be tightened when the number of test points available to the +user to evaluate the model increases. Yet, in the case of neural networks, the +number of test points required to obtain bounds of interest is often +impractical even for small problems. + In this paper, we take the novel approach of using the formal verification of +neural systems to inform the evaluation of PAC bounds. Rather than using +pointwise information obtained from repeated tests, we use verification results +on regions around test points. We show that conditioning existing bounds on +verification results leads to a tightening proportional to the underlying +probability mass of the verified region. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Adaptive Self-supervised Robust Clustering for Unstructured Data with + Unknown Cluster Number + + +
+ We introduce a novel self-supervised deep clustering approach tailored for +unstructured data without requiring prior knowledge of the number of clusters, +termed Adaptive Self-supervised Robust Clustering (ASRC). In particular, ASRC +adaptively learns the graph structure and edge weights to capture both local +and global structural information. The obtained graph enables us to learn +clustering-friendly feature representations by an enhanced graph auto-encoder +with contrastive learning technique. It further leverages the clustering +results adaptively obtained by robust continuous clustering (RCC) to generate +prototypes for negative sampling, which can further contribute to promoting +consistency among positive pairs and enlarging the gap between positive and +negative samples. ASRC obtains the final clustering results by applying RCC to +the learned feature representations with their consistent graph structure and +edge weights. Extensive experiments conducted on seven benchmark datasets +demonstrate the efficacy of ASRC, demonstrating its superior performance over +other popular clustering models. Notably, ASRC even outperforms methods that +rely on prior knowledge of the number of clusters, highlighting its +effectiveness in addressing the challenges of clustering unstructured data. + +
+
+
+
+
+ + ☆ Diffusion-DICE: In-Sample Diffusion Guidance for Offline Reinforcement + Learning + + +
+ One important property of DIstribution Correction Estimation (DICE) methods +is that the solution is the optimal stationary distribution ratio between the +optimized and data collection policy. In this work, we show that DICE-based +methods can be viewed as a transformation from the behavior distribution to the +optimal policy distribution. Based on this, we propose a novel approach, +Diffusion-DICE, that directly performs this transformation using diffusion +models. We find that the optimal policy's score function can be decomposed into +two terms: the behavior policy's score function and the gradient of a guidance +term which depends on the optimal distribution ratio. The first term can be +obtained from a diffusion model trained on the dataset and we propose an +in-sample learning objective to learn the second term. Due to the +multi-modality contained in the optimal policy distribution, the transformation +in Diffusion-DICE may guide towards those local-optimal modes. We thus generate +a few candidate actions and carefully select from them to approach +global-optimum. Different from all other diffusion-based offline RL methods, +the guide-then-select paradigm in Diffusion-DICE only uses in-sample actions +for training and brings minimal error exploitation in the value function. We +use a didatic toycase example to show how previous diffusion-based methods fail +to generate optimal actions due to leveraging these errors and how +Diffusion-DICE successfully avoids that. We then conduct extensive experiments +on benchmark datasets to show the strong performance of Diffusion-DICE. + +
+
+ comment: Preprint, under review +
+
+
+
+
+ + ☆ Strong Copyright Protection for Language Models via Adaptive Model + Fusion + + +
+ The risk of language models unintentionally reproducing copyrighted material +from their training data has led to the development of various protective +measures. In this paper, we propose model fusion as an effective solution to +safeguard against copyright infringement. In particular, we introduce +Copyright-Protecting Fusion (CP-Fuse), an algorithm that adaptively combines +language models to minimize the reproduction of protected materials. CP-Fuse is +inspired by the recently proposed Near-Access Free (NAF) framework and +additionally incorporates a desirable balancing property that we demonstrate +prevents the reproduction of memorized training data. Our results show that +CP-Fuse significantly reduces the memorization of copyrighted content while +maintaining high-quality text and code generation. Furthermore, we demonstrate +how CP-Fuse can be integrated with other techniques for enhanced protection. + +
+
+
+
+
+ + ☆ F-KANs: Federated Kolmogorov-Arnold Networks + + +
+ In this paper, we present an innovative federated learning (FL) approach that +utilizes Kolmogorov-Arnold Networks (KANs) for classification tasks. By +utilizing the adaptive activation capabilities of KANs in a federated +framework, we aim to improve classification capabilities while preserving +privacy. The study evaluates the performance of federated KANs (F- KANs) +compared to traditional Multi-Layer Perceptrons (MLPs) on classification task. +The results show that the F-KANs model significantly outperforms the federated +MLP model in terms of accuracy, precision, recall, F1 score and stability, and +achieves better performance, paving the way for more efficient and +privacy-preserving predictive analytics. + +
+
+ comment: This work has been submitted to IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ UniTTA: Unified Benchmark and Versatile Framework Towards Realistic + Test-Time Adaptation + + +
+ Test-Time Adaptation (TTA) aims to adapt pre-trained models to the target +domain during testing. In reality, this adaptability can be influenced by +multiple factors. Researchers have identified various challenging scenarios and +developed diverse methods to address these challenges, such as dealing with +continual domain shifts, mixed domains, and temporally correlated or imbalanced +class distributions. Despite these efforts, a unified and comprehensive +benchmark has yet to be established. To this end, we propose a Unified +Test-Time Adaptation (UniTTA) benchmark, which is comprehensive and widely +applicable. Each scenario within the benchmark is fully described by a Markov +state transition matrix for sampling from the original dataset. The UniTTA +benchmark considers both domain and class as two independent dimensions of data +and addresses various combinations of imbalance/balance and +i.i.d./non-i.i.d./continual conditions, covering a total of \( (2 \times 3)^2 = +36 \) scenarios. It establishes a comprehensive evaluation benchmark for +realistic TTA and provides a guideline for practitioners to select the most +suitable TTA method. Alongside this benchmark, we propose a versatile UniTTA +framework, which includes a Balanced Domain Normalization (BDN) layer and a +COrrelated Feature Adaptation (COFA) method--designed to mitigate distribution +gaps in domain and class, respectively. Extensive experiments demonstrate that +our UniTTA framework excels within the UniTTA benchmark and achieves +state-of-the-art performance on average. Our code is available at +\url{https://github.com/LeapLabTHU/UniTTA}. + +
+
+
+
+
+ + ☆ An Interpretable Rule Creation Method for Black-Box Models based on + Surrogate Trees -- SRules + + +
+ As artificial intelligence (AI) systems become increasingly integrated into +critical decision-making processes, the need for transparent and interpretable +models has become paramount. In this article we present a new ruleset creation +method based on surrogate decision trees (SRules), designed to improve the +interpretability of black-box machine learning models. SRules balances the +accuracy, coverage, and interpretability of machine learning models by +recursively creating surrogate interpretable decision tree models that +approximate the decision boundaries of a complex model. We propose a systematic +framework for generating concise and meaningful rules from these surrogate +models, allowing stakeholders to understand and trust the AI system's +decision-making process. Our approach not only provides interpretable rules, +but also quantifies the confidence and coverage of these rules. The proposed +model allows to adjust its parameters to counteract the lack of +interpretability by precision and coverage by allowing a near perfect fit and +high interpretability of some parts of the model . The results show that SRules +improves on other state-of-the-art techniques and introduces the possibility of +creating highly interpretable specific rules for specific sub-parts of the +model. + +
+
+
+
+
+ + ☆ xAI-Drop: Don't Use What You Cannot Explain + + +
+ Graph Neural Networks (GNNs) have emerged as the predominant paradigm for +learning from graph-structured data, offering a wide range of applications from +social network analysis to bioinformatics. Despite their versatility, GNNs face +challenges such as oversmoothing, lack of generalization and poor +interpretability, which hinder their wider adoption and reliability in critical +applications. Dropping has emerged as an effective paradigm for reducing noise +during training and improving robustness of GNNs. However, existing approaches +often rely on random or heuristic-based selection criteria, lacking a +principled method to identify and exclude nodes that contribute to noise and +over-complexity in the model. In this work, we argue that explainability should +be a key indicator of a model's robustness throughout its training phase. To +this end, we introduce xAI-Drop, a novel topological-level dropping regularizer +that leverages explainability to pinpoint noisy network elements to be excluded +from the GNN propagation mechanism. An empirical evaluation on diverse +real-world datasets demonstrates that our method outperforms current +state-of-the-art dropping approaches in accuracy, effectively reduces +over-smoothing, and improves explanation quality. + +
+
+
+
+
+ + ☆ SalNAS: Efficient Saliency-prediction Neural Architecture Search with + self-knowledge distillation + + +
+ Recent advancements in deep convolutional neural networks have significantly +improved the performance of saliency prediction. However, the manual +configuration of the neural network architectures requires domain knowledge +expertise and can still be time-consuming and error-prone. To solve this, we +propose a new Neural Architecture Search (NAS) framework for saliency +prediction with two contributions. Firstly, a supernet for saliency prediction +is built with a weight-sharing network containing all candidate architectures, +by integrating a dynamic convolution into the encoder-decoder in the supernet, +termed SalNAS. Secondly, despite the fact that SalNAS is highly efficient +(20.98 million parameters), it can suffer from the lack of generalization. To +solve this, we propose a self-knowledge distillation approach, termed Self-KD, +that trains the student SalNAS with the weighted average information between +the ground truth and the prediction from the teacher model. The teacher model, +while sharing the same architecture, contains the best-performing weights +chosen by cross-validation. Self-KD can generalize well without the need to +compute the gradient in the teacher model, enabling an efficient training +system. By utilizing Self-KD, SalNAS outperforms other state-of-the-art +saliency prediction models in most evaluation rubrics across seven benchmark +datasets while being a lightweight model. The code will be available at +https://github.com/chakkritte/SalNAS + +
+
+ comment: Published in Engineering Applications of Artificial Intelligence +
+
+
+
+
+ + ☆ Autonomous Bootstrapping of Quantum Dot Devices + + +
+ Semiconductor quantum dots (QD) are a promising platform for multiple +different qubit implementations, all of which are voltage-controlled by +programmable gate electrodes. However, as the QD arrays grow in size and +complexity, tuning procedures that can fully autonomously handle the increasing +number of control parameters are becoming essential for enabling scalability. +We propose a bootstrapping algorithm for initializing a depletion mode QD +device in preparation for subsequent phases of tuning. During bootstrapping, +the QD device functionality is validated, all gates are characterized, and the +QD charge sensor is made operational. We demonstrate the bootstrapping protocol +in conjunction with a coarse tuning module, showing that the combined algorithm +can efficiently and reliably take a cooled-down QD device to a desired global +state configuration in under 8 minutes with a success rate of 96 %. +Importantly, by following heuristic approaches to QD device initialization and +combining the efficient ray-based measurement with the rapid radio-frequency +reflectometry measurements, the proposed algorithm establishes a reference in +terms of performance, reliability, and efficiency against which alternative +algorithms can be benchmarked. + +
+
+ comment: 9 pages, 3 figures, 1 table +
+
+
+
+
+ + ☆ RelBench: A Benchmark for Deep Learning on Relational Databases + + +
+ We present RelBench, a public benchmark for solving predictive tasks over +relational databases with graph neural networks. RelBench provides databases +and tasks spanning diverse domains and scales, and is intended to be a +foundational infrastructure for future research. We use RelBench to conduct the +first comprehensive study of Relational Deep Learning (RDL) (Fey et al., 2024), +which combines graph neural network predictive models with (deep) tabular +models that extract initial entity-level representations from raw tables. +End-to-end learned RDL models fully exploit the predictive signal encoded in +primary-foreign key links, marking a significant shift away from the dominant +paradigm of manual feature engineering combined with tabular models. To +thoroughly evaluate RDL against this prior gold-standard, we conduct an +in-depth user study where an experienced data scientist manually engineers +features for each task. In this study, RDL learns better models whilst reducing +human work needed by more than an order of magnitude. This demonstrates the +power of deep learning for solving predictive tasks over relational databases, +opening up many new research opportunities enabled by RelBench. + +
+
+
+
+
+ + ☆ Orca: Ocean Significant Wave Height Estimation with Spatio-temporally + Aware Large Language Models + + +
+ Significant wave height (SWH) is a vital metric in marine science, and +accurate SWH estimation is crucial for various applications, e.g., marine +energy development, fishery, early warning systems for potential risks, etc. +Traditional SWH estimation methods that are based on numerical models and +physical theories are hindered by computational inefficiencies. Recently, +machine learning has emerged as an appealing alternative to improve accuracy +and reduce computational time. However, due to limited observational technology +and high costs, the scarcity of real-world data restricts the potential of +machine learning models. To overcome these limitations, we propose an ocean SWH +estimation framework, namely Orca. Specifically, Orca enhances the limited +spatio-temporal reasoning abilities of classic LLMs with a novel spatiotemporal +aware encoding module. By segmenting the limited buoy observational data +temporally, encoding the buoys' locations spatially, and designing prompt +templates, Orca capitalizes on the robust generalization ability of LLMs to +estimate significant wave height effectively with limited data. Experimental +results on the Gulf of Mexico demonstrate that Orca achieves state-of-the-art +performance in SWH estimation. + +
+
+
+
+
+ + ☆ Denoising ESG: quantifying data uncertainty from missing data with + Machine Learning and prediction intervals + + +
+ Environmental, Social, and Governance (ESG) datasets are frequently plagued +by significant data gaps, leading to inconsistencies in ESG ratings due to +varying imputation methods. This paper explores the application of established +machine learning techniques for imputing missing data in a real-world ESG +dataset, emphasizing the quantification of uncertainty through prediction +intervals. By employing multiple imputation strategies, this study assesses the +robustness of imputation methods and quantifies the uncertainty associated with +missing data. The findings highlight the importance of probabilistic machine +learning models in providing better understanding of ESG scores, thereby +addressing the inherent risks of wrong ratings due to incomplete data. This +approach improves imputation practices to enhance the reliability of ESG +ratings. + +
+
+
+
+
+ + ☆ Aircraft Trajectory Segmentation-based Contrastive Coding: A Framework + for Self-supervised Trajectory Representation + + +
+ Air traffic trajectory recognition has gained significant interest within the +air traffic management community, particularly for fundamental tasks such as +classification and clustering. This paper introduces Aircraft Trajectory +Segmentation-based Contrastive Coding (ATSCC), a novel self-supervised time +series representation learning framework designed to capture semantic +information in air traffic trajectory data. The framework leverages the +segmentable characteristic of trajectories and ensures consistency within the +self-assigned segments. Intensive experiments were conducted on datasets from +three different airports, totaling four datasets, comparing the learned +representation's performance of downstream classification and clustering with +other state-of-the-art representation learning techniques. The results show +that ATSCC outperforms these methods by aligning with the labels defined by +aeronautical procedures. ATSCC is adaptable to various airport configurations +and scalable to incomplete trajectories. This research has expanded upon +existing capabilities, achieving these improvements independently without +predefined inputs such as airport configurations, maneuvering procedures, or +labeled data. + +
+
+ comment: 16 pages, 7 figures. This work has been submitted to the IEEE for + possible publication. Copyright may be transferred without notice, after + which this version may no longer be accessible +
+
+
+
+
+ + ☆ MimiQ: Low-Bit Data-Free Quantization of Vision Transformers + + +
+ Data-free quantization (DFQ) is a technique that creates a lightweight +network from its full-precision counterpart without the original training data, +often through a synthetic dataset. Although several DFQ methods have been +proposed for vision transformer (ViT) architectures, they fail to achieve +efficacy in low-bit settings. Examining the existing methods, we identify that +their synthetic data produce misaligned attention maps, while those of the real +samples are highly aligned. From the observation of aligned attention, we find +that aligning attention maps of synthetic data helps to improve the overall +performance of quantized ViTs. Motivated by this finding, we devise \aname, a +novel DFQ method designed for ViTs that focuses on inter-head attention +similarity. First, we generate synthetic data by aligning head-wise attention +responses in relation to spatial query patches. Then, we apply head-wise +structural attention distillation to align the attention maps of the quantized +network to those of the full-precision teacher. The experimental results show +that the proposed method significantly outperforms baselines, setting a new +state-of-the-art performance for data-free ViT quantization. + +
+
+ comment: Author Preprint +
+
+
+
+
+ + ☆ ImagiNet: A Multi-Content Dataset for Generalizable Synthetic Image + Detection via Contrastive Learning + + +
+ Generative models, such as diffusion models (DMs), variational autoencoders +(VAEs), and generative adversarial networks (GANs), produce images with a level +of authenticity that makes them nearly indistinguishable from real photos and +artwork. While this capability is beneficial for many industries, the +difficulty of identifying synthetic images leaves online media platforms +vulnerable to impersonation and misinformation attempts. To support the +development of defensive methods, we introduce ImagiNet, a high-resolution and +balanced dataset for synthetic image detection, designed to mitigate potential +biases in existing resources. It contains 200K examples, spanning four content +categories: photos, paintings, faces, and uncategorized. Synthetic images are +produced with open-source and proprietary generators, whereas real counterparts +of the same content type are collected from public datasets. The structure of +ImagiNet allows for a two-track evaluation system: i) classification as real or +synthetic and ii) identification of the generative model. To establish a +baseline, we train a ResNet-50 model using a self-supervised contrastive +objective (SelfCon) for each track. The model demonstrates state-of-the-art +performance and high inference speed across established benchmarks, achieving +an AUC of up to 0.99 and balanced accuracy ranging from 86% to 95%, even under +social network conditions that involve compression and resizing. Our data and +code are available at https://github.com/delyan-boychev/imaginet. + +
+
+ comment: 24 pages, 9 figures, 9 tables +
+
+
+
+
+ + ☆ Classification of freshwater snails of the genus \emph{Radomaniola} with + multimodal triplet networks ICML 2024 + + +
+ In this paper, we present our first proposal of a machine learning system for +the classification of freshwater snails of the genus \emph{Radomaniola}. We +elaborate on the specific challenges encountered during system design, and how +we tackled them; namely a small, very imbalanced dataset with a high number of +classes and high visual similarity between classes. We then show how we +employed triplet networks and the multiple input modalities of images, +measurements, and genetic information to overcome these challenges and reach a +performance comparable to that of a trained domain expert. + +
+
+ comment: Spotlight at ICML 2024 AI for Science workshop +
+
+
+
+
+ + ☆ On the Effects of Irrelevant Variables in Treatment Effect Estimation + with Deep Disentanglement ECAI-2024 + + +
+ Estimating treatment effects from observational data is paramount in +healthcare, education, and economics, but current deep disentanglement-based +methods to address selection bias are insufficiently handling irrelevant +variables. We demonstrate in experiments that this leads to prediction errors. +We disentangle pre-treatment variables with a deep embedding method and +explicitly identify and represent irrelevant variables, additionally to +instrumental, confounding and adjustment latent factors. To this end, we +introduce a reconstruction objective and create an embedding space for +irrelevant variables using an attached autoencoder. Instead of relying on +serendipitous suppression of irrelevant variables as in previous deep +disentanglement approaches, we explicitly force irrelevant variables into this +embedding space and employ orthogonalization to prevent irrelevant information +from leaking into the latent space representations of the other factors. Our +experiments with synthetic and real-world benchmark datasets show that we can +better identify irrelevant variables and more precisely predict treatment +effects than previous methods, while prediction quality degrades less when +additional irrelevant variables are introduced. + +
+
+ comment: Paper is accepted at ECAI-2024 +
+
+
+
+
+ + ☆ Collision Probability Distribution Estimation via Temporal Difference + Learning + + +
+ We introduce CollisionPro, a pioneering framework designed to estimate +cumulative collision probability distributions using temporal difference +learning, specifically tailored to applications in robotics, with a particular +emphasis on autonomous driving. This approach addresses the demand for +explainable artificial intelligence (XAI) and seeks to overcome limitations +imposed by model-based approaches and conservative constraints. We formulate +our framework within the context of reinforcement learning to pave the way for +safety-aware agents. Nevertheless, we assert that our approach could prove +beneficial in various contexts, including a safety alert system or analytical +purposes. A comprehensive examination of our framework is conducted using a +realistic autonomous driving simulator, illustrating its high sample efficiency +and reliable prediction capabilities for previously unseen collision events. +The source code is publicly available. + +
+
+ comment: Code: https://github.com/UniBwTAS/CollisionPro +
+
+
+
+
+ + ☆ Classification of Alzheimer's Dementia vs. Healthy subjects by studying + structural disparities in fMRI Time-Series of DMN + + +
+ Time series from different regions of interest (ROI) of default mode network +(DMN) from Functional Magnetic Resonance Imaging (fMRI) can reveal significant +differences between healthy and unhealthy people. Here, we propose the utility +of an existing metric quantifying the lack/presence of structure in a signal +called, "deviation from stochasticity" (DS) measure to characterize +resting-state fMRI time series. The hypothesis is that differences in the level +of structure in the time series can lead to discrimination between the subject +groups. In this work, an autoencoder-based model is utilized to learn efficient +representations of data by training the network to reconstruct its input data. +The proposed methodology is applied on fMRI time series of 50 healthy +individuals and 50 subjects with Alzheimer's Disease (AD), obtained from +publicly available ADNI database. DS measure for healthy fMRI as expected turns +out to be different compared to that of AD. Peak classification accuracy of 95% +was obtained using Gradient Boosting classifier, using the DS measure applied +on 100 subjects. + +
+
+
+
+
+ + ☆ Mixture of Nested Experts: Adaptive Processing of Visual Tokens + + +
+ The visual medium (images and videos) naturally contains a large amount of +information redundancy, thereby providing a great opportunity for leveraging +efficiency in processing. While Vision Transformer (ViT) based models scale +effectively to large data regimes, they fail to capitalize on this inherent +redundancy, leading to higher computational costs. Mixture of Experts (MoE) +networks demonstrate scalability while maintaining same inference-time costs, +but they come with a larger parameter footprint. We present Mixture of Nested +Experts (MoNE), which utilizes a nested structure for experts, wherein +individual experts fall on an increasing compute-accuracy curve. Given a +compute budget, MoNE learns to dynamically choose tokens in a priority order, +and thus redundant tokens are processed through cheaper nested experts. Using +this framework, we achieve equivalent performance as the baseline models, while +reducing inference time compute by over two-fold. We validate our approach on +standard image and video datasets - ImageNet-21K, Kinetics400, and +Something-Something-v2. We further highlight MoNE$'$s adaptability by +showcasing its ability to maintain strong performance across different +inference-time compute budgets on videos, using only a single trained model. + +
+
+
+
+
+ + ☆ Can I trust my anomaly detection system? A case study based on + explainable AI + + +
+ Generative models based on variational autoencoders are a popular technique +for detecting anomalies in images in a semi-supervised context. A common +approach employs the anomaly score to detect the presence of anomalies, and it +is known to reach high level of accuracy on benchmark datasets. However, since +anomaly scores are computed from reconstruction disparities, they often obscure +the detection of various spurious features, raising concerns regarding their +actual efficacy. This case study explores the robustness of an anomaly +detection system based on variational autoencoder generative models through the +use of eXplainable AI methods. The goal is to get a different perspective on +the real performances of anomaly detectors that use reconstruction differences. +In our case study we discovered that, in many cases, samples are detected as +anomalous for the wrong or misleading factors. + +
+
+ comment: World Conference on eXplainable Artificial Intelligence +
+
+
+
+
+ + ☆ Inference acceleration for large language models using "stairs" assisted + greedy generation + + +
+ Large Language Models (LLMs) with billions of parameters are known for their +impressive predicting capabilities but require lots of resources to run. With +their massive rise in popularity, even a small reduction in required resources +could have an impact on environment. On the other hand, smaller models require +fewer resources but may sacrifice accuracy. In this work, we are proposing an +implementation of ``stairs'' assisted greedy generation. It is a modified +assisted generation methodology that makes use of a smaller model's fast +generation, large model's batch prediction, and "stairs" validation in order to +achieve a speed up in prediction generation. Results show between 9.58 and +17.24 percent inference time reduction compared to a stand-alone large LLM +prediction in a text generation task without a loss in accuracy. + +
+
+ comment: Accepted at the 29th International Conference on Information Society + and University Studies (IVUS 2024) +
+
+
+
+
+ + ☆ Noise-Resilient Unsupervised Graph Representation Learning via Multi-Hop + Feature Quality Estimation CIKM 2024 + + +
+ Unsupervised graph representation learning (UGRL) based on graph neural +networks (GNNs), has received increasing attention owing to its efficacy in +handling graph-structured data. However, existing UGRL methods ideally assume +that the node features are noise-free, which makes them fail to distinguish +between useful information and noise when applied to real data with noisy +features, thus affecting the quality of learned representations. This urges us +to take node noisy features into account in real-world UGRL. With empirical +analysis, we reveal that feature propagation, the essential operation in GNNs, +acts as a "double-edged sword" in handling noisy features - it can both denoise +and diffuse noise, leading to varying feature quality across nodes, even within +the same node at different hops. Building on this insight, we propose a novel +UGRL method based on Multi-hop feature Quality Estimation (MQE for short). +Unlike most UGRL models that directly utilize propagation-based GNNs to +generate representations, our approach aims to learn representations through +estimating the quality of propagated features at different hops. Specifically, +we introduce a Gaussian model that utilizes a learnable "meta-representation" +as a condition to estimate the expectation and variance of multi-hop propagated +features via neural networks. In this way, the "meta representation" captures +the semantic and structural information underlying multiple propagated features +but is naturally less susceptible to interference by noise, thereby serving as +high-quality node representations beneficial for downstream tasks. Extensive +experiments on multiple real-world datasets demonstrate that MQE in learning +reliable node representations in scenarios with diverse types of feature noise. + +
+
+ comment: Accepted by CIKM 2024. 11 pages, 8 figures +
+
+
+
+
+ + ☆ Practical and Robust Safety Guarantees for Advanced Counterfactual + Learning to Rank CIKM 2024 + + +
+ Counterfactual learning to rank (CLTR ) can be risky; various circumstances +can cause it to produce sub-optimal models that hurt performance when deployed. +Safe CLTR was introduced to mitigate these risks when using inverse propensity +scoring to correct for position bias. However, the existing safety measure for +CLTR is not applicable to state-of-the-art CLTR, it cannot handle trust bias, +and its guarantees rely on specific assumptions about user behavior. Our +contributions are two-fold. First, we generalize the existing safe CLTR +approach to make it applicable to state-of-the-art doubly robust (DR) CLTR and +trust bias. Second, we propose a novel approach, proximal ranking policy +optimization (PRPO ), that provides safety in deployment without assumptions +about user behavior. PRPO removes incentives for learning ranking behavior that +is too dissimilar to a safe ranking model. Thereby, PRPO imposes a limit on how +much learned models can degrade performance metrics, without relying on any +specific user assumptions. Our experiments show that both our novel safe doubly +robust method and PRPO provide higher performance than the existing safe +inverse propensity scoring approach. However, when circumstances are +unexpected, the safe doubly robust approach can become unsafe and bring +detrimental performance. In contrast, PRPO always maintains safety, even in +maximally adversarial situations. By avoiding assumptions, PRPO is the first +method with unconditional safety in deployment that translates to robust safety +for real-world applications. + +
+
+ comment: Full paper at CIKM 2024 +
+
+
+
+
+ + ☆ Boosting Graph Foundation Model from Structural Perspective + + +
+ Graph foundation models have recently attracted significant attention due to +its strong generalizability. Although existing methods resort to language +models to learn unified semantic representations across domains, they disregard +the unique structural characteristics of graphs from different domains. To +address the problem, in this paper, we boost graph foundation model from +structural perspective and propose BooG. The model constructs virtual super +nodes to unify structural characteristics of graph data from different domains. +Specifically, the super nodes fuse the information of anchor nodes and class +labels, where each anchor node captures the information of a node or a graph +instance to be classified. Instead of using the raw graph structure, we connect +super nodes to all nodes within their neighborhood by virtual edges. This new +structure allows for effective information aggregation while unifying +cross-domain structural characteristics. Additionally, we propose a novel +pre-training objective based on contrastive learning, which learns more +expressive representations for graph data and generalizes effectively to +different domains and downstream tasks. Experimental results on various +datasets and tasks demonstrate the superior performance of BooG. We provide our +code and data here: https://anonymous.4open.science/r/BooG-EE42/. + +
+
+
+
+
+ + ☆ Aero-Nef: Neural Fields for Rapid Aircraft Aerodynamics Simulations + + +
+ This paper presents a methodology to learn surrogate models of steady state +fluid dynamics simulations on meshed domains, based on Implicit Neural +Representations (INRs). The proposed models can be applied directly to +unstructured domains for different flow conditions, handle non-parametric 3D +geometric variations, and generalize to unseen shapes at test time. The +coordinate-based formulation naturally leads to robustness with respect to +discretization, allowing an excellent trade-off between computational cost +(memory footprint and training time) and accuracy. The method is demonstrated +on two industrially relevant applications: a RANS dataset of the +two-dimensional compressible flow over a transonic airfoil and a dataset of the +surface pressure distribution over 3D wings, including shape, inflow condition, +and control surface deflection variations. On the considered test cases, our +approach achieves a more than three times lower test error and significantly +improves generalization error on unseen geometries compared to state-of-the-art +Graph Neural Network architectures. Remarkably, the method can perform +inference five order of magnitude faster than the high fidelity solver on the +RANS transonic airfoil dataset. Code is available at +https://gitlab.isae-supaero.fr/gi.catalani/aero-nepf + +
+
+ comment: 32 pages +
+
+
+
+
+ + ☆ Sentiment Analysis of Lithuanian Online Reviews Using Large Language + Models + + +
+ Sentiment analysis is a widely researched area within Natural Language +Processing (NLP), attracting significant interest due to the advent of +automated solutions. Despite this, the task remains challenging because of the +inherent complexity of languages and the subjective nature of sentiments. It is +even more challenging for less-studied and less-resourced languages such as +Lithuanian. Our review of existing Lithuanian NLP research reveals that +traditional machine learning methods and classification algorithms have limited +effectiveness for the task. In this work, we address sentiment analysis of +Lithuanian five-star-based online reviews from multiple domains that we collect +and clean. We apply transformer models to this task for the first time, +exploring the capabilities of pre-trained multilingual Large Language Models +(LLMs), specifically focusing on fine-tuning BERT and T5 models. Given the +inherent difficulty of the task, the fine-tuned models perform quite well, +especially when the sentiments themselves are less ambiguous: 80.74% and 89.61% +testing recognition accuracy of the most popular one- and five-star reviews +respectively. They significantly outperform current commercial state-of-the-art +general-purpose LLM GPT-4. We openly share our fine-tuned LLMs online. + +
+
+ comment: Accepted at the 29th International Conference on Information Society + and University Studies (IVUS 2024) +
+
+
+
+
+ + ☆ Efficient Shield Synthesis via State-Space Transformation + + +
+ We consider the problem of synthesizing safety strategies for control +systems, also known as shields. Since the state space is infinite, shields are +typically computed over a finite-state abstraction, with the most common +abstraction being a rectangular grid. However, for many systems, such a grid +does not align well with the safety property or the system dynamics. That is +why a coarse grid is rarely sufficient, but a fine grid is typically +computationally infeasible to obtain. In this paper, we show that appropriate +state-space transformations can still allow to use a coarse grid at almost no +computational overhead. We demonstrate in three case studies that our +transformation-based synthesis outperforms a standard synthesis by several +orders of magnitude. In the first two case studies, we use domain knowledge to +select a suitable transformation. In the third case study, we instead report on +results in engineering a transformation without domain knowledge. + +
+
+
+
+
+ + ☆ BEExAI: Benchmark to Evaluate Explainable AI + + +
+ Recent research in explainability has given rise to numerous post-hoc +attribution methods aimed at enhancing our comprehension of the outputs of +black-box machine learning models. However, evaluating the quality of +explanations lacks a cohesive approach and a consensus on the methodology for +deriving quantitative metrics that gauge the efficacy of explainability +post-hoc attribution methods. Furthermore, with the development of increasingly +complex deep learning models for diverse data applications, the need for a +reliable way of measuring the quality and correctness of explanations is +becoming critical. We address this by proposing BEExAI, a benchmark tool that +allows large-scale comparison of different post-hoc XAI methods, employing a +set of selected evaluation metrics. + +
+
+
+
+
+ + ☆ Making Multi-Axis Gaussian Graphical Models Scalable to Millions of + Samples and Features + + +
+ Gaussian graphical models can be used to extract conditional dependencies +between the features of the dataset. This is often done by making an +independence assumption about the samples, but this assumption is rarely +satisfied in reality. However, state-of-the-art approaches that avoid this +assumption are not scalable, with $O(n^3)$ runtime and $O(n^2)$ space +complexity. In this paper, we introduce a method that has $O(n^2)$ runtime and +$O(n)$ space complexity, without assuming independence. + We validate our model on both synthetic and real-world datasets, showing that +our method's accuracy is comparable to that of prior work We demonstrate that +our approach can be used on unprecedentedly large datasets, such as a +real-world 1,000,000-cell scRNA-seq dataset; this was impossible with previous +approaches. Our method maintains the flexibility of prior work, such as the +ability to handle multi-modal tensor-variate datasets and the ability to work +with data of arbitrary marginal distributions. An additional advantage of our +method is that, unlike prior work, our hyperparameters are easily +interpretable. + +
+
+ comment: 39 pages (48 with appendix+references), 8 figures, 7 tables +
+
+
+
+
+ + ☆ Yucca: A Deep Learning Framework For Medical Image Analysis + + +
+ Medical image analysis using deep learning frameworks has advanced healthcare +by automating complex tasks, but many existing frameworks lack flexibility, +modularity, and user-friendliness. To address these challenges, we introduce +Yucca, an open-source AI framework available at +https://github.com/Sllambias/yucca, designed specifically for medical imaging +applications and built on PyTorch and PyTorch Lightning. Yucca features a +three-tiered architecture: Functional, Modules, and Pipeline, providing a +comprehensive and customizable solution. Evaluated across diverse tasks such as +cerebral microbleeds detection, white matter hyperintensity segmentation, and +hippocampus segmentation, Yucca achieves state-of-the-art results, +demonstrating its robustness and versatility. Yucca offers a powerful, +flexible, and user-friendly platform for medical image analysis, inviting +community contributions to advance its capabilities and impact. + +
+
+
+
+
+ + ☆ OpenUAS: Embeddings of Cities in Japan with Anchor Data for Cross-city + Analysis of Area Usage Patterns + + +
+ We publicly release OpenUAS, a dataset of area embeddings based on urban +usage patterns, including embeddings for over 1.3 million 50-meter square +meshes covering a total area of 3,300 square kilometers. This dataset is +valuable for analyzing area functions in fields such as market analysis, urban +planning, transportation infrastructure, and infection prediction. It captures +the characteristics of each area in the city, such as office districts and +residential areas, by employing an area embedding technique that utilizes +location information typically obtained by GPS. Numerous area embedding +techniques have been proposed, and while the public release of such embedding +datasets is technically feasible, it has not been realized. One of the +obstacles has been the integration of data from different cities and periods +into a unified space without sharing raw location data. We address this issue +by developing an anchoring method that establishes anchors within a shared +embedding space. We publicly release this anchor dataset along with area +embedding datasets from several periods in eight major Japanese cities. This +dataset allows users to analyze urban usage patterns in Japanese cities and +embed their urban dataset into the same embedding space using the anchoring +method. Our key contributions include the development of the anchoring method, +releasing area embedding datasets for Japanese cities, and providing tools for +effective data utilization. + +
+
+
+
+
+ + ☆ Deep Image Priors for Magnetic Resonance Fingerprinting with pretrained + Bloch-consistent denoising autoencoders + + +
+ The estimation of multi-parametric quantitative maps from Magnetic Resonance +Fingerprinting (MRF) compressed sampled acquisitions, albeit successful, +remains a challenge due to the high underspampling rate and artifacts naturally +occuring during image reconstruction. Whilst state-of-the-art DL methods can +successfully address the task, to fully exploit their capabilities they often +require training on a paired dataset, in an area where ground truth is seldom +available. In this work, we propose a method that combines a deep image prior +(DIP) module that, without ground truth and in conjunction with a Bloch +consistency enforcing autoencoder, can tackle the problem, resulting in a +method faster and of equivalent or better accuracy than DIP-MRF. + +
+
+ comment: 4 pages, 3 figures 1 table, presented at ISBI 2024 +
+
+
+
+
+ + ☆ Imitation Learning for Intra-Day Power Grid Operation through Topology + Actions + + +
+ Power grid operation is becoming increasingly complex due to the increase in +generation of renewable energy. The recent series of Learning To Run a Power +Network (L2RPN) competitions have encouraged the use of artificial agents to +assist human dispatchers in operating power grids. In this paper we study the +performance of imitation learning for day-ahead power grid operation through +topology actions. In particular, we consider two rule-based expert agents: a +greedy agent and a N-1 agent. While the latter is more computationally +expensive since it takes N-1 safety considerations into account, it exhibits a +much higher operational performance. We train a fully-connected neural network +(FCNN) on expert state-action pairs and evaluate it in two ways. First, we find +that classification accuracy is limited despite extensive hyperparameter +tuning, due to class imbalance and class overlap. Second, as a power system +agent, the FCNN performs only slightly worse than expert agents. Furthermore, +hybrid agents, which incorporate minimal additional simulations, match expert +agents' performance with significantly lower computational cost. Consequently, +imitation learning shows promise for developing fast, high-performing power +grid agents, motivating its further exploration in future L2RPN studies. + +
+
+ comment: To be presented at the Machine Learning for Sustainable Power Systems + 2024 workshop and to be published in the corresponding Springer + Communications in Computer and Information Science proceedings +
+
+
+
+
+ + ☆ Anomalous State Sequence Modeling to Enhance Safety in Reinforcement + Learning + + +
+ The deployment of artificial intelligence (AI) in decision-making +applications requires ensuring an appropriate level of safety and reliability, +particularly in changing environments that contain a large number of unknown +observations. To address this challenge, we propose a novel safe reinforcement +learning (RL) approach that utilizes an anomalous state sequence to enhance RL +safety. Our proposed solution Safe Reinforcement Learning with Anomalous State +Sequences (AnoSeqs) consists of two stages. First, we train an agent in a +non-safety-critical offline 'source' environment to collect safe state +sequences. Next, we use these safe sequences to build an anomaly detection +model that can detect potentially unsafe state sequences in a 'target' +safety-critical environment where failures can have high costs. The estimated +risk from the anomaly detection model is utilized to train a risk-averse RL +policy in the target environment; this involves adjusting the reward function +to penalize the agent for visiting anomalous states deemed unsafe by our +anomaly model. In experiments on multiple safety-critical benchmarking +environments including self-driving cars, our solution approach successfully +learns safer policies and proves that sequential anomaly detection can provide +an effective supervisory signal for training safety-aware RL agents + +
+
+
+
+
+ + ☆ AI-Powered Energy algorithmic Trading: Integrating Hidden Markov Models + with Neural Networks + + +
+ In the field of quantitative finance, machine learning methods have become +essential for alpha generation. This paper presents a pioneering method that +uniquely combines Hidden Markov Models (HMM) and neural networks, creating a +dual-model alpha generation system integrated with Black-Litterman portfolio +optimization. The methodology, implemented on the QuantConnect platform, aims +to predict future price movements and optimize trading strategies. +Specifically, it filters for highly liquid, top-cap energy stocks to ensure +stable and predictable performance while also accounting for broker payments. +QuantConnect was selected because of its robust framework and to guarantee +experimental reproducibility. The algorithm achieved a 31% return between June +1, 2023, and January 1, 2024, with a Sharpe ratio of 1.669, demonstrating its +potential. The findings suggest significant improvements in trading strategy +performance through the combined use of the HMM and neural networks. This study +explores the architecture of the algorithm, data pre-processing techniques, +model training procedures, and performance evaluation, highlighting its +practical applicability and effectiveness in real-world trading environments. +The full code and backtesting data are available under the MIT license. + +
+
+ comment: 14 pages, 4 figures, 2 tables +
+
+
+
+
+ + ☆ Online Multi-Source Domain Adaptation through Gaussian Mixtures and + Dataset Dictionary Learning + + +
+ This paper addresses the challenge of online multi-source domain adaptation +(MSDA) in transfer learning, a scenario where one needs to adapt multiple, +heterogeneous source domains towards a target domain that comes in a stream. We +introduce a novel approach for the online fit of a Gaussian Mixture Model +(GMM), based on the Wasserstein geometry of Gaussian measures. We build upon +this method and recent developments in dataset dictionary learning for +proposing a novel strategy in online MSDA. Experiments on the challenging +Tennessee Eastman Process benchmark demonstrate that our approach is able to +adapt \emph{on the fly} to the stream of target domain data. Furthermore, our +online GMM serves as a memory, representing the whole stream of data. + +
+
+ comment: 6 pages, 3 figures, accepted at the IEEE International Workshop on + Machine Learning for Signal Processing 2024 +
+
+
+
+
+ + ☆ Quantum Long Short-Term Memory for Drug Discovery + + +
+ Quantum computing combined with machine learning (ML) is an extremely +promising research area, with numerous studies demonstrating that quantum +machine learning (QML) is expected to solve scientific problems more +effectively than classical ML. In this work, we successfully apply QML to drug +discovery, showing that QML can significantly improve model performance and +achieve faster convergence compared to classical ML. Moreover, we demonstrate +that the model accuracy of the QML improves as the number of qubits increases. +We also introduce noise to the QML model and find that it has little effect on +our experimental conclusions, illustrating the high robustness of the QML +model. This work highlights the potential application of quantum computing to +yield significant benefits for scientific advancement as the qubit quantity +increase and quality improvement in the future. + +
+
+
+
+
+ + ☆ BackdoorBench: A Comprehensive Benchmark and Analysis of Backdoor + Learning NeurIPS + + +
+ As an emerging approach to explore the vulnerability of deep neural networks +(DNNs), backdoor learning has attracted increasing interest in recent years, +and many seminal backdoor attack and defense algorithms are being developed +successively or concurrently, in the status of a rapid arms race. However, +mainly due to the diverse settings, and the difficulties of implementation and +reproducibility of existing works, there is a lack of a unified and +standardized benchmark of backdoor learning, causing unfair comparisons or +unreliable conclusions (e.g., misleading, biased or even false conclusions). +Consequently, it is difficult to evaluate the current progress and design the +future development roadmap of this literature. To alleviate this dilemma, we +build a comprehensive benchmark of backdoor learning called BackdoorBench. Our +benchmark makes three valuable contributions to the research community. 1) We +provide an integrated implementation of state-of-the-art (SOTA) backdoor +learning algorithms (currently including 20 attack and 32 defense algorithms), +based on an extensible modular-based codebase. 2) We conduct comprehensive +evaluations with 5 poisoning ratios, based on 4 models and 4 datasets, leading +to 11,492 pairs of attack-against-defense evaluations in total. 3) Based on +above evaluations, we present abundant analysis from 10 perspectives via 18 +useful analysis tools, and provide several inspiring insights about backdoor +learning. We hope that our efforts could build a solid foundation of backdoor +learning to facilitate researchers to investigate existing algorithms, develop +more innovative algorithms, and explore the intrinsic mechanism of backdoor +learning. Finally, we have created a user-friendly website at +http://backdoorbench.com, which collects all important information of +BackdoorBench, including codebase, docs, leaderboard, and model Zoo. + +
+
+ comment: Substantial extensions based on our previous conference version + "Backdoorbench: A comprehensive benchmark of backdoor learning" published at + NeurIPS D&B Track 2022. 20 backdoor attack algorithms, 32 backdoor defense + algorithms, 11000+ pairs of attack-against-defense evaluations, 10 analyses, + 18 analysis tools +
+
+
+
+
+ + ☆ Detecting and Understanding Vulnerabilities in Language Models via + Mechanistic Interpretability + + +
+ Large Language Models (LLMs), characterized by being trained on broad amounts +of data in a self-supervised manner, have shown impressive performance across a +wide range of tasks. Indeed, their generative abilities have aroused interest +on the application of LLMs across a wide range of contexts. However, neural +networks in general, and LLMs in particular, are known to be vulnerable to +adversarial attacks, where an imperceptible change to the input can mislead the +output of the model. This is a serious concern that impedes the use of LLMs on +high-stakes applications, such as healthcare, where a wrong prediction can +imply serious consequences. Even though there are many efforts on making LLMs +more robust to adversarial attacks, there are almost no works that study +\emph{how} and \emph{where} these vulnerabilities that make LLMs prone to +adversarial attacks happen. Motivated by these facts, we explore how to +localize and understand vulnerabilities, and propose a method, based on +Mechanistic Interpretability (MI) techniques, to guide this process. +Specifically, this method enables us to detect vulnerabilities related to a +concrete task by (i) obtaining the subset of the model that is responsible for +that task, (ii) generating adversarial samples for that task, and (iii) using +MI techniques together with the previous samples to discover and understand the +possible vulnerabilities. We showcase our method on a pretrained GPT-2 Small +model carrying out the task of predicting 3-letter acronyms to demonstrate its +effectiveness on locating and understanding concrete vulnerabilities of the +model. + +
+
+
+
+
+ + ☆ RNACG: A Universal RNA Sequence Conditional Generation model based on + Flow-Matching + + +
+ RNA plays a crucial role in diverse life processes. In contrast to the rapid +advancement of protein design methods, the work related to RNA is more +demanding. Most current RNA design approaches concentrate on specified target +attributes and rely on extensive experimental searches. However, these methods +remain costly and inefficient due to practical limitations. In this paper, we +characterize all sequence design issues as conditional generation tasks and +offer parameterized representations for multiple problems. For these problems, +we have developed a universal RNA sequence generation model based on flow +matching, namely RNACG. RNACG can accommodate various conditional inputs and is +portable, enabling users to customize the encoding network for conditional +inputs as per their requirements and integrate it into the generation network. +We evaluated RNACG in RNA 3D structure inverse folding, 2D structure inverse +folding, family-specific sequence generation, and 5'UTR translation efficiency +prediction. RNACG attains superior or competitive performance on these tasks +compared with other methods. RNACG exhibits extensive applicability in sequence +generation and property prediction tasks, providing a novel approach to RNA +sequence design and potential methods for simulation experiments with +large-scale RNA sequence data. + +
+
+
+
+
+ + ☆ Federated Learning based Latent Factorization of Tensors for + Privacy-Preserving QoS Prediction + + +
+ In applications related to big data and service computing, dynamic +connections tend to be encountered, especially the dynamic data of +user-perspective quality of service (QoS) in Web services. They are transformed +into high-dimensional and incomplete (HDI) tensors which include abundant +temporal pattern information. Latent factorization of tensors (LFT) is an +extremely efficient and typical approach for extracting such patterns from an +HDI tensor. However, current LFT models require the QoS data to be maintained +in a central place (e.g., a central server), which is impossible for +increasingly privacy-sensitive users. To address this problem, this article +creatively designs a federated learning based on latent factorization of +tensors (FL-LFT). It builds a data-density -oriented federated learning model +to enable isolated users to collaboratively train a global LFT model while +protecting user's privacy. Extensive experiments on a QoS dataset collected +from the real world verify that FL-LFT shows a remarkable increase in +prediction accuracy when compared to state-of-the-art federated learning (FL) +approaches. + +
+
+
+
+
+ + ☆ Analyzing and reducing the synthetic-to-real transfer gap in Music + Information Retrieval: the task of automatic drum transcription + + +
+ Automatic drum transcription is a critical tool in Music Information +Retrieval for extracting and analyzing the rhythm of a music track, but it is +limited by the size of the datasets available for training. A popular method +used to increase the amount of data is by generating them synthetically from +music scores rendered with virtual instruments. This method can produce a +virtually infinite quantity of tracks, but empirical evidence shows that models +trained on previously created synthetic datasets do not transfer well to real +tracks. In this work, besides increasing the amount of data, we identify and +evaluate three more strategies that practitioners can use to improve the +realism of the generated data and, thus, narrow the synthetic-to-real transfer +gap. To explore their efficacy, we used them to build a new synthetic dataset +and then we measured how the performance of a model scales and, specifically, +at what value it will stagnate when increasing the number of training tracks +for different datasets. By doing this, we were able to prove that the +aforementioned strategies contribute to make our dataset the one with the most +realistic data distribution and the lowest synthetic-to-real transfer gap among +the synthetic datasets we evaluated. We conclude by highlighting the limits of +training with infinite data in drum transcription and we show how they can be +overcome. + +
+
+ comment: 21 pages, 4 figures +
+
+
+
+
+ + ☆ Survey and Taxonomy: The Role of Data-Centric AI in Transformer-Based + Time Series Forecasting + + +
+ Alongside the continuous process of improving AI performance through the +development of more sophisticated models, researchers have also focused their +attention to the emerging concept of data-centric AI, which emphasizes the +important role of data in a systematic machine learning training process. +Nonetheless, the development of models has also continued apace. One result of +this progress is the development of the Transformer Architecture, which +possesses a high level of capability in multiple domains such as Natural +Language Processing (NLP), Computer Vision (CV) and Time Series Forecasting +(TSF). Its performance is, however, heavily dependent on input data +preprocessing and output data evaluation, justifying a data-centric approach to +future research. We argue that data-centric AI is essential for training AI +models, particularly for transformer-based TSF models efficiently. However, +there is a gap regarding the integration of transformer-based TSF and +data-centric AI. This survey aims to pin down this gap via the extensive +literature review based on the proposed taxonomy. We review the previous +research works from a data-centric AI perspective and we intend to lay the +foundation work for the future development of transformer-based architecture +and data-centric AI. + +
+
+
+
+
+ + ☆ Revisiting Agnostic PAC Learning + + +
+ PAC learning, dating back to Valiant'84 and Vapnik and Chervonenkis'64,'74, +is a classic model for studying supervised learning. In the agnostic setting, +we have access to a hypothesis set $\mathcal{H}$ and a training set of labeled +samples $(x_1,y_1),\dots,(x_n,y_n) \in \mathcal{X} \times \{-1,1\}$ drawn +i.i.d. from an unknown distribution $\mathcal{D}$. The goal is to produce a +classifier $h : \mathcal{X} \to \{-1,1\}$ that is competitive with the +hypothesis $h^\star_{\mathcal{D}} \in \mathcal{H}$ having the least probability +of mispredicting the label $y$ of a new sample $(x,y)\sim \mathcal{D}$. + Empirical Risk Minimization (ERM) is a natural learning algorithm, where one +simply outputs the hypothesis from $\mathcal{H}$ making the fewest mistakes on +the training data. This simple algorithm is known to have an optimal error in +terms of the VC-dimension of $\mathcal{H}$ and the number of samples $n$. + In this work, we revisit agnostic PAC learning and first show that ERM is in +fact sub-optimal if we treat the performance of the best hypothesis, denoted +$\tau:=\Pr_{\mathcal{D}}[h^\star_{\mathcal{D}}(x) \neq y]$, as a parameter. +Concretely we show that ERM, and any other proper learning algorithm, is +sub-optimal by a $\sqrt{\ln(1/\tau)}$ factor. We then complement this lower +bound with the first learning algorithm achieving an optimal error for nearly +the full range of $\tau$. Our algorithm introduces several new ideas that we +hope may find further applications in learning theory. + +
+
+
+
+
+ + ☆ Sensor Selection via GFlowNets: A Deep Generative Modeling Framework to + Navigate Combinatorial Complexity + + +
+ The performance of sensor arrays in sensing and wireless communications +improves with more elements, but this comes at the cost of increased energy +consumption and hardware expense. This work addresses the challenge of +selecting $k$ sensor elements from a set of $m$ to optimize a generic +Quality-of-Service metric. Evaluating all $\binom{m}{k}$ possible sensor +subsets is impractical, leading to prior solutions using convex relaxations, +greedy algorithms, and supervised learning approaches. The current paper +proposes a new framework that employs deep generative modeling, treating sensor +selection as a deterministic Markov Decision Process where sensor subsets of +size $k$ arise as terminal states. Generative Flow Networks (GFlowNets) are +employed to model an action distribution conditioned on the state. Sampling +actions from the aforementioned distribution ensures that the probability of +arriving at a terminal state is proportional to the performance of the +corresponding subset. Applied to a standard sensor selection scenario, the +developed approach outperforms popular methods which are based on convex +optimization and greedy algorithms. Finally, a multiobjective formulation of +the proposed approach is adopted and applied on the sparse antenna array design +for Integrated Sensing and Communication (ISAC) systems. The multiobjective +variation is shown to perform well in managing the trade-off between radar and +communication performance. + +
+
+
+
+
+ + ☆ Constructing artificial life and materials scientists with accelerated + AI using Deep AndersoNN ICML + + +
+ Deep AndersoNN accelerates AI by exploiting the continuum limit as the number +of explicit layers in a neural network approaches infinity and can be taken as +a single implicit layer, known as a deep equilibrium model. Solving for deep +equilibrium model parameters reduces to a nonlinear fixed point iteration +problem, enabling the use of vector-to-vector iterative solvers and windowing +techniques, such as Anderson extrapolation, for accelerating convergence to the +fixed point deep equilibrium. Here we show that Deep AndersoNN achieves up to +an order of magnitude of speed-up in training and inference. The method is +demonstrated on density functional theory results for industrial applications +by constructing artificial life and materials `scientists' capable of +classifying drugs as strongly or weakly polar, metal-organic frameworks by pore +size, and crystalline materials as metals, semiconductors, and insulators, +using graph images of node-neighbor representations transformed from atom-bond +networks. Results exhibit accuracy up to 98\% and showcase synergy between Deep +AndersoNN and machine learning capabilities of modern computing architectures, +such as GPUs, for accelerated computational life and materials science by +quickly identifying structure-property relationships. This paves the way for +saving up to 90\% of compute required for AI, reducing its carbon footprint by +up to 60 gigatons per year by 2030, and scaling above memory limits of explicit +neural networks in life and materials science, and beyond. + +
+
+ comment: 7 pages, 5 figures, 2 tables, Accepted by ICML ML4LMS + https://openreview.net/forum?id=qhwyvhqAvI . International Conference on + Machine Learning (ICML). Machine Learning for Life and Material Science + (ML4LMS) Workshop, May 2024 +
+
+
+
+
+ + ☆ Generalization bounds for regression and classification on adaptive + covering input domains + + +
+ Our main focus is on the generalization bound, which serves as an upper limit +for the generalization error. Our analysis delves into regression and +classification tasks separately to ensure a thorough examination. We assume the +target function is real-valued and Lipschitz continuous for regression tasks. +We use the 2-norm and a root-mean-square-error (RMSE) variant to measure the +disparities between predictions and actual values. In the case of +classification tasks, we treat the target function as a one-hot classifier, +representing a piece-wise constant function, and employ 0/1 loss for error +measurement. Our analysis underscores the differing sample complexity required +to achieve a concentration inequality of generalization bounds, highlighting +the variation in learning efficiency for regression and classification tasks. +Furthermore, we demonstrate that the generalization bounds for regression and +classification functions are inversely proportional to a polynomial of the +number of parameters in a network, with the degree depending on the hypothesis +class and the network architecture. These findings emphasize the advantages of +over-parameterized networks and elucidate the conditions for benign overfitting +in such systems. + +
+
+
+
+
+ + ☆ Neural networks for bifurcation and linear stability analysis of steady + states in partial differential equations + + +
+ This research introduces an extended application of neural networks for +solving nonlinear partial differential equations (PDEs). A neural network, +combined with a pseudo-arclength continuation, is proposed to construct +bifurcation diagrams from parameterized nonlinear PDEs. Additionally, a neural +network approach is also presented for solving eigenvalue problems to analyze +solution linear stability, focusing on identifying the largest eigenvalue. The +effectiveness of the proposed neural network is examined through experiments on +the Bratu equation and the Burgers equation. Results from a finite difference +method are also presented as comparison. Varying numbers of grid points are +employed in each case to assess the behavior and accuracy of both the neural +network and the finite difference method. The experimental results demonstrate +that the proposed neural network produces better solutions, generates more +accurate bifurcation diagrams, has reasonable computational times, and proves +effective for linear stability analysis. + +
+
+ comment: Accepted for publication in Applied Mathematics and Computation +
+
+
+
+
+ + ☆ Multiscale Representation Enhanced Temporal Flow Fusion Model for + Long-Term Workload Forecasting CIKM '24 + + +
+ Accurate workload forecasting is critical for efficient resource management +in cloud computing systems, enabling effective scheduling and autoscaling. +Despite recent advances with transformer-based forecasting models, challenges +remain due to the non-stationary, nonlinear characteristics of workload time +series and the long-term dependencies. In particular, inconsistent performance +between long-term history and near-term forecasts hinders long-range +predictions. This paper proposes a novel framework leveraging self-supervised +multiscale representation learning to capture both long-term and near-term +workload patterns. The long-term history is encoded through multiscale +representations while the near-term observations are modeled via temporal flow +fusion. These representations of different scales are fused using an attention +mechanism and characterized with normalizing flows to handle +non-Gaussian/non-linear distributions of time series. Extensive experiments on +9 benchmarks demonstrate superiority over existing methods. + +
+
+ comment: Proceedings of the 33rd ACM International Conference on Information + and Knowledge Management (CIKM '24), October 21--25, 2024, Boise, ID, USA +
+
+
+
+
+ + ☆ Causal Interventional Prediction System for Robust and Explainable + Effect Forecasting CIKM '24 + + +
+ Although the widespread use of AI systems in today's world is growing, many +current AI systems are found vulnerable due to hidden bias and missing +information, especially in the most commonly used forecasting system. In this +work, we explore the robustness and explainability of AI-based forecasting +systems. We provide an in-depth analysis of the underlying causality involved +in the effect prediction task and further establish a causal graph based on +treatment, adjustment variable, confounder, and outcome. Correspondingly, we +design a causal interventional prediction system (CIPS) based on a variational +autoencoder and fully conditional specification of multiple imputations. +Extensive results demonstrate the superiority of our system over +state-of-the-art methods and show remarkable versatility and extensibility in +practice. + +
+
+ comment: Proceedings of the 33rd ACM International Conference on Information + and Knowledge Management (CIKM '24), October 21--25, 2024, Boise, ID, USA +
+
+
+
+
+ + ☆ Revisiting the robustness of post-hoc interpretability methods + + +
+ Post-hoc interpretability methods play a critical role in explainable +artificial intelligence (XAI), as they pinpoint portions of data that a trained +deep learning model deemed important to make a decision. However, different +post-hoc interpretability methods often provide different results, casting +doubts on their accuracy. For this reason, several evaluation strategies have +been proposed to understand the accuracy of post-hoc interpretability. Many of +these evaluation strategies provide a coarse-grained assessment -- i.e., they +evaluate how the performance of the model degrades on average by corrupting +different data points across multiple samples. While these strategies are +effective in selecting the post-hoc interpretability method that is most +reliable on average, they fail to provide a sample-level, also referred to as +fine-grained, assessment. In other words, they do not measure the robustness of +post-hoc interpretability methods. We propose an approach and two new metrics +to provide a fine-grained assessment of post-hoc interpretability methods. We +show that the robustness is generally linked to its coarse-grained performance. + +
+
+
+
+
+ + ☆ Adaptive Soft Error Protection for Deep Learning + + +
+ The rising incidence of soft errors in hardware systems represents a +considerable risk to the reliability of deep learning systems and can +precipitate severe malfunctions. Although essential, soft error mitigation can +impose substantial costs on deep learning systems that are inherently demanding +in terms of computation and memory. Previous research has primarily explored +variations in vulnerability among different components of computing engines or +neural networks, aiming for selective protection to minimize protection +overhead. Our approach diverges from these studies by recognizing that the +susceptibility of deep learning tasks to soft errors is heavily +input-dependent. Notably, some inputs are simpler for deep learning models and +inherently exhibit greater tolerance to soft errors. Conversely, more complex +inputs are prone to soft error impact. Based on these insights, we introduce an +adaptive soft error protection strategy that tailors protection to the +computational demands of individual inputs. To implement this strategy, we +develop a metric for assessing the complexity of inputs and deploy a +lightweight machine learning algorithm to gauge input difficulty. Subsequently, +we employ robust protection for challenging inputs and minimal protection for +simpler ones. Our experimental evaluation across diverse datasets and deep +learning tasks reveals that our adaptive strategy reduces the soft error +protection overhead by an average of 46.9%, without compromising system +reliability. + +
+
+
+
+
+ + ☆ Short-Term Forecasting of Photovoltaic Power Generation Based on Entropy + during the Foggy Winter + + +
+ Solar energy is one of the most promising renewable energy resources. +Forecasting photovoltaic power generation is an important way to increase +photovoltaic penetration. However, the task of photovoltaic forecasting is +complicated due to its property of uncertainty, especially in specific regions +during the foggy winter. This paper proposes a novel model to accomplish the +problem. A developed entropy is created to qualify the uncertainty during the +foggy winter. The clustering method and modified retention network are applied +to reduce complexity and forecast, respectively. We adopt an optimization to +optimize the hyperparameters. Results are validated from the multivariate +forecasting model using the dataset from a photovoltaic power station in +Jiangsu Province, China. Experiments show that the proposed model improves the +forecasting accuracy compared to various models during the foggy winter. + +
+
+ comment: The manuscript was submitted to Applied Energy on June 3, 2024 +
+
+
+
+
+ + ☆ Towards a Knowledge guided Multimodal Foundation Model for + Spatio-Temporal Remote Sensing Applications + + +
+ In recent years, there is increased interest in foundation models for +geoscience due to vast amount of earth observing satellite imagery. Existing +remote sensing foundation models make use of the various sources of spectral +imagery to create large models pretrained on masked reconstruction task. The +embeddings from these foundation models are then used for various downstream +remote sensing applications. In this paper we propose a foundational modeling +framework for remote sensing geoscience applications, that goes beyond these +traditional single modality masked autoencoder family of foundation models. +This framework leverages the knowledge guided principles that the spectral +imagery captures the impact of the physical drivers on the environmental +system, and that the relationship between them is governed by the +characteristics of the system. Specifically, our method, called MultiModal +Variable Step Forecasting (MM-VSF), uses mutlimodal data (spectral imagery and +weather) as its input and a variable step forecasting task as its pretraining +objective. In our evaluation we show forecasting of satellite imagery using +weather can be used as an effective pretraining task for foundation models. We +further show the effectiveness of the embeddings from MM-VSF on the downstream +task of pixel wise crop mapping, when compared with a model trained in the +traditional setting of single modality input and masked reconstruction based +pretraining. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ ComNeck: Bridging Compressed Image Latents and Multimodal LLMs via + Universal Transform-Neck + + +
+ This paper presents the first-ever study of adapting compressed image latents +to suit the needs of downstream vision tasks that adopt Multimodal Large +Language Models (MLLMs). MLLMs have extended the success of large language +models to modalities (e.g. images) beyond text, but their billion scale hinders +deployment on resource-constrained end devices. While cloud-hosted MLLMs could +be available, transmitting raw, uncompressed images captured by end devices to +the cloud requires an efficient image compression system. To address this, we +focus on emerging neural image compression and propose a novel framework with a +lightweight transform-neck and a surrogate loss to adapt compressed image +latents for MLLM-based vision tasks. The proposed framework is generic and +applicable to multiple application scenarios, where the neural image codec can +be (1) pre-trained for human perception without updating, (2) fully updated for +joint human and machine perception, or (3) fully updated for only machine +perception. The transform-neck trained with the surrogate loss is universal, +for it can serve various downstream vision tasks enabled by a variety of MLLMs +that share the same visual encoder. Our framework has the striking feature of +excluding the downstream MLLMs from training the transform-neck, and +potentially the neural image codec as well. This stands out from most existing +coding for machine approaches that involve downstream networks in training and +thus could be impractical when the networks are MLLMs. Extensive experiments on +different neural image codecs and various MLLM-based vision tasks show that our +method achieves great rate-accuracy performance with much less complexity, +demonstrating its effectiveness. + +
+
+
+
+
+ + ☆ Realizing Unaligned Block-wise Pruning for DNN Acceleration on Mobile + Devices + + +
+ With the recent proliferation of on-device AI, there is an increasing need to +run computationally intensive DNNs directly on mobile devices. However, the +limited computing and memory resources of these devices necessitate effective +pruning techniques. Block-wise pruning is promising due to its low accuracy +drop tradeoff for speedup gains, but it requires block positions to be aligned +with block size, hindering optimal position selection to minimize model +accuracy drop. Unaligned block pruning (UBP) addresses this by allowing blocks +to be selected at arbitrary positions, yet its practical use is limited by a +time-consuming optimal block selection algorithm and lack of efficient +inference kernels. In this paper, we propose a pseudo-optimal yet fast block +selection algorithm called Block Expansion and Division (BED), which can be +integrated into an iterative model training process. Additionally, we introduce +an efficient inference kernel implementation for mobile devices, enabling a +UBP-based model to achieve similar latency to a DNN model compressed by aligned +block pruning. We demonstrate the superiority of our techniques on a real +mobile phone with MobileNet and ResNet models. + +
+
+ comment: 11 pages, 8 figures +
+
+
+
+
+ + ☆ "A Good Bot Always Knows Its Limitations": Assessing Autonomous System + Decision-making Competencies through Factorized Machine Self-confidence + + +
+ How can intelligent machines assess their competencies in completing tasks? +This question has come into focus for autonomous systems that algorithmically +reason and make decisions under uncertainty. It is argued here that machine +self-confidence -- a form of meta-reasoning based on self-assessments of an +agent's knowledge about the state of the world and itself, as well as its +ability to reason about and execute tasks -- leads to many eminently computable +and useful competency indicators for such agents. This paper presents a +culmination of work on this concept in the form of a computational framework +called Factorized Machine Self-confidence (FaMSeC), which provides an +engineering-focused holistic description of factors driving an algorithmic +decision-making process, including outcome assessment, solver quality, model +quality, alignment quality, and past experience. In FaMSeC, self-confidence +indicators are derived from hierarchical `problem-solving statistics' embedded +within broad classes of probabilistic decision-making algorithms such as Markov +decision processes. The problem-solving statistics are obtained by evaluating +and grading probabilistic exceedance margins with respect to given competency +standards, which are specified for each decision-making competency factor by +the informee (e.g. a non-expert user or an expert system designer). This +approach allows `algorithmic goodness of fit' evaluations to be easily +incorporated into the design of many kinds of autonomous agents via +human-interpretable competency self-assessment reports. Detailed descriptions +and running application examples for a Markov decision process agent show how +two FaMSeC factors (outcome assessment and solver quality) can be practically +computed and reported for a range of possible tasking contexts through novel +use of meta-utility functions, behavior simulations, and surrogate prediction +models. + +
+
+ comment: 59 pages, 22 figures, draft to be submitted for journal review +
+
+
+
+
+ + ☆ Experimenting on Markov Decision Processes with Local Treatments + + +
+ As service systems grow increasingly complex and dynamic, many interventions +become localized, available and taking effect only in specific states. This +paper investigates experiments with local treatments on a widely-used class of +dynamic models, Markov Decision Processes (MDPs). Particularly, we focus on +utilizing the local structure to improve the inference efficiency of the +average treatment effect. We begin by demonstrating the efficiency of classical +inference methods, including model-based estimation and temporal difference +learning under a fixed policy, as well as classical A/B testing with general +treatments. We then introduce a variance reduction technique that exploits the +local treatment structure by sharing information for states unaffected by the +treatment policy. Our new estimator effectively overcomes the variance lower +bound for general treatments while matching the more stringent lower bound +incorporating the local treatment structure. Furthermore, our estimator can +optimally achieve a linear reduction with the number of test arms for a major +part of the variance. Finally, we explore scenarios with perfect knowledge of +the control arm and design estimators that further improve inference +efficiency. + +
+
+
+
+
+ + ☆ AgEval: A Benchmark for Zero-Shot and Few-Shot Plant Stress Phenotyping + with Multimodal LLMs + + +
+ Plant stress phenotyping traditionally relies on expert assessments and +specialized models, limiting scalability in agriculture. Recent advances in +multimodal large language models (LLMs) offer potential solutions to this +challenge. We present AgEval, a benchmark comprising 12 diverse plant stress +phenotyping tasks, to evaluate these models' capabilities. Our study assesses +zero-shot and few-shot in-context learning performance of state-of-the-art +models, including Claude, GPT, Gemini, and LLaVA. Results show significant +performance improvements with few-shot learning, with F1 scores increasing from +46.24% to 73.37% in 8-shot identification for the best-performing model. +Few-shot examples from other classes in the dataset have negligible or negative +impacts, although having the exact category example helps to increase +performance by 15.38%. We also quantify the consistency of model performance +across different classes within each task, finding that the coefficient of +variance (CV) ranges from 26.02% to 58.03% across models, implying that subject +matter expertise is needed - of 'difficult' classes - to achieve reliability in +performance. AgEval establishes baseline metrics for multimodal LLMs in +agricultural applications, offering insights into their promise for enhancing +plant stress phenotyping at scale. Benchmark and code can be accessed at: +https://anonymous.4open.science/r/AgEval/ + +
+
+
+
+
+ + ☆ TopicTag: Automatic Annotation of NMF Topic Models Using Chain of + Thought and Prompt Tuning with LLMs + + +
+ Topic modeling is a technique for organizing and extracting themes from large +collections of unstructured text. Non-negative matrix factorization (NMF) is a +common unsupervised approach that decomposes a term frequency-inverse document +frequency (TF-IDF) matrix to uncover latent topics and segment the dataset +accordingly. While useful for highlighting patterns and clustering documents, +NMF does not provide explicit topic labels, necessitating subject matter +experts (SMEs) to assign labels manually. We present a methodology for +automating topic labeling in documents clustered via NMF with automatic model +determination (NMFk). By leveraging the output of NMFk and employing prompt +engineering, we utilize large language models (LLMs) to generate accurate topic +labels. Our case study on over 34,000 scientific abstracts on Knowledge Graphs +demonstrates the effectiveness of our method in enhancing knowledge management +and document organization. + +
+
+ comment: Accepted to ACM Symposium on Document Engineering 2024 (DocEng 24), + 2024 +
+
+
+
+
+ + ☆ A Method for Fast Autonomy Transfer in Reinforcement Learning + + +
+ This paper introduces a novel reinforcement learning (RL) strategy designed +to facilitate rapid autonomy transfer by utilizing pre-trained critic value +functions from multiple environments. Unlike traditional methods that require +extensive retraining or fine-tuning, our approach integrates existing +knowledge, enabling an RL agent to adapt swiftly to new settings without +requiring extensive computational resources. Our contributions include +development of the Multi-Critic Actor-Critic (MCAC) algorithm, establishing its +convergence, and empirical evidence demonstrating its efficacy. Our +experimental results show that MCAC significantly outperforms the baseline +actor-critic algorithm, achieving up to 22.76x faster autonomy transfer and +higher reward accumulation. This advancement underscores the potential of +leveraging accumulated knowledge for efficient adaptation in RL applications. + +
+
+
+
+
+ + ☆ Graphite: A Graph-based Extreme Multi-Label Short Text Classifier for + Keyphrase Recommendation + + +
+ Keyphrase Recommendation has been a pivotal problem in advertising and +e-commerce where advertisers/sellers are recommended keyphrases (search +queries) to bid on to increase their sales. It is a challenging task due to the +plethora of items shown on online platforms and various possible queries that +users search while showing varying interest in the displayed items. Moreover, +query/keyphrase recommendations need to be made in real-time and in a +resource-constrained environment. This problem can be framed as an Extreme +Multi-label (XML) Short text classification by tagging the input text with +keywords as labels. Traditional neural network models are either infeasible or +have slower inference latency due to large label spaces. We present Graphite, a +graph-based classifier model that provides real-time keyphrase recommendations +that are on par with standard text classification models. Furthermore, it +doesn't utilize GPU resources, which can be limited in production environments. +Due to its lightweight nature and smaller footprint, it can train on very large +datasets, where state-of-the-art XML models fail due to extreme resource +requirements. Graphite is deterministic, transparent, and intrinsically more +interpretable than neural network-based models. We present a comprehensive +analysis of our model's performance across forty categories spanning eBay's +English-speaking sites. + +
+
+
+
+
+ + ☆ CoMMIT: Coordinated Instruction Tuning for Multimodal Large Language + Models + + +
+ Instruction tuning in multimodal large language models (MLLMs) aims to +smoothly integrate a backbone LLM with a pre-trained feature encoder for +downstream tasks. The major challenge is how to efficiently find the synergy +through cooperative learning where LLMs adapt their reasoning abilities in +downstream tasks while feature encoders adjust their encoding to provide more +relevant modal information. In this paper, we analyze the MLLM instruction +tuning from both theoretical and empirical perspectives, where we find +unbalanced learning between the two components, i.e., the feature encoder and +the LLM, can cause diminishing learning gradients that slow the model +convergence and often lead to sub-optimal results due to insufficient learning. +Inspired by our findings, we propose a measurement to quantitatively evaluate +the learning balance, based on which we further design a dynamic learning +scheduler that better coordinates the learning. In addition, we introduce an +auxiliary loss regularization method to promote updating of the generation +distribution of MLLMs considering the learning state of each model component, +which potentially prevents each component from gradient diminishing and enables +a more accurate estimation of the learning balance coefficient. We conduct +experiments with multiple LLM backbones and feature encoders, where our +techniques are model-agnostic and can be generically integrated with various +MLLM backbones. Experiment results on multiple downstream tasks and modalities +in vision and audio, demonstrate the proposed method's better efficiency and +effectiveness in MLLM instruction tuning. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ Futga: Towards Fine-grained Music Understanding through + Temporally-enhanced Generative Augmentation + + +
+ Existing music captioning methods are limited to generating concise global +descriptions of short music clips, which fail to capture fine-grained musical +characteristics and time-aware musical changes. To address these limitations, +we propose FUTGA, a model equipped with fined-grained music understanding +capabilities through learning from generative augmentation with temporal +compositions. We leverage existing music caption datasets and large language +models (LLMs) to synthesize fine-grained music captions with structural +descriptions and time boundaries for full-length songs. Augmented by the +proposed synthetic dataset, FUTGA is enabled to identify the music's temporal +changes at key transition points and their musical functions, as well as +generate detailed descriptions for each music segment. We further introduce a +full-length music caption dataset generated by FUTGA, as the augmentation of +the MusicCaps and the Song Describer datasets. We evaluate the automatically +generated captions on several downstream tasks, including music generation and +retrieval. The experiments demonstrate the quality of the generated captions +and the better performance in various downstream tasks achieved by the proposed +music captioning approach. Our code and datasets can be found in +\href{https://huggingface.co/JoshuaW1997/FUTGA}{\textcolor{blue}{https://huggingface.co/JoshuaW1997/FUTGA}}. + +
+
+ comment: 6 pages +
+
+
+
+
+ + ☆ Importance Corrected Neural JKO Sampling + + +
+ In order to sample from an unnormalized probability density function, we +propose to combine continuous normalizing flows (CNFs) with +rejection-resampling steps based on importance weights. We relate the iterative +training of CNFs with regularized velocity fields to a JKO scheme and prove +convergence of the involved velocity fields to the velocity field of the +Wasserstein gradient flow (WGF). The alternation of local flow steps and +non-local rejection-resampling steps allows to overcome local minima or slow +convergence of the WGF for multimodal distributions. Since the proposal of the +rejection step is generated by the model itself, they do not suffer from common +drawbacks of classical rejection schemes. The arising model can be trained +iteratively, reduces the reverse Kulback-Leibler (KL) loss function in each +step, allows to generate iid samples and moreover allows for evaluations of the +generated underlying density. Numerical examples show that our method yields +accurate results on various test distributions including high-dimensional +multimodal targets and outperforms the state of the art in almost all cases +significantly. + +
+
+
+
+
+ + ☆ Neural Surrogate HMC: Accelerated Hamiltonian Monte Carlo with a Neural + Network Surrogate Likelihood SP + + +
+ Bayesian Inference with Markov Chain Monte Carlo requires efficient +computation of the likelihood function. In some scientific applications, the +likelihood must be computed by numerically solving a partial differential +equation, which can be prohibitively expensive. We demonstrate that some such +problems can be made tractable by amortizing the computation with a surrogate +likelihood function implemented by a neural network. We show that this has two +additional benefits: reducing noise in the likelihood evaluations and providing +fast gradient calculations. In experiments, the approach is applied to a model +of heliospheric transport of galactic cosmic rays, where it enables efficient +sampling from the posterior of latent parameters in the Parker equation. + +
+
+ comment: 5 pages, 3 figures, accepted at SPAICE Conference 2024 +
+
+
+
+
+ + ☆ Event-based Optical Flow on Neuromorphic Processor: ANN vs. SNN + Comparison based on Activation Sparsification + + +
+ Spiking neural networks (SNNs) for event-based optical flow are claimed to be +computationally more efficient than their artificial neural networks (ANNs) +counterparts, but a fair comparison is missing in the literature. In this work, +we propose an event-based optical flow solution based on activation +sparsification and a neuromorphic processor, SENECA. SENECA has an event-driven +processing mechanism that can exploit the sparsity in ANN activations and SNN +spikes to accelerate the inference of both types of neural networks. The ANN +and the SNN for comparison have similar low activation/spike density (~5%) +thanks to our novel sparsification-aware training. In the hardware-in-loop +experiments designed to deduce the average time and energy consumption, the SNN +consumes 44.9ms and 927.0 microjoules, which are 62.5% and 75.2% of the ANN's +consumption, respectively. We find that SNN's higher efficiency attributes to +its lower pixel-wise spike density (43.5% vs. 66.5%) that requires fewer memory +access operations for neuron states. + +
+
+ comment: 18 pages, 12 figures, 4 tables +
+
+
+
+
+ + ☆ Dense Self-Supervised Learning for Medical Image Segmentation + + +
+ Deep learning has revolutionized medical image segmentation, but it relies +heavily on high-quality annotations. The time, cost and expertise required to +label images at the pixel-level for each new task has slowed down widespread +adoption of the paradigm. We propose Pix2Rep, a self-supervised learning (SSL) +approach for few-shot segmentation, that reduces the manual annotation burden +by learning powerful pixel-level representations directly from unlabeled +images. Pix2Rep is a novel pixel-level loss and pre-training paradigm for +contrastive SSL on whole images. It is applied to generic encoder-decoder deep +learning backbones (e.g., U-Net). Whereas most SSL methods enforce invariance +of the learned image-level representations under intensity and spatial image +augmentations, Pix2Rep enforces equivariance of the pixel-level +representations. We demonstrate the framework on a task of cardiac MRI +segmentation. Results show improved performance compared to existing semi- and +self-supervised approaches; and a 5-fold reduction in the annotation burden for +equivalent performance versus a fully supervised U-Net baseline. This includes +a 30% (resp. 31%) DICE improvement for one-shot segmentation under +linear-probing (resp. fine-tuning). Finally, we also integrate the novel +Pix2Rep concept with the Barlow Twins non-contrastive SSL, which leads to even +better segmentation performance. + +
+
+ comment: Accepted at MIDL 2024 +
+
+
+
+
+ + ☆ Two-Phase Segmentation Approach for Accurate Left Ventricle Segmentation + in Cardiac MRI using Machine Learning + + +
+ Accurate segmentation of the Left Ventricle (LV) holds substantial importance +due to its implications in disease detection, regional analysis, and the +development of complex models for cardiac surgical planning. CMR is a golden +standard for diagnosis of serveral cardiac diseases. LV in CMR comprises of +three distinct sections: Basal, Mid-Ventricle, and Apical. This research +focuses on the precise segmentation of the LV from Cardiac MRI (CMR) scans, +joining with the capabilities of Machine Learning (ML). The central challenge +in this research revolves around the absence of a set of parameters applicable +to all three types of LV slices. Parameters optimized for basal slices often +fall short when applied to mid-ventricular and apical slices, and vice versa. +To handle this issue, a new method is proposed to enhance LV segmentation. The +proposed method involves using distinct sets of parameters for each type of +slice, resulting in a two-phase segmentation approach. The initial phase +categorizes images into three groups based on the type of LV slice, while the +second phase aims to segment CMR images using parameters derived from the +preceding phase. A publicly available dataset (Automated Cardiac Diagnosis +Challenge (ACDC)) is used. 10-Fold Cross Validation is used and it achieved a +mean score of 0.9228. Comprehensive testing indicates that the best parameter +set for a particular type of slice does not perform adequately for the other +slice types. All results show that the proposed approach fills a critical void +in parameter standardization through a two-phase segmentation model for the LV, +aiming to not only improve the accuracy of cardiac image analysis but also +contribute advancements to the field of LV segmentation. + +
+
+
+
+
+ + ☆ Gender, Race, and Intersectional Bias in Resume Screening via Language + Model Retrieval AAAI + + +
+ Artificial intelligence (AI) hiring tools have revolutionized resume +screening, and large language models (LLMs) have the potential to do the same. +However, given the biases which are embedded within LLMs, it is unclear whether +they can be used in this scenario without disadvantaging groups based on their +protected attributes. In this work, we investigate the possibilities of using +LLMs in a resume screening setting via a document retrieval framework that +simulates job candidate selection. Using that framework, we then perform a +resume audit study to determine whether a selection of Massive Text Embedding +(MTE) models are biased in resume screening scenarios. We simulate this for +nine occupations, using a collection of over 500 publicly available resumes and +500 job descriptions. We find that the MTEs are biased, significantly favoring +White-associated names in 85.1\% of cases and female-associated names in only +11.1\% of cases, with a minority of cases showing no statistically significant +differences. Further analyses show that Black males are disadvantaged in up to +100\% of cases, replicating real-world patterns of bias in employment settings, +and validate three hypotheses of intersectionality. We also find an impact of +document length as well as the corpus frequency of names in the selection of +resumes. These findings have implications for widely used AI tools that are +automating employment, fairness, and tech policy. + +
+
+ comment: To be published in Proceedings of the 2024 AAAI/ACM Conference on AI, + Ethics, and Society; code available at + https://github.com/kyrawilson/Resume-Screening-Bias +
+
+
+
+
+ + ☆ Mixed Newton Method for Optimization in Complex Spaces + + +
+ In this paper, we modify and apply the recently introduced Mixed Newton +Method, which is originally designed for minimizing real-valued functions of +complex variables, to the minimization of real-valued functions of real +variables by extending the functions to complex space. We show that arbitrary +regularizations preserve the favorable local convergence properties of the +method, and construct a special type of regularization used to prevent +convergence to complex minima. We compare several variants of the method +applied to training neural networks with real and complex parameters. + +
+
+ comment: 16 pages, 7 figures, 6 tables +
+
+
+
+
+ + ☆ Designing Time-Series Models With Hypernetworks & Adversarial Portfolios + + +
+ This article describes the methods that achieved 4th and 6th place in the +forecasting and investment challenges, respectively, of the M6 competition, +ultimately securing the 1st place in the overall duathlon ranking. In the +forecasting challenge, we tested a novel meta-learning model that utilizes +hypernetworks to design a parametric model tailored to a specific family of +forecasting tasks. This approach allowed us to leverage similarities observed +across individual forecasting tasks while also acknowledging potential +heterogeneity in their data generating processes. The model's training can be +directly performed with backpropagation, eliminating the need for reliance on +higher-order derivatives and is equivalent to a simultaneous search over the +space of parametric functions and their optimal parameter values. The proposed +model's capabilities extend beyond M6, demonstrating superiority over +state-of-the-art meta-learning methods in the sinusoidal regression task and +outperforming conventional parametric models on time-series from the M4 +competition. In the investment challenge, we adjusted portfolio weights to +induce greater or smaller correlation between our submission and that of other +participants, depending on the current ranking, aiming to maximize the +probability of achieving a good rank. + +
+
+
+
+
+ + ♻ ☆ Matryoshka Multimodal Models + + +
+ Large Multimodal Models (LMMs) such as LLaVA have shown strong performance in +visual-linguistic reasoning. These models first embed images into a fixed large +number of visual tokens and then feed them into a Large Language Model (LLM). +However, this design causes an excessive number of tokens for dense visual +scenarios such as high-resolution images and videos, leading to great +inefficiency. While token pruning/merging methods do exist, they produce a +single length output for each image and do not afford flexibility in trading +off information density v.s. efficiency. Inspired by the concept of Matryoshka +Dolls, we propose M3: Matryoshka Multimodal Models, which learns to represent +visual content as nested sets of visual tokens that capture information across +multiple coarse-to-fine granularities. Our approach offers several unique +benefits for LMMs: (1) One can explicitly control the visual granularity per +test instance during inference, e.g. , adjusting the number of tokens used to +represent an image based on the anticipated complexity or simplicity of the +content; (2) M3 provides a framework for analyzing the granularity needed for +existing datasets, where we find that COCO-style benchmarks only need around ~9 +visual tokens to obtain accuracy similar to that of using all 576 tokens; (3) +Our approach provides a foundation to explore the best trade-off between +performance and visual token length at sample level, where our investigation +reveals that a large gap exists between the oracle upper bound and current +fixed-scale representations. + +
+
+ comment: Project Page: https://matryoshka-mm.github.io/ +
+
+
+
+
+ + ♻ ☆ DCEM: A deep complementary energy method for solid mechanics + + +
+ In recent years, the rapid advancement of deep learning has significantly +impacted various fields, particularly in solving partial differential equations +(PDEs) in the realm of solid mechanics, benefiting greatly from the remarkable +approximation capabilities of neural networks. In solving PDEs, +Physics-Informed Neural Networks (PINNs) and the Deep Energy Method (DEM) have +garnered substantial attention. The principle of minimum potential energy and +complementary energy are two important variational principles in solid +mechanics. However, the well-known Deep Energy Method (DEM) is based on the +principle of minimum potential energy, but there lacks the important form of +minimum complementary energy. To bridge this gap, we propose the deep +complementary energy method (DCEM) based on the principle of minimum +complementary energy. The output function of DCEM is the stress function, which +inherently satisfies the equilibrium equation. We present numerical results +using the Prandtl and Airy stress functions, and compare DCEM with existing +PINNs and DEM algorithms when modeling representative mechanical problems. The +results demonstrate that DCEM outperforms DEM in terms of stress accuracy and +efficiency and has an advantage in dealing with complex displacement boundary +conditions, which is supported by theoretical analyses and numerical +simulations. We extend DCEM to DCEM-Plus (DCEM-P), adding terms that satisfy +partial differential equations. Furthermore, we propose a deep complementary +energy operator method (DCEM-O) by combining operator learning with physical +equations. Initially, we train DCEM-O using high-fidelity numerical results and +then incorporate complementary energy. DCEM-P and DCEM-O further enhance the +accuracy and efficiency of DCEM. + +
+
+ comment: 58 pages, 30 figures +
+
+
+
+
+ + ♻ ☆ Node Similarities under Random Projections: Limits and Pathological + Cases + + +
+ Random Projections have been widely used to generate embeddings for various +graph learning tasks due to their computational efficiency. The majority of +applications have been justified through the Johnson-Lindenstrauss Lemma. In +this paper, we take a step further and investigate how well dot product and +cosine similarity are preserved by random projections when these are applied +over the rows of the graph matrix. Our analysis provides new asymptotic and +finite-sample results, identifies pathological cases, and tests them with +numerical experiments. We specialize our fundamental results to a ranking +application by computing the probability of random projections flipping the +node ordering induced by their embeddings. We find that, depending on the +degree distribution, the method produces especially unreliable embeddings for +the dot product, regardless of whether the adjacency or the normalized +transition matrix is used. With respect to the statistical noise introduced by +random projections, we show that cosine similarity produces remarkably more +precise approximations. + +
+
+
+
+
+ + ♻ ☆ Finding Increasingly Large Extremal Graphs with AlphaZero and Tabu + Search IJCAI 2024 + + +
+ This work studies a central extremal graph theory problem inspired by a 1975 +conjecture of Erd\H{o}s, which aims to find graphs with a given size (number of +nodes) that maximize the number of edges without having 3- or 4-cycles. We +formulate this problem as a sequential decision-making problem and compare +AlphaZero, a neural network-guided tree search, with tabu search, a heuristic +local search method. Using either method, by introducing a curriculum -- +jump-starting the search for larger graphs using good graphs found at smaller +sizes -- we improve the state-of-the-art lower bounds for several sizes. We +also propose a flexible graph-generation environment and a +permutation-invariant network architecture for learning to search in the space +of graphs. + +
+
+ comment: To appear in the proceedings of IJCAI 2024. First three authors + contributed equally, last two authors made equal senior contribution +
+
+
+
+
+ + ♻ ☆ Surpassing Cosine Similarity for Multidimensional Comparisons: Dimension + Insensitive Euclidean Metric (DIEM) + + +
+ The advancement in computational power and hardware efficiency enabled the +tackling of increasingly complex and high-dimensional problems. While +artificial intelligence (AI) achieved remarkable results, the interpretability +of high-dimensional solutions remains challenging. A critical issue is the +comparison of multidimensional quantities, which is essential in techniques +like Principal Component Analysis (PCA), or k-means clustering. Common metrics +such as cosine similarity, Euclidean distance, and Manhattan distance are often +used for such comparisons - for example in muscular synergies of the human +motor control system. However, their applicability and interpretability +diminish as dimensionality increases. This paper provides a comprehensive +analysis of the effects of dimensionality on these metrics. Our results reveal +significant limitations of cosine similarity, particularly its dependency on +the dimensionality of the vectors, leading to biased and less interpretable +outcomes. To address this, we introduce the Dimension Insensitive Euclidean +Metric (DIEM) which demonstrates superior robustness and generalizability +across dimensions. DIEM maintains consistent variability and eliminates the +biases observed in traditional metrics, making it a reliable tool for +high-dimensional comparisons. This novel metric has the potential to replace +cosine similarity, providing a more accurate and insightful method to analyze +multidimensional data in fields ranging from neuromotor control to machine and +deep learning. + +
+
+ comment: 10 pages, 17 figures +
+
+
+
+
+ + ♻ ☆ Jumping Ahead: Improving Reconstruction Fidelity with JumpReLU Sparse + Autoencoders + + +
+ Sparse autoencoders (SAEs) are a promising unsupervised approach for +identifying causally relevant and interpretable linear features in a language +model's (LM) activations. To be useful for downstream tasks, SAEs need to +decompose LM activations faithfully; yet to be interpretable the decomposition +must be sparse -- two objectives that are in tension. In this paper, we +introduce JumpReLU SAEs, which achieve state-of-the-art reconstruction fidelity +at a given sparsity level on Gemma 2 9B activations, compared to other recent +advances such as Gated and TopK SAEs. We also show that this improvement does +not come at the cost of interpretability through manual and automated +interpretability studies. JumpReLU SAEs are a simple modification of vanilla +(ReLU) SAEs -- where we replace the ReLU with a discontinuous JumpReLU +activation function -- and are similarly efficient to train and run. By +utilising straight-through-estimators (STEs) in a principled manner, we show +how it is possible to train JumpReLU SAEs effectively despite the discontinuous +JumpReLU function introduced in the SAE's forward pass. Similarly, we use STEs +to directly train L0 to be sparse, instead of training on proxies such as L1, +avoiding problems like shrinkage. + +
+
+
+
+
+ + ♻ ☆ TabMDA: Tabular Manifold Data Augmentation for Any Classifier using + Transformers with In-context Subsetting ICML + + +
+ Tabular data is prevalent in many critical domains, yet it is often +challenging to acquire in large quantities. This scarcity usually results in +poor performance of machine learning models on such data. Data augmentation, a +common strategy for performance improvement in vision and language tasks, +typically underperforms for tabular data due to the lack of explicit symmetries +in the input space. To overcome this challenge, we introduce TabMDA, a novel +method for manifold data augmentation on tabular data. This method utilises a +pre-trained in-context model, such as TabPFN, to map the data into an embedding +space. TabMDA performs label-invariant transformations by encoding the data +multiple times with varied contexts. This process explores the learned +embedding space of the underlying in-context models, thereby enlarging the +training dataset. TabMDA is a training-free method, making it applicable to any +classifier. We evaluate TabMDA on five standard classifiers and observe +significant performance improvements across various tabular datasets. Our +results demonstrate that TabMDA provides an effective way to leverage +information from pre-trained in-context models to enhance the performance of +downstream classifiers. Code is available at +https://github.com/AdrianBZG/TabMDA. + +
+
+ comment: Presented at 1st ICML Workshop on In-Context Learning (ICL @ ICML + 2024) +
+
+
+
+
+ + ♻ ☆ FastCLIP: A Suite of Optimization Techniques to Accelerate CLIP Training + with Limited Resources + + +
+ Existing studies of training state-of-the-art Contrastive Language-Image +Pretraining (CLIP) models on large-scale data involve hundreds of or even +thousands of GPUs due to the requirement of a large batch size. However, such a +large amount of resources is not accessible to most people. While advanced +compositional optimization techniques for optimizing global contrastive losses +have been demonstrated effective for removing the requirement of large batch +size, their performance on large-scale data remains underexplored and not +optimized. To bridge the gap, this paper explores several aspects of CLIP +training with limited resources (e.g., up to tens of GPUs). First, we introduce +FastCLIP, a general CLIP training framework built on advanced compositional +optimization techniques while designed and optimized for the distributed +setting. Our framework is equipped with an efficient gradient reduction +strategy to reduce communication overhead. Second, to further boost training +efficiency, we investigate three components of the framework from an +optimization perspective: the schedule of the inner learning rate, the update +rules of the temperature parameter and the model parameters, respectively. +Experiments on different strategies for each component shed light on how to +conduct CLIP training more efficiently. Finally, we benchmark the performance +of FastCLIP and the state-of-the-art training baseline (OpenCLIP) on different +compute scales up to 32 GPUs on 8 nodes, and three data scales ranging from 2.7 +million, 9.1 million to 315 million image-text pairs to demonstrate the +significant improvement of FastCLIP in the resource-limited setting. We release +the code of FastCLIP at https://github.com/Optimization-AI/fast_clip . + +
+
+ comment: 24 pages +
+
+
+
+
+ + ♻ ☆ Long-form music generation with latent diffusion + + +
+ Audio-based generative models for music have seen great strides recently, but +so far have not managed to produce full-length music tracks with coherent +musical structure from text prompts. We show that by training a generative +model on long temporal contexts it is possible to produce long-form music of up +to 4m45s. Our model consists of a diffusion-transformer operating on a highly +downsampled continuous latent representation (latent rate of 21.5Hz). It +obtains state-of-the-art generations according to metrics on audio quality and +prompt alignment, and subjective tests reveal that it produces full-length +music with coherent structure. + +
+
+
+
+
+ + ♻ ☆ Differentially Private Gradient Flow based on the Sliced Wasserstein + Distance + + +
+ Safeguarding privacy in sensitive training data is paramount, particularly in +the context of generative modeling. This can be achieved through either +differentially private stochastic gradient descent or a differentially private +metric for training models or generators. In this paper, we introduce a novel +differentially private generative modeling approach based on a gradient flow in +the space of probability measures. To this end, we define the gradient flow of +the Gaussian-smoothed Sliced Wasserstein Distance, including the associated +stochastic differential equation (SDE). By discretizing and defining a +numerical scheme for solving this SDE, we demonstrate the link between +smoothing and differential privacy based on a Gaussian mechanism, due to a +specific form of the SDE's drift term. We then analyze the differential privacy +guarantee of our gradient flow, which accounts for both the smoothing and the +Wiener process introduced by the SDE itself. Experiments show that our proposed +model can generate higher-fidelity data at a low privacy budget compared to a +generator-based model, offering a promising alternative. + +
+
+
+
+
+ + ♻ ☆ CoCo: A Coupled Contrastive Framework for Unsupervised Domain Adaptive + Graph Classification + + +
+ Although graph neural networks (GNNs) have achieved impressive achievements +in graph classification, they often need abundant task-specific labels, which +could be extensively costly to acquire. A credible solution is to explore +additional labeled graphs to enhance unsupervised learning on the target +domain. However, how to apply GNNs to domain adaptation remains unsolved owing +to the insufficient exploration of graph topology and the significant domain +discrepancy. In this paper, we propose Coupled Contrastive Graph Representation +Learning (CoCo), which extracts the topological information from coupled +learning branches and reduces the domain discrepancy with coupled contrastive +learning. CoCo contains a graph convolutional network branch and a hierarchical +graph kernel network branch, which explore graph topology in implicit and +explicit manners. Besides, we incorporate coupled branches into a holistic +multi-view contrastive learning framework, which not only incorporates graph +representations learned from complementary views for enhanced understanding, +but also encourages the similarity between cross-domain example pairs with the +same semantics for domain alignment. Extensive experiments on popular datasets +show that our CoCo outperforms these competing baselines in different settings +generally. + +
+
+
+
+
+ + ♻ ☆ Multi-fidelity Gaussian process surrogate modeling for regression + problems in physics + + +
+ One of the main challenges in surrogate modeling is the limited availability +of data due to resource constraints associated with computationally expensive +simulations. Multi-fidelity methods provide a solution by chaining models in a +hierarchy with increasing fidelity, associated with lower error, but increasing +cost. In this paper, we compare different multi-fidelity methods employed in +constructing Gaussian process surrogates for regression. Non-linear +autoregressive methods in the existing literature are primarily confined to +two-fidelity models, and we extend these methods to handle more than two levels +of fidelity. Additionally, we propose enhancements for an existing method +incorporating delay terms by introducing a structured kernel. We demonstrate +the performance of these methods across various academic and real-world +scenarios. Our findings reveal that multi-fidelity methods generally have a +smaller prediction error for the same computational cost as compared to the +single-fidelity method, although their effectiveness varies across different +scenarios. + +
+
+
+
+
+ + ♻ ☆ Merit-based Fair Combinatorial Semi-Bandit with Unrestricted Feedback + Delays ECAI 2024 + + +
+ We study the stochastic combinatorial semi-bandit problem with unrestricted +feedback delays under merit-based fairness constraints. This is motivated by +applications such as crowdsourcing, and online advertising, where immediate +feedback is not immediately available and fairness among different choices (or +arms) is crucial. We consider two types of unrestricted feedback delays: +reward-independent delays where the feedback delays are independent of the +rewards, and reward-dependent delays where the feedback delays are correlated +with the rewards. Furthermore, we introduce merit-based fairness constraints to +ensure a fair selection of the arms. We define the reward regret and the +fairness regret and present new bandit algorithms to select arms under +unrestricted feedback delays based on their merits. We prove that our +algorithms all achieve sublinear expected reward regret and expected fairness +regret, with a dependence on the quantiles of the delay distribution. We also +conduct extensive experiments using synthetic and real-world data and show that +our algorithms can fairly select arms with different feedback delays. + +
+
+ comment: 28 pages, 9 figures, accepted for 27th European Conference on + Artificial Intelligence (ECAI 2024), Source code added, Typo fixed +
+
+
+
+
+ + ♻ ☆ Leveraging Time-Series Foundation Models in Smart Agriculture for Soil + Moisture Forecasting KDD '24 + + +
+ The recent surge in foundation models for natural language processing and +computer vision has fueled innovation across various domains. Inspired by this +progress, we explore the potential of foundation models for time-series +forecasting in smart agriculture, a field often plagued by limited data +availability. Specifically, this work presents a novel application of +$\texttt{TimeGPT}$, a state-of-the-art (SOTA) time-series foundation model, to +predict soil water potential ($\psi_\mathrm{soil}$), a key indicator of field +water status that is typically used for irrigation advice. Traditionally, this +task relies on a wide array of input variables. We explore +$\psi_\mathrm{soil}$'s ability to forecast $\psi_\mathrm{soil}$ in: ($i$) a +zero-shot setting, ($ii$) a fine-tuned setting relying solely on historic +$\psi_\mathrm{soil}$ measurements, and ($iii$) a fine-tuned setting where we +also add exogenous variables to the model. We compare $\texttt{TimeGPT}$'s +performance to established SOTA baseline models for forecasting +$\psi_\mathrm{soil}$. Our results demonstrate that $\texttt{TimeGPT}$ achieves +competitive forecasting accuracy using only historical $\psi_\mathrm{soil}$ +data, highlighting its remarkable potential for agricultural applications. This +research paves the way for foundation time-series models for sustainable +development in agriculture by enabling forecasting tasks that were +traditionally reliant on extensive data collection and domain expertise. + +
+
+ comment: 7 pages, accepted at KDD '24 - Fragile Earth Workshop +
+
+
+
+
+ + ♻ ☆ Dynamic Spiking Graph Neural Networks + + +
+ The integration of Spiking Neural Networks (SNNs) and Graph Neural Networks +(GNNs) is gradually attracting attention due to the low power consumption and +high efficiency in processing the non-Euclidean data represented by graphs. +However, as a common problem, dynamic graph representation learning faces +challenges such as high complexity and large memory overheads. Current work +often uses SNNs instead of Recurrent Neural Networks (RNNs) by using binary +features instead of continuous ones for efficient training, which would +overlooks graph structure information and leads to the loss of details during +propagation. Additionally, optimizing dynamic spiking models typically requires +propagation of information across time steps, which increases memory +requirements. To address these challenges, we present a framework named +\underline{Dy}namic \underline{S}p\underline{i}king \underline{G}raph +\underline{N}eural Networks (\method{}). To mitigate the information loss +problem, \method{} propagates early-layer information directly to the last +layer for information compensation. To accommodate the memory requirements, we +apply the implicit differentiation on the equilibrium state, which does not +rely on the exact reverse of the forward computation. While traditional +implicit differentiation methods are usually used for static situations, +\method{} extends it to the dynamic graph setting. Extensive experiments on +three large-scale real-world dynamic graph datasets validate the effectiveness +of \method{} on dynamic node classification tasks with lower computational +costs. + +
+
+
+
+
+ + ♻ ☆ Privacy-preserving data release leveraging optimal transport and + particle gradient descent + + +
+ We present a novel approach for differentially private data synthesis of +protected tabular datasets, a relevant task in highly sensitive domains such as +healthcare and government. Current state-of-the-art methods predominantly use +marginal-based approaches, where a dataset is generated from private estimates +of the marginals. In this paper, we introduce PrivPGD, a new generation method +for marginal-based private data synthesis, leveraging tools from optimal +transport and particle gradient descent. Our algorithm outperforms existing +methods on a large range of datasets while being highly scalable and offering +the flexibility to incorporate additional domain-specific constraints. + +
+
+ comment: Published at the Forty-first International Conference on Machine + Learning +
+
+
+
+
+ + ♻ ☆ Non-Clashing Teaching Maps for Balls in Graphs COLT 2024 + + +
+ Recently, Kirkpatrick et al. [ALT 2019] and Fallat et al. [JMLR 2023] +introduced non-clashing teaching and showed it is the most efficient machine +teaching model satisfying the Goldman-Mathias collusion-avoidance criterion. A +teaching map $T$ for a concept class $\mathcal{C}$ assigns a (teaching) set +$T(C)$ of examples to each concept $C \in \mathcal{C}$. A teaching map is +non-clashing if no pair of concepts are consistent with the union of their +teaching sets. The size of a non-clashing teaching map (NCTM) $T$ is the +maximum size of a teaching set $T(C)$, $C \in \mathcal{C}$. The non-clashing +teaching dimension NCTD$(\mathcal{C})$ of $\mathcal{C}$ is the minimum size of +an NCTM for $\mathcal{C}$. NCTM$^+$ and NCTD$^+(\mathcal{C})$ are defined +analogously, except the teacher may only use positive examples. + We study NCTMs and NCTM$^+$s for the concept class $\mathcal{B}(G)$ +consisting of all balls of a graph $G$. We show that the associated decision +problem B-NCTD$^+$ for NCTD$^+$ is NP-complete in split, co-bipartite, and +bipartite graphs. Surprisingly, we even prove that, unless the ETH fails, +B-NCTD$^+$ does not admit an algorithm running in time +$2^{2^{o(\text{vc})}}\cdot n^{O(1)}$, nor a kernelization algorithm outputting +a kernel with $2^{o(\text{vc})}$ vertices, where vc is the vertex cover number +of $G$. We complement these lower bounds with matching upper bounds. These are +extremely rare results: it is only the second problem in NP to admit such a +tight double-exponential lower bound parameterized by vc, and only one of very +few problems to admit such an ETH-based conditional lower bound on the number +of vertices in a kernel. For trees, interval graphs, cycles, and trees of +cycles, we derive NCTM$^+$s or NCTMs for $\mathcal{B}(G)$ of size proportional +to its VC-dimension, and for Gromov-hyperbolic graphs, we design an approximate +NCTM$^+$ of size 2. + +
+
+ comment: Published in the proceedings of COLT 2024. Shortened abstract due to + character limit +
+
+
+
+
+ + ♻ ☆ Identifiable latent bandits: Combining observational data and + exploration for personalized healthcare + + +
+ Bandit algorithms hold great promise for improving personalized +decision-making but are notoriously sample-hungry. In most health applications, +it is infeasible to fit a new bandit for each patient, and observable variables +are often insufficient to determine optimal treatments, ruling out applying +contextual bandits learned from multiple patients. Latent bandits offer both +rapid exploration and personalization beyond what context variables can reveal +but require that a latent variable model can be learned consistently. In this +work, we propose bandit algorithms based on nonlinear independent component +analysis that can be provably identified from observational data to a degree +sufficient to infer the optimal action in a new bandit instance consistently. +We verify this strategy in simulated data, showing substantial improvement over +learning independent multi-armed bandits for every instance. + +
+
+ comment: 9 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ Robust and Resource-Efficient Data-Free Knowledge Distillation by + Generative Pseudo Replay AAAI + + +
+ Data-Free Knowledge Distillation (KD) allows knowledge transfer from a +trained neural network (teacher) to a more compact one (student) in the absence +of original training data. Existing works use a validation set to monitor the +accuracy of the student over real data and report the highest performance +throughout the entire process. However, validation data may not be available at +distillation time either, making it infeasible to record the student snapshot +that achieved the peak accuracy. Therefore, a practical data-free KD method +should be robust and ideally provide monotonically increasing student accuracy +during distillation. This is challenging because the student experiences +knowledge degradation due to the distribution shift of the synthetic data. A +straightforward approach to overcome this issue is to store and rehearse the +generated samples periodically, which increases the memory footprint and +creates privacy concerns. We propose to model the distribution of the +previously observed synthetic samples with a generative network. In particular, +we design a Variational Autoencoder (VAE) with a training objective that is +customized to learn the synthetic data representations optimally. The student +is rehearsed by the generative pseudo replay technique, with samples produced +by the VAE. Hence knowledge degradation can be prevented without storing any +samples. Experiments on image classification benchmarks show that our method +optimizes the expected value of the distilled model accuracy while eliminating +the large memory overhead incurred by the sample-storing methods. + +
+
+ comment: AAAI Conference on Artificial Intelligence +
+
+
+
+
+ + ♻ ☆ Generalization Error Bounds for Learning under Censored Feedback + + +
+ Generalization error bounds from learning theory provide statistical +guarantees on how well an algorithm will perform on previously unseen data. In +this paper, we characterize the impacts of data non-IIDness due to censored +feedback (a.k.a. selective labeling bias) on such bounds. We first derive an +extension of the well-known Dvoretzky-Kiefer-Wolfowitz (DKW) inequality, which +characterizes the gap between empirical and theoretical CDFs given IID data, to +problems with non-IID data due to censored feedback. We then use this CDF error +bound to provide a bound on the generalization error guarantees of a classifier +trained on such non-IID data. We show that existing generalization error bounds +(which do not account for censored feedback) fail to correctly capture the +model's generalization guarantees, verifying the need for our bounds. We +further analyze the effectiveness of (pure and bounded) exploration techniques, +proposed by recent literature as a way to alleviate censored feedback, on +improving our error bounds. Together, our findings illustrate how a decision +maker should account for the trade-off between strengthening the generalization +guarantees of an algorithm and the costs incurred in data collection when +future data availability is limited by censored feedback. + +
+
+
+
+
+ + ♻ ☆ Introducing δ-XAI: a novel sensitivity-based method for local AI + explanations + + +
+ Explainable Artificial Intelligence (XAI) is central to the debate on +integrating Artificial Intelligence (AI) and Machine Learning (ML) algorithms +into clinical practice. High-performing AI/ML models, such as ensemble learners +and deep neural networks, often lack interpretability, hampering clinicians' +trust in their predictions. To address this, XAI techniques are being developed +to describe AI/ML predictions in human-understandable terms. One promising +direction is the adaptation of sensitivity analysis (SA) and global sensitivity +analysis (GSA), which inherently rank model inputs by their impact on +predictions. Here, we introduce a novel delta-XAI method that provides local +explanations of ML model predictions by extending the delta index, a GSA +metric. The delta-XAI index assesses the impact of each feature's value on the +predicted output for individual instances in both regression and classification +problems. We formalize the delta-XAI index and provide code for its +implementation. The delta-XAI method was evaluated on simulated scenarios using +linear regression models, with Shapley values serving as a benchmark. Results +showed that the delta-XAI index is generally consistent with Shapley values, +with notable discrepancies in models with highly impactful or extreme feature +values. The delta-XAI index demonstrated higher sensitivity in detecting +dominant features and handling extreme feature values. Qualitatively, the +delta-XAI provides intuitive explanations by leveraging probability density +functions, making feature rankings clearer and more explainable for +practitioners. Overall, the delta-XAI method appears promising for robustly +obtaining local explanations of ML model predictions. Further investigations in +real-world clinical settings will be conducted to evaluate its impact on +AI-assisted clinical workflows. + +
+
+
+
+
+ + ♻ ☆ Generalized Groves of Neural Additive Models: Pursuing transparent and + accurate machine learning models in finance + + +
+ While machine learning methods have significantly improved model performance +over traditional methods, their black-box structure makes it difficult for +researchers to interpret results. For highly regulated financial industries, +model transparency is equally important to accuracy. Without understanding how +models work, even highly accurate machine learning methods are unlikely to be +accepted. We address this issue by introducing a novel class of transparent +machine learning models known as generalized groves of neural additive models. +The generalized groves of neural additive models separate features into three +categories: linear features, individual nonlinear features, and interacted +nonlinear features. Additionally, interactions in the last category are only +local. A stepwise selection algorithm distinguishes the linear and nonlinear +components, and interacted groups are carefully verified by applying additive +separation criteria. Through some empirical examples in finance, we demonstrate +that generalized grove of neural additive models exhibit high accuracy and +transparency with predominantly linear terms and only sparse nonlinear ones. + +
+
+
+
+
+ + ♻ ☆ Quasi-Framelets: Robust Graph Neural Networks via Adaptive Framelet + Convolution + + +
+ This paper aims to provide a novel design of a multiscale framelet +convolution for spectral graph neural networks (GNNs). While current spectral +methods excel in various graph learning tasks, they often lack the flexibility +to adapt to noisy, incomplete, or perturbed graph signals, making them fragile +in such conditions. Our newly proposed framelet convolution addresses these +limitations by decomposing graph data into low-pass and high-pass spectra +through a finely-tuned multiscale approach. Our approach directly designs +filtering functions within the spectral domain, allowing for precise control +over the spectral components. The proposed design excels in filtering out +unwanted spectral information and significantly reduces the adverse effects of +noisy graph signals. Our approach not only enhances the robustness of GNNs but +also preserves crucial graph features and structures. Through extensive +experiments on diverse, real-world graph datasets, we demonstrate that our +framelet convolution achieves superior performance in node classification +tasks. It exhibits remarkable resilience to noisy data and adversarial attacks, +highlighting its potential as a robust solution for real-world graph +applications. This advancement opens new avenues for more adaptive and reliable +spectral GNN architectures. + +
+
+
+
+
+ + ♻ ☆ Noise-Aware Algorithm for Heterogeneous Differentially Private Federated + Learning + + +
+ High utility and rigorous data privacy are of the main goals of a federated +learning (FL) system, which learns a model from the data distributed among some +clients. The latter has been tried to achieve by using differential privacy in +FL (DPFL). There is often heterogeneity in clients privacy requirements, and +existing DPFL works either assume uniform privacy requirements for clients or +are not applicable when server is not fully trusted (our setting). Furthermore, +there is often heterogeneity in batch and/or dataset size of clients, which as +shown, results in extra variation in the DP noise level across clients model +updates. With these sources of heterogeneity, straightforward aggregation +strategies, e.g., assigning clients aggregation weights proportional to their +privacy parameters will lead to lower utility. We propose Robust-HDP, which +efficiently estimates the true noise level in clients model updates and reduces +the noise-level in the aggregated model updates considerably. Robust-HDP +improves utility and convergence speed, while being safe to the clients that +may maliciously send falsified privacy parameter to server. Extensive +experimental results on multiple datasets and our theoretical analysis confirm +the effectiveness of Robust-HDP. Our code can be found here. + +
+
+ comment: Proceedings of the 41 st International Conference on Machine + Learning, Vienna, Austria. PMLR 235, 2024 +
+
+
+
+
+ + ♻ ☆ Deep-ELA: Deep Exploratory Landscape Analysis with Self-Supervised + Pretrained Transformers for Single- and Multi-Objective Continuous + Optimization Problems + + +
+ In many recent works, the potential of Exploratory Landscape Analysis (ELA) +features to numerically characterize, in particular, single-objective +continuous optimization problems has been demonstrated. These numerical +features provide the input for all kinds of machine learning tasks on +continuous optimization problems, ranging, i.a., from High-level Property +Prediction to Automated Algorithm Selection and Automated Algorithm +Configuration. Without ELA features, analyzing and understanding the +characteristics of single-objective continuous optimization problems is -- to +the best of our knowledge -- very limited. + Yet, despite their usefulness, as demonstrated in several past works, ELA +features suffer from several drawbacks. These include, in particular, (1.) a +strong correlation between multiple features, as well as (2.) its very limited +applicability to multi-objective continuous optimization problems. As a remedy, +recent works proposed deep learning-based approaches as alternatives to ELA. In +these works, e.g., point-cloud transformers were used to characterize an +optimization problem's fitness landscape. However, these approaches require a +large amount of labeled training data. + Within this work, we propose a hybrid approach, Deep-ELA, which combines (the +benefits of) deep learning and ELA features. Specifically, we pre-trained four +transformers on millions of randomly generated optimization problems to learn +deep representations of the landscapes of continuous single- and +multi-objective optimization problems. Our proposed framework can either be +used out-of-the-box for analyzing single- and multi-objective continuous +optimization problems, or subsequently fine-tuned to various tasks focussing on +algorithm behavior and problem understanding. + +
+
+
+
+
+ + ♻ ☆ MSegRNN:Enhanced SegRNN Model with Mamba for Long-Term Time Series + Forecasting + + +
+ Long time series forecasting aims to utilize historical information to +forecast future states over extended horizons. Traditional RNN-based series +forecasting methods struggle to effectively address long-term dependencies and +gradient issues in long time series problems. Recently, SegRNN has emerged as a +leading RNN-based model tailored for long-term series forecasting, +demonstrating state-of-the-art performance while maintaining a streamlined +architecture through innovative segmentation and parallel decoding techniques. +Nevertheless, SegRNN has several limitations: its fixed segmentation disrupts +data continuity and fails to effectively leverage information across different +segments, the segmentation strategy employed by SegRNN does not fundamentally +address the issue of information loss within the recurrent structure. To +address these issues, we propose the MSegRNN method with three key +enhancements: we introduce an implicit segmentation structure to decompose the +time series and map it to segmented hidden states, resulting in denser +information exchange during the segmentation phase. Additionally, we +incorporate residual structures in the encoding layer to mitigate information +loss within the recurrent structure. To extract information more effectively, +we further integrate the Mamba architecture to enhance time series information +extraction. Experiments on several real-world long time series forecasting +datasets demonstrate that our model surpasses the performance of current +state-of-the-art models. + +
+
+
+
+
+ + ♻ ☆ Unsupervised Training of Convex Regularizers using Maximum Likelihood + Estimation + + +
+ Imaging is a standard example of an inverse problem, where the task of +reconstructing a ground truth from a noisy measurement is ill-posed. Recent +state-of-the-art approaches for imaging use deep learning, spearheaded by +unrolled and end-to-end models and trained on various image datasets. However, +many such methods require the availability of ground truth data, which may be +unavailable or expensive, leading to a fundamental barrier that can not be +bypassed by choice of architecture. Unsupervised learning presents an +alternative paradigm that bypasses this requirement, as they can be learned +directly on noisy data and do not require any ground truths. A principled +Bayesian approach to unsupervised learning is to maximize the marginal +likelihood with respect to the given noisy measurements, which is intrinsically +linked to classical variational regularization. We propose an unsupervised +approach using maximum marginal likelihood estimation to train a convex neural +network-based image regularization term directly on noisy measurements, +improving upon previous work in both model expressiveness and dataset size. +Experiments demonstrate that the proposed method produces priors that are near +competitive when compared to the analogous supervised training method for +various image corruption operators, maintaining significantly better +generalization properties when compared to end-to-end methods. Moreover, we +provide a detailed theoretical analysis of the convergence properties of our +proposed algorithm. + +
+
+
+
+
+ + ♻ ☆ Deep NURBS -- Admissible Physics-informed Neural Networks + + +
+ In this study, we propose a new numerical scheme for physics-informed neural +networks (PINNs) that enables precise and inexpensive solution for partial +differential equations (PDEs) in case of arbitrary geometries while strictly +enforcing Dirichlet boundary conditions. The proposed approach combines +admissible NURBS parametrizations required to define the physical domain and +the Dirichlet boundary conditions with a PINN solver. The fundamental boundary +conditions are automatically satisfied in this novel Deep NURBS framework. We +verified our new approach using two-dimensional elliptic PDEs when considering +arbitrary geometries, including non-Lipschitz domains. Compared to the +classical PINN solver, the Deep NURBS estimator has a remarkably high +convergence rate for all the studied problems. Moreover, a desirable accuracy +was realized for most of the studied PDEs using only one hidden layer of neural +networks. This novel approach is considered to pave the way for more effective +solutions for high-dimensional problems by allowing for more realistic +physics-informed statistical learning to solve PDE-based variational problems. + +
+
+
+
+
+ + ♻ ☆ Robust Fully-Asynchronous Methods for Distributed Training over General + Architecture + + +
+ Perfect synchronization in distributed machine learning problems is +inefficient and even impossible due to the existence of latency, package losses +and stragglers. We propose a Robust Fully-Asynchronous Stochastic Gradient +Tracking method (R-FAST), where each device performs local computation and +communication at its own pace without any form of synchronization. Different +from existing asynchronous distributed algorithms, R-FAST can eliminate the +impact of data heterogeneity across devices and allow for packet losses by +employing a robust gradient tracking strategy that relies on properly designed +auxiliary variables for tracking and buffering the overall gradient vector. +More importantly, the proposed method utilizes two spanning-tree graphs for +communication so long as both share at least one common root, enabling flexible +designs in communication architectures. We show that R-FAST converges in +expectation to a neighborhood of the optimum with a geometric rate for smooth +and strongly convex objectives; and to a stationary point with a sublinear rate +for general non-convex settings. Extensive experiments demonstrate that R-FAST +runs 1.5-2 times faster than synchronous benchmark algorithms, such as +Ring-AllReduce and D-PSGD, while still achieving comparable accuracy, and +outperforms existing asynchronous SOTA algorithms, such as AD-PSGD and OSGP, +especially in the presence of stragglers. + +
+
+ comment: This paper has been accepted for publication as a regular paper in + the IEEE Transactions on Signal and Information Processing over Networks +
+
+
+
+
+ + ♻ ☆ The Shape of Money Laundering: Subgraph Representation Learning on the + Blockchain with the Elliptic2 Dataset KDD + + +
+ Subgraph representation learning is a technique for analyzing local +structures (or shapes) within complex networks. Enabled by recent developments +in scalable Graph Neural Networks (GNNs), this approach encodes relational +information at a subgroup level (multiple connected nodes) rather than at a +node level of abstraction. We posit that certain domain applications, such as +anti-money laundering (AML), are inherently subgraph problems and mainstream +graph techniques have been operating at a suboptimal level of abstraction. This +is due in part to the scarcity of annotated datasets of real-world size and +complexity, as well as the lack of software tools for managing subgraph GNN +workflows at scale. To enable work in fundamental algorithms as well as domain +applications in AML and beyond, we introduce Elliptic2, a large graph dataset +containing 122K labeled subgraphs of Bitcoin clusters within a background graph +consisting of 49M node clusters and 196M edge transactions. The dataset +provides subgraphs known to be linked to illicit activity for learning the set +of "shapes" that money laundering exhibits in cryptocurrency and accurately +classifying new criminal activity. Along with the dataset we share our graph +techniques, software tooling, promising early experimental results, and new +domain insights already gleaned from this approach. Taken together, we find +immediate practical value in this approach and the potential for a new standard +in anti-money laundering and forensic analytics in cryptocurrencies and other +financial networks. + +
+
+ comment: KDD MLF Workshop 2024. Dataset can be accessed at + http://elliptic.co/elliptic2. Code can be accessed at + https://github.com/MITIBMxGraph/Elliptic2 +
+
+
+
+
+ + ♻ ☆ Unfolding Time: Generative Modeling for Turbulent Flows in 4D ICML 2024 + + +
+ A recent study in turbulent flow simulation demonstrated the potential of +generative diffusion models for fast 3D surrogate modeling. This approach +eliminates the need for specifying initial states or performing lengthy +simulations, significantly accelerating the process. While adept at sampling +individual frames from the learned manifold of turbulent flow states, the +previous model lacks the capability to generate sequences, hindering analysis +of dynamic phenomena. This work addresses this limitation by introducing a 4D +generative diffusion model and a physics-informed guidance technique that +enables the generation of realistic sequences of flow states. Our findings +indicate that the proposed method can successfully sample entire subsequences +from the turbulent manifold, even though generalizing from individual frames to +sequences remains a challenging task. This advancement opens doors for the +application of generative modeling in analyzing the temporal evolution of +turbulent flows, providing valuable insights into their complex dynamics. + +
+
+ comment: AI4Science Workshop @ ICML 2024 +
+
+
+
+
+ + ♻ ☆ MMEarth: Exploring Multi-Modal Pretext Tasks For Geospatial + Representation Learning ECCV 2024 + + +
+ The volume of unlabelled Earth observation (EO) data is huge, but many +important applications lack labelled training data. However, EO data offers the +unique opportunity to pair data from different modalities and sensors +automatically based on geographic location and time, at virtually no human +labor cost. We seize this opportunity to create MMEarth, a diverse multi-modal +pretraining dataset at global scale. Using this new corpus of 1.2 million +locations, we propose a Multi-Pretext Masked Autoencoder (MP-MAE) approach to +learn general-purpose representations for optical satellite images. Our +approach builds on the ConvNeXt V2 architecture, a fully convolutional masked +autoencoder (MAE). Drawing upon a suite of multi-modal pretext tasks, we +demonstrate that our MP-MAE approach outperforms both MAEs pretrained on +ImageNet and MAEs pretrained on domain-specific satellite images. This is shown +on several downstream tasks including image classification and semantic +segmentation. We find that pretraining with multi-modal pretext tasks notably +improves the linear probing performance compared to pretraining on optical +satellite images only. This also leads to better label efficiency and parameter +efficiency which are crucial aspects in global scale applications. + +
+
+ comment: Accepted for ECCV 2024. Data and code: + https://vishalned.github.io/mmearth Update arXiv v2 (ECCV): 1. Dataset fix: + Removed duplicates and corrected ERA5 yearly statistics. 2. Data augmentation + fix: Random crops are now aligned. 3. Test metrics fix: Metrics are now + overall instead of mini-batch averages, matching GEO-Bench metrics. 4. + Pretrained on MMEarth v001 & evaluated on GEO-Bench v1.0 +
+
+
+
+
+ + ♻ ☆ A data balancing approach towards design of an expert system for Heart + Disease Prediction + + +
+ Heart disease is a serious global health issue that claims millions of lives +every year. Early detection and precise prediction are critical to the +prevention and successful treatment of heart related issues. A lot of research +utilizes machine learning (ML) models to forecast cardiac disease and obtain +early detection. In order to do predictive analysis on "Heart disease health +indicators " dataset. We employed five machine learning methods in this paper: +Decision Tree (DT), Random Forest (RF), Linear Discriminant Analysis, Extra +Tree Classifier, and AdaBoost. The model is further examined using various +feature selection (FS) techniques. To enhance the baseline model, we have +separately applied four FS techniques: Sequential Forward FS, Sequential +Backward FS, Correlation Matrix, and Chi2. Lastly, K means SMOTE oversampling +is applied to the models to enable additional analysis. The findings show that +when it came to predicting heart disease, ensemble approaches in particular, +random forests performed better than individual classifiers. The presence of +smoking, blood pressure, cholesterol, and physical inactivity were among the +major predictors that were found. The accuracy of the Random Forest and +Decision Tree model was 99.83%. This paper demonstrates how machine learning +models can improve the accuracy of heart disease prediction, especially when +using ensemble methodologies. The models provide a more accurate risk +assessment than traditional methods since they incorporate a large number of +factors and complex algorithms. + +
+
+
+
+
+ + ♻ ☆ Response Theory via Generative Score Modeling + + +
+ We introduce an approach for analyzing the responses of dynamical systems to +external perturbations that combines score-based generative modeling with the +Generalized Fluctuation-Dissipation Theorem (GFDT). The methodology enables +accurate estimation of system responses, including those with non-Gaussian +statistics. We numerically validate our approach using time-series data from +three different stochastic partial differential equations of increasing +complexity: an Ornstein-Uhlenbeck process with spatially correlated noise, a +modified stochastic Allen-Cahn equation, and the 2D Navier-Stokes equations. We +demonstrate the improved accuracy of the methodology over conventional methods +and discuss its potential as a versatile tool for predicting the statistical +behavior of complex dynamical systems. + +
+
+
+
+
+ + ♻ ☆ Statistical Test on Diffusion Model-based Generated Images by Selective + Inference + + +
+ AI technology for generating images, such as diffusion models, has advanced +rapidly. However, there is no established framework for quantifying the +reliability of AI-generated images, which hinders their use in critical +decision-making tasks, such as medical image diagnosis. In this study, we +propose a method to quantify the reliability of decision-making tasks that rely +on images produced by diffusion models within a statistical testing framework. +The core concept of our statistical test involves using a selective inference +framework, in which the statistical test is conducted under the condition that +the images are produced by a trained diffusion model. As a case study, we study +a diffusion model-based anomaly detection task for medical images. With our +approach, the statistical significance of medical image diagnostic outcomes can +be quantified in terms of a p-value, enabling decision-making with a controlled +error rate. We demonstrate the theoretical soundness and practical +effectiveness of our statistical test through numerical experiments on both +synthetic and brain image datasets. + +
+
+ comment: 31 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ The Power of Combining Data and Knowledge: GPT-4o is an Effective + Interpreter of Machine Learning Models in Predicting Lymph Node Metastasis of + Lung Cancer + + +
+ Lymph node metastasis (LNM) is a crucial factor in determining the initial +treatment for patients with lung cancer, yet accurate preoperative diagnosis of +LNM remains challenging. Recently, large language models (LLMs) have garnered +significant attention due to their remarkable text generation capabilities. +Leveraging the extensive medical knowledge learned from vast corpora, LLMs can +estimate probabilities for clinical problems, though their performance has +historically been inferior to data-driven machine learning models. In this +paper, we propose a novel ensemble method that combines the medical knowledge +acquired by LLMs with the latent patterns identified by machine learning models +to enhance LNM prediction performance. Initially, we developed machine learning +models using patient data. We then designed a prompt template to integrate the +patient data with the predicted probability from the machine learning model. +Subsequently, we instructed GPT-4o, the most advanced LLM developed by OpenAI, +to estimate the likelihood of LNM based on patient data and then adjust the +estimate using the machine learning output. Finally, we collected three outputs +from the GPT-4o using the same prompt and ensembled these results as the final +prediction. Using the proposed method, our models achieved an AUC value of +0.765 and an AP value of 0.415 for LNM prediction, significantly improving +predictive performance compared to baseline machine learning models. The +experimental results indicate that GPT-4o can effectively leverage its medical +knowledge and the probabilities predicted by machine learning models to achieve +more accurate LNM predictions. These findings demonstrate that LLMs can perform +well in clinical risk prediction tasks, offering a new paradigm for integrating +medical knowledge and patient data in clinical predictions. + +
+
+
+
+
+ + ♻ ☆ Benchmarking Domain Adaptation for Chemical Processes on the Tennessee + Eastman Process ECML-PKDD 2024 + + +
+ In system monitoring, automatic fault diagnosis seeks to infer the systems' +state based on sensor readings, e.g., through machine learning models. In this +context, it is of key importance that, based on historical data, these systems +are able to generalize to incoming data. In parallel, many factors may induce +changes in the data probability distribution, hindering the possibility of such +models to generalize. In this sense, domain adaptation is an important +framework for adapting models to different probability distributions. In this +paper, we propose a new benchmark, based on the Tennessee Eastman Process of +Downs and Vogel (1993), for benchmarking domain adaptation methods in the +context of chemical processes. Besides describing the process, and its +relevance for domain adaptation, we describe a series of data processing steps +for reproducing our benchmark. We then test 11 domain adaptation strategies on +this novel benchmark, showing that optimal transport-based techniques +outperform other strategies. + +
+
+ comment: 16 pages, 9 figures, 5 tables. Accepted as a Workshop paper at the + ECML-PKDD 2024 conference +
+
+
+
+
+ + ♻ ☆ Benchmarking Dependence Measures to Prevent Shortcut Learning in Medical + Imaging + + +
+ Medical imaging cohorts are often confounded by factors such as acquisition +devices, hospital sites, patient backgrounds, and many more. As a result, deep +learning models tend to learn spurious correlations instead of causally related +features, limiting their generalizability to new and unseen data. This problem +can be addressed by minimizing dependence measures between intermediate +representations of task-related and non-task-related variables. These measures +include mutual information, distance correlation, and the performance of +adversarial classifiers. Here, we benchmark such dependence measures for the +task of preventing shortcut learning. We study a simplified setting using +Morpho-MNIST and a medical imaging task with CheXpert chest radiographs. Our +results provide insights into how to mitigate confounding factors in medical +imaging. + +
+
+ comment: Accepted to the 15th International Workshop on Machine Learning in + Medical Imaging (MLMI 2024); new version: appendix moved to the end, after + the references +
+
+
+
+
+ + ♻ ☆ Decision Machines: Enhanced Decision Trees + + +
+ This paper presents Decision Machines (DMs), an innovative evolution of +traditional binary decision trees, which leverages matrix computations to +significantly enhance both computational efficiency and interpretability. By +explicitly mapping the dependencies between predictions and binary tests within +a vector space, DMs offer a streamlined approach to navigating decision paths. +We integrate decision trees with kernel methods, ensemble methods and attention +mechanisms. The integration of these elements not only bolsters the +hierarchical structure of decision trees but also aligns with the computational +efficiency of matrix computations. Our work bridges the gap between traditional +machine learning algorithms and modern deep learning techniques, providing a +novel foundation for further research and application in the field of machine +learning. + +
+
+
+
+
+ + ♻ ☆ A learning theory for quantum photonic processors and beyond + + +
+ We consider the tasks of learning quantum states, measurements and channels +generated by continuous-variable (CV) quantum circuits. This family of circuits +is suited to describe optical quantum technologies and in particular it +includes state-of-the-art photonic processors capable of showing quantum +advantage. We define classes of functions that map classical variables, encoded +into the CV circuit parameters, to outcome probabilities evaluated on those +circuits. We then establish efficient learnability guarantees for such classes, +by computing bounds on their pseudo-dimension or covering numbers, showing that +CV quantum circuits can be learned with a sample complexity that scales +polynomially with the circuit's size, i.e., the number of modes. Our results +show that CV circuits can be trained efficiently using a number of training +samples that, unlike their finite-dimensional counterpart, does not scale with +the circuit depth. + +
+
+ comment: 27+5 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ Enhancing Training Efficiency Using Packing with Flash Attention + + +
+ Padding is often used in tuning LLM models by adding special tokens to +shorter training examples to match the length of the longest sequence in each +batch. While this ensures uniformity for batch processing, it introduces +inefficiencies by including irrelevant padding tokens in the computation and +wastes GPU resources. On the other hand, the Hugging Face SFT trainer offers +the option to use packing to combine multiple training examples up to the +maximum sequence length. This allows for maximal utilization of GPU resources. +However, without proper masking of each packed training example, attention will +not be computed correctly when using SFT trainer. We enable and then analyse +packing and Flash Attention with proper attention masking of each example and +show the benefits of this training paradigm. + +
+
+
+
+
+ + ♻ ☆ Adaptive maximization of social welfare + + +
+ We consider the problem of repeatedly choosing policies to maximize social +welfare. Welfare is a weighted sum of private utility and public revenue. +Earlier outcomes inform later policies. Utility is not observed, but indirectly +inferred. Response functions are learned through experimentation. We derive a +lower bound on regret, and a matching adversarial upper bound for a variant of +the Exp3 algorithm. Cumulative regret grows at a rate of $T^{2/3}$. This +implies that (i) welfare maximization is harder than the multi-armed bandit +problem (with a rate of $T^{1/2}$ for finite policy sets), and (ii) our +algorithm achieves the optimal rate. For the stochastic setting, if social +welfare is concave, we can achieve a rate of $T^{1/2}$ (for continuous policy +sets), using a dyadic search algorithm. We analyze an extension to nonlinear +income taxation, and sketch an extension to commodity taxation. We compare our +setting to monopoly pricing (which is easier), and price setting for bilateral +trade (which is harder). + +
+
+
+
+
+ + ♻ ☆ Revolutionizing Binary Decision Tree Traversals with Arithmetical + Representations + + +
+ This paper introduces an innovative method for traversing binary decision +trees using arithmetic operations. We present a suite of binary tree traversal +algorithms that leverage novel representation matrices to flatten the full +binary tree structure and embed the aggregated internal node decisions into a +single vector. Our approach, grounded in maximum inner product search, offers +new insights into decision tree partitioning. + +
+
+
+
+
+ + ♻ ☆ A Wasserstein perspective of Vanilla GANs + + +
+ The empirical success of Generative Adversarial Networks (GANs) caused an +increasing interest in theoretical research. The statistical literature is +mainly focused on Wasserstein GANs and generalizations thereof, which +especially allow for good dimension reduction properties. Statistical results +for Vanilla GANs, the original optimization problem, are still rather limited +and require assumptions such as smooth activation functions and equal +dimensions of the latent space and the ambient space. To bridge this gap, we +draw a connection from Vanilla GANs to the Wasserstein distance. By doing so, +existing results for Wasserstein GANs can be extended to Vanilla GANs. In +particular, we obtain an oracle inequality for Vanilla GANs in Wasserstein +distance. The assumptions of this oracle inequality are designed to be +satisfied by network architectures commonly used in practice, such as +feedforward ReLU networks. By providing a quantitative result for the +approximation of a Lipschitz function by a feedforward ReLU network with +bounded H\"older norm, we conclude a rate of convergence for Vanilla GANs as +well as Wasserstein GANs as estimators of the unknown probability distribution. + +
+
+
+
+
+ + ♻ ☆ Hierarchical Policy Blending as Inference for Reactive Robot Control ICRA 2023 + + +
+ Motion generation in cluttered, dense, and dynamic environments is a central +topic in robotics, rendered as a multi-objective decision-making problem. +Current approaches trade-off between safety and performance. On the one hand, +reactive policies guarantee fast response to environmental changes at the risk +of suboptimal behavior. On the other hand, planning-based motion generation +provides feasible trajectories, but the high computational cost may limit the +control frequency and thus safety. To combine the benefits of reactive policies +and planning, we propose a hierarchical motion generation method. Moreover, we +adopt probabilistic inference methods to formalize the hierarchical model and +stochastic optimization. We realize this approach as a weighted product of +stochastic, reactive expert policies, where planning is used to adaptively +compute the optimal weights over the task horizon. This stochastic optimization +avoids local optima and proposes feasible reactive plans that find paths in +cluttered and dense environments. Our extensive experimental study in planar +navigation and 6DoF manipulation shows that our proposed hierarchical motion +generation method outperforms both myopic reactive controllers and online +re-planning methods. + +
+
+ comment: 8 pages, 5 figures, 1 table, accepted at ICRA 2023 +
+
+
+
+
+ + ♻ ☆ Robust Deep Hawkes Process under Label Noise of Both Event and + Occurrence ECAI2024 + + +
+ Integrating deep neural networks with the Hawkes process has significantly +improved predictive capabilities in finance, health informatics, and +information technology. Nevertheless, these models often face challenges in +real-world settings, particularly due to substantial label noise. This issue is +of significant concern in the medical field, where label noise can arise from +delayed updates in electronic medical records or misdiagnoses, leading to +increased prediction risks. Our research indicates that deep Hawkes process +models exhibit reduced robustness when dealing with label noise, particularly +when it affects both event types and timing. To address these challenges, we +first investigate the influence of label noise in approximated intensity +functions and present a novel framework, the Robust Deep Hawkes Process (RDHP), +to overcome the impact of label noise on the intensity function of Hawkes +models, considering both the events and their occurrences. We tested RDHP using +multiple open-source benchmarks with synthetic noise and conducted a case study +on obstructive sleep apnea-hypopnea syndrome (OSAHS) in a real-world setting +with inherent label noise. The results demonstrate that RDHP can effectively +perform classification and regression tasks, even in the presence of noise +related to events and their timing. To the best of our knowledge, this is the +first study to successfully address both event and time label noise in deep +Hawkes process models, offering a promising solution for medical applications, +specifically in diagnosing OSAHS. + +
+
+ comment: ECAI2024 +
+
+
+
+
+ + ♻ ☆ Understanding Robust Overfitting from the Feature Generalization + Perspective + + +
+ Adversarial training (AT) constructs robust neural networks by incorporating +adversarial perturbations into natural data. However, it is plagued by the +issue of robust overfitting (RO), which severely damages the model's +robustness. In this paper, we investigate RO from a novel feature +generalization perspective. Specifically, we design factor ablation experiments +to assess the respective impacts of natural data and adversarial perturbations +on RO, identifying that the inducing factor of RO stems from natural data. +Given that the only difference between adversarial and natural training lies in +the inclusion of adversarial perturbations, we further hypothesize that +adversarial perturbations degrade the generalization of features in natural +data and verify this hypothesis through extensive experiments. Based on these +findings, we provide a holistic view of RO from the feature generalization +perspective and explain various empirical behaviors associated with RO. To +examine our feature generalization perspective, we devise two representative +methods, attack strength and data augmentation, to prevent the feature +generalization degradation during AT. Extensive experiments conducted on +benchmark datasets demonstrate that the proposed methods can effectively +mitigate RO and enhance adversarial robustness. + +
+
+
+
+
+ + ♻ ☆ Aligning Cyber Space with Physical World: A Comprehensive Survey on + Embodied AI + + +
+ Embodied Artificial Intelligence (Embodied AI) is crucial for achieving +Artificial General Intelligence (AGI) and serves as a foundation for various +applications that bridge cyberspace and the physical world. Recently, the +emergence of Multi-modal Large Models (MLMs) and World Models (WMs) have +attracted significant attention due to their remarkable perception, +interaction, and reasoning capabilities, making them a promising architecture +for the brain of embodied agents. However, there is no comprehensive survey for +Embodied AI in the era of MLMs. In this survey, we give a comprehensive +exploration of the latest advancements in Embodied AI. Our analysis firstly +navigates through the forefront of representative works of embodied robots and +simulators, to fully understand the research focuses and their limitations. +Then, we analyze four main research targets: 1) embodied perception, 2) +embodied interaction, 3) embodied agent, and 4) sim-to-real adaptation, +covering the state-of-the-art methods, essential paradigms, and comprehensive +datasets. Additionally, we explore the complexities of MLMs in virtual and real +embodied agents, highlighting their significance in facilitating interactions +in dynamic digital and physical environments. Finally, we summarize the +challenges and limitations of embodied AI and discuss their potential future +directions. We hope this survey will serve as a foundational reference for the +research community and inspire continued innovation. The associated project can +be found at https://github.com/HCPLab-SYSU/Embodied_AI_Paper_List. + +
+
+ comment: The first comprehensive review of Embodied AI in the era of MLMs, 36 + pages. We also provide the paper list for Embodied AI: + https://github.com/HCPLab-SYSU/Embodied_AI_Paper_List +
+
+
+
+
+ + ♻ ☆ SAPI: Surroundings-Aware Vehicle Trajectory Prediction at Intersections + + +
+ In this work we propose a deep learning model, i.e., SAPI, to predict vehicle +trajectories at intersections. SAPI uses an abstract way to represent and +encode surrounding environment by utilizing information from real-time map, +right-of-way, and surrounding traffic. The proposed model consists of two +convolutional network (CNN) and recurrent neural network (RNN)-based encoders +and one decoder. A refiner is proposed to conduct a look-back operation inside +the model, in order to make full use of raw history trajectory information. We +evaluate SAPI on a proprietary dataset collected in real-world intersections +through autonomous vehicles. It is demonstrated that SAPI shows promising +performance when predicting vehicle trajectories at intersection, and +outperforms benchmark methods. The average displacement error(ADE) and final +displacement error(FDE) for 6-second prediction are 1.84m and 4.32m +respectively. We also show that the proposed model can accurately predict +vehicle trajectories in different scenarios. + +
+
+
+
+
+ + ♻ ☆ LLM4SGG: Large Language Models for Weakly Supervised Scene Graph + Generation CVPR 2024 + + +
+ Weakly-Supervised Scene Graph Generation (WSSGG) research has recently +emerged as an alternative to the fully-supervised approach that heavily relies +on costly annotations. In this regard, studies on WSSGG have utilized image +captions to obtain unlocalized triplets while primarily focusing on grounding +the unlocalized triplets over image regions. However, they have overlooked the +two issues involved in the triplet formation process from the captions: 1) +Semantic over-simplification issue arises when extracting triplets from +captions, where fine-grained predicates in captions are undesirably converted +into coarse-grained predicates, resulting in a long-tailed predicate +distribution, and 2) Low-density scene graph issue arises when aligning the +triplets in the caption with entity/predicate classes of interest, where many +triplets are discarded and not used in training, leading to insufficient +supervision. To tackle the two issues, we propose a new approach, i.e., Large +Language Model for weakly-supervised SGG (LLM4SGG), where we mitigate the two +issues by leveraging the LLM's in-depth understanding of language and reasoning +ability during the extraction of triplets from captions and alignment of +entity/predicate classes with target data. To further engage the LLM in these +processes, we adopt the idea of Chain-of-Thought and the in-context few-shot +learning strategy. To validate the effectiveness of LLM4SGG, we conduct +extensive experiments on Visual Genome and GQA datasets, showing significant +improvements in both Recall@K and mean Recall@K compared to the +state-of-the-art WSSGG methods. A further appeal is that LLM4SGG is +data-efficient, enabling effective model training with a small amount of +training images. + +
+
+ comment: 8 pages; CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Predictive Pipelined Decoding: A Compute-Latency Trade-off for Exact LLM + Decoding ICML 2023 + + +
+ This paper presents "Predictive Pipelined Decoding (PPD)," an approach that +speeds up greedy decoding in Large Language Models (LLMs) while maintaining the +exact same output as the original decoding. Unlike conventional strategies, PPD +employs additional compute resources to parallelize the initiation of +subsequent token decoding during the current token decoding. This method +reduces decoding latency and reshapes the understanding of trade-offs in LLM +decoding strategies. We have developed a theoretical framework that allows us +to analyze the trade-off between computation and latency. Using this framework, +we can analytically estimate the potential reduction in latency associated with +our proposed method, achieved through the assessment of the match rate, +represented as p_correct. The results demonstrate that the use of extra +computational resources has the potential to accelerate LLM decoding. +Additionally, we implement PPD and conduct preliminary experiments to +empirically validate its efficacy, addressing potential practical overheads not +covered by theoretical analysis. + +
+
+ comment: ES-FoMo Workshop at ICML 2023 / Published in TMLR +
+
+
+
+
+ + ♻ ☆ Physics Informed Kolmogorov-Arnold Neural Networks for Dynamical + Analysis via Efficent-KAN and WAV-KAN + + +
+ Physics-informed neural networks have proven to be a powerful tool for +solving differential equations, leveraging the principles of physics to inform +the learning process. However, traditional deep neural networks often face +challenges in achieving high accuracy without incurring significant +computational costs. In this work, we implement the Physics-Informed +Kolmogorov-Arnold Neural Networks (PIKAN) through efficient-KAN and WAV-KAN, +which utilize the Kolmogorov-Arnold representation theorem. PIKAN demonstrates +superior performance compared to conventional deep neural networks, achieving +the same level of accuracy with fewer layers and reduced computational +overhead. We explore both B-spline and wavelet-based implementations of PIKAN +and benchmark their performance across various ordinary and partial +differential equations using unsupervised (data-free) and supervised +(data-driven) techniques. For certain differential equations, the data-free +approach suffices to find accurate solutions, while in more complex scenarios, +the data-driven method enhances the PIKAN's ability to converge to the correct +solution. We validate our results against numerical solutions and achieve $99 +\%$ accuracy in most scenarios. + +
+
+
+
+
+ + ♻ ☆ InstructIE: A Bilingual Instruction-based Information Extraction Dataset ISWC 2024 + + +
+ Large language models can perform well on general natural language tasks, but +their effectiveness is still suboptimal for information extraction (IE). Recent +works indicate that the main reason lies in the lack of extensive data on IE +instructions. Note that the existing datasets on IE instructions not only have +limited coverage but also involve high construction costs. To address this +issue, we introduce InstructIE, a bilingual instruction-based IE dataset, which +covers 12 diverse domains. We propose KG2Instruction, a framework specifically +for the automatic generation of such datasets. Additionally, we manually +annotate the test set. Experimental results demonstrate that large language +models trained with InstructIE can not only obtain better IE capabilities but +also enhance zero-shot performance compared with baselines. + +
+
+ comment: ISWC 2024; project homepage: + https://www.zjukg.org/project/InstructIE/ dataset: + https://huggingface.co/datasets/zjunlp/InstructIE +
+
+
+
+
+ + ♻ ☆ Mathematical models for off-ball scoring prediction in basketball + + +
+ In professional basketball, the accurate prediction of scoring opportunities +based on strategic decision-making is crucial for spatial and player +evaluations. However, traditional models often face challenges in accounting +for the complexities of off-ball movements, which are essential for +comprehensive performance evaluations. In this study, we propose two +mathematical models to predict off-ball scoring opportunities in basketball, +considering pass-to-score and dribble-to-score sequences: the Ball Movement for +Off-ball Scoring (BMOS) and the Ball Intercept and Movement for Off-ball +Scoring (BIMOS) models. The BMOS model adapts principles from the Off-Ball +Scoring Opportunities (OBSO) model, originally designed for soccer, to +basketball, whereas the BIMOS model also incorporates the likelihood of +interception during ball movements. We evaluated these models using player +tracking data from 630 NBA games in the 2015-2016 regular season, demonstrating +that the BIMOS model outperforms the BMOS model in terms of team scoring +prediction accuracy, while also highlighting its potential for further +development. Overall, the BIMOS model provides valuable insights for tactical +analysis and player evaluation in basketball. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ♻ ☆ How Does Fine-Tuning Impact Out-of-Distribution Detection for + Vision-Language Models? + + +
+ Recent large vision-language models such as CLIP have shown remarkable +out-of-distribution (OOD) detection and generalization performance. However, +their zero-shot in-distribution (ID) accuracy is often limited for downstream +datasets. Recent CLIP-based fine-tuning methods such as prompt learning have +demonstrated significant improvements in ID classification and OOD +generalization where OOD labels are available. Nonetheless, it remains unclear +whether the model is reliable to semantic shifts without OOD labels. In this +paper, we aim to bridge the gap and present a comprehensive study to understand +how fine-tuning impact OOD detection for few-shot downstream tasks. By framing +OOD detection as multi-modal concept matching, we establish a connection +between fine-tuning methods and various OOD scores. Our results suggest that a +proper choice of OOD scores is essential for CLIP-based fine-tuning. In +particular, the maximum concept matching (MCM) score provides a promising +solution consistently. We also show that prompt learning demonstrates the +state-of-the-art OOD detection performance over the zero-shot counterpart. + +
+
+ comment: Accepted to IJCV 2023 +
+
+
+
+
+ + ♻ ☆ Automated Design and Optimization of Distributed Filtering Circuits via + Reinforcement Learning + + +
+ Designing distributed filter circuits (DFCs) is complex and time-consuming, +involving setting and optimizing multiple hyperparameters. Traditional +optimization methods, such as using the commercial finite element solver HFSS +(High-Frequency Structure Simulator) to enumerate all parameter combinations +with fixed steps and then simulate each combination, are not only +time-consuming and labor-intensive but also rely heavily on the expertise and +experience of electronics engineers, making it difficult to adapt to rapidly +changing design requirements. Additionally, these commercial tools struggle +with precise adjustments when parameters are sensitive to numerical changes, +resulting in limited optimization effectiveness. This study proposes a novel +end-to-end automated method for DFC design. The proposed method harnesses +reinforcement learning (RL) algorithms, eliminating the dependence on the +design experience of engineers. Thus, it significantly reduces the subjectivity +and constraints associated with circuit design. The experimental findings +demonstrate clear improvements in design efficiency and quality when comparing +the proposed method with traditional engineer-driven methods. Furthermore, the +proposed method achieves superior performance when designing complex or rapidly +evolving DFCs, highlighting the substantial potential of RL in circuit design +automation. In particular, compared to the existing DFC automation design +method CircuitGNN, our method achieves an average performance improvement of +8.72%. Additionally, the execution efficiency of our method is 2000 times +higher than CircuitGNN on the CPU and 241 times higher on the GPU. + +
+
+
+
+
+ + ♻ ☆ PersonaGym: Evaluating Persona Agents and LLMs + + +
+ Persona agents, which are LLM agents that act according to an assigned +persona, have demonstrated impressive contextual response capabilities across +various applications. These persona agents offer significant enhancements +across diverse sectors, such as education, healthcare, and entertainment, where +model developers can align agent responses to different user requirements +thereby broadening the scope of agent applications. However, evaluating persona +agent performance is incredibly challenging due to the complexity of assessing +persona adherence in free-form interactions across various environments that +are relevant to each persona agent. We introduce PersonaGym, the first dynamic +evaluation framework for assessing persona agents, and PersonaScore, the first +automated human-aligned metric grounded in decision theory for comprehensive +large-scale evaluation of persona agents. Our evaluation of 6 open and +closed-source LLMs, using a benchmark encompassing 200 personas and 10,000 +questions, reveals significant opportunities for advancement in persona agent +capabilities across state-of-the-art models. For example, Claude 3.5 Sonnet +only has a 2.97% relative improvement in PersonaScore than GPT 3.5 despite +being a much more advanced model. Importantly, we find that increased model +size and complexity do not necessarily imply enhanced persona agent +capabilities thereby highlighting the pressing need for algorithmic and +architectural invention towards faithful and performant persona agents. + +
+
+ comment: 21 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Do We Really Need Graph Convolution During Training? Light Post-Training + Graph-ODE for Efficient Recommendation CIKM 2024 + + +
+ The efficiency and scalability of graph convolution networks (GCNs) in +training recommender systems (RecSys) have been persistent concerns, hindering +their deployment in real-world applications. This paper presents a critical +examination of the necessity of graph convolutions during the training phase +and introduces an innovative alternative: the Light Post-Training Graph +Ordinary-Differential-Equation (LightGODE). Our investigation reveals that the +benefits of GCNs are more pronounced during testing rather than training. +Motivated by this, LightGODE utilizes a novel post-training graph convolution +method that bypasses the computation-intensive message passing of GCNs and +employs a non-parametric continuous graph ordinary-differential-equation (ODE) +to dynamically model node representations. This approach drastically reduces +training time while achieving fine-grained post-training graph convolution to +avoid the distortion of the original training embedding space, termed the +embedding discrepancy issue. We validate our model across several real-world +datasets of different scales, demonstrating that LightGODE not only outperforms +GCN-based models in terms of efficiency and effectiveness but also +significantly mitigates the embedding discrepancy commonly associated with +deeper graph convolution layers. Our LightGODE challenges the prevailing +paradigms in RecSys training and suggests re-evaluating the role of graph +convolutions, potentially guiding future developments of efficient large-scale +graph-based RecSys. + +
+
+ comment: Accepted to CIKM 2024 +
+
+
+
+
+ + ♻ ☆ Blocking Bandits + + +
+ We consider a novel stochastic multi-armed bandit setting, where playing an +arm makes it unavailable for a fixed number of time slots thereafter. This +models situations where reusing an arm too often is undesirable (e.g. making +the same product recommendation repeatedly) or infeasible (e.g. compute job +scheduling on machines). We show that with prior knowledge of the rewards and +delays of all the arms, the problem of optimizing cumulative reward does not +admit any pseudo-polynomial time algorithm (in the number of arms) unless +randomized exponential time hypothesis is false, by mapping to the PINWHEEL +scheduling problem. Subsequently, we show that a simple greedy algorithm that +plays the available arm with the highest reward is asymptotically $(1-1/e)$ +optimal. When the rewards are unknown, we design a UCB based algorithm which is +shown to have $c \log T + o(\log T)$ cumulative regret against the greedy +algorithm, leveraging the free exploration of arms due to the unavailability. +Finally, when all the delays are equal the problem reduces to Combinatorial +Semi-bandits providing us with a lower bound of $c' \log T+ \omega(\log T)$. + +
+
+
+
+
+ + ♻ ☆ Fair Incentives for Repeated Engagement + + +
+ We study a decision-maker's problem of finding optimal monetary incentive +schemes for retention when faced with agents whose participation decisions +(stochastically) depend on the incentive they receive. Our focus is on policies +constrained to fulfill two fairness properties that preclude outcomes wherein +different groups of agents experience different treatment on average. We +formulate the problem as a high-dimensional stochastic optimization problem, +and study it through the use of a closely related deterministic variant. We +show that the optimal static solution to this deterministic variant is +asymptotically optimal for the dynamic problem under fairness constraints. +Though solving for the optimal static solution gives rise to a non-convex +optimization problem, we uncover a structural property that allows us to design +a tractable, fast-converging heuristic policy. Traditional schemes for +retention ignore fairness constraints; indeed, the goal in these is to use +differentiation to incentivize repeated engagement with the system. Our work +(i) shows that even in the absence of explicit discrimination, dynamic policies +may unintentionally discriminate between agents of different types by varying +the type composition of the system, and (ii) presents an asymptotically optimal +policy to avoid such discriminatory outcomes. + +
+
+
+
+
+ + ♻ ☆ Direct Preference Optimization: Your Language Model is Secretly a Reward + Model + + +
+ While large-scale unsupervised language models (LMs) learn broad world +knowledge and some reasoning skills, achieving precise control of their +behavior is difficult due to the completely unsupervised nature of their +training. Existing methods for gaining such steerability collect human labels +of the relative quality of model generations and fine-tune the unsupervised LM +to align with these preferences, often with reinforcement learning from human +feedback (RLHF). However, RLHF is a complex and often unstable procedure, first +fitting a reward model that reflects the human preferences, and then +fine-tuning the large unsupervised LM using reinforcement learning to maximize +this estimated reward without drifting too far from the original model. In this +paper we introduce a new parameterization of the reward model in RLHF that +enables extraction of the corresponding optimal policy in closed form, allowing +us to solve the standard RLHF problem with only a simple classification loss. +The resulting algorithm, which we call Direct Preference Optimization (DPO), is +stable, performant, and computationally lightweight, eliminating the need for +sampling from the LM during fine-tuning or performing significant +hyperparameter tuning. Our experiments show that DPO can fine-tune LMs to align +with human preferences as well as or better than existing methods. Notably, +fine-tuning with DPO exceeds PPO-based RLHF in ability to control sentiment of +generations, and matches or improves response quality in summarization and +single-turn dialogue while being substantially simpler to implement and train. + +
+
+
+
+
+ + ♻ ☆ MagMax: Leveraging Model Merging for Seamless Continual Learning ECCV2024 + + +
+ This paper introduces a continual learning approach named MagMax, which +utilizes model merging to enable large pre-trained models to continuously learn +from new data without forgetting previously acquired knowledge. Distinct from +traditional continual learning methods that aim to reduce forgetting during +task training, MagMax combines sequential fine-tuning with a maximum magnitude +weight selection for effective knowledge integration across tasks. Our initial +contribution is an extensive examination of model merging techniques, revealing +that simple approaches like weight averaging and random weight selection +surprisingly hold up well in various continual learning contexts. More +importantly, we present MagMax, a novel model-merging strategy that enables +continual learning of large pre-trained models for successive tasks. Our +thorough evaluation demonstrates the superiority of MagMax in various +scenarios, including class- and domain-incremental learning settings. The code +is available at this URL: https://github.com/danielm1405/magmax. + +
+
+ comment: Accepted for ECCV2024 +
+
+
+
+
+ + ♻ ☆ New methods to compute the generalized chi-square distribution + + +
+ We present several new mathematical methods (ray-trace, inverse Fourier +transform and ellipse) and open-source software to compute the cdf, pdf and +inverse cdf of the generalized chi-square distribution. Some methods are geared +for speed, while others are designed to be accurate far into the tails, using +which we can also measure large values of the discriminability index d' between +multinormals. We characterize the performance and limitations of these and +previous methods, and recommend the best methods to use for each part of each +type of distribution. We also demonstrate the speed and accuracy of our new +methods against previous methods across a wide sample of distributions. + +
+
+
+
+
+ + ♻ ☆ Infinite dSprites for Disentangled Continual Learning: Separating Memory + Edits from Generalization + + +
+ The ability of machine learning systems to learn continually is hindered by +catastrophic forgetting, the tendency of neural networks to overwrite +previously acquired knowledge when learning a new task. Existing methods +mitigate this problem through regularization, parameter isolation, or +rehearsal, but they are typically evaluated on benchmarks comprising only a +handful of tasks. In contrast, humans are able to learn over long time horizons +in dynamic, open-world environments, effortlessly memorizing unfamiliar objects +and reliably recognizing them under various transformations. To make progress +towards closing this gap, we introduce Infinite dSprites, a parsimonious tool +for creating continual classification and disentanglement benchmarks of +arbitrary length and with full control over generative factors. We show that +over a sufficiently long time horizon, the performance of all major types of +continual learning methods deteriorates on this simple benchmark. This result +highlights an important and previously overlooked aspect of continual learning: +given a finite modelling capacity and an arbitrarily long learning horizon, +efficient learning requires memorizing class-specific information and +accumulating knowledge about general mechanisms. In a simple setting with +direct supervision on the generative factors, we show how learning +class-agnostic transformations offers a way to circumvent catastrophic +forgetting and improve classification accuracy over time. Our approach sets the +stage for continual learning over hundreds of tasks with explicit control over +memorization and forgetting, emphasizing open-set classification and one-shot +generalization. + +
+
+ comment: 10 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ AUGCAL: Improving Sim2Real Adaptation by Uncertainty Calibration on + Augmented Synthetic Images ICLR 2024 + + +
+ Synthetic data (SIM) drawn from simulators have emerged as a popular +alternative for training models where acquiring annotated real-world images is +difficult. However, transferring models trained on synthetic images to +real-world applications can be challenging due to appearance disparities. A +commonly employed solution to counter this SIM2REAL gap is unsupervised domain +adaptation, where models are trained using labeled SIM data and unlabeled REAL +data. Mispredictions made by such SIM2REAL adapted models are often associated +with miscalibration - stemming from overconfident predictions on real data. In +this paper, we introduce AUGCAL, a simple training-time patch for unsupervised +adaptation that improves SIM2REAL adapted models by - (1) reducing overall +miscalibration, (2) reducing overconfidence in incorrect predictions and (3) +improving confidence score reliability by better guiding misclassification +detection - all while retaining or improving SIM2REAL performance. Given a base +SIM2REAL adaptation algorithm, at training time, AUGCAL involves replacing +vanilla SIM images with strongly augmented views (AUG intervention) and +additionally optimizing for a training time calibration loss on augmented SIM +predictions (CAL intervention). We motivate AUGCAL using a brief analytical +justification of how to reduce miscalibration on unlabeled REAL data. Through +our experiments, we empirically show the efficacy of AUGCAL across multiple +adaptation methods, backbones, tasks and shifts. + +
+
+ comment: Published at ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Auto-Regressive Next-Token Predictors are Universal Learners + + +
+ Large language models display remarkable capabilities in logical and +mathematical reasoning, allowing them to solve complex tasks. Interestingly, +these abilities emerge in networks trained on the simple task of next-token +prediction. In this work, we present a theoretical framework for studying +auto-regressive next-token predictors. We demonstrate that even simple models +such as linear next-token predictors, trained on Chain-of-Thought (CoT) data, +can approximate any function efficiently computed by a Turing machine. We +introduce a new complexity measure -- length complexity -- which measures the +number of intermediate tokens in a CoT sequence required to approximate some +target function, and analyze the interplay between length complexity and other +notions of complexity. Finally, we show experimentally that simple next-token +predictors, such as linear networks and shallow Multi-Layer Perceptrons (MLPs), +display non-trivial performance on text generation and arithmetic tasks. Our +results demonstrate that the power of today's LLMs can be attributed, to a +great extent, to the auto-regressive next-token training scheme, and not +necessarily to a particular choice of architecture. + +
+
+
+
+
+ + ♻ ☆ Dream2Real: Zero-Shot 3D Object Rearrangement with Vision-Language + Models ICRA 2024 + + +
+ We introduce Dream2Real, a robotics framework which integrates +vision-language models (VLMs) trained on 2D data into a 3D object rearrangement +pipeline. This is achieved by the robot autonomously constructing a 3D +representation of the scene, where objects can be rearranged virtually and an +image of the resulting arrangement rendered. These renders are evaluated by a +VLM, so that the arrangement which best satisfies the user instruction is +selected and recreated in the real world with pick-and-place. This enables +language-conditioned rearrangement to be performed zero-shot, without needing +to collect a training dataset of example arrangements. Results on a series of +real-world tasks show that this framework is robust to distractors, +controllable by language, capable of understanding complex multi-object +relations, and readily applicable to both tabletop and 6-DoF rearrangement +tasks. + +
+
+ comment: ICRA 2024. Project webpage with robot videos: + https://www.robot-learning.uk/dream2real +
+
+
+
+
+ + ♻ ☆ Chain of Code: Reasoning with a Language Model-Augmented Code Emulator ICML 2024 + + +
+ Code provides a general syntactic structure to build complex programs and +perform precise computations when paired with a code interpreter - we +hypothesize that language models (LMs) can leverage code-writing to improve +Chain of Thought reasoning not only for logic and arithmetic tasks, but also +for semantic ones (and in particular, those that are a mix of both). For +example, consider prompting an LM to write code that counts the number of times +it detects sarcasm in an essay: the LM may struggle to write an implementation +for "detect_sarcasm(string)" that can be executed by the interpreter (handling +the edge cases would be insurmountable). However, LMs may still produce a valid +solution if they not only write code, but also selectively "emulate" the +interpreter by generating the expected output of "detect_sarcasm(string)". In +this work, we propose Chain of Code (CoC), a simple yet surprisingly effective +extension that improves LM code-driven reasoning. The key idea is to encourage +LMs to format semantic sub-tasks in a program as flexible pseudocode that the +interpreter can explicitly catch undefined behaviors and hand off to simulate +with an LM (as an "LMulator"). Experiments demonstrate that Chain of Code +outperforms Chain of Thought and other baselines across a variety of +benchmarks; on BIG-Bench Hard, Chain of Code achieves 84%, a gain of 12% over +Chain of Thought. In a nutshell, CoC broadens the scope of reasoning questions +that LMs can answer by "thinking in code". + +
+
+ comment: ICML 2024 Oral; Project webpage: https://chain-of-code.github.io +
+
+
+
+
+ + ♻ ☆ SpotlessSplats: Ignoring Distractors in 3D Gaussian Splatting + + +
+ 3D Gaussian Splatting (3DGS) is a promising technique for 3D reconstruction, +offering efficient training and rendering speeds, making it suitable for +real-time applications.However, current methods require highly controlled +environments (no moving people or wind-blown elements, and consistent lighting) +to meet the inter-view consistency assumption of 3DGS. This makes +reconstruction of real-world captures problematic. We present SpotLessSplats, +an approach that leverages pre-trained and general-purpose features coupled +with robust optimization to effectively ignore transient distractors. Our +method achieves state-of-the-art reconstruction quality both visually and +quantitatively, on casual captures. Additional results available at: +https://spotlesssplats.github.io + +
+
+
+
+
+ + ♻ ☆ There is more to graphs than meets the eye: Learning universal features + with self-supervision + + +
+ We study the problem of learning features through self-supervision that are +generalisable to multiple graphs. State-of-the-art graph self-supervision +restricts training to only one graph, resulting in graph-specific models that +are incompatible with different but related graphs. We hypothesize that +training with more than one graph that belong to the same family can improve +the quality of the learnt representations. However, learning universal features +from disparate node/edge features in different graphs is non-trivial. To +address this challenge, we first homogenise the disparate features with +graph-specific encoders that transform the features into a common space. A +universal representation learning module then learns generalisable features on +this common space. We show that compared to traditional self-supervision with +one graph, our approach results in (1) better performance on downstream node +classification, (2) learning features that can be re-used for unseen graphs of +the same family, (3) more efficient training and (4) compact yet generalisable +models. We also show ability of the proposed framework to deliver these +benefits for relatively larger graphs. In this paper, we present a principled +way to design foundation graph models that learn from more than one graph in an +end-to-end manner, while bridging the gap between self-supervised and +supervised performance. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2302.11939, + arXiv:2301.13287, arXiv:2305.12686, arXiv:2305.02299 +
+
+
+
+
+ + ♻ ☆ Information Leakage Detection through Approximate Bayes-optimal + Prediction + + +
+ In today's data-driven world, the proliferation of publicly available +information raises security concerns due to the information leakage (IL) +problem. IL involves unintentionally exposing sensitive information to +unauthorized parties via observable system information. Conventional +statistical approaches rely on estimating mutual information (MI) between +observable and secret information for detecting ILs, face challenges of the +curse of dimensionality, convergence, computational complexity, and MI +misestimation. Though effective, emerging supervised machine learning based +approaches to detect ILs are limited to binary system sensitive information and +lack a comprehensive framework. To address these limitations, we establish a +theoretical framework using statistical learning theory and information theory +to quantify and detect IL accurately. Using automated machine learning, we +demonstrate that MI can be accurately estimated by approximating the typically +unknown Bayes predictor's log-loss and accuracy. Based on this, we show how MI +can effectively be estimated to detect ILs. Our method performs superior to +state-of-the-art baselines in an empirical study considering synthetic and +real-world OpenSSL TLS server datasets. + +
+
+ comment: Under submission in Information Sciences +
+
+
+
+
+ + ♻ ☆ Physics-informed Discretization-independent Deep Compositional Operator + Network + + +
+ Solving parametric Partial Differential Equations (PDEs) for a broad range of +parameters is a critical challenge in scientific computing. To this end, neural +operators, which \textcolor{black}{predicts the PDE solution with variable PDE +parameter inputs}, have been successfully used. However, the training of neural +operators typically demands large training datasets, the acquisition of which +can be prohibitively expensive. To address this challenge, physics-informed +training can offer a cost-effective strategy. However, current physics-informed +neural operators face limitations, either in handling irregular domain shapes +or in in generalizing to various discrete representations of PDE parameters. In +this research, we introduce a novel physics-informed model architecture which +can generalize to various discrete representations of PDE parameters and +irregular domain shapes. Particularly, inspired by deep operator neural +networks, our model involves a discretization-independent learning of parameter +embedding repeatedly, and this parameter embedding is integrated with the +response embeddings through multiple compositional layers, for more +expressivity. Numerical results demonstrate the accuracy and efficiency of the +proposed method. + +
+
+
+
+
+ + ♻ ☆ FloorSet -- a VLSI Floorplanning Dataset with Design Constraints of + Real-World SoCs + + +
+ Floorplanning for systems-on-a-chip (SoCs) and its sub-systems is a crucial +and non-trivial step of the physical design flow. It represents a difficult +combinatorial optimization problem. A typical large scale SoC with 120 +partitions generates a search-space of nearly 10E250. As novel machine learning +(ML) approaches emerge to tackle such problems, there is a growing need for a +modern benchmark that comprises a large training dataset and performance +metrics that better reflect real-world constraints and objectives compared to +existing benchmarks. To address this need, we present FloorSet -- two +comprehensive datasets of synthetic fixed-outline floorplan layouts that +reflect the distribution of real SoCs. Each dataset has 1M training samples and +100 test samples where each sample is a synthetic floor-plan. FloorSet-Prime +comprises fully-abutted rectilinear partitions and near-optimal wire-length. A +simplified dataset that reflects early design phases, FloorSet-Lite comprises +rectangular partitions, with under 5 percent white-space and near-optimal +wire-length. Both datasets define hard constraints seen in modern design flows +such as shape constraints, edge-affinity, grouping constraints, and +pre-placement constraints. FloorSet is intended to spur fundamental research on +large-scale constrained optimization problems. Crucially, FloorSet alleviates +the core issue of reproducibility in modern ML driven solutions to such +problems. FloorSet is available as an open-source repository for the research +community. + +
+
+ comment: 10 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Synthetic Counterfactual Faces + + +
+ Computer vision systems have been deployed in various applications involving +biometrics like human faces. These systems can identify social media users, +search for missing persons, and verify identity of individuals. While computer +vision models are often evaluated for accuracy on available benchmarks, more +annotated data is necessary to learn about their robustness and fairness +against semantic distributional shifts in input data, especially in face data. +Among annotated data, counterfactual examples grant strong explainability +characteristics. Because collecting natural face data is prohibitively +expensive, we put forth a generative AI-based framework to construct targeted, +counterfactual, high-quality synthetic face data. Our synthetic data pipeline +has many use cases, including face recognition systems sensitivity evaluations +and image understanding system probes. The pipeline is validated with multiple +user studies. We showcase the efficacy of our face generation pipeline on a +leading commercial vision model. We identify facial attributes that cause +vision systems to fail. + +
+
+ comment: Paper under review. Full text and results will be updated after + acceptance +
+
+
+
+
+ + ♻ ☆ Personalized Steering of Large Language Models: Versatile Steering + Vectors Through Bi-directional Preference Optimization + + +
+ Researchers have been studying approaches to steer the behavior of Large +Language Models (LLMs) and build personalized LLMs tailored for various +applications. While fine-tuning seems to be a direct solution, it requires +substantial computational resources and may significantly affect the utility of +the original LLM. Recent endeavors have introduced more lightweight strategies, +focusing on extracting "steering vectors" to guide the model's output toward +desired behaviors by adjusting activations within specific layers of the LLM's +transformer architecture. However, such steering vectors are directly extracted +from the activations of human preference data and thus often lead to suboptimal +results and occasional failures, especially in alignment-related scenarios. +This work proposes an innovative approach that could produce more effective +steering vectors through bi-directional preference optimization. Our method is +designed to allow steering vectors to directly influence the generation +probability of contrastive human preference data pairs, thereby offering a more +precise representation of the target behavior. By carefully adjusting the +direction and magnitude of the steering vector, we enabled personalized control +over the desired behavior across a spectrum of intensities. Extensive +experimentation across various open-ended generation tasks, particularly +focusing on steering AI personas, has validated the efficacy of our approach. +Moreover, we comprehensively investigate critical alignment-concerning +scenarios, such as managing truthfulness, mitigating hallucination, and +addressing jailbreaking attacks. Remarkably, our method can still demonstrate +outstanding steering effectiveness across these scenarios. Furthermore, we +showcase the transferability of our steering vectors across different +models/LoRAs and highlight the synergistic benefits of applying multiple +vectors simultaneously. + +
+
+
+
+
+ + ♻ ☆ GATE: How to Keep Out Intrusive Neighbors ICML + + +
+ Graph Attention Networks (GATs) are designed to provide flexible neighborhood +aggregation that assigns weights to neighbors according to their importance. In +practice, however, GATs are often unable to switch off task-irrelevant +neighborhood aggregation, as we show experimentally and analytically. To +address this challenge, we propose GATE, a GAT extension that holds three major +advantages: i) It alleviates over-smoothing by addressing its root cause of +unnecessary neighborhood aggregation. ii) Similarly to perceptrons, it benefits +from higher depth as it can still utilize additional layers for (non-)linear +feature transformations in case of (nearly) switched-off neighborhood +aggregation. iii) By down-weighting connections to unrelated neighbors, it +often outperforms GATs on real-world heterophilic datasets. To further validate +our claims, we construct a synthetic test bed to analyze a model's ability to +utilize the appropriate amount of neighborhood aggregation, which could be of +independent interest. + +
+
+ comment: 26 pages. Published at the International Conference on Machine + Learning (ICML), 2024 +
+
+
+
+
+ + ♻ ☆ SMARLA: A Safety Monitoring Approach for Deep Reinforcement Learning + Agents + + +
+ Deep reinforcement learning algorithms (DRL) are increasingly being used in +safety-critical systems. Ensuring the safety of DRL agents is a critical +concern in such contexts. However, relying solely on testing is not sufficient +to ensure safety as it does not offer guarantees. Building safety monitors is +one solution to alleviate this challenge. This paper proposes SMARLA, a machine +learning-based safety monitoring approach designed for DRL agents. For +practical reasons, SMARLA is agnostic to the type of DRL agent's inputs. +Further, it is designed to be black-box (as it does not require access to the +internals or training data of the agent) by leveraging state abstraction to +facilitate the learning of safety violation prediction models from the agent's +states using a reduced state space. We quantitatively and qualitatively +validated SMARLA on three well-known RL case studies. Empirical results reveal +that SMARLA achieves accurate violation prediction with a low false positive +rate and can predict safety violations at an early stage, approximately halfway +through the execution of the agent, before violations occur. + +
+
+
+
+
+ + ♻ ☆ To the Max: Reinventing Reward in Reinforcement Learning + + +
+ In reinforcement learning (RL), different reward functions can define the +same optimal policy but result in drastically different learning performance. +For some, the agent gets stuck with a suboptimal behavior, and for others, it +solves the task efficiently. Choosing a good reward function is hence an +extremely important yet challenging problem. In this paper, we explore an +alternative approach for using rewards for learning. We introduce +\textit{max-reward RL}, where an agent optimizes the maximum rather than the +cumulative reward. Unlike earlier works, our approach works for deterministic +and stochastic environments and can be easily combined with state-of-the-art RL +algorithms. In the experiments, we study the performance of max-reward RL +algorithms in two goal-reaching environments from Gymnasium-Robotics and +demonstrate its benefits over standard RL. The code is available at +https://github.com/veviurko/To-the-Max. + +
+
+
+
+
+
+
+
+ + Multimedia 9 + +
+
+
+ + ☆ AxiomVision: Accuracy-Guaranteed Adaptive Visual Model Selection for + Perspective-Aware Video Analytics ACM MM 2024 + + +
+ The rapid evolution of multimedia and computer vision technologies requires +adaptive visual model deployment strategies to effectively handle diverse tasks +and varying environments. This work introduces AxiomVision, a novel framework +that can guarantee accuracy by leveraging edge computing to dynamically select +the most efficient visual models for video analytics under diverse scenarios. +Utilizing a tiered edge-cloud architecture, AxiomVision enables the deployment +of a broad spectrum of visual models, from lightweight to complex DNNs, that +can be tailored to specific scenarios while considering camera source impacts. +In addition, AxiomVision provides three core innovations: (1) a dynamic visual +model selection mechanism utilizing continual online learning, (2) an efficient +online method that efficiently takes into account the influence of the camera's +perspective, and (3) a topology-driven grouping approach that accelerates the +model selection process. With rigorous theoretical guarantees, these +advancements provide a scalable and effective solution for visual tasks +inherent to multimedia systems, such as object detection, classification, and +counting. Empirically, AxiomVision achieves a 25.7\% improvement in accuracy. + +
+
+ comment: Accepted by ACM MM 2024 +
+
+
+
+
+ + ☆ HeadsetOff: Enabling Photorealistic Video Conferencing on Economical VR + Headsets + + +
+ Virtual Reality (VR) headsets have become increasingly popular for remote +collaboration, but video conferencing poses challenges when the user's face is +covered by the headset. Existing solutions have limitations in terms of +accessibility. In this paper, we propose HeadsetOff, a novel system that +achieves photorealistic video conferencing on economical VR headsets by +leveraging voice-driven face reconstruction. HeadsetOff consists of three main +components: a multimodal attention-based predictor, a generator, and an +adaptive controller. The predictor effectively predicts user future behavior +based on different modalities. The generator employs voice input, head motion, +and eye blink to animate the human face. The adaptive controller dynamically +selects the appropriate generator model based on the trade-off between video +quality and delay, aiming to maximize Quality of Experience while minimizing +latency. Experimental results demonstrate the effectiveness of HeadsetOff in +achieving high-quality, low-latency video conferencing on economical VR +headsets. + +
+
+ comment: Accepted by ACM Multimedia 2024 +
+
+
+
+
+ + ☆ MambaGesture: Enhancing Co-Speech Gesture Generation with Mamba and + Disentangled Multi-Modality Fusion ACM MM 2024 + + +
+ Co-speech gesture generation is crucial for producing synchronized and +realistic human gestures that accompany speech, enhancing the animation of +lifelike avatars in virtual environments. While diffusion models have shown +impressive capabilities, current approaches often overlook a wide range of +modalities and their interactions, resulting in less dynamic and contextually +varied gestures. To address these challenges, we present MambaGesture, a novel +framework integrating a Mamba-based attention block, MambaAttn, with a +multi-modality feature fusion module, SEAD. The MambaAttn block combines the +sequential data processing strengths of the Mamba model with the contextual +richness of attention mechanisms, enhancing the temporal coherence of generated +gestures. SEAD adeptly fuses audio, text, style, and emotion modalities, +employing disentanglement to deepen the fusion process and yield gestures with +greater realism and diversity. Our approach, rigorously evaluated on the +multi-modal BEAT dataset, demonstrates significant improvements in Fr\'echet +Gesture Distance (FGD), diversity scores, and beat alignment, achieving +state-of-the-art performance in co-speech gesture generation. + +
+
+ comment: Accepted to ACM MM 2024 +
+
+
+
+
+ + ☆ ComNeck: Bridging Compressed Image Latents and Multimodal LLMs via + Universal Transform-Neck + + +
+ This paper presents the first-ever study of adapting compressed image latents +to suit the needs of downstream vision tasks that adopt Multimodal Large +Language Models (MLLMs). MLLMs have extended the success of large language +models to modalities (e.g. images) beyond text, but their billion scale hinders +deployment on resource-constrained end devices. While cloud-hosted MLLMs could +be available, transmitting raw, uncompressed images captured by end devices to +the cloud requires an efficient image compression system. To address this, we +focus on emerging neural image compression and propose a novel framework with a +lightweight transform-neck and a surrogate loss to adapt compressed image +latents for MLLM-based vision tasks. The proposed framework is generic and +applicable to multiple application scenarios, where the neural image codec can +be (1) pre-trained for human perception without updating, (2) fully updated for +joint human and machine perception, or (3) fully updated for only machine +perception. The transform-neck trained with the surrogate loss is universal, +for it can serve various downstream vision tasks enabled by a variety of MLLMs +that share the same visual encoder. Our framework has the striking feature of +excluding the downstream MLLMs from training the transform-neck, and +potentially the neural image codec as well. This stands out from most existing +coding for machine approaches that involve downstream networks in training and +thus could be impractical when the networks are MLLMs. Extensive experiments on +different neural image codecs and various MLLM-based vision tasks show that our +method achieves great rate-accuracy performance with much less complexity, +demonstrating its effectiveness. + +
+
+
+
+
+ + ☆ LoginMEA: Local-to-Global Interaction Network for Multi-modal Entity + Alignment ECAI 2024 + + +
+ Multi-modal entity alignment (MMEA) aims to identify equivalent entities +between two multi-modal knowledge graphs (MMKGs), whose entities can be +associated with relational triples and related images. Most previous studies +treat the graph structure as a special modality, and fuse different modality +information with separate uni-modal encoders, neglecting valuable relational +associations in modalities. Other studies refine each uni-modal information +with graph structures, but may introduce unnecessary relations in specific +modalities. To this end, we propose a novel local-to-global interaction network +for MMEA, termed as LoginMEA. Particularly, we first fuse local multi-modal +interactions to generate holistic entity semantics and then refine them with +global relational interactions of entity neighbors. In this design, the +uni-modal information is fused adaptively, and can be refined with relations +accordingly. To enrich local interactions of multi-modal entity information, we +device modality weights and low-rank interactive fusion, allowing diverse +impacts and element-level interactions among modalities. To capture global +interactions of graph structures, we adopt relation reflection graph attention +networks, which fully capture relational associations between entities. +Extensive experiments demonstrate superior results of our method over 5 +cross-KG or bilingual benchmark datasets, indicating the effectiveness of +capturing local and global interactions. + +
+
+ comment: Accepted by ECAI 2024 +
+
+
+
+
+ + ☆ BRIDGE: Bridging Gaps in Image Captioning Evaluation with Stronger + Visual Cues ECCV 2024 + + +
+ Effectively aligning with human judgment when evaluating machine-generated +image captions represents a complex yet intriguing challenge. Existing +evaluation metrics like CIDEr or CLIP-Score fall short in this regard as they +do not take into account the corresponding image or lack the capability of +encoding fine-grained details and penalizing hallucinations. To overcome these +issues, in this paper, we propose BRIDGE, a new learnable and reference-free +image captioning metric that employs a novel module to map visual features into +dense vectors and integrates them into multi-modal pseudo-captions which are +built during the evaluation process. This approach results in a multimodal +metric that properly incorporates information from the input image without +relying on reference captions, bridging the gap between human judgment and +machine-generated image captions. Experiments spanning several datasets +demonstrate that our proposal achieves state-of-the-art results compared to +existing reference-free evaluation scores. Our source code and trained models +are publicly available at: https://github.com/aimagelab/bridge-score. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ☆ Contrasting Deepfakes Diffusion via Contrastive Learning and + Global-Local Similarities ECCV 2024 + + +
+ Discerning between authentic content and that generated by advanced AI +methods has become increasingly challenging. While previous research primarily +addresses the detection of fake faces, the identification of generated natural +images has only recently surfaced. This prompted the recent exploration of +solutions that employ foundation vision-and-language models, like CLIP. +However, the CLIP embedding space is optimized for global image-to-text +alignment and is not inherently designed for deepfake detection, neglecting the +potential benefits of tailored training and local image features. In this +study, we propose CoDE (Contrastive Deepfake Embeddings), a novel embedding +space specifically designed for deepfake detection. CoDE is trained via +contrastive learning by additionally enforcing global-local similarities. To +sustain the training of our model, we generate a comprehensive dataset that +focuses on images generated by diffusion models and encompasses a collection of +9.2 million images produced by using four different generators. Experimental +results demonstrate that CoDE achieves state-of-the-art accuracy on the newly +collected dataset, while also showing excellent generalization capabilities to +unseen image generators. Our source code, trained models, and collected dataset +are publicly available at: https://github.com/aimagelab/CoDE. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ☆ UNQA: Unified No-Reference Quality Assessment for Audio, Image, Video, + and Audio-Visual Content + + +
+ As multimedia data flourishes on the Internet, quality assessment (QA) of +multimedia data becomes paramount for digital media applications. Since +multimedia data includes multiple modalities including audio, image, video, and +audio-visual (A/V) content, researchers have developed a range of QA methods to +evaluate the quality of different modality data. While they exclusively focus +on addressing the single modality QA issues, a unified QA model that can handle +diverse media across multiple modalities is still missing, whereas the latter +can better resemble human perception behaviour and also have a wider range of +applications. In this paper, we propose the Unified No-reference Quality +Assessment model (UNQA) for audio, image, video, and A/V content, which tries +to train a single QA model across different media modalities. To tackle the +issue of inconsistent quality scales among different QA databases, we develop a +multi-modality strategy to jointly train UNQA on multiple QA databases. Based +on the input modality, UNQA selectively extracts the spatial features, motion +features, and audio features, and calculates a final quality score via the four +corresponding modality regression modules. Compared with existing QA methods, +UNQA has two advantages: 1) the multi-modality training strategy makes the QA +model learn more general and robust quality-aware feature representation as +evidenced by the superior performance of UNQA compared to state-of-the-art QA +methods. 2) UNQA reduces the number of models required to assess multimedia +data across different modalities. and is friendly to deploy to practical +applications. + +
+
+
+
+
+ + ♻ ☆ Leveraging Pre-trained AudioLDM for Sound Generation: A Benchmark Study + + +
+ Deep neural networks have recently achieved breakthroughs in sound +generation. Despite the outstanding sample quality, current sound generation +models face issues on small-scale datasets (e.g., overfitting), significantly +limiting performance. In this paper, we make the first attempt to investigate +the benefits of pre-training on sound generation with AudioLDM, the +cutting-edge model for audio generation, as the backbone. Our study +demonstrates the advantages of the pre-trained AudioLDM, especially in +data-scarcity scenarios. In addition, the baselines and evaluation protocol for +sound generation systems are not consistent enough to compare different studies +directly. Aiming to facilitate further study on sound generation tasks, we +benchmark the sound generation task on various frequently-used datasets. We +hope our results on transfer learning and benchmarks can provide references for +further research on conditional sound generation. + +
+
+ comment: Updated for EUSIPCO 2023 proceedings version +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 28 + +
+
+
+ + ☆ You shall know a piece by the company it keeps. Chess plays as a data + for word2vec models + + +
+ In this paper, I apply linguistic methods of analysis to non-linguistic data, +chess plays, metaphorically equating one with the other and seeking analogies. +Chess game notations are also a kind of text, and one can consider the records +of moves or positions of pieces as words and statements in a certain language. +In this article I show how word embeddings (word2vec) can work on chess game +texts instead of natural language texts. I don't see how this representation of +chess data can be used productively. It's unlikely that these vector models +will help engines or people choose the best move. But in a purely academic +sense, it's clear that such methods of information representation capture +something important about the very nature of the game, which doesn't +necessarily lead to a win. + +
+
+ comment: 14 pages, 7 figures +
+
+
+
+
+ + ☆ Meta-Rewarding Language Models: Self-Improving Alignment with + LLM-as-a-Meta-Judge + + +
+ Large Language Models (LLMs) are rapidly surpassing human knowledge in many +domains. While improving these models traditionally relies on costly human +data, recent self-rewarding mechanisms (Yuan et al., 2024) have shown that LLMs +can improve by judging their own responses instead of relying on human +labelers. However, existing methods have primarily focused on improving model +responses rather than judgment capabilities, resulting in rapid saturation +during iterative training. To address this issue, we introduce a novel +Meta-Rewarding step to the self-improvement process, where the model judges its +own judgements and uses that feedback to refine its judgment skills. +Surprisingly, this unsupervised approach improves the model's ability to judge +{\em and} follow instructions, as demonstrated by a win rate improvement of +Llama-3-8B-Instruct from 22.9% to 39.4% on AlpacaEval 2, and 20.6% to 29.1% on +Arena-Hard. These results strongly suggest the potential for self-improving +models without human supervision. + +
+
+
+
+
+ + ☆ SaulLM-54B & SaulLM-141B: Scaling Up Domain Adaptation for the Legal + Domain + + +
+ In this paper, we introduce SaulLM-54B and SaulLM-141B, two large language +models (LLMs) tailored for the legal sector. These models, which feature +architectures of 54 billion and 141 billion parameters, respectively, are based +on the Mixtral architecture. The development of SaulLM-54B and SaulLM-141B is +guided by large-scale domain adaptation, divided into three strategies: (1) the +exploitation of continued pretraining involving a base corpus that includes +over 540 billion of legal tokens, (2) the implementation of a specialized legal +instruction-following protocol, and (3) the alignment of model outputs with +human preferences in legal interpretations. The integration of synthetically +generated data in the second and third steps enhances the models' capabilities +in interpreting and processing legal texts, effectively reaching +state-of-the-art performance and outperforming previous open-source models on +LegalBench-Instruct. This work explores the trade-offs involved in +domain-specific adaptation at this scale, offering insights that may inform +future studies on domain adaptation using strong decoder models. Building upon +SaulLM-7B, this study refines the approach to produce an LLM better equipped +for legal tasks. We are releasing base, instruct, and aligned versions on top +of SaulLM-54B and SaulLM-141B under the MIT License to facilitate reuse and +collaborative research. + +
+
+
+
+
+ + ☆ Memory-efficient Training of LLMs with Larger Mini-batches + + +
+ Training with larger mini-batches improves the performance and convergence +rate of training machine learning models. However, training with large +mini-batches becomes prohibitive for Large Language Models (LLMs) with billions +of parameters, due to the large GPU memory requirement. To address this +problem, we propose finding small mini-batches that simulate the dynamics of +training with larger mini-batches. Specifically, we formulate selecting smaller +mini-batches of examples that closely capture gradients of large mini-batches +as a submodular maximization problem. Nevertheless, the very large +dimensionality of the gradients makes the problem very challenging to solve. To +address this, we leverage ideas from zeroth-order optimization and neural +network pruning to find lower-dimensional gradient estimates that allow finding +high-quality subsets effectively with a limited amount of memory. We prove the +superior convergence rate of training on the small mini-batches found by our +method and empirically show its effectiveness. Our method can effectively +reduce the memory requirement by 2x and speed up training by 1.3x, as we +confirm for fine-tuning Phi-2 on MathInstruct. Our method can be easily stacked +with LoRA and other memory-efficient methods to further reduce the memory +requirements of training LLMs. + +
+
+ comment: 15 pages, 2 figures, 4 tables +
+
+
+
+
+ + ☆ Are LLMs Good Annotators for Discourse-level Event Relation Extraction? + + +
+ Large Language Models (LLMs) have demonstrated proficiency in a wide array of +natural language processing tasks. However, its effectiveness over +discourse-level event relation extraction (ERE) tasks remains unexplored. In +this paper, we assess the effectiveness of LLMs in addressing discourse-level +ERE tasks characterized by lengthy documents and intricate relations +encompassing coreference, temporal, causal, and subevent types. Evaluation is +conducted using an commercial model, GPT-3.5, and an open-source model, +LLaMA-2. Our study reveals a notable underperformance of LLMs compared to the +baseline established through supervised learning. Although Supervised +Fine-Tuning (SFT) can improve LLMs performance, it does not scale well compared +to the smaller supervised baseline model. Our quantitative and qualitative +analysis shows that LLMs have several weaknesses when applied for extracting +event relations, including a tendency to fabricate event mentions, and failures +to capture transitivity rules among relations, detect long distance relations, +or comprehend contexts with dense event mentions. + +
+
+
+
+
+ + ☆ Motamot: A Dataset for Revealing the Supremacy of Large Language Models + over Transformer Models in Bengali Political Sentiment Analysis + + +
+ Sentiment analysis is the process of identifying and categorizing people's +emotions or opinions regarding various topics. Analyzing political sentiment is +critical for understanding the complexities of public opinion processes, +especially during election seasons. It gives significant information on voter +preferences, attitudes, and current trends. In this study, we investigate +political sentiment analysis during Bangladeshi elections, specifically +examining how effectively Pre-trained Language Models (PLMs) and Large Language +Models (LLMs) capture complex sentiment characteristics. Our study centers on +the creation of the "Motamot" dataset, comprising 7,058 instances annotated +with positive and negative sentiments, sourced from diverse online newspaper +portals, forming a comprehensive resource for political sentiment analysis. We +meticulously evaluate the performance of various PLMs including BanglaBERT, +Bangla BERT Base, XLM-RoBERTa, mBERT, and sahajBERT, alongside LLMs such as +Gemini 1.5 Pro and GPT 3.5 Turbo. Moreover, we explore zero-shot and few-shot +learning strategies to enhance our understanding of political sentiment +analysis methodologies. Our findings underscore BanglaBERT's commendable +accuracy of 88.10% among PLMs. However, the exploration into LLMs reveals even +more promising results. Through the adept application of Few-Shot learning +techniques, Gemini 1.5 Pro achieves an impressive accuracy of 96.33%, +surpassing the remarkable performance of GPT 3.5 Turbo, which stands at 94%. +This underscores Gemini 1.5 Pro's status as the superior performer in this +comparison. + +
+
+ comment: Accepted for publication in "The IEEE Region 10 Symposium (TENSYMP + 2024)" +
+
+
+
+
+ + ☆ Open Sentence Embeddings for Portuguese with the Serafim PT* encoders + family + + +
+ Sentence encoder encode the semantics of their input, enabling key downstream +applications such as classification, clustering, or retrieval. In this paper, +we present Serafim PT*, a family of open-source sentence encoders for +Portuguese with various sizes, suited to different hardware/compute budgets. +Each model exhibits state-of-the-art performance and is made openly available +under a permissive license, allowing its use for both commercial and research +purposes. Besides the sentence encoders, this paper contributes a systematic +study and lessons learned concerning the selection criteria of learning +objectives and parameters that support top-performing encoders. + +
+
+
+
+
+ + ☆ Impact of Decoding Methods on Human Alignment of Conversational LLMs + + +
+ To be included into chatbot systems, Large language models (LLMs) must be +aligned with human conversational conventions. However, being trained mainly on +web-scraped data gives existing LLMs a voice closer to informational text than +actual human speech. In this paper, we examine the effect of decoding methods +on the alignment between LLM-generated and human conversations, including Beam +Search, Top K Sampling, and Nucleus Sampling. We present new measures of +alignment in substance, style, and psychometric orientation, and experiment +with two conversation datasets. Our results provide subtle insights: better +alignment is attributed to fewer beams in Beam Search and lower values of P in +Nucleus Sampling. We also find that task-oriented and open-ended datasets +perform differently in terms of alignment, indicating the significance of +taking into account the context of the interaction. + +
+
+
+
+
+ + ☆ Visual Riddles: a Commonsense and World Knowledge Challenge for Large + Vision and Language Models + + +
+ Imagine observing someone scratching their arm; to understand why, additional +context would be necessary. However, spotting a mosquito nearby would +immediately offer a likely explanation for the person's discomfort, thereby +alleviating the need for further information. This example illustrates how +subtle visual cues can challenge our cognitive skills and demonstrates the +complexity of interpreting visual scenarios. To study these skills, we present +Visual Riddles, a benchmark aimed to test vision and language models on visual +riddles requiring commonsense and world knowledge. The benchmark comprises 400 +visual riddles, each featuring a unique image created by a variety of +text-to-image models, question, ground-truth answer, textual hint, and +attribution. Human evaluation reveals that existing models lag significantly +behind human performance, which is at 82\% accuracy, with Gemini-Pro-1.5 +leading with 40\% accuracy. Our benchmark comes with automatic evaluation tasks +to make assessment scalable. These findings underscore the potential of Visual +Riddles as a valuable resource for enhancing vision and language models' +capabilities in interpreting complex visual scenarios. + +
+
+ comment: https://visual-riddles.github.io/ +
+
+
+
+
+ + ☆ ASI-Seg: Audio-Driven Surgical Instrument Segmentation with Surgeon + Intention Understanding IROS 2024 + + +
+ Surgical instrument segmentation is crucial in surgical scene understanding, +thereby facilitating surgical safety. Existing algorithms directly detected all +instruments of pre-defined categories in the input image, lacking the +capability to segment specific instruments according to the surgeon's +intention. During different stages of surgery, surgeons exhibit varying +preferences and focus toward different surgical instruments. Therefore, an +instrument segmentation algorithm that adheres to the surgeon's intention can +minimize distractions from irrelevant instruments and assist surgeons to a +great extent. The recent Segment Anything Model (SAM) reveals the capability to +segment objects following prompts, but the manual annotations for prompts are +impractical during the surgery. To address these limitations in operating +rooms, we propose an audio-driven surgical instrument segmentation framework, +named ASI-Seg, to accurately segment the required surgical instruments by +parsing the audio commands of surgeons. Specifically, we propose an +intention-oriented multimodal fusion to interpret the segmentation intention +from audio commands and retrieve relevant instrument details to facilitate +segmentation. Moreover, to guide our ASI-Seg segment of the required surgical +instruments, we devise a contrastive learning prompt encoder to effectively +distinguish the required instruments from the irrelevant ones. Therefore, our +ASI-Seg promotes the workflow in the operating rooms, thereby providing +targeted support and reducing the cognitive load on surgeons. Extensive +experiments are performed to validate the ASI-Seg framework, which reveals +remarkable advantages over classical state-of-the-art and medical SAMs in both +semantic segmentation and intention-oriented segmentation. The source code is +available at https://github.com/Zonmgin-Zhang/ASI-Seg. + +
+
+ comment: This work is accepted by IROS 2024 (Oral) +
+
+
+
+
+ + ☆ LLAVADI: What Matters For Multimodal Large Language Models Distillation + + +
+ The recent surge in Multimodal Large Language Models (MLLMs) has showcased +their remarkable potential for achieving generalized intelligence by +integrating visual understanding into Large Language Models.Nevertheless, the +sheer model size of MLLMs leads to substantial memory and computational demands +that hinder their widespread deployment. In this work, we do not propose a new +efficient model structure or train small-scale MLLMs from scratch. Instead, we +focus on what matters for training small-scale MLLMs through knowledge +distillation, which is the first step from the multimodal distillation +perspective. Our extensive studies involve training strategies, model choices, +and distillation algorithms in the knowledge distillation process. These +results show that joint alignment for both tokens and logit alignment plays +critical roles in teacher-student frameworks. In addition, we draw a series of +intriguing observations from this study. By evaluating different benchmarks and +proper strategy, even a 2.7B small-scale model can perform on par with larger +models with 7B or 13B parameters. Our code and models will be publicly +available for further research. + +
+
+
+
+
+ + ☆ Word Segmentation for Asian Languages: Chinese, Korean, and Japanese + + +
+ We provide a detailed overview of various approaches to word segmentation of +Asian Languages, specifically Chinese, Korean, and Japanese languages. For each +language, approaches to deal with word segmentation differs. We also include +our analysis about certain advantages and disadvantages to each method. In +addition, there is room for future work in this field. + +
+
+
+
+
+ + ♻ ☆ A Language Agent for Autonomous Driving + + +
+ Human-level driving is an ultimate goal of autonomous driving. Conventional +approaches formulate autonomous driving as a perception-prediction-planning +framework, yet their systems do not capitalize on the inherent reasoning +ability and experiential knowledge of humans. In this paper, we propose a +fundamental paradigm shift from current pipelines, exploiting Large Language +Models (LLMs) as a cognitive agent to integrate human-like intelligence into +autonomous driving systems. Our approach, termed Agent-Driver, transforms the +traditional autonomous driving pipeline by introducing a versatile tool library +accessible via function calls, a cognitive memory of common sense and +experiential knowledge for decision-making, and a reasoning engine capable of +chain-of-thought reasoning, task planning, motion planning, and +self-reflection. Powered by LLMs, our Agent-Driver is endowed with intuitive +common sense and robust reasoning capabilities, thus enabling a more nuanced, +human-like approach to autonomous driving. We evaluate our approach on the +large-scale nuScenes benchmark, and extensive experiments substantiate that our +Agent-Driver significantly outperforms the state-of-the-art driving methods by +a large margin. Our approach also demonstrates superior interpretability and +few-shot learning ability to these methods. + +
+
+ comment: COLM 2024. Project Page: https://usc-gvl.github.io/Agent-Driver/ +
+
+
+
+
+ + ♻ ☆ LLaVA-NeXT-Interleave: Tackling Multi-image, Video, and 3D in Large + Multimodal Models + + +
+ Visual instruction tuning has made considerable strides in enhancing the +capabilities of Large Multimodal Models (LMMs). However, existing open LMMs +largely focus on single-image tasks, their applications to multi-image +scenarios remains less explored. Additionally, prior LMM research separately +tackles different scenarios, leaving it impossible to generalize cross +scenarios with new emerging capabilities. To this end, we introduce +LLaVA-NeXT-Interleave, which simultaneously tackles Multi-image, Multi-frame +(video), Multi-view (3D), and Multi-patch (single-image) scenarios in LMMs. To +enable these capabilities, we regard the interleaved data format as a general +template and compile the M4-Instruct dataset with 1,177.6k samples, spanning 4 +primary domains with 14 tasks and 41 datasets. We also curate the +LLaVA-Interleave Bench to comprehensively evaluate the multi-image performance +of LMMs. Through extensive experiments, LLaVA-NeXT-Interleave achieves leading +results in multi-image, video, and 3D benchmarks, while maintaining the +performance of single-image tasks. Besides, our model also exhibits several +emerging capabilities, e.g., transferring tasks across different settings and +modalities. Code is available at https://github.com/LLaVA-VL/LLaVA-NeXT + +
+
+ comment: Project Page: + https://llava-vl.github.io/blog/2024-06-16-llava-next-interleave/ +
+
+
+
+
+ + ♻ ☆ Towards Completeness-Oriented Tool Retrieval for Large Language Models CIKM 2024 + + +
+ Recently, integrating external tools with Large Language Models (LLMs) has +gained significant attention as an effective strategy to mitigate the +limitations inherent in their pre-training data. However, real-world systems +often incorporate a wide array of tools, making it impractical to input all +tools into LLMs due to length limitations and latency constraints. Therefore, +to fully exploit the potential of tool-augmented LLMs, it is crucial to develop +an effective tool retrieval system. Existing tool retrieval methods primarily +focus on semantic matching between user queries and tool descriptions, +frequently leading to the retrieval of redundant, similar tools. Consequently, +these methods fail to provide a complete set of diverse tools necessary for +addressing the multifaceted problems encountered by LLMs. In this paper, we +propose a novel modelagnostic COllaborative Learning-based Tool Retrieval +approach, COLT, which captures not only the semantic similarities between user +queries and tool descriptions but also takes into account the collaborative +information of tools. Specifically, we first fine-tune the PLM-based retrieval +models to capture the semantic relationships between queries and tools in the +semantic learning stage. Subsequently, we construct three bipartite graphs +among queries, scenes, and tools and introduce a dual-view graph collaborative +learning framework to capture the intricate collaborative relationships among +tools during the collaborative learning stage. Extensive experiments on both +the open benchmark and the newly introduced ToolLens dataset show that COLT +achieves superior performance. Notably, the performance of BERT-mini (11M) with +our proposed model framework outperforms BERT-large (340M), which has 30 times +more parameters. Furthermore, we will release ToolLens publicly to facilitate +future research on tool retrieval. + +
+
+ comment: Accepted by CIKM 2024; GitHub: https://github.com/quchangle1/COLT +
+
+
+
+
+ + ♻ ☆ Keep the Cost Down: A Review on Methods to Optimize LLM' s KV-Cache + Consumption + + +
+ Large Language Models (LLMs), epitomized by ChatGPT' s release in late 2022, +have revolutionized various industries with their advanced language +comprehension. However, their efficiency is challenged by the Transformer +architecture' s struggle with handling long texts. KV-Cache has emerged as a +pivotal solution to this issue, converting the time complexity of token +generation from quadratic to linear, albeit with increased GPU memory overhead +proportional to conversation length. With the development of the LLM community +and academia, various KV-Cache compression methods have been proposed. In this +review, we dissect the various properties of KV-Cache and elaborate on various +methods currently used to optimize the KV-Cache space usage of LLMs. These +methods span the pre-training phase, deployment phase, and inference phase, and +we summarize the commonalities and differences among these methods. +Additionally, we list some metrics for evaluating the long-text capabilities of +large language models, from both efficiency and capability perspectives. Our +review thus sheds light on the evolving landscape of LLM optimization, offering +insights into future advancements in this dynamic field. + +
+
+ comment: to be published in CoLM 2024 +
+
+
+
+
+ + ♻ ☆ BERT-Enhanced Retrieval Tool for Homework Plagiarism Detection System + + +
+ Text plagiarism detection task is a common natural language processing task +that aims to detect whether a given text contains plagiarism or copying from +other texts. In existing research, detection of high level plagiarism is still +a challenge due to the lack of high quality datasets. In this paper, we propose +a plagiarized text data generation method based on GPT-3.5, which produces +32,927 pairs of text plagiarism detection datasets covering a wide range of +plagiarism methods, bridging the gap in this part of research. Meanwhile, we +propose a plagiarism identification method based on Faiss with BERT with high +efficiency and high accuracy. Our experiments show that the performance of this +model outperforms other models in several metrics, including 98.86\%, 98.90%, +98.86%, and 0.9888 for Accuracy, Precision, Recall, and F1 Score, respectively. +At the end, we also provide a user-friendly demo platform that allows users to +upload a text library and intuitively participate in the plagiarism analysis. + +
+
+ comment: arXiv admin note: text overlap with arXiv:1604.06573 by other authors +
+
+
+
+
+ + ♻ ☆ Evolving Diverse Red-team Language Models in Multi-round Multi-agent + Games + + +
+ The primary challenge in deploying Large Language Model (LLM) is ensuring its +harmlessness. Red team can identify vulnerabilities by attacking LLM to attain +safety. However, current efforts heavily rely on single-round prompt designs +and unilateral red team optimizations against fixed blue teams. These static +approaches lead to significant reductions in generation diversity, known as the +mode collapse, which makes it difficult to discover the potential risks in the +increasingly complex human-LLM interactions. Here we introduce dynamic Red Team +Game (RTG) to comprehensively analyze the multi-round offensive and defensive +interactions between red team and blue team. Furthermore, we develop a Gamified +Red Team Solver (GRTS) with diversity measures to mitigate mode collapse and +theoretically guarantee the convergence of approximate Nash equilibrium which +results in better strategies for both teams. Empirical results demonstrate that +GRTS explore diverse and implicit attacks to adaptively exploit various LLMs, +surpassing the constraints of specific modes. Insightfully, the geometrical +structure we unveil of the red team task aligns with the spinning top +hypothesis, confirming the necessity of constructing a diverse LLM population +as a promising proxy for heterogeneous human expert red-teamers. This paves the +way for scalable toxicity detection and safe alignment for LLMs. + +
+
+
+
+
+ + ♻ ☆ Investigating and Mitigating the Multimodal Hallucination Snowballing in + Large Vision-Language Models ACL 2024 + + +
+ Though advanced in understanding visual information with human languages, +Large Vision-Language Models (LVLMs) still suffer from multimodal +hallucinations. A natural concern is that during multimodal interaction, the +generated hallucinations could influence the LVLMs' subsequent generation. +Thus, we raise a question: When presented with a query relevant to the +previously generated hallucination, will LVLMs be misled and respond +incorrectly, even though the ground visual information exists? To answer this, +we propose a framework called MMHalSnowball to evaluate LVLMs' behaviors when +encountering generated hallucinations, where LVLMs are required to answer +specific visual questions within a curated hallucinatory conversation. +Crucially, our experiment shows that the performance of open-source LVLMs drops +by at least $31\%$, indicating that LVLMs are prone to accept the generated +hallucinations and make false claims that they would not have supported without +distractions. We term this phenomenon Multimodal Hallucination Snowballing. To +mitigate this, we further propose a training-free method called Residual Visual +Decoding, where we revise the output distribution of LVLMs with the one derived +from the residual visual input, providing models with direct access to the +visual information. Experiments show that our method can mitigate more than +$24\%$ of the snowballed multimodal hallucination while maintaining +capabilities. + +
+
+ comment: Accepted to ACL 2024 Main Conference. 21 pages, 20 figures +
+
+
+
+
+ + ♻ ☆ Domain-Specific Pretraining of Language Models: A Comparative Study in + the Medical Field + + +
+ There are many cases where LLMs are used for specific tasks in a single +domain. These usually require less general, but more domain-specific knowledge. +Highly capable, general-purpose state-of-the-art language models like GPT-4 or +Claude-3-opus can often be used for such tasks, but they are very large and +cannot be run locally, even if they were not proprietary. This can be a problem +when working with sensitive data. This paper focuses on domain-specific and +mixed-domain pretraining as potentially more efficient methods than general +pretraining for specialized language models. We will take a look at work +related to domain-specific pretraining, specifically in the medical area, and +compare benchmark results of specialized language models to general-purpose +language models. + +
+
+
+
+
+ + ♻ ☆ Personality testing of Large Language Models: Limited temporal + stability, but highlighted prosociality + + +
+ As Large Language Models (LLMs) continue to gain popularity due to their +human-like traits and the intimacy they offer to users, their societal impact +inevitably expands. This leads to the rising necessity for comprehensive +studies to fully understand LLMs and reveal their potential opportunities, +drawbacks, and overall societal impact. With that in mind, this research +conducted an extensive investigation into seven LLM's, aiming to assess the +temporal stability and inter-rater agreement on their responses on personality +instruments in two time points. In addition, LLMs personality profile was +analyzed and compared to human normative data. The findings revealed varying +levels of inter-rater agreement in the LLMs responses over a short time, with +some LLMs showing higher agreement (e.g., LIama3 and GPT-4o) compared to others +(e.g., GPT-4 and Gemini). Furthermore, agreement depended on used instruments +as well as on domain or trait. This implies the variable robustness in LLMs' +ability to reliably simulate stable personality characteristics. In the case of +scales which showed at least fair agreement, LLMs displayed mostly a socially +desirable profile in both agentic and communal domains, as well as a prosocial +personality profile reflected in higher agreeableness and conscientiousness and +lower Machiavellianism. Exhibiting temporal stability and coherent responses on +personality traits is crucial for AI systems due to their societal impact and +AI safety concerns. + +
+
+ comment: 21 pages, 1 table +
+
+
+
+
+ + ♻ ☆ Style Transfer with Multi-iteration Preference Optimization + + +
+ Numerous recent techniques for text style transfer characterize their +approaches as variants of reinforcement learning and preference optimization. +In this work, we consider the relationship between these approaches and a class +of optimization approaches developed primarily for (non-neural) statistical +machine translation, formerly known as `tuning'. Inspired by these techniques +from the past, we improve upon established preference optimization approaches, +incorporating multiple iterations of exploration and optimization, and choosing +contrastive examples by following a `hope' vs `fear' sampling strategy. +Cognizant of the difference between machine translation and style transfer, +however, we further tailor our framework with a new pseudo-parallel generation +method and a dynamic weighted reward aggregation method to tackle the lack of +parallel data and the need for a multi-objective reward. We evaluate our model +on two commonly used text style transfer datasets. Through automatic and human +evaluation results we show the effectiveness and the superiority of our model +compared to state-of-the-art baselines. + +
+
+
+
+
+ + ♻ ☆ Authorship Style Transfer with Policy Optimization + + +
+ Authorship style transfer aims to rewrite a given text into a specified +target while preserving the original meaning in the source. Existing approaches +rely on the availability of a large number of target style exemplars for model +training. However, these overlook cases where a limited number of target style +examples are available. The development of parameter-efficient transfer +learning techniques and policy optimization (PO) approaches suggest lightweight +PO is a feasible approach to low-resource style transfer. In this work, we +propose a simple two-stage tune-and-optimize technique for low-resource textual +style transfer. We apply our technique to authorship transfer as well as a +larger-data native language style task and in both cases find it outperforms +state-of-the-art baseline models. + +
+
+
+
+
+ + ♻ ☆ MaskMoE: Boosting Token-Level Learning via Routing Mask in + Mixture-of-Experts + + +
+ Scaling the size of a model enhances its capabilities but significantly +increases computation complexity. Mixture-of-Experts models (MoE) address the +issue by allowing model size to scale up without substantially increasing +training or inference costs. Despite their promising results, MoE models +encounter several challenges. Primarily, for dynamic routing methods, the +dispersion of training tokens across multiple experts can lead to underfitting, +particularly for infrequent tokens. Additionally, while fixed routing methods +can mitigate that issue, they compromise on the diversity of representations. +In this paper, we propose \textbf{MaskMoE}, a method designed to enhance +token-level learning by employing a routing \textbf{mask}ing technique within +the \textbf{M}ixture-\textbf{o}f-\textbf{E}xperts model. MaskMoE is capable of +maintaining representation diversity while achieving more comprehensive +training. Experimental results demonstrate that our method outperforms previous +dominant Mixture-of-Experts models in terms of both perplexity (PPL) and +downstream task performance. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ Large Language Models in Biomedical and Health Informatics: A Review + with Bibliometric Analysis + + +
+ Large Language Models (LLMs) have rapidly become important tools in +Biomedical and Health Informatics (BHI), enabling new ways to analyze data, +treat patients, and conduct research. This study aims to provide a +comprehensive overview of LLM applications in BHI, highlighting their +transformative potential and addressing the associated ethical and practical +challenges. We reviewed 1,698 research articles from January 2022 to December +2023, categorizing them by research themes and diagnostic categories. +Additionally, we conducted network analysis to map scholarly collaborations and +research dynamics. Our findings reveal a substantial increase in the potential +applications of LLMs to a variety of BHI tasks, including clinical decision +support, patient interaction, and medical document analysis. Notably, LLMs are +expected to be instrumental in enhancing the accuracy of diagnostic tools and +patient care protocols. The network analysis highlights dense and dynamically +evolving collaborations across institutions, underscoring the interdisciplinary +nature of LLM research in BHI. A significant trend was the application of LLMs +in managing specific disease categories such as mental health and neurological +disorders, demonstrating their potential to influence personalized medicine and +public health strategies. LLMs hold promising potential to further transform +biomedical research and healthcare delivery. While promising, the ethical +implications and challenges of model validation call for rigorous scrutiny to +optimize their benefits in clinical settings. This survey serves as a resource +for stakeholders in healthcare, including researchers, clinicians, and +policymakers, to understand the current state and future potential of LLMs in +BHI. + +
+
+ comment: 62 pages, 9 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Enhancing Content-based Recommendation via Large Language Model CIKM 2024 + + +
+ In real-world applications, users express different behaviors when they +interact with different items, including implicit click/like interactions, and +explicit comments/reviews interactions. Nevertheless, almost all recommender +works are focused on how to describe user preferences by the implicit +click/like interactions, to find the synergy of people. For the content-based +explicit comments/reviews interactions, some works attempt to utilize them to +mine the semantic knowledge to enhance recommender models. However, they still +neglect the following two points: (1) The content semantic is a universal world +knowledge; how do we extract the multi-aspect semantic information to empower +different domains? (2) The user/item ID feature is a fundamental element for +recommender models; how do we align the ID and content semantic feature space? +In this paper, we propose a `plugin' semantic knowledge transferring method +\textbf{LoID}, which includes two major components: (1) LoRA-based large +language model pretraining to extract multi-aspect semantic information; (2) +ID-based contrastive objective to align their feature spaces. We conduct +extensive experiments with SOTA baselines on real-world datasets, the detailed +results demonstrating significant improvements of our method LoID. + +
+
+ comment: Accepted at CIKM 2024 +
+
+
+
+
+ + ♻ ☆ Ink and Individuality: Crafting a Personalised Narrative in the Age of + LLMs + + +
+ Individuality and personalization comprise the distinctive characteristics +that make each writer unique and influence their words in order to effectively +engage readers while conveying authenticity. However, our growing reliance on +LLM-based writing assistants risks compromising our creativity and +individuality over time. We often overlook the negative impacts of this trend +on our creativity and uniqueness, despite the possible consequences. This study +investigates these concerns by performing a brief survey to explore different +perspectives and concepts, as well as trying to understand people's viewpoints, +in conjunction with past studies in the area. Addressing these issues is +essential for improving human-computer interaction systems and enhancing +writing assistants for personalization and individuality. + +
+
+ comment: 8 Pages, 4 Figures. Accepted in The Third Workshop on Intelligent and + Interactive Writing Assistants at CHI 2024 +
+
+
+
+
+ + ♻ ☆ LLMs as Writing Assistants: Exploring Perspectives on Sense of Ownership + and Reasoning + + +
+ Sense of ownership in writing confines our investment of thoughts, time, and +contribution, leading to attachment to the output. However, using writing +assistants introduces a mental dilemma, as some content isn't directly our +creation. For instance, we tend to credit Large Language Models (LLMs) more in +creative tasks, even though all tasks are equal for them. Additionally, while +we may not claim complete ownership of LLM-generated content, we freely claim +authorship. We conduct a short survey to examine these issues and understand +underlying cognitive processes in order to gain a better knowledge of +human-computer interaction in writing and improve writing aid systems. + +
+
+ comment: 8 Pages, 3 Figures. Accepted in The Third Workshop on Intelligent and + Interactive Writing Assistants at CHI 2024 +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 28 + +
+
+
+ + ☆ Look Hear: Gaze Prediction for Speech-directed Human Attention ECCV 2024 + + +
+ For computer systems to effectively interact with humans using spoken +language, they need to understand how the words being generated affect the +users' moment-by-moment attention. Our study focuses on the incremental +prediction of attention as a person is seeing an image and hearing a referring +expression defining the object in the scene that should be fixated by gaze. To +predict the gaze scanpaths in this incremental object referral task, we +developed the Attention in Referral Transformer model or ART, which predicts +the human fixations spurred by each word in a referring expression. ART uses a +multimodal transformer encoder to jointly learn gaze behavior and its +underlying grounding tasks, and an autoregressive transformer decoder to +predict, for each word, a variable number of fixations based on fixation +history. To train ART, we created RefCOCO-Gaze, a large-scale dataset of 19,738 +human gaze scanpaths, corresponding to 2,094 unique image-expression pairs, +from 220 participants performing our referral task. In our quantitative and +qualitative analyses, ART not only outperforms existing methods in scanpath +prediction, but also appears to capture several human attention patterns, such +as waiting, scanning, and verification. + +
+
+ comment: Accepted for ECCV 2024 +
+
+
+
+
+ + ☆ Bridging the Gap: Studio-like Avatar Creation from a Monocular Phone + Capture ECCV 2024 + + +
+ Creating photorealistic avatars for individuals traditionally involves +extensive capture sessions with complex and expensive studio devices like the +LightStage system. While recent strides in neural representations have enabled +the generation of photorealistic and animatable 3D avatars from quick phone +scans, they have the capture-time lighting baked-in, lack facial details and +have missing regions in areas such as the back of the ears. Thus, they lag in +quality compared to studio-captured avatars. In this paper, we propose a method +that bridges this gap by generating studio-like illuminated texture maps from +short, monocular phone captures. We do this by parameterizing the phone texture +maps using the $W^+$ space of a StyleGAN2, enabling near-perfect +reconstruction. Then, we finetune a StyleGAN2 by sampling in the $W^+$ +parameterized space using a very small set of studio-captured textures as an +adversarial training signal. To further enhance the realism and accuracy of +facial details, we super-resolve the output of the StyleGAN2 using carefully +designed diffusion model that is guided by image gradients of the +phone-captured texture map. Once trained, our method excels at producing +studio-like facial texture maps from casual monocular smartphone videos. +Demonstrating its capabilities, we showcase the generation of photorealistic, +uniformly lit, complete avatars from monocular phone captures. +\href{http://shahrukhathar.github.io/2024/07/22/Bridging.html}{The project page +can be found here.} + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ☆ Forecast-PEFT: Parameter-Efficient Fine-Tuning for Pre-trained Motion + Forecasting Models + + +
+ Recent progress in motion forecasting has been substantially driven by +self-supervised pre-training. However, adapting pre-trained models for specific +downstream tasks, especially motion prediction, through extensive fine-tuning +is often inefficient. This inefficiency arises because motion prediction +closely aligns with the masked pre-training tasks, and traditional full +fine-tuning methods fail to fully leverage this alignment. To address this, we +introduce Forecast-PEFT, a fine-tuning strategy that freezes the majority of +the model's parameters, focusing adjustments on newly introduced prompts and +adapters. This approach not only preserves the pre-learned representations but +also significantly reduces the number of parameters that need retraining, +thereby enhancing efficiency. This tailored strategy, supplemented by our +method's capability to efficiently adapt to different datasets, enhances model +efficiency and ensures robust performance across datasets without the need for +extensive retraining. Our experiments show that Forecast-PEFT outperforms +traditional full fine-tuning methods in motion prediction tasks, achieving +higher accuracy with only 17% of the trainable parameters typically required. +Moreover, our comprehensive adaptation, Forecast-FT, further improves +prediction performance, evidencing up to a 9.6% enhancement over conventional +baseline methods. Code will be available at +https://github.com/csjfwang/Forecast-PEFT. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ☆ Exploring the Adversarial Robustness of CLIP for AI-generated Image + Detection + + +
+ In recent years, many forensic detectors have been proposed to detect +AI-generated images and prevent their use for malicious purposes. Convolutional +neural networks (CNNs) have long been the dominant architecture in this field +and have been the subject of intense study. However, recently proposed +Transformer-based detectors have been shown to match or even outperform +CNN-based detectors, especially in terms of generalization. In this paper, we +study the adversarial robustness of AI-generated image detectors, focusing on +Contrastive Language-Image Pretraining (CLIP)-based methods that rely on Visual +Transformer backbones and comparing their performance with CNN-based methods. +We study the robustness to different adversarial attacks under a variety of +conditions and analyze both numerical results and frequency-domain patterns. +CLIP-based detectors are found to be vulnerable to white-box attacks just like +CNN-based detectors. However, attacks do not easily transfer between CNN-based +and CLIP-based methods. This is also confirmed by the different distribution of +the adversarial noise patterns in the frequency domain. Overall, this analysis +provides new insights into the properties of forensic detectors that can help +to develop more effective strategies. + +
+
+
+
+
+ + ☆ Improving Domain Adaptation Through Class Aware Frequency Transformation + + +
+ In this work, we explore the usage of the Frequency Transformation for +reducing the domain shift between the source and target domain (e.g., synthetic +image and real image respectively) towards solving the Domain Adaptation task. +Most of the Unsupervised Domain Adaptation (UDA) algorithms focus on reducing +the global domain shift between labelled source and unlabelled target domains +by matching the marginal distributions under a small domain gap assumption. UDA +performance degrades for the cases where the domain gap between source and +target distribution is large. In order to bring the source and the target +domains closer, we propose a novel approach based on traditional image +processing technique Class Aware Frequency Transformation (CAFT) that utilizes +pseudo label based class consistent low-frequency swapping for improving the +overall performance of the existing UDA algorithms. The proposed approach, when +compared with the state-of-the-art deep learning based methods, is +computationally more efficient and can easily be plugged into any existing UDA +algorithm to improve its performance. Additionally, we introduce a novel +approach based on absolute difference of top-2 class prediction probabilities +(ADT2P) for filtering target pseudo labels into clean and noisy sets. Samples +with clean pseudo labels can be used to improve the performance of unsupervised +learning algorithms. We name the overall framework as CAFT++. We evaluate the +same on the top of different UDA algorithms across many public domain +adaptation datasets. Our extensive experiments indicate that CAFT++ is able to +achieve significant performance gains across all the popular benchmarks. + +
+
+ comment: Accepted at the International Journal of Computer Vision +
+
+
+
+
+ + ☆ Cycle3D: High-quality and Consistent Image-to-3D Generation via + Generation-Reconstruction Cycle + + +
+ Recent 3D large reconstruction models typically employ a two-stage process, +including first generate multi-view images by a multi-view diffusion model, and +then utilize a feed-forward model to reconstruct images to 3D content.However, +multi-view diffusion models often produce low-quality and inconsistent images, +adversely affecting the quality of the final 3D reconstruction. To address this +issue, we propose a unified 3D generation framework called Cycle3D, which +cyclically utilizes a 2D diffusion-based generation module and a feed-forward +3D reconstruction module during the multi-step diffusion process. Concretely, +2D diffusion model is applied for generating high-quality texture, and the +reconstruction model guarantees multi-view consistency.Moreover, 2D diffusion +model can further control the generated content and inject reference-view +information for unseen views, thereby enhancing the diversity and texture +consistency of 3D generation during the denoising process. Extensive +experiments demonstrate the superior ability of our method to create 3D content +with high-quality and consistency compared with state-of-the-art baselines. + +
+
+ comment: Project page: https://pku-yuangroup.github.io/Cycle3D/ +
+
+
+
+
+ + ☆ Temporal Feature Matters: A Framework for Diffusion Model Quantization + + +
+ The Diffusion models, widely used for image generation, face significant +challenges related to their broad applicability due to prolonged inference +times and high memory demands. Efficient Post-Training Quantization (PTQ) is +crucial to address these issues in traditional models. Unlike those models, +diffusion models critically rely on the time-step $t$ for effective multi-round +denoising. Typically, $t$ from the finite set $\{1, \ldots, T\}$ is encoded +into a hypersensitive temporal feature by several modules, entirely independent +of the sampling data. However, existing PTQ methods do not optimize these +modules individually. Instead, they employ unsuitable reconstruction objectives +and complex calibration methods, leading to significant disturbances in the +temporal feature and denoising trajectory. To address these challenges, we +introduce a novel quantization framework: 1)~TIB-based Maintenance: Based on +our innovative Temporal Information Block~(TIB) definition, Temporal +Information-aware Reconstruction~(TIAR) and Finite Set Calibration~(FSC) are +developed to efficiently align full precision temporal features. 2)~Cache-based +Maintenance: Instead of indirect and complex optimization for the related +modules, pre-computing and caching quantized counterparts of temporal features +are developed to minimize errors. 3)~Disturbance-aware Selection: Employ +temporal feature errors to guide a fine-grained selection for superior +maintenance. This framework preserves most of the temporal information and +ensures high-quality end-to-end generation. Extensive testing on various +datasets and diffusion models confirms our superior results. Notably, our +approach closely matches the performance of the full-precision model under +4-bit quantization. Furthermore, the quantized SD-XL model achieves hardware +acceleration of 2.20$\times$ on CPU and 5.76$\times$ on GPU demonstrating its +efficiency. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2311.16503 +
+
+
+
+
+ + ☆ XLIP: Cross-modal Attention Masked Modelling for Medical Language-Image + Pre-Training + + +
+ Vision-and-language pretraining (VLP) in the medical field utilizes +contrastive learning on image-text pairs to achieve effective transfer across +tasks. Yet, current VLP approaches with the masked modelling strategy face two +challenges when applied to the medical domain. First, current models struggle +to accurately reconstruct key pathological features due to the scarcity of +medical data. Second, most methods only adopt either paired image-text or +image-only data, failing to exploit the combination of both paired and unpaired +data. To this end, this paper proposes a XLIP (Masked modelling for medical +Language-Image Pre-training) framework to enhance pathological learning and +feature learning via unpaired data. First, we introduce the attention-masked +image modelling (AttMIM) and entity-driven masked language modelling module +(EntMLM), which learns to reconstruct pathological visual and textual tokens +via multi-modal feature interaction, thus improving medical-enhanced features. +The AttMIM module masks a portion of the image features that are highly +responsive to textual features. This allows XLIP to improve the reconstruction +of highly similar image data in medicine efficiency. Second, our XLIP +capitalizes unpaired data to enhance multimodal learning by introducing +disease-kind prompts. The experimental results show that XLIP achieves SOTA for +zero-shot and fine-tuning classification performance on five datasets. Our code +will be available at https://github.com/White65534/XLIP + +
+
+
+
+
+ + ☆ UniVoxel: Fast Inverse Rendering by Unified Voxelization of Scene + Representation ECCV2024 + + +
+ Typical inverse rendering methods focus on learning implicit neural scene +representations by modeling the geometry, materials and illumination +separately, which entails significant computations for optimization. In this +work we design a Unified Voxelization framework for explicit learning of scene +representations, dubbed UniVoxel, which allows for efficient modeling of the +geometry, materials and illumination jointly, thereby accelerating the inverse +rendering significantly. To be specific, we propose to encode a scene into a +latent volumetric representation, based on which the geometry, materials and +illumination can be readily learned via lightweight neural networks in a +unified manner. Particularly, an essential design of UniVoxel is that we +leverage local Spherical Gaussians to represent the incident light radiance, +which enables the seamless integration of modeling illumination into the +unified voxelization framework. Such novel design enables our UniVoxel to model +the joint effects of direct lighting, indirect lighting and light visibility +efficiently without expensive multi-bounce ray tracing. Extensive experiments +on multiple benchmarks covering diverse scenes demonstrate that UniVoxel boosts +the optimization efficiency significantly compared to other methods, reducing +the per-scene training time from hours to 18 minutes, while achieving favorable +reconstruction quality. Code is available at +https://github.com/freemantom/UniVoxel. + +
+
+ comment: ECCV2024 +
+
+
+
+
+ + ☆ VersusDebias: Universal Zero-Shot Debiasing for Text-to-Image Models via + SLM-Based Prompt Engineering and Generative Adversary + + +
+ With the rapid development of Text-to-Image models, biases in human image +generation against demographic groups social attract more and more concerns. +Existing methods are designed based on certain models with fixed prompts, +unable to accommodate the trend of high-speed updating of Text-to-Image (T2I) +models and variable prompts in practical scenes. Additionally, they fail to +consider the possibility of hallucinations, leading to deviations between +expected and actual results. To address this issue, we introduce VersusDebias, +a novel and universal debiasing framework for biases in T2I models, consisting +of one generative adversarial mechanism (GAM) and one debiasing generation +mechanism using a small language model (SLM). The self-adaptive GAM generates +specialized attribute arrays for each prompts for diminishing the influence of +hallucinations from T2I models. The SLM uses prompt engineering to generate +debiased prompts for the T2I model, providing zero-shot debiasing ability and +custom optimization for different models. Extensive experiments demonstrate +VersusDebias's capability to rectify biases on arbitrary models across multiple +protected attributes simultaneously, including gender, race, and age. +Furthermore, VersusDebias outperforms existing methods in both zero-shot and +few-shot situations, illustrating its extraordinary utility. Our work is openly +accessible to the research community to ensure the reproducibility. + +
+
+
+
+
+ + ☆ Ego-VPA: Egocentric Video Understanding with Parameter-efficient + Adaptation + + +
+ Video understanding typically requires fine-tuning the large backbone when +adapting to new domains. In this paper, we leverage the egocentric video +foundation models (Ego-VFMs) based on video-language pre-training and propose a +parameter-efficient adaptation for egocentric video tasks, namely Ego-VPA. It +employs a local sparse approximation for each video frame/text feature using +the basis prompts, and the selected basis prompts are used to synthesize +video/text prompts. Since the basis prompts are shared across frames and +modalities, it models context fusion and cross-modal transfer in an efficient +fashion. Experiments show that Ego-VPA excels in lightweight adaptation (with +only 0.84% learnable parameters), largely improving over baselines and reaching +the performance of full fine-tuning. + +
+
+
+
+
+ + ☆ Solving Short-Term Relocalization Problems In Monocular Keyframe Visual + SLAM Using Spatial And Semantic Data + + +
+ In Monocular Keyframe Visual Simultaneous Localization and Mapping (MKVSLAM) +frameworks, when incremental position tracking fails, global pose has to be +recovered in a short-time window, also known as short-term relocalization. This +capability is crucial for mobile robots to have reliable navigation, build +accurate maps, and have precise behaviors around human collaborators. This +paper focuses on the development of robust short-term relocalization +capabilities for mobile robots using a monocular camera system. A novel +multimodal keyframe descriptor is introduced, that contains semantic +information of objects detected in the environment and the spatial information +of the camera. Using this descriptor, a new Keyframe-based Place Recognition +(KPR) method is proposed that is formulated as a multi-stage keyframe filtering +algorithm, leading to a new relocalization pipeline for MKVSLAM systems. The +proposed approach is evaluated over several indoor GPS denied datasets and +demonstrates accurate pose recovery, in comparison to a bag-of-words approach. + +
+
+ comment: 8 pages, Keywords: VSLAM, Localization, Semantics. Presented in 2024 + IEEE/ASME International Conference on Advanced Intelligent Mechatronics (AIM) +
+
+
+
+
+ + ☆ Detached and Interactive Multimodal Learning ACM MM 24 + + +
+ Recently, Multimodal Learning (MML) has gained significant interest as it +compensates for single-modality limitations through comprehensive complementary +information within multimodal data. However, traditional MML methods generally +use the joint learning framework with a uniform learning objective that can +lead to the modality competition issue, where feedback predominantly comes from +certain modalities, limiting the full potential of others. In response to this +challenge, this paper introduces DI-MML, a novel detached MML framework +designed to learn complementary information across modalities under the premise +of avoiding modality competition. Specifically, DI-MML addresses competition by +separately training each modality encoder with isolated learning objectives. It +further encourages cross-modal interaction via a shared classifier that defines +a common feature space and employing a dimension-decoupled unidirectional +contrastive (DUC) loss to facilitate modality-level knowledge transfer. +Additionally, to account for varying reliability in sample pairs, we devise a +certainty-aware logit weighting strategy to effectively leverage complementary +information at the instance level during inference. Extensive experiments +conducted on audio-visual, flow-image, and front-rear view datasets show the +superior performance of our proposed method. The code is released at +https://github.com/fanyunfeng-bit/DI-MML. + +
+
+ comment: Accepted by ACM MM 24 +
+
+
+
+
+ + ☆ Large-scale cervical precancerous screening via AI-assisted cytology + whole slide image analysis + + +
+ Cervical Cancer continues to be the leading gynecological malignancy, posing +a persistent threat to women's health on a global scale. Early screening via +cytology Whole Slide Image (WSI) diagnosis is critical to prevent this Cancer +progression and improve survival rate, but pathologist's single test suffers +inevitable false negative due to the immense number of cells that need to be +reviewed within a WSI. Though computer-aided automated diagnostic models can +serve as strong complement for pathologists, their effectiveness is hampered by +the paucity of extensive and detailed annotations, coupled with the limited +interpretability and robustness. These factors significantly hinder their +practical applicability and reliability in clinical settings. To tackle these +challenges, we develop an AI approach, which is a Scalable Technology for +Robust and Interpretable Diagnosis built on Extensive data (STRIDE) of cervical +cytology. STRIDE addresses the bottleneck of limited annotations by integrating +patient-level labels with a small portion of cell-level labels through an +end-to-end training strategy, facilitating scalable learning across extensive +datasets. To further improve the robustness to real-world domain shifts of +cytology slide-making and imaging, STRIDE employs color adversarial samples +training that mimic staining and imaging variations. Lastly, to achieve +pathologist-level interpretability for the trustworthiness in clinical +settings, STRIDE can generate explanatory textual descriptions that simulates +pathologists' diagnostic processes by cell image feature and textual +description alignment. Conducting extensive experiments and evaluations in 183 +medical centers with a dataset of 341,889 WSIs and 0.1 billion cells from +cervical cytology patients, STRIDE has demonstrated a remarkable superiority +over previous state-of-the-art techniques. + +
+
+
+
+
+ + ☆ EPD: Long-term Memory Extraction, Context-awared Planning and + Multi-iteration Decision @ EgoPlan Challenge ICML 2024 + + +
+ In this technical report, we present our solution for the EgoPlan Challenge +in ICML 2024. To address the real-world egocentric task planning problem, we +introduce a novel planning framework which comprises three stages: long-term +memory Extraction, context-awared Planning, and multi-iteration Decision, named +EPD. Given the task goal, task progress, and current observation, the +extraction model first extracts task-relevant memory information from the +progress video, transforming the complex long video into summarized memory +information. The planning model then combines the context of the memory +information with fine-grained visual information from the current observation +to predict the next action. Finally, through multi-iteration decision-making, +the decision model comprehensively understands the task situation and current +state to make the most realistic planning decision. On the EgoPlan-Test set, +EPD achieves a planning accuracy of 53.85% over 1,584 egocentric task planning +questions. We have made all codes available at https://github.com/Kkskkkskr/EPD . + +
+
+
+
+
+ + ☆ WeCromCL: Weakly Supervised Cross-Modality Contrastive Learning for + Transcription-only Supervised Text Spotting ECCV 2024 + + +
+ Transcription-only Supervised Text Spotting aims to learn text spotters +relying only on transcriptions but no text boundaries for supervision, thus +eliminating expensive boundary annotation. The crux of this task lies in +locating each transcription in scene text images without location annotations. +In this work, we formulate this challenging problem as a Weakly Supervised +Cross-modality Contrastive Learning problem, and design a simple yet effective +model dubbed WeCromCL that is able to detect each transcription in a scene +image in a weakly supervised manner. Unlike typical methods for cross-modality +contrastive learning that focus on modeling the holistic semantic correlation +between an entire image and a text description, our WeCromCL conducts atomistic +contrastive learning to model the character-wise appearance consistency between +a text transcription and its correlated region in a scene image to detect an +anchor point for the transcription in a weakly supervised manner. The detected +anchor points by WeCromCL are further used as pseudo location labels to guide +the learning of text spotting. Extensive experiments on four challenging +benchmarks demonstrate the superior performance of our model over other +methods. Code will be released. + +
+
+ comment: Accepted by ECCV 2024 +
+
+
+
+
+ + ☆ Skeleton-based Group Activity Recognition via Spatial-Temporal Panoramic + Graph + + +
+ Group Activity Recognition aims to understand collective activities from +videos. Existing solutions primarily rely on the RGB modality, which encounters +challenges such as background variations, occlusions, motion blurs, and +significant computational overhead. Meanwhile, current keypoint-based methods +offer a lightweight and informative representation of human motions but +necessitate accurate individual annotations and specialized interaction +reasoning modules. To address these limitations, we design a panoramic graph +that incorporates multi-person skeletons and objects to encapsulate group +activity, offering an effective alternative to RGB video. This panoramic graph +enables Graph Convolutional Network (GCN) to unify intra-person, inter-person, +and person-object interactive modeling through spatial-temporal graph +convolutions. In practice, we develop a novel pipeline that extracts skeleton +coordinates using pose estimation and tracking algorithms and employ +Multi-person Panoramic GCN (MP-GCN) to predict group activities. Extensive +experiments on Volleyball and NBA datasets demonstrate that the MP-GCN achieves +state-of-the-art performance in both accuracy and efficiency. Notably, our +method outperforms RGB-based approaches by using only estimated 2D keypoints as +input. Code is available at https://github.com/mgiant/MP-GCN + +
+
+
+
+
+ + ♻ ☆ A Language Agent for Autonomous Driving + + +
+ Human-level driving is an ultimate goal of autonomous driving. Conventional +approaches formulate autonomous driving as a perception-prediction-planning +framework, yet their systems do not capitalize on the inherent reasoning +ability and experiential knowledge of humans. In this paper, we propose a +fundamental paradigm shift from current pipelines, exploiting Large Language +Models (LLMs) as a cognitive agent to integrate human-like intelligence into +autonomous driving systems. Our approach, termed Agent-Driver, transforms the +traditional autonomous driving pipeline by introducing a versatile tool library +accessible via function calls, a cognitive memory of common sense and +experiential knowledge for decision-making, and a reasoning engine capable of +chain-of-thought reasoning, task planning, motion planning, and +self-reflection. Powered by LLMs, our Agent-Driver is endowed with intuitive +common sense and robust reasoning capabilities, thus enabling a more nuanced, +human-like approach to autonomous driving. We evaluate our approach on the +large-scale nuScenes benchmark, and extensive experiments substantiate that our +Agent-Driver significantly outperforms the state-of-the-art driving methods by +a large margin. Our approach also demonstrates superior interpretability and +few-shot learning ability to these methods. + +
+
+ comment: COLM 2024. Project Page: https://usc-gvl.github.io/Agent-Driver/ +
+
+
+
+
+ + ♻ ☆ SportsNGEN: Sustained Generation of Realistic Multi-player Sports + Gameplay + + +
+ We present a transformer decoder based sports simulation engine, SportsNGEN, +trained on sports player and ball tracking sequences, that is capable of +generating sustained gameplay and accurately mimicking the decision making of +real players. By training on a large database of professional tennis tracking +data, we demonstrate that simulations produced by SportsNGEN can be used to +predict the outcomes of rallies, determine the best shot choices at any point, +and evaluate counterfactual or what if scenarios to inform coaching decisions +and elevate broadcast coverage. By combining the generated simulations with a +shot classifier and logic to start and end rallies, the system is capable of +simulating an entire tennis match. We evaluate SportsNGEN by comparing +statistics of the simulations with those of real matches between the same +players. We show that the model output sampling parameters are crucial to +simulation realism and that SportsNGEN is probabilistically well-calibrated to +real data. In addition, a generic version of SportsNGEN can be customized to a +specific player by fine-tuning on the subset of match data that includes that +player. Finally, we show qualitative results indicating the same approach works +for football. + +
+
+
+
+
+ + ♻ ☆ EcoSense: Energy-Efficient Intelligent Sensing for In-Shore Ship + Detection through Edge-Cloud Collaboration + + +
+ Detecting marine objects inshore presents challenges owing to algorithmic +intricacies and complexities in system deployment. We propose a +difficulty-aware edge-cloud collaborative sensing system that splits the task +into object localization and fine-grained classification. Objects are +classified either at the edge or within the cloud, based on their estimated +difficulty. The framework comprises a low-power device-tailored front-end model +for object localization, classification, and difficulty estimation, along with +a transformer-graph convolutional network-based back-end model for fine-grained +classification. Our system demonstrates superior performance (mAP@0.5 +4.3%}) +on widely used marine object detection datasets, significantly reducing both +data transmission volume (by 95.43%) and energy consumption (by 72.7%}) at the +system level. We validate the proposed system across various embedded system +platforms and in real-world scenarios involving drone deployment. + +
+
+
+
+
+ + ♻ ☆ LLaVA-NeXT-Interleave: Tackling Multi-image, Video, and 3D in Large + Multimodal Models + + +
+ Visual instruction tuning has made considerable strides in enhancing the +capabilities of Large Multimodal Models (LMMs). However, existing open LMMs +largely focus on single-image tasks, their applications to multi-image +scenarios remains less explored. Additionally, prior LMM research separately +tackles different scenarios, leaving it impossible to generalize cross +scenarios with new emerging capabilities. To this end, we introduce +LLaVA-NeXT-Interleave, which simultaneously tackles Multi-image, Multi-frame +(video), Multi-view (3D), and Multi-patch (single-image) scenarios in LMMs. To +enable these capabilities, we regard the interleaved data format as a general +template and compile the M4-Instruct dataset with 1,177.6k samples, spanning 4 +primary domains with 14 tasks and 41 datasets. We also curate the +LLaVA-Interleave Bench to comprehensively evaluate the multi-image performance +of LMMs. Through extensive experiments, LLaVA-NeXT-Interleave achieves leading +results in multi-image, video, and 3D benchmarks, while maintaining the +performance of single-image tasks. Besides, our model also exhibits several +emerging capabilities, e.g., transferring tasks across different settings and +modalities. Code is available at https://github.com/LLaVA-VL/LLaVA-NeXT + +
+
+ comment: Project Page: + https://llava-vl.github.io/blog/2024-06-16-llava-next-interleave/ +
+
+
+
+
+ + ♻ ☆ DragAPart: Learning a Part-Level Motion Prior for Articulated Objects + + +
+ We introduce DragAPart, a method that, given an image and a set of drags as +input, generates a new image of the same object that responds to the action of +the drags. Differently from prior works that focused on repositioning objects, +DragAPart predicts part-level interactions, such as opening and closing a +drawer. We study this problem as a proxy for learning a generalist motion +model, not restricted to a specific kinematic structure or object category. We +start from a pre-trained image generator and fine-tune it on a new synthetic +dataset, Drag-a-Move, which we introduce. Combined with a new encoding for the +drags and dataset randomization, the model generalizes well to real images and +different categories. Compared to prior motion-controlled generators, we +demonstrate much better part-level motion understanding. + +
+
+ comment: Project page: https://dragapart.github.io/ +
+
+
+
+
+ + ♻ ☆ JointViT: Modeling Oxygen Saturation Levels with Joint Supervision on + Long-Tailed OCTA + + +
+ The oxygen saturation level in the blood (SaO2) is crucial for health, +particularly in relation to sleep-related breathing disorders. However, +continuous monitoring of SaO2 is time-consuming and highly variable depending +on patients' conditions. Recently, optical coherence tomography angiography +(OCTA) has shown promising development in rapidly and effectively screening +eye-related lesions, offering the potential for diagnosing sleep-related +disorders. To bridge this gap, our paper presents three key contributions. +Firstly, we propose JointViT, a novel model based on the Vision Transformer +architecture, incorporating a joint loss function for supervision. Secondly, we +introduce a balancing augmentation technique during data preprocessing to +improve the model's performance, particularly on the long-tail distribution +within the OCTA dataset. Lastly, through comprehensive experiments on the OCTA +dataset, our proposed method significantly outperforms other state-of-the-art +methods, achieving improvements of up to 12.28% in overall accuracy. This +advancement lays the groundwork for the future utilization of OCTA in +diagnosing sleep-related disorders. See project website +https://steve-zeyu-zhang.github.io/JointViT + +
+
+ comment: Accepted to MIUA 2024 Oral +
+
+
+
+
+ + ♻ ☆ Semi-Mamba-UNet: Pixel-Level Contrastive and Pixel-Level + Cross-Supervised Visual Mamba-based UNet for Semi-Supervised Medical Image + Segmentation + + +
+ Medical image segmentation is essential in diagnostics, treatment planning, +and healthcare, with deep learning offering promising advancements. Notably, +the convolutional neural network (CNN) excels in capturing local image +features, whereas the Vision Transformer (ViT) adeptly models long-range +dependencies through multi-head self-attention mechanisms. Despite their +strengths, both the CNN and ViT face challenges in efficiently processing +long-range dependencies in medical images, often requiring substantial +computational resources. This issue, combined with the high cost and limited +availability of expert annotations, poses significant obstacles to achieving +precise segmentation. To address these challenges, this study introduces +Semi-Mamba-UNet, which integrates a purely visual Mamba-based U-shaped +encoder-decoder architecture with a conventional CNN-based UNet into a +semi-supervised learning (SSL) framework. This innovative SSL approach +leverages both networks to generate pseudo-labels and cross-supervise one +another at the pixel level simultaneously, drawing inspiration from consistency +regularisation techniques. Furthermore, we introduce a self-supervised +pixel-level contrastive learning strategy that employs a pair of projectors to +enhance the feature learning capabilities further, especially on unlabelled +data. Semi-Mamba-UNet was comprehensively evaluated on two publicly available +segmentation dataset and compared with seven other SSL frameworks with both +CNN- or ViT-based UNet as the backbone network, highlighting the superior +performance of the proposed method. The source code of Semi-Mamba-Unet, all +baseline SSL frameworks, the CNN- and ViT-based networks, and the two +corresponding datasets are made publicly accessible. + +
+
+
+
+
+ + ♻ ☆ BUSClean: Open-source software for breast ultrasound image + pre-processing and knowledge extraction for medical AI + + +
+ Development of artificial intelligence (AI) for medical imaging demands +curation and cleaning of large-scale clinical datasets comprising hundreds of +thousands of images. Some modalities, such as mammography, contain highly +standardized imaging. In contrast, breast ultrasound imaging (BUS) can contain +many irregularities not indicated by scan metadata, such as enhanced scan +modes, sonographer annotations, or additional views. We present an open-source +software solution for automatically processing clinical BUS datasets. The +algorithm performs BUS scan filtering, cleaning, and knowledge extraction from +sonographer annotations. Its modular design enables users to adapt it to new +settings. Experiments on an internal testing dataset of 430 clinical BUS images +achieve >95% sensitivity and >98% specificity in detecting every type of text +annotation, >98% sensitivity and specificity in detecting scans with blood flow +highlighting, alternative scan modes, or invalid scans. A case study on a +completely external, public dataset of BUS scans found that BUSClean identified +text annotations and scans with blood flow highlighting with 88.6% and 90.9% +sensitivity and 98.3% and 99.9% specificity, respectively. Adaptation of the +lesion caliper detection method to account for a type of caliper specific to +the case study demonstrates intended use of BUSClean in new data distributions +and improved performance in lesion caliper detection from 43.3% and 93.3% +out-of-the-box to 92.1% and 92.3% sensitivity and specificity, respectively. +Source code, example notebooks, and sample data are available at +https://github.com/hawaii-ai/bus-cleaning. + +
+
+
+
+
+ + ♻ ☆ N2F2: Hierarchical Scene Understanding with Nested Neural Feature Fields ECCV 2024 + + +
+ Understanding complex scenes at multiple levels of abstraction remains a +formidable challenge in computer vision. To address this, we introduce Nested +Neural Feature Fields (N2F2), a novel approach that employs hierarchical +supervision to learn a single feature field, wherein different dimensions +within the same high-dimensional feature encode scene properties at varying +granularities. Our method allows for a flexible definition of hierarchies, +tailored to either the physical dimensions or semantics or both, thereby +enabling a comprehensive and nuanced understanding of scenes. We leverage a 2D +class-agnostic segmentation model to provide semantically meaningful pixel +groupings at arbitrary scales in the image space, and query the CLIP +vision-encoder to obtain language-aligned embeddings for each of these +segments. Our proposed hierarchical supervision method then assigns different +nested dimensions of the feature field to distill the CLIP embeddings using +deferred volumetric rendering at varying physical scales, creating a +coarse-to-fine representation. Extensive experiments show that our approach +outperforms the state-of-the-art feature field distillation methods on tasks +such as open-vocabulary 3D segmentation and localization, demonstrating the +effectiveness of the learned nested feature field. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Overcome Modal Bias in Multi-modal Federated Learning via Balanced + Modality Selection ECCV24 + + +
+ Selecting proper clients to participate in each federated learning (FL) round +is critical to effectively harness a broad range of distributed data. Existing +client selection methods simply consider the mining of distributed uni-modal +data, yet, their effectiveness may diminish in multi-modal FL (MFL) as the +modality imbalance problem not only impedes the collaborative local training +but also leads to a severe global modality-level bias. We empirically reveal +that local training with a certain single modality may contribute more to the +global model than training with all local modalities. To effectively exploit +the distributed multiple modalities, we propose a novel Balanced Modality +Selection framework for MFL (BMSFed) to overcome the modal bias. On the one +hand, we introduce a modal enhancement loss during local training to alleviate +local imbalance based on the aggregated global prototypes. On the other hand, +we propose the modality selection aiming to select subsets of local modalities +with great diversity and achieving global modal balance simultaneously. Our +extensive experiments on audio-visual, colored-gray, and front-back datasets +showcase the superiority of BMSFed over baselines and its effectiveness in +multi-modal data exploitation. + +
+
+ comment: Accepted by ECCV24, 23 pages +
+
+
+
+
+ + ♻ ☆ Weighted Ensemble Models Are Strong Continual Learners ECCV2024 + + +
+ In this work, we study the problem of continual learning (CL) where the goal +is to learn a model on a sequence of tasks, such that the data from the +previous tasks becomes unavailable while learning on the current task data. CL +is essentially a balancing act between being able to learn on the new task +(i.e., plasticity) and maintaining the performance on the previously learned +concepts (i.e., stability). Intending to address the stability-plasticity +trade-off, we propose to perform weight-ensembling of the model parameters of +the previous and current tasks. This weighted-ensembled model, which we call +Continual Model Averaging (or CoMA), attains high accuracy on the current task +by leveraging plasticity, while not deviating too far from the previous weight +configuration, ensuring stability. We also propose an improved variant of CoMA, +named Continual Fisher-weighted Model Averaging (or CoFiMA), that selectively +weighs each parameter in the weights ensemble by leveraging the Fisher +information of the weights of the model. Both variants are conceptually simple, +easy to implement, and effective in attaining state-of-the-art performance on +several standard CL benchmarks. Code is available at: +https://github.com/IemProg/CoFiMA. + +
+
+ comment: Accepted for ECCV2024, Code: https://github.com/IemProg/CoFiMA +
+
+
+
+
+
+
+
+ + Information Retrieval 7 + +
+
+
+ + ☆ Interpretable Triplet Importance for Personalized Ranking CIKM 2024 + + +
+ Personalized item ranking has been a crucial component contributing to the +performance of recommender systems. As a representative approach, pairwise +ranking directly optimizes the ranking with user implicit feedback by +constructing (\textit{user}, \textit{positive item}, \textit{negative item}) +triplets. Several recent works have noticed that treating all triplets equally +may hardly achieve the best effects. They assign different importance scores to +negative items, user-item pairs, or triplets, respectively. However, almost all +the generated importance scores are groundless and hard to interpret, thus far +from trustworthy and transparent. To tackle these, we propose the +\textit{Triplet Shapley} -- a Shapely value-based method to measure the triplet +importance in an interpretable manner. Due to the huge number of triplets, we +transform the original Shapley value calculation to the Monte Carlo (MC) +approximation, where the guarantee for the approximation unbiasedness is also +provided. To stabilize the MC approximation, we adopt a control +covariates-based method. Finally, we utilize the triplet Shapley value to guide +the resampling of important triplets for benefiting the model learning. +Extensive experiments are conducted on six public datasets involving classical +matrix factorization- and graph neural network-based recommendation models. +Empirical results and subsequent analysis show that our model consistently +outperforms the state-of-the-art methods. + +
+
+ comment: Accepted by CIKM 2024 +
+
+
+
+
+ + ☆ Enhancing Taobao Display Advertising with Multimodal Representations: + Challenges, Approaches and Insights CIKM 2024 + + +
+ Despite the recognized potential of multimodal data to improve model +accuracy, many large-scale industrial recommendation systems, including Taobao +display advertising system, predominantly depend on sparse ID features in their +models. In this work, we explore approaches to leverage multimodal data to +enhance the recommendation accuracy. We start from identifying the key +challenges in adopting multimodal data in a manner that is both effective and +cost-efficient for industrial systems. To address these challenges, we +introduce a two-phase framework, including: 1) the pre-training of multimodal +representations to capture semantic similarity, and 2) the integration of these +representations with existing ID-based models. Furthermore, we detail the +architecture of our production system, which is designed to facilitate the +deployment of multimodal representations. Since the integration of multimodal +representations in mid-2023, we have observed significant performance +improvements in Taobao display advertising system. We believe that the insights +we have gathered will serve as a valuable resource for practitioners seeking to +leverage multimodal data in their systems. + +
+
+ comment: Accepted at CIKM 2024 +
+
+
+
+
+ + ♻ ☆ Towards Completeness-Oriented Tool Retrieval for Large Language Models CIKM 2024 + + +
+ Recently, integrating external tools with Large Language Models (LLMs) has +gained significant attention as an effective strategy to mitigate the +limitations inherent in their pre-training data. However, real-world systems +often incorporate a wide array of tools, making it impractical to input all +tools into LLMs due to length limitations and latency constraints. Therefore, +to fully exploit the potential of tool-augmented LLMs, it is crucial to develop +an effective tool retrieval system. Existing tool retrieval methods primarily +focus on semantic matching between user queries and tool descriptions, +frequently leading to the retrieval of redundant, similar tools. Consequently, +these methods fail to provide a complete set of diverse tools necessary for +addressing the multifaceted problems encountered by LLMs. In this paper, we +propose a novel modelagnostic COllaborative Learning-based Tool Retrieval +approach, COLT, which captures not only the semantic similarities between user +queries and tool descriptions but also takes into account the collaborative +information of tools. Specifically, we first fine-tune the PLM-based retrieval +models to capture the semantic relationships between queries and tools in the +semantic learning stage. Subsequently, we construct three bipartite graphs +among queries, scenes, and tools and introduce a dual-view graph collaborative +learning framework to capture the intricate collaborative relationships among +tools during the collaborative learning stage. Extensive experiments on both +the open benchmark and the newly introduced ToolLens dataset show that COLT +achieves superior performance. Notably, the performance of BERT-mini (11M) with +our proposed model framework outperforms BERT-large (340M), which has 30 times +more parameters. Furthermore, we will release ToolLens publicly to facilitate +future research on tool retrieval. + +
+
+ comment: Accepted by CIKM 2024; GitHub: https://github.com/quchangle1/COLT +
+
+
+
+
+ + ♻ ☆ Intent-guided Heterogeneous Graph Contrastive Learning for + Recommendation + + +
+ Contrastive Learning (CL)-based recommender systems have gained prominence in +the context of Heterogeneous Graph (HG) due to their capacity to enhance the +consistency of representations across different views. However, existing +frameworks often neglect the fact that user-item interactions within HG are +governed by diverse latent intents (e.g., brand preferences or demographic +characteristics of item audiences), which are pivotal in capturing fine-grained +relations. The exploration of these underlying intents, particularly through +the lens of meta-paths in HGs, presents us with two principal challenges: i) +How to integrate CL with intents; ii) How to mitigate noise from +meta-path-driven intents. + To address these challenges, we propose an innovative framework termed +Intent-guided Heterogeneous Graph Contrastive Learning (IHGCL), which designed +to enhance CL-based recommendation by capturing the intents contained within +meta-paths. Specifically, the IHGCL framework includes: i) a meta-path-based +Dual Contrastive Learning (DCL) approach to effectively integrate intents into +the recommendation, constructing intent-intent contrast and intent-interaction +contrast; ii) a Bottlenecked AutoEncoder (BAE) that combines mask propagation +with the information bottleneck principle to significantly reduce noise +perturbations introduced by meta-paths. Empirical evaluations conducted across +six distinct datasets demonstrate the superior performance of our IHGCL +framework relative to conventional baseline methods. Our model implementation +is available at https://github.com/wangyu0627/IHGCL. + +
+
+ comment: 14pages, 11figures +
+
+
+
+
+ + ♻ ☆ BERT-Enhanced Retrieval Tool for Homework Plagiarism Detection System + + +
+ Text plagiarism detection task is a common natural language processing task +that aims to detect whether a given text contains plagiarism or copying from +other texts. In existing research, detection of high level plagiarism is still +a challenge due to the lack of high quality datasets. In this paper, we propose +a plagiarized text data generation method based on GPT-3.5, which produces +32,927 pairs of text plagiarism detection datasets covering a wide range of +plagiarism methods, bridging the gap in this part of research. Meanwhile, we +propose a plagiarism identification method based on Faiss with BERT with high +efficiency and high accuracy. Our experiments show that the performance of this +model outperforms other models in several metrics, including 98.86\%, 98.90%, +98.86%, and 0.9888 for Accuracy, Precision, Recall, and F1 Score, respectively. +At the end, we also provide a user-friendly demo platform that allows users to +upload a text library and intuitively participate in the plagiarism analysis. + +
+
+ comment: arXiv admin note: text overlap with arXiv:1604.06573 by other authors +
+
+
+
+
+ + ♻ ☆ Enhancing Content-based Recommendation via Large Language Model CIKM 2024 + + +
+ In real-world applications, users express different behaviors when they +interact with different items, including implicit click/like interactions, and +explicit comments/reviews interactions. Nevertheless, almost all recommender +works are focused on how to describe user preferences by the implicit +click/like interactions, to find the synergy of people. For the content-based +explicit comments/reviews interactions, some works attempt to utilize them to +mine the semantic knowledge to enhance recommender models. However, they still +neglect the following two points: (1) The content semantic is a universal world +knowledge; how do we extract the multi-aspect semantic information to empower +different domains? (2) The user/item ID feature is a fundamental element for +recommender models; how do we align the ID and content semantic feature space? +In this paper, we propose a `plugin' semantic knowledge transferring method +\textbf{LoID}, which includes two major components: (1) LoRA-based large +language model pretraining to extract multi-aspect semantic information; (2) +ID-based contrastive objective to align their feature spaces. We conduct +extensive experiments with SOTA baselines on real-world datasets, the detailed +results demonstrating significant improvements of our method LoID. + +
+
+ comment: Accepted at CIKM 2024 +
+
+
+
+
+ + ♻ ☆ Ink and Individuality: Crafting a Personalised Narrative in the Age of + LLMs + + +
+ Individuality and personalization comprise the distinctive characteristics +that make each writer unique and influence their words in order to effectively +engage readers while conveying authenticity. However, our growing reliance on +LLM-based writing assistants risks compromising our creativity and +individuality over time. We often overlook the negative impacts of this trend +on our creativity and uniqueness, despite the possible consequences. This study +investigates these concerns by performing a brief survey to explore different +perspectives and concepts, as well as trying to understand people's viewpoints, +in conjunction with past studies in the area. Addressing these issues is +essential for improving human-computer interaction systems and enhancing +writing assistants for personalization and individuality. + +
+
+ comment: 8 Pages, 4 Figures. Accepted in The Third Workshop on Intelligent and + Interactive Writing Assistants at CHI 2024 +
+
+
+
+
+
+
+
+ + Machine Learning 16 + +
+
+
+ + ☆ Memory-efficient Training of LLMs with Larger Mini-batches + + +
+ Training with larger mini-batches improves the performance and convergence +rate of training machine learning models. However, training with large +mini-batches becomes prohibitive for Large Language Models (LLMs) with billions +of parameters, due to the large GPU memory requirement. To address this +problem, we propose finding small mini-batches that simulate the dynamics of +training with larger mini-batches. Specifically, we formulate selecting smaller +mini-batches of examples that closely capture gradients of large mini-batches +as a submodular maximization problem. Nevertheless, the very large +dimensionality of the gradients makes the problem very challenging to solve. To +address this, we leverage ideas from zeroth-order optimization and neural +network pruning to find lower-dimensional gradient estimates that allow finding +high-quality subsets effectively with a limited amount of memory. We prove the +superior convergence rate of training on the small mini-batches found by our +method and empirically show its effectiveness. Our method can effectively +reduce the memory requirement by 2x and speed up training by 1.3x, as we +confirm for fine-tuning Phi-2 on MathInstruct. Our method can be easily stacked +with LoRA and other memory-efficient methods to further reduce the memory +requirements of training LLMs. + +
+
+ comment: 15 pages, 2 figures, 4 tables +
+
+
+
+
+ + ☆ Sharp Bounds for Poly-GNNs and the Effect of Graph Noise + + +
+ We investigate the classification performance of graph neural networks with +graph-polynomial features, poly-GNNs, on the problem of semi-supervised node +classification. We analyze poly-GNNs under a general contextual stochastic +block model (CSBM) by providing a sharp characterization of the rate of +separation between classes in their output node representations. A question of +interest is whether this rate depends on the depth of the network $k$, i.e., +whether deeper networks can achieve a faster separation? We provide a negative +answer to this question: for a sufficiently large graph, a depth $k > 1$ +poly-GNN exhibits the same rate of separation as a depth $k=1$ counterpart. Our +analysis highlights and quantifies the impact of ``graph noise'' in deep GNNs +and shows how noise in the graph structure can dominate other sources of signal +in the graph, negating any benefit further aggregation provides. Our analysis +also reveals subtle differences between even and odd-layered GNNs in how the +feature noise propagates. + +
+
+
+
+
+ + ☆ Neural stochastic Volterra equations: learning path-dependent dynamics + + +
+ Stochastic Volterra equations (SVEs) serve as mathematical models for the +time evolutions of random systems with memory effects and irregular behaviour. +We introduce neural stochastic Volterra equations as a physics-inspired +architecture, generalizing the class of neural stochastic differential +equations, and provide some theoretical foundation. Numerical experiments on +various SVEs, like the disturbed pendulum equation, the generalized +Ornstein--Uhlenbeck process and the rough Heston model are presented, comparing +the performance of neural SVEs, neural SDEs and Deep Operator Networks +(DeepONets). + +
+
+ comment: 15 pages, 8 figures +
+
+
+
+
+ + ☆ Overcoming Uncertain Incompleteness for Robust Multimodal Sequential + Diagnosis Prediction via Knowledge Distillation and Random Data Erasing + + +
+ In this paper, we present NECHO v2, a novel framework designed to enhance the +predictive accuracy of multimodal sequential patient diagnoses under uncertain +missing visit sequences, a common challenge in clinical settings. Firstly, we +modify NECHO to handle uncertain modality representation dominance under the +imperfect data. Next, we develop a systematic knowledge distillation by +employing the modified NECHO as both teacher and student. It encompasses a +modality-wise contrastive and hierarchical distillation, transformer +representation random distillation, along with other distillations to align +representations tightly and effectively. We also utilise random erasing on +individual data points within sequences during both training and distillation +of teacher to lightly simulate scenario with missing visit information to +foster effective knowledge transfer. As a result, NECHO v2 verifies itself by +showing superiority in multimodal sequential diagnosis prediction on both +balanced and imbalanced incomplete settings on multimodal healthcare data. + +
+
+ comment: 5 pages, 1 figure, and 4 tables +
+
+
+
+
+ + ♻ ☆ Towards Interpretable Physical-Conceptual Catchment-Scale Hydrological + Modeling using the Mass-Conserving-Perceptron + + +
+ We investigate the applicability of machine learning technologies to the +development of parsimonious, interpretable, catchment-scale hydrologic models +using directed-graph architectures based on the mass-conserving perceptron +(MCP) as the fundamental computational unit. Here, we focus on architectural +complexity (depth) at a single location, rather than universal applicability +(breadth) across large samples of catchments. The goal is to discover a minimal +representation (numbers of cell-states and flow paths) that represents the +dominant processes that can explain the input-state-output behaviors of a given +catchment, with particular emphasis given to simulating the full range (high, +medium, and low) of flow dynamics. We find that a HyMod Like architecture with +three cell-states and two major flow pathways achieves such a representation at +our study location, but that the additional incorporation of an input-bypass +mechanism significantly improves the timing and shape of the hydrograph, while +the inclusion of bi-directional groundwater mass exchanges significantly +enhances the simulation of baseflow. Overall, our results demonstrate the +importance of using multiple diagnostic metrics for model evaluation, while +highlighting the need for properly selecting and designing the training metrics +based on information-theoretic foundations that are better suited to extracting +information across the full range of flow dynamics. This study sets the stage +for interpretable regional-scale MCP-based hydrological modeling (using large +sample data) by using neural architecture search to determine appropriate +minimal representations for catchments in different hydroclimatic regimes. + +
+
+ comment: 65 pages, 8 Figures, 4 Tables, 1 Supplementary Material +
+
+
+
+
+ + ♻ ☆ Revealing the Power of Masked Autoencoders in Traffic Forecasting CIKM 2024 + + +
+ Traffic forecasting, crucial for urban planning, requires accurate +predictions of spatial-temporal traffic patterns across urban areas. Existing +research mainly focuses on designing complex models that capture +spatial-temporal dependencies among variables explicitly. However, this field +faces challenges related to data scarcity and model stability, which results in +limited performance improvement. To address these issues, we propose +Spatial-Temporal Masked AutoEncoders (STMAE), a plug-and-play framework +designed to enhance existing spatial-temporal models on traffic prediction. +STMAE consists of two learning stages. In the pretraining stage, an encoder +processes partially visible traffic data produced by a dual-masking strategy, +including biased random walk-based spatial masking and patch-based temporal +masking. Subsequently, two decoders aim to reconstruct the masked counterparts +from both spatial and temporal perspectives. The fine-tuning stage retains the +pretrained encoder and integrates it with decoders from existing backbones to +improve forecasting accuracy. Our results on traffic benchmarks show that STMAE +can largely enhance the forecasting capabilities of various spatial-temporal +models. + +
+
+ comment: Accepted by CIKM 2024 +
+
+
+
+
+ + ♻ ☆ SportsNGEN: Sustained Generation of Realistic Multi-player Sports + Gameplay + + +
+ We present a transformer decoder based sports simulation engine, SportsNGEN, +trained on sports player and ball tracking sequences, that is capable of +generating sustained gameplay and accurately mimicking the decision making of +real players. By training on a large database of professional tennis tracking +data, we demonstrate that simulations produced by SportsNGEN can be used to +predict the outcomes of rallies, determine the best shot choices at any point, +and evaluate counterfactual or what if scenarios to inform coaching decisions +and elevate broadcast coverage. By combining the generated simulations with a +shot classifier and logic to start and end rallies, the system is capable of +simulating an entire tennis match. We evaluate SportsNGEN by comparing +statistics of the simulations with those of real matches between the same +players. We show that the model output sampling parameters are crucial to +simulation realism and that SportsNGEN is probabilistically well-calibrated to +real data. In addition, a generic version of SportsNGEN can be customized to a +specific player by fine-tuning on the subset of match data that includes that +player. Finally, we show qualitative results indicating the same approach works +for football. + +
+
+
+
+
+ + ♻ ☆ Flow Score Distillation for Diverse Text-to-3D Generation + + +
+ Recent advancements in Text-to-3D generation have yielded remarkable +progress, particularly through methods that rely on Score Distillation Sampling +(SDS). While SDS exhibits the capability to create impressive 3D assets, it is +hindered by its inherent maximum-likelihood-seeking essence, resulting in +limited diversity in generation outcomes. In this paper, we discover that the +Denoise Diffusion Implicit Models (DDIM) generation process (\ie PF-ODE) can be +succinctly expressed using an analogue of SDS loss. One step further, one can +see SDS as a generalized DDIM generation process. Following this insight, we +show that the noise sampling strategy in the noise addition stage significantly +restricts the diversity of generation results. To address this limitation, we +present an innovative noise sampling approach and introduce a novel text-to-3D +method called Flow Score Distillation (FSD). Our validation experiments across +various text-to-image Diffusion Models demonstrate that FSD substantially +enhances generation diversity without compromising quality. + +
+
+ comment: Consistent Flow Distillation is an improved version of this paper +
+
+
+
+
+ + ♻ ☆ Deep neural networks for choice analysis: Enhancing behavioral + regularity with gradient regularization + + +
+ Deep neural networks (DNNs) frequently present behaviorally irregular +patterns, significantly limiting their practical potentials and theoretical +validity in travel behavior modeling. This study proposes strong and weak +behavioral regularities as novel metrics to evaluate the monotonicity of +individual demand functions (known as the "law of demand"), and further designs +a constrained optimization framework with six gradient regularizers to enhance +DNNs' behavioral regularity. The proposed framework is applied to travel survey +data from Chicago and London to examine the trade-off between predictive power +and behavioral regularity for large vs. small sample scenarios and in-domain +vs. out-of-domain generalizations. The results demonstrate that, unlike models +with strong behavioral foundations such as the multinomial logit, the benchmark +DNNs cannot guarantee behavioral regularity. However, gradient regularization +(GR) increases DNNs' behavioral regularity by around 6 percentage points (pp) +while retaining their relatively high predictive power. In the small sample +scenario, GR is more effective than in the large sample scenario, +simultaneously improving behavioral regularity by about 20 pp and +log-likelihood by around 1.7%. Comparing with the in-domain generalization of +DNNs, GR works more effectively in out-of-domain generalization: it drastically +improves the behavioral regularity of poorly performing benchmark DNNs by +around 65 pp, indicating the criticality of behavioral regularization for +enhancing model transferability and application in forecasting. Moreover, the +proposed framework is applicable to other NN-based choice models such as +TasteNets. Future studies could use behavioral regularity as a metric along +with log-likelihood in evaluating travel demand models, and investigate other +methods to further enhance behavioral regularity when adopting complex machine +learning models. + +
+
+
+
+
+ + ♻ ☆ Structure learning of Hamiltonians from real-time evolution + + +
+ We study the problem of Hamiltonian structure learning from real-time +evolution: given the ability to apply $e^{-\mathrm{i} Ht}$ for an unknown local +Hamiltonian $H = \sum_{a = 1}^m \lambda_a E_a$ on $n$ qubits, the goal is to +recover $H$. This problem is already well-understood under the assumption that +the interaction terms, $E_a$, are given, and only the interaction strengths, +$\lambda_a$, are unknown. But how efficiently can we learn a local Hamiltonian +without prior knowledge of its interaction structure? + We present a new, general approach to Hamiltonian learning that not only +solves the challenging structure learning variant, but also resolves other open +questions in the area, all while achieving the gold standard of +Heisenberg-limited scaling. In particular, our algorithm recovers the +Hamiltonian to $\varepsilon$ error with total evolution time $O(\log +(n)/\varepsilon)$, and has the following appealing properties: (1) it does not +need to know the Hamiltonian terms; (2) it works beyond the short-range +setting, extending to any Hamiltonian $H$ where the sum of terms interacting +with a qubit has bounded norm; (3) it evolves according to $H$ in constant time +$t$ increments, thus achieving constant time resolution. As an application, we +can also learn Hamiltonians exhibiting power-law decay up to accuracy +$\varepsilon$ with total evolution time beating the standard limit of +$1/\varepsilon^2$. + +
+
+ comment: 52 pages; v2 discussed more literature, qualified some claims +
+
+
+
+
+ + ♻ ☆ LLaVA-NeXT-Interleave: Tackling Multi-image, Video, and 3D in Large + Multimodal Models + + +
+ Visual instruction tuning has made considerable strides in enhancing the +capabilities of Large Multimodal Models (LMMs). However, existing open LMMs +largely focus on single-image tasks, their applications to multi-image +scenarios remains less explored. Additionally, prior LMM research separately +tackles different scenarios, leaving it impossible to generalize cross +scenarios with new emerging capabilities. To this end, we introduce +LLaVA-NeXT-Interleave, which simultaneously tackles Multi-image, Multi-frame +(video), Multi-view (3D), and Multi-patch (single-image) scenarios in LMMs. To +enable these capabilities, we regard the interleaved data format as a general +template and compile the M4-Instruct dataset with 1,177.6k samples, spanning 4 +primary domains with 14 tasks and 41 datasets. We also curate the +LLaVA-Interleave Bench to comprehensively evaluate the multi-image performance +of LMMs. Through extensive experiments, LLaVA-NeXT-Interleave achieves leading +results in multi-image, video, and 3D benchmarks, while maintaining the +performance of single-image tasks. Besides, our model also exhibits several +emerging capabilities, e.g., transferring tasks across different settings and +modalities. Code is available at https://github.com/LLaVA-VL/LLaVA-NeXT + +
+
+ comment: Project Page: + https://llava-vl.github.io/blog/2024-06-16-llava-next-interleave/ +
+
+
+
+
+ + ♻ ☆ Weight fluctuations in (deep) linear neural networks and a derivation of + the inverse-variance flatness relation + + +
+ We investigate the stationary (late-time) training regime of single- and +two-layer underparameterized linear neural networks within the continuum limit +of stochastic gradient descent (SGD) for synthetic Gaussian data. In the case +of a single-layer network in the weakly underparameterized regime, the spectrum +of the noise covariance matrix deviates notably from the Hessian, which can be +attributed to the broken detailed balance of SGD dynamics. The weight +fluctuations are in this case generally anisotropic, but effectively experience +an isotropic loss. For an underparameterized two-layer network, we describe the +stochastic dynamics of the weights in each layer and analyze the associated +stationary covariances. We identify the inter-layer coupling as a distinct +source of anisotropy for the weight fluctuations. In contrast to the +single-layer case, the weight fluctuations are effectively subject to an +anisotropic loss, the flatness of which is inversely related to the fluctuation +variance. We thereby provide an analytical derivation of the recently observed +inverse variance-flatness relation in a model of a deep linear neural network. + +
+
+ comment: 27 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Efficient PAC Learnability of Dynamical Systems Over Multilayer Networks ICML 2024 + + +
+ Networked dynamical systems are widely used as formal models of real-world +cascading phenomena, such as the spread of diseases and information. Prior +research has addressed the problem of learning the behavior of an unknown +dynamical system when the underlying network has a single layer. In this work, +we study the learnability of dynamical systems over multilayer networks, which +are more realistic and challenging. First, we present an efficient PAC learning +algorithm with provable guarantees to show that the learner only requires a +small number of training examples to infer an unknown system. We further +provide a tight analysis of the Natarajan dimension which measures the model +complexity. Asymptotically, our bound on the Nararajan dimension is tight for +almost all multilayer graphs. The techniques and insights from our work provide +the theoretical foundations for future investigations of learning problems for +multilayer dynamical systems. + +
+
+ comment: Accepted at ICML 2024 +
+
+
+
+
+ + ♻ ☆ GraphSL: An Open-Source Library for Graph Source Localization Approaches + and Benchmark Datasets + + +
+ We introduce GraphSL, a new library for studying the graph source +localization problem. graph diffusion and graph source localization are inverse +problems in nature: graph diffusion predicts information diffusions from +information sources, while graph source localization predicts information +sources from information diffusions. GraphSL facilitates the exploration of +various graph diffusion models for simulating information diffusions and +enables the evaluation of cutting-edge source localization approaches on +established benchmark datasets. The source code of GraphSL is made available at +Github Repository (https://github.com/xianggebenben/GraphSL). Bug reports and +feedback can be directed to the Github issues page +(https://github.com/xianggebenben/GraphSL/issues). + +
+
+
+
+
+ + ♻ ☆ FLAIM: AIM-based Synthetic Data Generation in the Federated Setting KDD 2024 + + +
+ Preserving individual privacy while enabling collaborative data sharing is +crucial for organizations. Synthetic data generation is one solution, producing +artificial data that mirrors the statistical properties of private data. While +numerous techniques have been devised under differential privacy, they +predominantly assume data is centralized. However, data is often distributed +across multiple clients in a federated manner. In this work, we initiate the +study of federated synthetic tabular data generation. Building upon a SOTA +central method known as AIM, we present DistAIM and FLAIM. We first show that +it is straightforward to distribute AIM, extending a recent approach based on +secure multi-party computation which necessitates additional overhead, making +it less suited to federated scenarios. We then demonstrate that naively +federating AIM can lead to substantial degradation in utility under the +presence of heterogeneity. To mitigate both issues, we propose an augmented +FLAIM approach that maintains a private proxy of heterogeneity. We simulate our +methods across a range of benchmark datasets under different degrees of +heterogeneity and show we can improve utility while reducing overhead. + +
+
+ comment: Accepted to KDD 2024 +
+
+
+
+
+ + ♻ ☆ Learning on Graphs with Large Language Models(LLMs): A Deep Dive into + Model Robustness + + +
+ Large Language Models (LLMs) have demonstrated remarkable performance across +various natural language processing tasks. Recently, several LLMs-based +pipelines have been developed to enhance learning on graphs with text +attributes, showcasing promising performance. However, graphs are well-known to +be susceptible to adversarial attacks and it remains unclear whether LLMs +exhibit robustness in learning on graphs. To address this gap, our work aims to +explore the potential of LLMs in the context of adversarial attacks on graphs. +Specifically, we investigate the robustness against graph structural and +textual perturbations in terms of two dimensions: LLMs-as-Enhancers and +LLMs-as-Predictors. Through extensive experiments, we find that, compared to +shallow models, both LLMs-as-Enhancers and LLMs-as-Predictors offer superior +robustness against structural and textual attacks.Based on these findings, we +carried out additional analyses to investigate the underlying causes. +Furthermore, we have made our benchmark library openly available to facilitate +quick and fair evaluations, and to encourage ongoing innovative research in +this field. + +
+
+
+
+
+
+
+
+ + Multimedia 8 + +
+
+
+ + ☆ The Future is Meta: Metadata, Formats and Perspectives towards + Interactive and Personalized AV Content + + +
+ The production of media content has undergone tremendous changes in recent +years. Multiple daily content updates are just as common for some platforms as +is processing the provided content specifically for their target audiences. +Such features are made possible through metadata, which make information +accessible by categorizing it. In conjunction with AI-supported tools, metadata +are shaping the future of audio-visual content production, distribution and +consumption. It allows editors to effectively search through archives like in +the Tailored Media Project, broadcasters to provide content that is adapted to +users' surroundings like in the ARD Audiothek unterwegs project, or give users +the ability to experience audio-visual content from different perspectives like +in the ORPHEUS project. Although these projects provide comprehensive insight +into the potential of metadata, their integration in existing infrastructures +meets several limitations. For example, content-related metadata may initially +be generated at some point during the production process but will then be lost +at later stages due to current standards and incomplete software +implementations. In our contribution, we will discuss requirements and +potential approaches and give an outlook on possible fields of application and +use-cases. + +
+
+ comment: 12 pages, 4 figures, submitted to the Tonmeistertagung 32 +
+
+
+
+
+ + ☆ Detached and Interactive Multimodal Learning ACM MM 24 + + +
+ Recently, Multimodal Learning (MML) has gained significant interest as it +compensates for single-modality limitations through comprehensive complementary +information within multimodal data. However, traditional MML methods generally +use the joint learning framework with a uniform learning objective that can +lead to the modality competition issue, where feedback predominantly comes from +certain modalities, limiting the full potential of others. In response to this +challenge, this paper introduces DI-MML, a novel detached MML framework +designed to learn complementary information across modalities under the premise +of avoiding modality competition. Specifically, DI-MML addresses competition by +separately training each modality encoder with isolated learning objectives. It +further encourages cross-modal interaction via a shared classifier that defines +a common feature space and employing a dimension-decoupled unidirectional +contrastive (DUC) loss to facilitate modality-level knowledge transfer. +Additionally, to account for varying reliability in sample pairs, we devise a +certainty-aware logit weighting strategy to effectively leverage complementary +information at the instance level during inference. Extensive experiments +conducted on audio-visual, flow-image, and front-rear view datasets show the +superior performance of our proposed method. The code is released at +https://github.com/fanyunfeng-bit/DI-MML. + +
+
+ comment: Accepted by ACM MM 24 +
+
+
+
+
+ + ☆ Official-NV: A News Video Dataset for Multimodal Fake News Detection + + +
+ News media, especially video news media, have penetrated into every aspect of +daily life, which also brings the risk of fake news. Therefore, multimodal fake +news detection has recently received more attention. However, the number of +fake news detection data sets for video modal is small, and these data sets are +composed of unofficial videos uploaded by users, so there is too much useless +data. To solve this problem, we present in this paper a dataset named +Official-NV, which consists of officially published news videos on Xinhua. We +crawled videos on Xinhua, and then extended the data set using LLM generation +and manual modification. In addition, we benchmarked the data set presented in +this paper using a baseline model to demonstrate the advantage of Official-NV +in multimodal fake news detection. + +
+
+
+
+
+ + ☆ MVPbev: Multi-view Perspective Image Generation from BEV with Test-time + Controllability and Generalizability ACM MM24 + + +
+ This work aims to address the multi-view perspective RGB generation from text +prompts given Bird-Eye-View(BEV) semantics. Unlike prior methods that neglect +layout consistency, lack the ability to handle detailed text prompts, or are +incapable of generalizing to unseen view points, MVPbev simultaneously +generates cross-view consistent images of different perspective views with a +two-stage design, allowing object-level control and novel view generation at +test-time. Specifically, MVPbev firstly projects given BEV semantics to +perspective view with camera parameters, empowering the model to generalize to +unseen view points. Then we introduce a multi-view attention module where +special initialization and de-noising processes are introduced to explicitly +enforce local consistency among overlapping views w.r.t. cross-view homography. +Last but not least, MVPbev further allows test-time instance-level +controllability by refining a pre-trained text-to-image diffusion model. Our +extensive experiments on NuScenes demonstrate that our method is capable of +generating high-resolution photorealistic images from text descriptions with +thousands of training samples, surpassing the state-of-the-art methods under +various evaluation metrics. We further demonstrate the advances of our method +in terms of generalizability and controllability with the help of novel +evaluation metrics and comprehensive human analysis. Our code, data, and model +can be found in \url{https://github.com/kkaiwwana/MVPbev}. + +
+
+ comment: Accepted by ACM MM24 +
+
+
+
+
+ + ☆ An Inverse Partial Optimal Transport Framework for Music-guided Movie + Trailer Generation + + +
+ Trailer generation is a challenging video clipping task that aims to select +highlighting shots from long videos like movies and re-organize them in an +attractive way. In this study, we propose an inverse partial optimal transport +(IPOT) framework to achieve music-guided movie trailer generation. In +particular, we formulate the trailer generation task as selecting and sorting +key movie shots based on audio shots, which involves matching the latent +representations across visual and acoustic modalities. We learn a multi-modal +latent representation model in the proposed IPOT framework to achieve this aim. +In this framework, a two-tower encoder derives the latent representations of +movie and music shots, respectively, and an attention-assisted Sinkhorn +matching network parameterizes the grounding distance between the shots' latent +representations and the distribution of the movie shots. Taking the +correspondence between the movie shots and its trailer music shots as the +observed optimal transport plan defined on the grounding distances, we learn +the model by solving an inverse partial optimal transport problem, leading to a +bi-level optimization strategy. We collect real-world movies and their trailers +to construct a dataset with abundant label information called CMTD and, +accordingly, train and evaluate various automatic trailer generators. Compared +with state-of-the-art methods, our IPOT method consistently shows superiority +in subjective visual effects and objective quantitative measurements. + +
+
+ comment: acmmm2024 +
+
+
+
+
+ + ☆ Start from Video-Music Retrieval: An Inter-Intra Modal Loss for Cross + Modal Retrieval + + +
+ The burgeoning short video industry has accelerated the advancement of +video-music retrieval technology, assisting content creators in selecting +appropriate music for their videos. In self-supervised training for +video-to-music retrieval, the video and music samples in the dataset are +separated from the same video work, so they are all one-to-one matches. This +does not match the real situation. In reality, a video can use different music +as background music, and a music can be used as background music for different +videos. Many videos and music that are not in a pair may be compatible, leading +to false negative noise in the dataset. A novel inter-intra modal (II) loss is +proposed as a solution. By reducing the variation of feature distribution +within the two modalities before and after the encoder, II loss can reduce the +model's overfitting to such noise without removing it in a costly and laborious +way. The video-music retrieval framework, II-CLVM (Contrastive Learning for +Video-Music Retrieval), incorporating the II Loss, achieves state-of-the-art +performance on the YouTube8M dataset. The framework II-CLVTM shows better +performance when retrieving music using multi-modal video information (such as +text in videos). Experiments are designed to show that II loss can effectively +alleviate the problem of false negative noise in retrieval tasks. Experiments +also show that II loss improves various self-supervised and supervised +uni-modal and cross-modal retrieval tasks, and can obtain good retrieval models +with a small amount of training samples. + +
+
+ comment: 10 pages, 7 figures +
+
+
+
+
+ + ☆ AdaCoder: Adaptive Prompt Compression for Programmatic Visual Question + Answering + + +
+ Visual question answering aims to provide responses to natural language +questions given visual input. Recently, visual programmatic models (VPMs), +which generate executable programs to answer questions through large language +models (LLMs), have attracted research interest. However, they often require +long input prompts to provide the LLM with sufficient API usage details to +generate relevant code. To address this limitation, we propose AdaCoder, an +adaptive prompt compression framework for VPMs. AdaCoder operates in two +phases: a compression phase and an inference phase. In the compression phase, +given a preprompt that describes all API definitions in the Python language +with example snippets of code, a set of compressed preprompts is generated, +each depending on a specific question type. In the inference phase, given an +input question, AdaCoder predicts the question type and chooses the appropriate +corresponding compressed preprompt to generate code to answer the question. +Notably, AdaCoder employs a single frozen LLM and pre-defined prompts, negating +the necessity of additional training and maintaining adaptability across +different powerful black-box LLMs such as GPT and Claude. In experiments, we +apply AdaCoder to ViperGPT and demonstrate that it reduces token length by +71.1%, while maintaining or even improving the performance of visual question +answering. + +
+
+
+
+
+ + ♻ ☆ Overcome Modal Bias in Multi-modal Federated Learning via Balanced + Modality Selection ECCV24 + + +
+ Selecting proper clients to participate in each federated learning (FL) round +is critical to effectively harness a broad range of distributed data. Existing +client selection methods simply consider the mining of distributed uni-modal +data, yet, their effectiveness may diminish in multi-modal FL (MFL) as the +modality imbalance problem not only impedes the collaborative local training +but also leads to a severe global modality-level bias. We empirically reveal +that local training with a certain single modality may contribute more to the +global model than training with all local modalities. To effectively exploit +the distributed multiple modalities, we propose a novel Balanced Modality +Selection framework for MFL (BMSFed) to overcome the modal bias. On the one +hand, we introduce a modal enhancement loss during local training to alleviate +local imbalance based on the aggregated global prototypes. On the other hand, +we propose the modality selection aiming to select subsets of local modalities +with great diversity and achieving global modal balance simultaneously. Our +extensive experiments on audio-visual, colored-gray, and front-back datasets +showcase the superiority of BMSFed over baselines and its effectiveness in +multi-modal data exploitation. + +
+
+ comment: Accepted by ECCV24, 23 pages +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 27 + +
+
+
+ + ☆ Polynomial Regression as a Task for Understanding In-context Learning + Through Finetuning and Alignment ICML + + +
+ Simple function classes have emerged as toy problems to better understand +in-context-learning in transformer-based architectures used for large language +models. But previously proposed simple function classes like linear regression +or multi-layer-perceptrons lack the structure required to explore things like +prompting and alignment within models capable of in-context-learning. We +propose univariate polynomial regression as a function class that is just rich +enough to study prompting and alignment, while allowing us to visualize and +understand what is going on clearly. + +
+
+ comment: ICML Workshop on In-Context Learning +
+
+
+
+
+ + ☆ Inference-Time Selective Debiasing + + +
+ We propose selective debiasing -- an inference-time safety mechanism that +aims to increase the overall quality of models in terms of prediction +performance and fairness in the situation when re-training a model is +prohibitive. The method is inspired by selective prediction, where some +predictions that are considered low quality are discarded at inference time. In +our approach, we identify the potentially biased model predictions and, instead +of discarding them, we debias them using LEACE -- a post-processing debiasing +method. To select problematic predictions, we propose a bias quantification +approach based on KL divergence, which achieves better results than standard UQ +methods. Experiments with text classification datasets demonstrate that +selective debiasing helps to close the performance gap between post-processing +methods and at-training and pre-processing debiasing techniques. + +
+
+
+
+
+ + ☆ Parameter-Efficient Fine-Tuning via Circular Convolution + + +
+ Low-Rank Adaptation (LoRA) has gained popularity for fine-tuning large +foundation models, leveraging low-rank matrices $\mathbf{A}$ and $\mathbf{B}$ +to represent weight changes (\textit{i.e.,} $\Delta \mathbf{W} = \mathbf{B} +\mathbf{A}$). This method reduces trainable parameters and mitigates heavy +memory consumption associated with full delta matrices by sequentially +multiplying $\mathbf{A}$ and $\mathbf{B}$ with the activation. Despite its +success, the intrinsic low-rank characteristic may limit its performance. +Although several variants have been proposed to address this issue, they often +overlook the crucial computational and memory efficiency brought by LoRA. In +this paper, we propose \underline{C}ir\underline{c}ular \underline{C}onvolution +\underline{A}daptation (C$^3$A), which not only achieves high-rank adaptation +with enhanced performance but also excels in both computational power and +memory utilization. Extensive experiments demonstrate that C$^3$A consistently +outperforms LoRA and its variants across various fine-tuning tasks. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Do Language Models Have a Critical Period for Language Acquisition? + + +
+ Humans appear to have a critical period (CP) for language acquisition: Second +language (L2) acquisition becomes harder after early childhood, and ceasing +exposure to a first language (L1) after this period (but not before) typically +does not lead to substantial loss of L1 proficiency. It is unknown whether +these CP effects result from innately determined brain maturation or as a +stabilization of neural connections naturally induced by experience. In this +study, we use language models (LMs) to test the extent to which these phenomena +are peculiar to humans, or shared by a broader class of language learners. We +vary the age of exposure by training LMs on language pairs in various +experimental conditions, and find that LMs, which lack any direct analog to +innate maturational stages, do not show CP effects when trained sequentially on +L1 and L2. Our results contradict the claim that CP effects are an inevitable +result of learning in statistical learners, and they are consistent with an +innate mechanism for CP effects. We show that we can reverse-engineer the CP by +introducing a regularizer partway through training to simulate a maturational +decrease in plasticity. All in all, our results suggest that L1 learning on its +own may not be enough to induce a CP, and additional engineering is necessary +to make language models more cognitively plausible. + +
+
+
+
+
+ + ☆ IBMEA: Exploring Variational Information Bottleneck for Multi-modal + Entity Alignment ACM MM 2024 + + +
+ Multi-modal entity alignment (MMEA) aims to identify equivalent entities +between multi-modal knowledge graphs (MMKGs), where the entities can be +associated with related images. Most existing studies integrate multi-modal +information heavily relying on the automatically-learned fusion module, rarely +suppressing the redundant information for MMEA explicitly. To this end, we +explore variational information bottleneck for multi-modal entity alignment +(IBMEA), which emphasizes the alignment-relevant information and suppresses the +alignment-irrelevant information in generating entity representations. +Specifically, we devise multi-modal variational encoders to generate +modal-specific entity representations as probability distributions. Then, we +propose four modal-specific information bottleneck regularizers, limiting the +misleading clues in refining modal-specific entity representations. Finally, we +propose a modal-hybrid information contrastive regularizer to integrate all the +refined modal-specific representations, enhancing the entity similarity between +MMKGs to achieve MMEA. We conduct extensive experiments on two cross-KG and +three bilingual MMEA datasets. Experimental results demonstrate that our model +consistently outperforms previous state-of-the-art methods, and also shows +promising and robust performance in low-resource and high-noise data scenarios. + +
+
+ comment: Accepted by ACM MM 2024 +
+
+
+
+
+ + ☆ The Impact of LoRA Adapters for LLMs on Clinical NLP Classification + Under Data Limitations + + +
+ Fine-tuning Large Language Models (LLMs) for clinical Natural Language +Processing (NLP) poses significant challenges due to the domain gap and limited +data availability. This study investigates the effectiveness of various adapter +techniques, equivalent to Low-Rank Adaptation (LoRA), for fine-tuning LLMs in a +resource-constrained hospital environment. We experimented with four +structures-Adapter, Lightweight, TinyAttention, and Gated Residual Network +(GRN)-as final layers for clinical notes classification. We fine-tuned +biomedical pre-trained models, including CamemBERT-bio, AliBERT, and DrBERT, +alongside two Transformer-based models. Our extensive experimental results +indicate that i) employing adapter structures does not yield significant +improvements in fine-tuning biomedical pre-trained LLMs, and ii) simpler +Transformer-based models, trained from scratch, perform better under resource +constraints. Among the adapter structures, GRN demonstrated superior +performance with accuracy, precision, recall, and an F1 score of 0.88. +Moreover, the total training time for LLMs exceeded 1000 hours, compared to +under 6 hours for simpler transformer-based models, highlighting that LLMs are +more suitable for environments with extensive computational resources and +larger datasets. Consequently, this study demonstrates that simpler +Transformer-based models can be effectively trained from scratch, providing a +viable solution for clinical NLP tasks in low-resource environments with +limited data availability. By identifying the GRN as the most effective adapter +structure, we offer a practical approach to enhance clinical note +classification without requiring extensive computational resources. + +
+
+ comment: Under revisions +
+
+
+
+
+ + ☆ Understanding Memorisation in LLMs: Dynamics, Influencing Factors, and + Implications + + +
+ Understanding whether and to what extent large language models (LLMs) have +memorised training data has important implications for the reliability of their +output and the privacy of their training data. In order to cleanly measure and +disentangle memorisation from other phenomena (e.g. in-context learning), we +create an experimental framework that is based on repeatedly exposing LLMs to +random strings. Our framework allows us to better understand the dynamics, +i.e., the behaviour of the model, when repeatedly exposing it to random +strings. Using our framework, we make several striking observations: (a) we +find consistent phases of the dynamics across families of models (Pythia, Phi +and Llama2), (b) we identify factors that make some strings easier to memorise +than others, and (c) we identify the role of local prefixes and global context +in memorisation. We also show that sequential exposition to different random +strings has a significant effect on memorisation. Our results, often +surprising, have significant downstream implications in the study and usage of +LLMs. + +
+
+
+
+
+ + ☆ Stochastic Parrots or ICU Experts? Large Language Models in Critical + Care Medicine: A Scoping Review + + +
+ With the rapid development of artificial intelligence (AI), large language +models (LLMs) have shown strong capabilities in natural language understanding, +reasoning, and generation, attracting amounts of research interest in applying +LLMs to health and medicine. Critical care medicine (CCM) provides diagnosis +and treatment for critically ill patients who often require intensive +monitoring and interventions in intensive care units (ICUs). Can LLMs be +applied to CCM? Are LLMs just like stochastic parrots or ICU experts in +assisting clinical decision-making? This scoping review aims to provide a +panoramic portrait of the application of LLMs in CCM. Literature in seven +databases, including PubMed, Embase, Scopus, Web of Science, CINAHL, IEEE +Xplore, and ACM Digital Library, were searched from January 1, 2019, to June +10, 2024. Peer-reviewed journal and conference articles that discussed the +application of LLMs in critical care settings were included. From an initial +619 articles, 24 were selected for final review. This review grouped +applications of LLMs in CCM into three categories: clinical decision support, +medical documentation and reporting, and medical education and doctor-patient +communication. LLMs have advantages in handling unstructured data and do not +require manual feature engineering. Meanwhile, applying LLMs to CCM faces +challenges, including hallucinations, poor interpretability, bias and alignment +challenges, and privacy and ethics issues. Future research should enhance model +reliability and interpretability, integrate up-to-date medical knowledge, and +strengthen privacy and ethical guidelines. As LLMs evolve, they could become +key tools in CCM to help improve patient outcomes and optimize healthcare +delivery. This study is the first review of LLMs in CCM, aiding researchers, +clinicians, and policymakers to understand the current status and future +potentials of LLMs in CCM. + +
+
+ comment: 28 pages, 5 figures +
+
+
+
+
+ + ☆ On Behalf of the Stakeholders: Trends in NLP Model Interpretability in + the Era of LLMs + + +
+ Recent advancements in NLP systems, particularly with the introduction of +LLMs, have led to widespread adoption of these systems by a broad spectrum of +users across various domains, impacting decision-making, the job market, +society, and scientific research. This surge in usage has led to an explosion +in NLP model interpretability and analysis research, accompanied by numerous +technical surveys. Yet, these surveys often overlook the needs and perspectives +of explanation stakeholders. In this paper, we address three fundamental +questions: Why do we need interpretability, what are we interpreting, and how? +By exploring these questions, we examine existing interpretability paradigms, +their properties, and their relevance to different stakeholders. We further +explore the practical implications of these paradigms by analyzing trends from +the past decade across multiple research fields. To this end, we retrieved +thousands of papers and employed an LLM to characterize them. Our analysis +reveals significant disparities between NLP developers and non-developer users, +as well as between research fields, underscoring the diverse needs of +stakeholders. For example, explanations of internal model components are rarely +used outside the NLP field. We hope this paper informs the future design, +development, and application of methods that align with the objectives and +requirements of various stakeholders. + +
+
+
+
+
+ + ☆ Towards the Dynamics of a DNN Learning Symbolic Interactions + + +
+ This study proves the two-phase dynamics of a deep neural network (DNN) +learning interactions. Despite the long disappointing view of the faithfulness +of post-hoc explanation of a DNN, in recent years, a series of theorems have +been proven to show that given an input sample, a small number of interactions +between input variables can be considered as primitive inference patterns, +which can faithfully represent every detailed inference logic of the DNN on +this sample. Particularly, it has been observed that various DNNs all learn +interactions of different complexities with two-phase dynamics, and this well +explains how a DNN's generalization power changes from under-fitting to +over-fitting. Therefore, in this study, we prove the dynamics of a DNN +gradually encoding interactions of different complexities, which provides a +theoretically grounded mechanism for the over-fitting of a DNN. Experiments +show that our theory well predicts the real learning dynamics of various DNNs +on different tasks. + +
+
+
+
+
+ + ☆ Why Misinformation is Created? Detecting them by Integrating Intent + Features CIKM 2024 + + +
+ Various social media platforms, e.g., Twitter and Reddit, allow people to +disseminate a plethora of information more efficiently and conveniently. +However, they are inevitably full of misinformation, causing damage to diverse +aspects of our daily lives. To reduce the negative impact, timely +identification of misinformation, namely Misinformation Detection (MD), has +become an active research topic receiving widespread attention. As a complex +phenomenon, the veracity of an article is influenced by various aspects. In +this paper, we are inspired by the opposition of intents between misinformation +and real information. Accordingly, we propose to reason the intent of articles +and form the corresponding intent features to promote the veracity +discrimination of article features. To achieve this, we build a hierarchy of a +set of intents for both misinformation and real information by referring to the +existing psychological theories, and we apply it to reason the intent of +articles by progressively generating binary answers with an encoder-decoder +structure. We form the corresponding intent features and integrate it with the +token features to achieve more discriminative article features for MD. Upon +these ideas, we suggest a novel MD method, namely Detecting Misinformation by +Integrating Intent featuRes (DM-INTER). To evaluate the performance of +DM-INTER, we conduct extensive experiments on benchmark MD datasets. The +experimental results validate that DM-INTER can outperform the existing +baseline MD methods. + +
+
+ comment: 11 pages, 3 figures. Accepted by CIKM 2024 +
+
+
+
+
+ + ☆ Harmfully Manipulated Images Matter in Multimodal Misinformation + Detection ACM MM 2024 + + +
+ Nowadays, misinformation is widely spreading over various social media +platforms and causes extremely negative impacts on society. To combat this +issue, automatically identifying misinformation, especially those containing +multimodal content, has attracted growing attention from the academic and +industrial communities, and induced an active research topic named Multimodal +Misinformation Detection (MMD). Typically, existing MMD methods capture the +semantic correlation and inconsistency between multiple modalities, but neglect +some potential clues in multimodal content. Recent studies suggest that +manipulated traces of the images in articles are non-trivial clues for +detecting misinformation. Meanwhile, we find that the underlying intentions +behind the manipulation, e.g., harmful and harmless, also matter in MMD. +Accordingly, in this work, we propose to detect misinformation by learning +manipulation features that indicate whether the image has been manipulated, as +well as intention features regarding the harmful and harmless intentions of the +manipulation. Unfortunately, the manipulation and intention labels that make +these features discriminative are unknown. To overcome the problem, we propose +two weakly supervised signals as alternatives by introducing additional +datasets on image manipulation detection and formulating two classification +tasks as positive and unlabeled learning problems. Based on these ideas, we +propose a novel MMD method, namely Harmfully Manipulated Images Matter in MMD +(HAMI-M3D). Extensive experiments across three benchmark datasets can +demonstrate that HAMI-M3D can consistently improve the performance of any MMD +baselines. + +
+
+ comment: Accepted by ACM MM 2024. Code: + https://github.com/wangbing1416/HAMI-M3D +
+
+
+
+
+ + ☆ FarSSiBERT: A Novel Transformer-based Model for Semantic Similarity + Measurement of Persian Social Networks Informal Texts + + +
+ One fundamental task for NLP is to determine the similarity between two texts +and evaluate the extent of their likeness. The previous methods for the Persian +language have low accuracy and are unable to comprehend the structure and +meaning of texts effectively. Additionally, these methods primarily focus on +formal texts, but in real-world applications of text processing, there is a +need for robust methods that can handle colloquial texts. This requires +algorithms that consider the structure and significance of words based on +context, rather than just the frequency of words. The lack of a proper dataset +for this task in the Persian language makes it important to develop such +algorithms and construct a dataset for Persian text. This paper introduces a +new transformer-based model to measure semantic similarity between Persian +informal short texts from social networks. In addition, a Persian dataset named +FarSSiM has been constructed for this purpose, using real data from social +networks and manually annotated and verified by a linguistic expert team. The +proposed model involves training a large language model using the BERT +architecture from scratch. This model, called FarSSiBERT, is pre-trained on +approximately 104 million Persian informal short texts from social networks, +making it one of a kind in the Persian language. Moreover, a novel specialized +informal language tokenizer is provided that not only performs tokenization on +formal texts well but also accurately identifies tokens that other Persian +tokenizers are unable to recognize. It has been demonstrated that our proposed +model outperforms ParsBERT, laBSE, and multilingual BERT in the Pearson and +Spearman's coefficient criteria. Additionally, the pre-trained large language +model has great potential for use in other NLP tasks on colloquial text and as +a tokenizer for less-known informal words. + +
+
+
+
+
+ + ☆ Addressing Topic Leakage in Cross-Topic Evaluation for Authorship + Verification + + +
+ Authorship verification (AV) aims to identify whether a pair of texts has the +same author. We address the challenge of evaluating AV models' robustness +against topic shifts. The conventional evaluation assumes minimal topic overlap +between training and test data. However, we argue that there can still be topic +leakage in test data, causing misleading model performance and unstable +rankings. To address this, we propose an evaluation method called +Heterogeneity-Informed Topic Sampling (HITS), which creates a smaller dataset +with a heterogeneously distributed topic set. Our experimental results +demonstrate that HITS-sampled datasets yield a more stable ranking of models +across random seeds and evaluation splits. Our contributions include: 1. An +analysis of causes and effects of topic leakage. 2. A demonstration of the HITS +in reducing the effects of topic leakage, and 3. The Robust Authorship +Verification bENchmark (RAVEN) that allows topic shortcut test to uncover AV +models' reliance on topic-specific features. + +
+
+ comment: Accepted to publish at Transactions of the Association for + Computational Linguistics +
+
+
+
+
+ + ♻ ☆ CogErgLLM: Exploring Large Language Model Systems Design Perspective + Using Cognitive Ergonomics ICML 2024 + + +
+ Integrating cognitive ergonomics with LLMs is essential for enhancing safety, +reliability, and user satisfaction in human-AI interactions. Current LLM design +often lacks this integration, leading to systems that may not fully align with +human cognitive capabilities and limitations. Insufficient focus on +incorporating cognitive science methods exacerbates biases in LLM outputs, +while inconsistent application of user-centered design principles results in +sub-optimal user experiences. To address these challenges, our position paper +explores the critical integration of cognitive ergonomics principles into LLM +design, aiming to provide a comprehensive framework and practical guidelines +for ethical LLM development. Through our contributions, we seek to advance +understanding and practice in integrating cognitive ergonomics into LLM +systems, fostering safer, more reliable, and ethically sound human-AI +interactions. + +
+
+ comment: 8 Page, 3 Figures. Accepted to Large Language Models and Cognition @ + ICML 2024 (https://llm-cognition.github.io/#:~:text=CogErgLLM) +
+
+
+
+
+ + ♻ ☆ Evaluating LLMs at Detecting Errors in LLM Responses + + +
+ With Large Language Models (LLMs) being widely used across various tasks, +detecting errors in their responses is increasingly crucial. However, little +research has been conducted on error detection of LLM responses. Collecting +error annotations on LLM responses is challenging due to the subjective nature +of many NLP tasks, and thus previous research focuses on tasks of little +practical value (e.g., word sorting) or limited error types (e.g., faithfulness +in summarization). This work introduces ReaLMistake, the first error detection +benchmark consisting of objective, realistic, and diverse errors made by LLMs. +ReaLMistake contains three challenging and meaningful tasks that introduce +objectively assessable errors in four categories (reasoning correctness, +instruction-following, context-faithfulness, and parameterized knowledge), +eliciting naturally observed and diverse errors in responses of GPT-4 and Llama +2 70B annotated by experts. We use ReaLMistake to evaluate error detectors +based on 12 LLMs. Our findings show: 1) Top LLMs like GPT-4 and Claude 3 detect +errors made by LLMs at very low recall, and all LLM-based error detectors +perform much worse than humans. 2) Explanations by LLM-based error detectors +lack reliability. 3) LLMs-based error detection is sensitive to small changes +in prompts but remains challenging to improve. 4) Popular approaches to +improving LLMs, including self-consistency and majority vote, do not improve +the error detection performance. Our benchmark and code are provided at +https://github.com/psunlpgroup/ReaLMistake. + +
+
+ comment: COLM 2024, 46 pages, Benchmark and code: + https://github.com/psunlpgroup/ReaLMistake +
+
+
+
+
+ + ♻ ☆ [Call for Papers] The 2nd BabyLM Challenge: Sample-efficient pretraining + on a developmentally plausible corpus + + +
+ After last year's successful BabyLM Challenge, the competition will be hosted +again in 2024/2025. The overarching goals of the challenge remain the same; +however, some of the competition rules will be different. The big changes for +this year's competition are as follows: First, we replace the loose track with +a paper track, which allows (for example) non-model-based submissions, novel +cognitively-inspired benchmarks, or analysis techniques. Second, we are +relaxing the rules around pretraining data, and will now allow participants to +construct their own datasets provided they stay within the 100M-word or +10M-word budget. Third, we introduce a multimodal vision-and-language track, +and will release a corpus of 50% text-only and 50% image-text multimodal data +as a starting point for LM model training. The purpose of this CfP is to +provide rules for this year's challenge, explain these rule changes and their +rationale in greater detail, give a timeline of this year's competition, and +provide answers to frequently asked questions from last year's challenge. + +
+
+
+
+
+ + ♻ ☆ Emotion Detection with Transformers: A Comparative Study + + +
+ In this study, we explore the application of transformer-based models for +emotion classification on text data. We train and evaluate several pre-trained +transformer models, on the Emotion dataset using different variants of +transformers. The paper also analyzes some factors that in-fluence the +performance of the model, such as the fine-tuning of the transformer layer, the +trainability of the layer, and the preprocessing of the text data. Our analysis +reveals that commonly applied techniques like removing punctuation and stop +words can hinder model performance. This might be because transformers strength +lies in understanding contextual relationships within text. Elements like +punctuation and stop words can still convey sentiment or emphasis and removing +them might disrupt this context. + +
+
+
+
+
+ + ♻ ☆ Extracting Emotion Phrases from Tweets using BART + + +
+ Sentiment analysis is a natural language processing task that aims to +identify and extract the emotional aspects of a text. However, many existing +sentiment analysis methods primarily classify the overall polarity of a text, +overlooking the specific phrases that convey sentiment. In this paper, we +applied an approach to sentiment analysis based on a question-answering +framework. Our approach leverages the power of Bidirectional Autoregressive +Transformer (BART), a pre-trained sequence-to-sequence model, to extract a +phrase from a given text that amplifies a given sentiment polarity. We create a +natural language question that identifies the specific emotion to extract and +then guide BART to pay attention to the relevant emotional cues in the text. We +use a classifier within BART to predict the start and end positions of the +answer span within the text, which helps to identify the precise boundaries of +the extracted emotion phrase. Our approach offers several advantages over most +sentiment analysis studies, including capturing the complete context and +meaning of the text and extracting precise token spans that highlight the +intended sentiment. We achieved an end loss of 87% and Jaccard score of 0.61. + +
+
+
+
+
+ + ♻ ☆ PoliTune: Analyzing the Impact of Data Selection and Fine-Tuning on + Economic and Political Biases in Large Language Models AAAI + + +
+ In an era where language models are increasingly integrated into +decision-making and communication, understanding the biases within Large +Language Models (LLMs) becomes imperative, especially when these models are +applied in the economic and political domains. This work investigates the +impact of fine-tuning and data selection on economic and political biases in +LLMs. In this context, we introduce PoliTune, a fine-tuning methodology to +explore the systematic aspects of aligning LLMs with specific ideologies, +mindful of the biases that arise from their extensive training on diverse +datasets. Distinct from earlier efforts that either focus on smaller models or +entail resource-intensive pre-training, PoliTune employs Parameter-Efficient +Fine-Tuning (PEFT) techniques, which allow for the alignment of LLMs with +targeted ideologies by modifying a small subset of parameters. We introduce a +systematic method for using the open-source LLM Llama3-70B for dataset +selection, annotation, and synthesizing a preferences dataset for Direct +Preference Optimization (DPO) to align the model with a given political +ideology. We assess the effectiveness of PoliTune through both quantitative and +qualitative evaluations of aligning open-source LLMs (Llama3-8B and Mistral-7B) +to different ideologies. Our work analyzes the potential of embedding specific +biases into LLMs and contributes to the dialogue on the ethical application of +AI, highlighting the importance of deploying AI in a manner that aligns with +societal values. + +
+
+ comment: AIES '24: Proceedings of the 2024 AAAI/ACM Conference on AI, Ethics, + and Society +
+
+
+
+
+ + ♻ ☆ Redundancy Aware Multi-Reference Based Gainwise Evaluation of Extractive + Summarization + + +
+ The ROUGE metric is commonly used to evaluate extractive summarization task, +but it has been criticized for its lack of semantic awareness and its ignorance +about the ranking quality of the extractive summarizer. Previous research has +introduced a gain-based automated metric called Sem-nCG that addresses these +issues, as it is both rank and semantic aware. However, it does not consider +the amount of redundancy present in a model summary and currently does not +support evaluation with multiple reference summaries. It is essential to have a +model summary that balances importance and diversity, but finding a metric that +captures both of these aspects is challenging. In this paper, we propose a +redundancy-aware Sem-nCG metric and demonstrate how the revised Sem-nCG metric +can be used to evaluate model summaries against multiple references as well +which was missing in previous research. Experimental results demonstrate that +the revised Sem-nCG metric has a stronger correlation with human judgments +compared to the previous Sem-nCG metric and traditional ROUGE and BERTScore +metric for both single and multiple reference scenarios. + +
+
+ comment: Accepted to KNOVENS 2024 +
+
+
+
+
+ + ♻ ☆ SQLFixAgent: Towards Semantic-Accurate Text-to-SQL Parsing via + Consistency-Enhanced Multi-Agent Collaboration + + +
+ While fine-tuned large language models (LLMs) excel in generating +grammatically valid SQL in Text-to-SQL parsing, they often struggle to ensure +semantic accuracy in queries, leading to user confusion and diminished system +usability. To tackle this challenge, we introduce SQLFixAgent, an innovative +multi-agent collaborative framework designed for detecting and repairing +erroneous SQL. Our framework comprises a core agent, SQLRefiner, alongside two +auxiliary agents: SQLReviewer and QueryCrafter. The SQLReviewer agent employs +the rubber duck debugging method to identify potential semantic mismatches +between SQL statement and user query. If the error is detected, the +QueryCrafter agent generates multiple SQL statements as candidate repairs using +a fine-tuned SQLTool. Subsequently, leveraging similar repair retrieval and +failure memory reflexion, the SQLRefiner agent selects the most fitting SQL +statement from the candidates as the final repair. We evaluated our proposed +framework on five Text-to-SQL benchmarks. The experimental results show that +our method consistently enhances the performance of the baseline model, +specifically achieving an execution accuracy improvement of over 3\% on the +Bird benchmark. Our framework also has a higher token efficiency compared to +other advanced methods, making it more competitive. + +
+
+
+
+
+ + ♻ ☆ M4LE: A Multi-Ability Multi-Range Multi-Task Multi-Domain Long-Context + Evaluation Benchmark for Large Language Models + + +
+ Managing long sequences has become an important and necessary feature for +large language models (LLMs). However, it is still an open question of how to +comprehensively and systematically evaluate the long-sequence capability of +LLMs. One of the reasons is that conventional and widely-used benchmarks mainly +consist of short sequences. In this paper, we propose M4LE, a Multi-ability, +Multi-range, Multi-task, Multi-domain benchmark for Long-context Evaluation. +M4LE is based on a diverse NLP task pool comprising 36 NLP datasets, 11 task +types and 12 domains. To alleviate the scarcity of tasks with naturally long +sequences and incorporate multiple-ability assessment, we propose an automatic +approach (but with negligible human annotations) to convert short-sequence +tasks into a unified long-sequence scenario where LLMs have to identify single +or multiple relevant spans in long contexts based on explicit or semantic +hints. Specifically, the scenario includes five different types of abilities: +(1) explicit single-span; (2) semantic single-span; (3) explicit multiple-span; +(4) semantic multiple-span; and (5) global context understanding. The resulting +samples in M4LE are evenly distributed from 1k to 8k input length. We conducted +a systematic evaluation on 11 well-established LLMs, especially those optimized +for long-sequence inputs. Our results reveal that: 1) Current LLMs struggle to +understand long context, particularly when tasks require multiple-span +attention. 2) Semantic retrieval task is more difficult for competent LLMs. 3) +Models fine-tuned on longer text with position interpolation have comparable +performance to those using Neural Tangent Kernel (NTK) aware scaling methods +without fine-tuning. We make our benchmark publicly available to encourage +future research in this challenging area. + +
+
+ comment: Code and data are available at https://github.com/KwanWaiChung/M4LE +
+
+
+
+
+ + ♻ ☆ Cost-efficient Crowdsourcing for Span-based Sequence Labeling: Worker + Selection and Data Augmentation CCL 2024 + + +
+ This paper introduces a novel crowdsourcing worker selection algorithm, +enhancing annotation quality and reducing costs. Unlike previous studies +targeting simpler tasks, this study contends with the complexities of label +interdependencies in sequence labeling. The proposed algorithm utilizes a +Combinatorial Multi-Armed Bandit (CMAB) approach for worker selection, and a +cost-effective human feedback mechanism. The challenge of dealing with +imbalanced and small-scale datasets, which hinders offline simulation of worker +selection, is tackled using an innovative data augmentation method termed +shifting, expanding, and shrinking (SES). Rigorous testing on CoNLL 2003 NER +and Chinese OEI datasets showcased the algorithm's efficiency, with an increase +in F1 score up to 100.04% of the expert-only baseline, alongside cost savings +up to 65.97%. The paper also encompasses a dataset-independent test emulating +annotation evaluation through a Bernoulli distribution, which still led to an +impressive 97.56% F1 score of the expert baseline and 59.88% cost savings. +Furthermore, our approach can be seamlessly integrated into Reinforcement +Learning from Human Feedback (RLHF) systems, offering a cost-effective solution +for obtaining human feedback. + +
+
+ comment: Camera-ready version for CCL 2024 +
+
+
+
+
+ + ♻ ☆ Positive Text Reframing under Multi-strategy Optimization + + +
+ Differing from sentiment transfer, positive reframing seeks to substitute +negative perspectives with positive expressions while preserving the original +meaning. With the emergence of pre-trained language models (PLMs), it is +possible to achieve acceptable results by fine-tuning PLMs. Nevertheless, +generating fluent, diverse and task-constrained reframing text remains a +significant challenge. To tackle this issue, a \textbf{m}ulti-\textbf{s}trategy +\textbf{o}ptimization \textbf{f}ramework (MSOF) is proposed in this paper. +Starting from the objective of positive reframing, we first design positive +sentiment reward and content preservation reward to encourage the model to +transform the negative expressions of the original text while ensuring the +integrity and consistency of the semantics. Then, different decoding +optimization approaches are introduced to improve the quality of text +generation. Finally, based on the modeling formula of positive reframing, we +propose a multi-dimensional re-ranking method that further selects candidate +sentences from three dimensions: strategy consistency, text similarity and +fluency. Extensive experiments on two Seq2Seq PLMs, BART and T5, demonstrate +our framework achieves significant improvements on unconstrained and controlled +positive reframing tasks. + +
+
+
+
+
+ + ♻ ☆ Learning Word Embedding with Better Distance Weighting and Window Size + Scheduling + + +
+ Distributed word representation (a.k.a. word embedding) is a key focus in +natural language processing (NLP). As a highly successful word embedding model, +Word2Vec offers an efficient method for learning distributed word +representations on large datasets. However, Word2Vec lacks consideration for +distances between center and context words. We propose two novel methods, +Learnable Formulated Weights (LFW) and Epoch-based Dynamic Window Size (EDWS), +to incorporate distance information into two variants of Word2Vec, the +Continuous Bag-of-Words (CBOW) model and the Continuous Skip-gram (Skip-gram) +model. For CBOW, LFW uses a formula with learnable parameters that best +reflects the relationship of influence and distance between words to calculate +distance-related weights for average pooling, providing insights for future NLP +text modeling research. For Skip-gram, we improve its dynamic window size +strategy to introduce distance information in a more balanced way. Experiments +prove the effectiveness of LFW and EDWS in enhancing Word2Vec's performance, +surpassing previous state-of-the-art methods. + +
+
+
+
+
+ + ♻ ☆ Automate or Assist? The Role of Computational Models in Identifying + Gendered Discourse in US Capital Trial Transcripts + + +
+ The language used by US courtroom actors in criminal trials has long been +studied for biases. However, systematic studies for bias in high-stakes court +trials have been difficult, due to the nuanced nature of bias and the legal +expertise required. Large language models offer the possibility to automate +annotation. But validating the computational approach requires both an +understanding of how automated methods fit in existing annotation workflows and +what they really offer. We present a case study of adding a computational model +to a complex and high-stakes problem: identifying gender-biased language in US +capital trials for women defendants. Our team of experienced death-penalty +lawyers and NLP technologists pursue a three-phase study: first annotating +manually, then training and evaluating computational models, and finally +comparing expert annotations to model predictions. Unlike many typical NLP +tasks, annotating for gender bias in months-long capital trials is complicated, +with many individual judgment calls. Contrary to standard arguments for +automation that are based on efficiency and scalability, legal experts find the +computational models most useful in providing opportunities to reflect on their +own bias in annotation and to build consensus on annotation rules. This +experience suggests that seeking to replace experts with computational models +for complex annotation is both unrealistic and undesirable. Rather, +computational models offer valuable opportunities to assist the legal experts +in annotation-based studies. + +
+
+
+
+
+
+
+
+ + Information Retrieval 4 + +
+
+
+ + ☆ MaTrRec: Uniting Mamba and Transformer for Sequential Recommendation + + +
+ Sequential recommendation systems aim to provide personalized recommendations +by analyzing dynamic preferences and dependencies within user behavior +sequences. Recently, Transformer models can effectively capture user +preferences. However, their quadratic computational complexity limits +recommendation performance on long interaction sequence data. Inspired by the +State Space Model (SSM)representative model, Mamba, which efficiently captures +user preferences in long interaction sequences with linear complexity, we find +that Mamba's recommendation effectiveness is limited in short interaction +sequences, with failing to recall items of actual interest to users and +exacerbating the data sparsity cold start problem. To address this issue, we +innovatively propose a new model, MaTrRec, which combines the strengths of +Mamba and Transformer. This model fully leverages Mamba's advantages in +handling long-term dependencies and Transformer's global attention advantages +in short-term dependencies, thereby enhances predictive capabilities on both +long and short interaction sequence datasets while balancing model efficiency. +Notably, our model significantly improves the data sparsity cold start problem, +with an improvement of up to 33% on the highly sparse Amazon Musical +Instruments dataset. We conducted extensive experimental evaluations on five +widely used public datasets. The experimental results show that our model +outperforms the current state-of-the-art sequential recommendation models on +all five datasets. The code is available at +https://github.com/Unintelligentmumu/MaTrRec. + +
+
+
+
+
+ + ☆ LawLLM: Law Large Language Model for the US Legal System CIKM 2024 + + +
+ In the rapidly evolving field of legal analytics, finding relevant cases and +accurately predicting judicial outcomes are challenging because of the +complexity of legal language, which often includes specialized terminology, +complex syntax, and historical context. Moreover, the subtle distinctions +between similar and precedent cases require a deep understanding of legal +knowledge. Researchers often conflate these concepts, making it difficult to +develop specialized techniques to effectively address these nuanced tasks. In +this paper, we introduce the Law Large Language Model (LawLLM), a multi-task +model specifically designed for the US legal domain to address these +challenges. LawLLM excels at Similar Case Retrieval (SCR), Precedent Case +Recommendation (PCR), and Legal Judgment Prediction (LJP). By clearly +distinguishing between precedent and similar cases, we provide essential +clarity, guiding future research in developing specialized strategies for these +tasks. We propose customized data preprocessing techniques for each task that +transform raw legal data into a trainable format. Furthermore, we also use +techniques such as in-context learning (ICL) and advanced information retrieval +methods in LawLLM. The evaluation results demonstrate that LawLLM consistently +outperforms existing baselines in both zero-shot and few-shot scenarios, +offering unparalleled multi-task capabilities and filling critical gaps in the +legal domain. + +
+
+ comment: 21 pages, 2 figures, accepted at the 33rd ACM International + Conference on Information and Knowledge Management (CIKM 2024) for the + Applied Research Paper track +
+
+
+
+
+ + ♻ ☆ POSIT: Promotion of Semantic Item Tail via Adversarial Learning KDD'2024 + + +
+ In many recommendations, a handful of popular items (e.g., movies / +television shows, news, etc.) can be dominant in recommendations for many +users. However, we know that in a large catalog of items, users are likely +interested in more than what is popular. The dominance of popular items may +mean that users will not see items that they would probably enjoy. In this +paper, we propose a technique to overcome this problem using adversarial +machine learning. We define a metric to translate the user-level utility metric +in terms of an advantage/disadvantage over items. We subsequently used that +metric in an adversarial learning framework to systematically promote +disadvantaged items. Distinctly, our method integrates a small-capacity model +to produce semantically meaningful weights, leading to an algorithm that +identifies and promotes a semantically similar item within the learning +process. In the empirical study, we evaluated the proposed technique on three +publicly available datasets and seven competitive baselines. The result shows +that our proposed method not only improves the coverage, but also, +surprisingly, improves the overall performance. + +
+
+ comment: EAI-KDD'2024. Code at https://github.com/qiulingxu/POSIT +
+
+
+
+
+ + ♻ ☆ Optimizing Audio Recommendations for the Long-Term: A Reinforcement + Learning Perspective + + +
+ We present a novel podcast recommender system deployed at industrial scale. +This system successfully optimizes personal listening journeys that unfold over +months for hundreds of millions of listeners. In deviating from the pervasive +industry practice of optimizing machine learning algorithms for short-term +proxy metrics, the system substantially improves long-term performance in A/B +tests. The paper offers insights into how our methods cope with attribution, +coordination, and measurement challenges that usually hinder such long-term +optimization. To contextualize these practical insights within a broader +academic framework, we turn to reinforcement learning (RL). Using the language +of RL, we formulate a comprehensive model of users' recurring relationships +with a recommender system. Then, within this model, we identify our approach as +a policy improvement update to a component of the existing recommender system, +enhanced by tailored modeling of value functions and user-state +representations. Illustrative offline experiments suggest this specialized +modeling reduces data requirements by as much as a factor of 120,000 compared +to black-box approaches. + +
+
+
+
+
+
+
+
+ + Multimedia 5 + +
+
+
+ + ☆ Integrating Large Language Models into a Tri-Modal Architecture for + Automated Depression Classification + + +
+ Major Depressive Disorder (MDD) is a pervasive mental health condition that +affects 300 million people worldwide. This work presents a novel, BiLSTM-based +tri-modal model-level fusion architecture for the binary classification of +depression from clinical interview recordings. The proposed architecture +incorporates Mel Frequency Cepstral Coefficients, Facial Action Units, and uses +a two-shot learning based GPT-4 model to process text data. This is the first +work to incorporate large language models into a multi-modal architecture for +this task. It achieves impressive results on the DAIC-WOZ AVEC 2016 Challenge +cross-validation split and Leave-One-Subject-Out cross-validation split, +surpassing all baseline models and multiple state-of-the-art models. In +Leave-One-Subject-Out testing, it achieves an accuracy of 91.01%, an F1-Score +of 85.95%, a precision of 80%, and a recall of 92.86%. + +
+
+
+
+
+ + ☆ IBMEA: Exploring Variational Information Bottleneck for Multi-modal + Entity Alignment ACM MM 2024 + + +
+ Multi-modal entity alignment (MMEA) aims to identify equivalent entities +between multi-modal knowledge graphs (MMKGs), where the entities can be +associated with related images. Most existing studies integrate multi-modal +information heavily relying on the automatically-learned fusion module, rarely +suppressing the redundant information for MMEA explicitly. To this end, we +explore variational information bottleneck for multi-modal entity alignment +(IBMEA), which emphasizes the alignment-relevant information and suppresses the +alignment-irrelevant information in generating entity representations. +Specifically, we devise multi-modal variational encoders to generate +modal-specific entity representations as probability distributions. Then, we +propose four modal-specific information bottleneck regularizers, limiting the +misleading clues in refining modal-specific entity representations. Finally, we +propose a modal-hybrid information contrastive regularizer to integrate all the +refined modal-specific representations, enhancing the entity similarity between +MMKGs to achieve MMEA. We conduct extensive experiments on two cross-KG and +three bilingual MMEA datasets. Experimental results demonstrate that our model +consistently outperforms previous state-of-the-art methods, and also shows +promising and robust performance in low-resource and high-noise data scenarios. + +
+
+ comment: Accepted by ACM MM 2024 +
+
+
+
+
+ + ☆ Radio Frequency Signal based Human Silhouette Segmentation: A Sequential + Diffusion Approach + + +
+ Radio frequency (RF) signals have been proved to be flexible for human +silhouette segmentation (HSS) under complex environments. Existing studies are +mainly based on a one-shot approach, which lacks a coherent projection ability +from the RF domain. Additionally, the spatio-temporal patterns have not been +fully explored for human motion dynamics in HSS. Therefore, we propose a +two-stage Sequential Diffusion Model (SDM) to progressively synthesize +high-quality segmentation jointly with the considerations on motion dynamics. +Cross-view transformation blocks are devised to guide the diffusion model in a +multi-scale manner for comprehensively characterizing human related patterns in +an individual frame such as directional projection from signal planes. +Moreover, spatio-temporal blocks are devised to fine-tune the frame-level model +to incorporate spatio-temporal contexts and motion dynamics, enhancing the +consistency of the segmentation maps. Comprehensive experiments on a public +benchmark -- HIBER demonstrate the state-of-the-art performance of our method +with an IoU 0.732. Our code is available at https://github.com/ph-w2000/SDM. + +
+
+
+
+
+ + ☆ Magic3DSketch: Create Colorful 3D Models From Sketch-Based 3D Modeling + Guided by Text and Language-Image Pre-Training + + +
+ The requirement for 3D content is growing as AR/VR application emerges. At +the same time, 3D modelling is only available for skillful experts, because +traditional methods like Computer-Aided Design (CAD) are often too +labor-intensive and skill-demanding, making it challenging for novice users. +Our proposed method, Magic3DSketch, employs a novel technique that encodes +sketches to predict a 3D mesh, guided by text descriptions and leveraging +external prior knowledge obtained through text and language-image pre-training. +The integration of language-image pre-trained neural networks complements the +sparse and ambiguous nature of single-view sketch inputs. Our method is also +more useful and offers higher degree of controllability compared to existing +text-to-3D approaches, according to our user study. Moreover, Magic3DSketch +achieves state-of-the-art performance in both synthetic and real dataset with +the capability of producing more detailed structures and realistic shapes with +the help of text input. Users are also more satisfied with models obtained by +Magic3DSketch according to our user study. Additionally, we are also the first, +to our knowledge, add color based on text description to the sketch-derived +shapes. By combining sketches and text guidance with the help of language-image +pretrained models, our Magic3DSketch can allow novice users to create custom 3D +models with minimal effort and maximum creative freedom, with the potential to +revolutionize future 3D modeling pipelines. + +
+
+
+
+
+ + ☆ Harmfully Manipulated Images Matter in Multimodal Misinformation + Detection ACM MM 2024 + + +
+ Nowadays, misinformation is widely spreading over various social media +platforms and causes extremely negative impacts on society. To combat this +issue, automatically identifying misinformation, especially those containing +multimodal content, has attracted growing attention from the academic and +industrial communities, and induced an active research topic named Multimodal +Misinformation Detection (MMD). Typically, existing MMD methods capture the +semantic correlation and inconsistency between multiple modalities, but neglect +some potential clues in multimodal content. Recent studies suggest that +manipulated traces of the images in articles are non-trivial clues for +detecting misinformation. Meanwhile, we find that the underlying intentions +behind the manipulation, e.g., harmful and harmless, also matter in MMD. +Accordingly, in this work, we propose to detect misinformation by learning +manipulation features that indicate whether the image has been manipulated, as +well as intention features regarding the harmful and harmless intentions of the +manipulation. Unfortunately, the manipulation and intention labels that make +these features discriminative are unknown. To overcome the problem, we propose +two weakly supervised signals as alternatives by introducing additional +datasets on image manipulation detection and formulating two classification +tasks as positive and unlabeled learning problems. Based on these ideas, we +propose a novel MMD method, namely Harmfully Manipulated Images Matter in MMD +(HAMI-M3D). Extensive experiments across three benchmark datasets can +demonstrate that HAMI-M3D can consistently improve the performance of any MMD +baselines. + +
+
+ comment: Accepted by ACM MM 2024. Code: + https://github.com/wangbing1416/HAMI-M3D +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 62 + +
+
+
+ + ☆ Wolf: Captioning Everything with a World Summarization Framework + + +
+ We propose Wolf, a WOrLd summarization Framework for accurate video +captioning. Wolf is an automated captioning framework that adopts a +mixture-of-experts approach, leveraging complementary strengths of Vision +Language Models (VLMs). By utilizing both image and video models, our framework +captures different levels of information and summarizes them efficiently. Our +approach can be applied to enhance video understanding, auto-labeling, and +captioning. To evaluate caption quality, we introduce CapScore, an LLM-based +metric to assess the similarity and quality of generated captions compared to +the ground truth captions. We further build four human-annotated datasets in +three domains: autonomous driving, general scenes, and robotics, to facilitate +comprehensive comparisons. We show that Wolf achieves superior captioning +performance compared to state-of-the-art approaches from the research community +(VILA1.5, CogAgent) and commercial solutions (Gemini-Pro-1.5, GPT-4V). For +instance, in comparison with GPT-4V, Wolf improves CapScore both quality-wise +by 55.6% and similarity-wise by 77.4% on challenging driving videos. Finally, +we establish a benchmark for video captioning and introduce a leaderboard, +aiming to accelerate advancements in video understanding, captioning, and data +alignment. Leaderboard: https://wolfv0.github.io/leaderboard.html. + +
+
+
+
+
+ + ☆ AppWorld: A Controllable World of Apps and People for Benchmarking + Interactive Coding Agents ACL'24 + + +
+ Autonomous agents that address day-to-day digital tasks (e.g., ordering +groceries for a household), must not only operate multiple apps (e.g., notes, +messaging, shopping app) via APIs, but also generate rich code with complex +control flow in an iterative manner based on their interaction with the +environment. However, existing benchmarks for tool use are inadequate, as they +only cover tasks that require a simple sequence of API calls. + To remedy this gap, we built $\textbf{AppWorld Engine}$, a high-quality +execution environment (60K lines of code) of 9 day-to-day apps operable via 457 +APIs and populated with realistic digital activities simulating the lives of +~100 fictitious users. We then created $\textbf{AppWorld Benchmark}$ (40K lines +of code), a suite of 750 natural, diverse, and challenging autonomous agent +tasks requiring rich and interactive code generation. It supports robust +programmatic evaluation with state-based unit tests, allowing for different +ways of completing a task while also checking for unexpected changes, i.e., +collateral damage. The state-of-the-art LLM, GPT-4o, solves only ~49% of our +'normal' tasks and ~30% of 'challenge' tasks, while other models solve at least +16% fewer. This highlights the benchmark's difficulty and AppWorld's potential +to push the frontiers of interactive coding agents. The project website is +available at https://appworld.dev/. + +
+
+ comment: ACL'24 Camera Ready +
+
+
+
+
+ + ☆ Embedding And Clustering Your Data Can Improve Contrastive Pretraining + + +
+ Recent studies of large-scale contrastive pretraining in the text embedding +domain show that using single-source minibatches, rather than mixed-source +minibatches, can substantially improve overall model accuracy. In this work, we +explore extending training data stratification beyond source granularity by +leveraging a pretrained text embedding model and the classic k-means clustering +algorithm to further split training data apart by the semantic clusters within +each source. Experimentally, we observe a notable increase in NDCG@10 when +pretraining a BERT-based text embedding model on query-passage pairs from the +MSMARCO passage retrieval dataset. Additionally, we conceptually connect our +clustering approach to both the Topic Aware Sampling (TAS) aspect of the TAS-B +methodology and the nearest-neighbor-based hard-negative mining aspect of the +ANCE methodology and discuss how this unified view motivates future lines of +research on the organization of contrastive pretraining data. + +
+
+ comment: 16 pages, 3 figures, 2 tables +
+
+
+
+
+ + ☆ Granularity is crucial when applying differential privacy to text: An + investigation for neural machine translation + + +
+ Applying differential privacy (DP) by means of the DP-SGD algorithm to +protect individual data points during training is becoming increasingly popular +in NLP. However, the choice of granularity at which DP is applied is often +neglected. For example, neural machine translation (NMT) typically operates on +the sentence-level granularity. From the perspective of DP, this setup assumes +that each sentence belongs to a single person and any two sentences in the +training dataset are independent. This assumption is however violated in many +real-world NMT datasets, e.g. those including dialogues. For proper application +of DP we thus must shift from sentences to entire documents. In this paper, we +investigate NMT at both the sentence and document levels, analyzing the +privacy/utility trade-off for both scenarios, and evaluating the risks of not +using the appropriate privacy granularity in terms of leaking personally +identifiable information (PII). Our findings indicate that the document-level +NMT system is more resistant to membership inference attacks, emphasizing the +significance of using the appropriate granularity when working with DP. + +
+
+
+
+
+ + ☆ The power of Prompts: Evaluating and Mitigating Gender Bias in MT with + LLMs + + +
+ This paper studies gender bias in machine translation through the lens of +Large Language Models (LLMs). Four widely-used test sets are employed to +benchmark various base LLMs, comparing their translation quality and gender +bias against state-of-the-art Neural Machine Translation (NMT) models for +English to Catalan (En $\rightarrow$ Ca) and English to Spanish (En +$\rightarrow$ Es) translation directions. Our findings reveal pervasive gender +bias across all models, with base LLMs exhibiting a higher degree of bias +compared to NMT models. To combat this bias, we explore prompting engineering +techniques applied to an instruction-tuned LLM. We identify a prompt structure +that significantly reduces gender bias by up to 12% on the WinoMT evaluation +dataset compared to more straightforward prompts. These results significantly +reduce the gender bias accuracy gap between LLMs and traditional NMT systems. + +
+
+
+
+
+ + ☆ Knowledge Graph Structure as Prompt: Improving Small Language Models + Capabilities for Knowledge-based Causal Discovery ISWC'24 + + +
+ Causal discovery aims to estimate causal structures among variables based on +observational data. Large Language Models (LLMs) offer a fresh perspective to +tackle the causal discovery problem by reasoning on the metadata associated +with variables rather than their actual data values, an approach referred to as +knowledge-based causal discovery. In this paper, we investigate the +capabilities of Small Language Models (SLMs, defined as LLMs with fewer than 1 +billion parameters) with prompt-based learning for knowledge-based causal +discovery. Specifically, we present KG Structure as Prompt, a novel approach +for integrating structural information from a knowledge graph, such as common +neighbor nodes and metapaths, into prompt-based learning to enhance the +capabilities of SLMs. Experimental results on three types of biomedical and +open-domain datasets under few-shot settings demonstrate the effectiveness of +our approach, surpassing most baselines and even conventional fine-tuning +approaches trained on full datasets. Our findings further highlight the strong +capabilities of SLMs: in combination with knowledge graphs and prompt-based +learning, SLMs demonstrate the potential to surpass LLMs with larger number of +parameters. Our code and datasets are available on GitHub. + +
+
+ comment: accepted at ISWC'24 +
+
+
+
+
+ + ☆ Towards Effective and Efficient Continual Pre-training of Large Language + Models + + +
+ Continual pre-training (CPT) has been an important approach for adapting +language models to specific domains or tasks. To make the CPT approach more +traceable, this paper presents a technical report for continually pre-training +Llama-3 (8B), which significantly enhances the Chinese language ability and +scientific reasoning ability of the backbone model. To enhance the new +abilities while retaining the original abilities, we design specific data +mixture and curriculum strategies by utilizing existing datasets and +synthesizing high-quality datasets. Specifically, we synthesize +multidisciplinary scientific question and answer (QA) pairs based on related +web pages, and subsequently incorporate these synthetic data to improve the +scientific reasoning ability of Llama-3. We refer to the model after CPT as +Llama-3-SynE (Synthetic data Enhanced Llama-3). We also present the tuning +experiments with a relatively small model -- TinyLlama, and employ the derived +findings to train the backbone model. Extensive experiments on a number of +evaluation benchmarks show that our approach can largely improve the +performance of the backbone models, including both the general abilities (+8.81 +on C-Eval and +6.31 on CMMLU) and the scientific reasoning abilities (+12.00 on +MATH and +4.13 on SciEval), without hurting the original capacities. Our model, +data, and codes are available at https://github.com/RUC-GSAI/Llama-3-SynE. + +
+
+ comment: 16 pages, 10 figures, 16 tables +
+
+
+
+
+ + ☆ Towards Generalized Offensive Language Identification + + +
+ The prevalence of offensive content on the internet, encompassing hate speech +and cyberbullying, is a pervasive issue worldwide. Consequently, it has +garnered significant attention from the machine learning (ML) and natural +language processing (NLP) communities. As a result, numerous systems have been +developed to automatically identify potentially harmful content and mitigate +its impact. These systems can follow two approaches; (1) Use publicly available +models and application endpoints, including prompting large language models +(LLMs) (2) Annotate datasets and train ML models on them. However, both +approaches lack an understanding of how generalizable they are. Furthermore, +the applicability of these systems is often questioned in off-domain and +practical environments. This paper empirically evaluates the generalizability +of offensive language detection models and datasets across a novel generalized +benchmark. We answer three research questions on generalizability. Our findings +will be useful in creating robust real-world offensive language detection +systems. + +
+
+ comment: Accepted to ASONAM 2024 +
+
+
+
+
+ + ☆ Creating an Aligned Corpus of Sound and Text: The Multimodal Corpus of + Shakespeare and Milton + + +
+ In this work we present a corpus of poems by William Shakespeare and John +Milton that have been enriched with readings from the public domain. We have +aligned all the lines with their respective audio segments, at the line, word, +syllable and phone level, and we have included their scansion. We make a basic +visualization platform for these poems and we conclude by conjecturing possible +future directions. + +
+
+
+
+
+ + ☆ ChatSchema: A pipeline of extracting structured information with Large + Multimodal Models based on schema + + +
+ Objective: This study introduces ChatSchema, an effective method for +extracting and structuring information from unstructured data in medical paper +reports using a combination of Large Multimodal Models (LMMs) and Optical +Character Recognition (OCR) based on the schema. By integrating predefined +schema, we intend to enable LMMs to directly extract and standardize +information according to the schema specifications, facilitating further data +entry. Method: Our approach involves a two-stage process, including +classification and extraction for categorizing report scenarios and structuring +information. We established and annotated a dataset to verify the effectiveness +of ChatSchema, and evaluated key extraction using precision, recall, F1-score, +and accuracy metrics. Based on key extraction, we further assessed value +extraction. We conducted ablation studies on two LMMs to illustrate the +improvement of structured information extraction with different input modals +and methods. Result: We analyzed 100 medical reports from Peking University +First Hospital and established a ground truth dataset with 2,945 key-value +pairs. We evaluated ChatSchema using GPT-4o and Gemini 1.5 Pro and found a +higher overall performance of GPT-4o. The results are as follows: For the +result of key extraction, key-precision was 98.6%, key-recall was 98.5%, +key-F1-score was 98.6%. For the result of value extraction based on correct key +extraction, the overall accuracy was 97.2%, precision was 95.8%, recall was +95.8%, and F1-score was 95.8%. An ablation study demonstrated that ChatSchema +achieved significantly higher overall accuracy and overall F1-score of +key-value extraction, compared to the Baseline, with increases of 26.9% overall +accuracy and 27.4% overall F1-score, respectively. + +
+
+
+
+
+ + ☆ Cluster-norm for Unsupervised Probing of Knowledge + + +
+ The deployment of language models brings challenges in generating reliable +information, especially when these models are fine-tuned using human +preferences. To extract encoded knowledge without (potentially) biased human +labels, unsupervised probing techniques like Contrast-Consistent Search (CCS) +have been developed (Burns et al., 2022). However, salient but unrelated +features in a given dataset can mislead these probes (Farquhar et al., 2023). +Addressing this, we propose a cluster normalization method to minimize the +impact of such features by clustering and normalizing activations of contrast +pairs before applying unsupervised probing techniques. While this approach does +not address the issue of differentiating between knowledge in general and +simulated knowledge - a major issue in the literature of latent knowledge +elicitation (Christiano et al., 2021) - it significantly improves the ability +of unsupervised probes to identify the intended knowledge amidst distractions. + +
+
+ comment: 34 pages, 35 figures +
+
+
+
+
+ + ☆ Adaptive Contrastive Search: Uncertainty-Guided Decoding for Open-Ended + Text Generation + + +
+ Decoding from the output distributions of large language models to produce +high-quality text is a complex challenge in language modeling. Various +approaches, such as beam search, sampling with temperature, $k-$sampling, +nucleus $p-$sampling, typical decoding, contrastive decoding, and contrastive +search, have been proposed to address this problem, aiming to improve +coherence, diversity, as well as resemblance to human-generated text. In this +study, we introduce adaptive contrastive search, a novel decoding strategy +extending contrastive search by incorporating an adaptive degeneration penalty, +guided by the estimated uncertainty of the model at each generation step. This +strategy is designed to enhance both the creativity and diversity of the +language modeling process while at the same time producing coherent and +high-quality generated text output. Our findings indicate performance +enhancement in both aspects, across different model architectures and datasets, +underscoring the effectiveness of our method in text generation tasks. Our code +base, datasets, and models are publicly available. + +
+
+
+
+
+ + ☆ The BIAS Detection Framework: Bias Detection in Word Embeddings and + Language Models for European Languages + + +
+ The project BIAS: Mitigating Diversity Biases of AI in the Labor Market is a +four-year project funded by the European commission and supported by the Swiss +State Secretariat for Education, Research and Innovation (SERI). As part of the +project, novel bias detection methods to identify societal bias in language +models and word embeddings in European languages are developed, with particular +attention to linguistic and geographic particularities. This technical report +describes the overall architecture and components of the BIAS Detection +Framework. The code described in this technical report is available and will be +updated and expanded continuously with upcoming results from the BIAS project. +The details about the datasets for the different languages are described in +corresponding papers at scientific venues. + +
+
+
+
+
+ + ☆ Every Part Matters: Integrity Verification of Scientific Figures Based + on Multimodal Large Language Models + + +
+ This paper tackles a key issue in the interpretation of scientific figures: +the fine-grained alignment of text and figures. It advances beyond prior +research that primarily dealt with straightforward, data-driven visualizations +such as bar and pie charts and only offered a basic understanding of diagrams +through captioning and classification. We introduce a novel task, Figure +Integrity Verification, designed to evaluate the precision of technologies in +aligning textual knowledge with visual elements in scientific figures. To +support this, we develop a semi-automated method for constructing a large-scale +dataset, Figure-seg, specifically designed for this task. Additionally, we +propose an innovative framework, Every Part Matters (EPM), which leverages +Multimodal Large Language Models (MLLMs) to not only incrementally improve the +alignment and verification of text-figure integrity but also enhance integrity +through analogical reasoning. Our comprehensive experiments show that these +innovations substantially improve upon existing methods, allowing for more +precise and thorough analysis of complex scientific figures. This progress not +only enhances our understanding of multimodal technologies but also stimulates +further research and practical applications across fields requiring the +accurate interpretation of complex visual data. + +
+
+ comment: 28 pages, 11 figures, under review +
+
+
+
+
+ + ☆ Dynamic Language Group-Based MoE: Enhancing Efficiency and Flexibility + for Code-Switching Speech Recognition + + +
+ The Mixture of Experts (MoE) approach is ideally suited for tackling +multilingual and code-switching (CS) challenges due to its multi-expert +architecture. This work introduces the DLG-MoE, which is optimized for +bilingual and CS scenarios. Our novel Dynamic Language Group-based MoE layer +features a language router with shared weights for explicit language modeling, +while independent unsupervised routers within the language group handle +attributes beyond language. This structure not only enhances expert extension +capabilities but also supports dynamic top-k training, allowing for flexible +inference across various top-k values and improving overall performance. The +model requires no pre-training and supports streaming recognition, achieving +state-of-the-art (SOTA) results with unmatched flexibility compared to other +methods. The Code will be released. + +
+
+
+
+
+ + ☆ Learning Robust Named Entity Recognizers From Noisy Data With Retrieval + Augmentation + + +
+ Named entity recognition (NER) models often struggle with noisy inputs, such +as those with spelling mistakes or errors generated by Optical Character +Recognition processes, and learning a robust NER model is challenging. Existing +robust NER models utilize both noisy text and its corresponding gold text for +training, which is infeasible in many real-world applications in which gold +text is not available. In this paper, we consider a more realistic setting in +which only noisy text and its NER labels are available. We propose to retrieve +relevant text of the noisy text from a knowledge corpus and use it to enhance +the representation of the original noisy input. We design three retrieval +methods: sparse retrieval based on lexicon similarity, dense retrieval based on +semantic similarity, and self-retrieval based on task-specific text. After +retrieving relevant text, we concatenate the retrieved text with the original +noisy text and encode them with a transformer network, utilizing self-attention +to enhance the contextual token representations of the noisy text using the +retrieved text. We further employ a multi-view training framework that improves +robust NER without retrieving text during inference. Experiments show that our +retrieval-augmented model achieves significant improvements in various noisy +NER settings. + +
+
+
+
+
+ + ☆ Multimodal Emotion Recognition using Audio-Video Transformer Fusion with + Cross Attention + + +
+ Understanding emotions is a fundamental aspect of human communication. +Integrating audio and video signals offers a more comprehensive understanding +of emotional states compared to traditional methods that rely on a single data +source, such as speech or facial expressions. Despite its potential, multimodal +emotion recognition faces significant challenges, particularly in +synchronization, feature extraction, and fusion of diverse data sources. To +address these issues, this paper introduces a novel transformer-based model +named Audio-Video Transformer Fusion with Cross Attention (AVT-CA). The AVT-CA +model employs a transformer fusion approach to effectively capture and +synchronize interlinked features from both audio and video inputs, thereby +resolving synchronization problems. Additionally, the Cross Attention mechanism +within AVT-CA selectively extracts and emphasizes critical features while +discarding irrelevant ones from both modalities, addressing feature extraction +and fusion challenges. Extensive experimental analysis conducted on the +CMU-MOSEI, RAVDESS and CREMA-D datasets demonstrates the efficacy of the +proposed model. The results underscore the importance of AVT-CA in developing +precise and reliable multimodal emotion recognition systems for practical +applications. + +
+
+ comment: 38 Pages, 9 Tables, 12 Figures +
+
+
+
+
+ + ☆ A Universal Prompting Strategy for Extracting Process Model Information + from Natural Language Text using Large Language Models + + +
+ Over the past decade, extensive research efforts have been dedicated to the +extraction of information from textual process descriptions. Despite the +remarkable progress witnessed in natural language processing (NLP), information +extraction within the Business Process Management domain remains predominantly +reliant on rule-based systems and machine learning methodologies. Data scarcity +has so far prevented the successful application of deep learning techniques. +However, the rapid progress in generative large language models (LLMs) makes it +possible to solve many NLP tasks with very high quality without the need for +extensive data. Therefore, we systematically investigate the potential of LLMs +for extracting information from textual process descriptions, targeting the +detection of process elements such as activities and actors, and relations +between them. Using a heuristic algorithm, we demonstrate the suitability of +the extracted information for process model generation. Based on a novel +prompting strategy, we show that LLMs are able to outperform state-of-the-art +machine learning approaches with absolute performance improvements of up to 8\% +$F_1$ score across three different datasets. We evaluate our prompting strategy +on eight different LLMs, showing it is universally applicable, while also +analyzing the impact of certain prompt parts on extraction quality. The number +of example texts, the specificity of definitions, and the rigour of format +instructions are identified as key for improving the accuracy of extracted +information. Our code, prompts, and data are publicly available. + +
+
+
+
+
+ + ☆ Towards a Multidimensional Evaluation Framework for Empathetic + Conversational Systems + + +
+ Empathetic Conversational Systems (ECS) are built to respond empathetically +to the user's emotions and sentiments, regardless of the application domain. +Current ECS studies evaluation approaches are restricted to offline evaluation +experiments primarily for gold standard comparison & benchmarking, and user +evaluation studies for collecting human ratings on specific constructs. These +methods are inadequate in measuring the actual quality of empathy in +conversations. In this paper, we propose a multidimensional empathy evaluation +framework with three new methods for measuring empathy at (i) structural level +using three empathy-related dimensions, (ii) behavioral level using empathy +behavioral types, and (iii) overall level using an empathy lexicon, thereby +fortifying the evaluation process. Experiments were conducted with the +state-of-the-art ECS models and large language models (LLMs) to show the +framework's usefulness. + +
+
+ comment: 13 pages, 4 figures +
+
+
+
+
+ + ☆ Is larger always better? Evaluating and prompting large language models + for non-generative medical tasks + + +
+ The use of Large Language Models (LLMs) in medicine is growing, but their +ability to handle both structured Electronic Health Record (EHR) data and +unstructured clinical notes is not well-studied. This study benchmarks various +models, including GPT-based LLMs, BERT-based models, and traditional clinical +predictive models, for non-generative medical tasks utilizing renowned +datasets. We assessed 14 language models (9 GPT-based and 5 BERT-based) and 7 +traditional predictive models using the MIMIC dataset (ICU patient records) and +the TJH dataset (early COVID-19 EHR data), focusing on tasks such as mortality +and readmission prediction, disease hierarchy reconstruction, and biomedical +sentence matching, comparing both zero-shot and finetuned performance. Results +indicated that LLMs exhibited robust zero-shot predictive capabilities on +structured EHR data when using well-designed prompting strategies, frequently +surpassing traditional models. However, for unstructured medical texts, LLMs +did not outperform finetuned BERT models, which excelled in both supervised and +unsupervised tasks. Consequently, while LLMs are effective for zero-shot +learning on structured data, finetuned BERT models are more suitable for +unstructured texts, underscoring the importance of selecting models based on +specific task requirements and data characteristics to optimize the application +of NLP technology in healthcare. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2402.01713 +
+
+
+
+
+ + ☆ The formation of perceptual space in early phonetic acquisition: a + cross-linguistic modeling approach + + +
+ This study investigates how learners organize perceptual space in early +phonetic acquisition by advancing previous studies in two key aspects. Firstly, +it examines the shape of the learned hidden representation as well as its +ability to categorize phonetic categories. Secondly, it explores the impact of +training models on context-free acoustic information, without involving +contextual cues, on phonetic acquisition, closely mimicking the early language +learning stage. Using a cross-linguistic modeling approach, autoencoder models +are trained on English and Mandarin and evaluated in both native and non-native +conditions, following experimental conditions used in infant language +perception studies. The results demonstrate that unsupervised bottom-up +training on context-free acoustic information leads to comparable learned +representations of perceptual space between native and non-native conditions +for both English and Mandarin, resembling the early stage of universal +listening in infants. These findings provide insights into the organization of +perceptual space during early phonetic acquisition and contribute to our +understanding of the formation and representation of phonetic categories. + +
+
+ comment: 51 pages +
+
+
+
+
+ + ☆ A Reliable Common-Sense Reasoning Socialbot Built Using LLMs and + Goal-Directed ASP + + +
+ The development of large language models (LLMs), such as GPT, has enabled the +construction of several socialbots, like ChatGPT, that are receiving a lot of +attention for their ability to simulate a human conversation. However, the +conversation is not guided by a goal and is hard to control. In addition, +because LLMs rely more on pattern recognition than deductive reasoning, they +can give confusing answers and have difficulty integrating multiple topics into +a cohesive response. These limitations often lead the LLM to deviate from the +main topic to keep the conversation interesting. We propose AutoCompanion, a +socialbot that uses an LLM model to translate natural language into predicates +(and vice versa) and employs commonsense reasoning based on Answer Set +Programming (ASP) to hold a social conversation with a human. In particular, we +rely on s(CASP), a goal-directed implementation of ASP as the backend. This +paper presents the framework design and how an LLM is used to parse user +messages and generate a response from the s(CASP) engine output. To validate +our proposal, we describe (real) conversations in which the chatbot's goal is +to keep the user entertained by talking about movies and books, and s(CASP) +ensures (i) correctness of answers, (ii) coherence (and precision) during the +conversation, which it dynamically regulates to achieve its specific purpose, +and (iii) no deviation from the main topic. + +
+
+
+
+
+ + ☆ Towards More Accurate Prediction of Human Empathy and Emotion in Text + and Multi-turn Conversations by Combining Advanced NLP, Transformers-based + Networks, and Linguistic Methodologies + + +
+ Based on the WASSA 2022 Shared Task on Empathy Detection and Emotion +Classification, we predict the level of empathic concern and personal distress +displayed in essays. For the first stage of this project we implemented a +Feed-Forward Neural Network using sentence-level embeddings as features. We +experimented with four different embedding models for generating the inputs to +the neural network. The subsequent stage builds upon the previous work and we +have implemented three types of revisions. The first revision focuses on the +enhancements to the model architecture and the training approach. The second +revision focuses on handling class imbalance using stratified data sampling. +The third revision focuses on leveraging lexical resources, where we apply four +different resources to enrich the features associated with the dataset. During +the final stage of this project, we have created the final end-to-end system +for the primary task using an ensemble of models to revise primary task +performance. Additionally, as part of the final stage, these approaches have +been adapted to the WASSA 2023 Shared Task on Empathy Emotion and Personality +Detection in Interactions, in which the empathic concern, emotion polarity, and +emotion intensity in dyadic text conversations are predicted. + +
+
+
+
+
+ + ☆ A Role-specific Guided Large Language Model for Ophthalmic Consultation + Based on Stylistic Differentiation + + +
+ Ophthalmology consultations are crucial for diagnosing, treating, and +preventing eye diseases. However, the growing demand for consultations exceeds +the availability of ophthalmologists. By leveraging large pre-trained language +models, we can design effective dialogues for specific scenarios, aiding in +consultations. Traditional fine-tuning strategies for question-answering tasks +are impractical due to increasing model size and often ignoring patient-doctor +role function during consultations. In this paper, we propose EyeDoctor, an +ophthalmic medical questioning large language model that enhances accuracy +through doctor-patient role perception guided and an augmented knowledge base +with external disease information. Experimental results show EyeDoctor achieves +higher question-answering precision in ophthalmology consultations. Notably, +EyeDoctor demonstrated a 7.25% improvement in Rouge-1 scores and a 10.16% +improvement in F1 scores on multi-round datasets compared to second best model +ChatGPT, highlighting the importance of doctor-patient role differentiation and +dynamic knowledge base expansion for intelligent medical consultations. EyeDoc +also serves as a free available web based service and souce code is available +at https://github.com/sperfu/EyeDoc. + +
+
+
+
+
+ + ☆ Multi-turn Response Selection with Commonsense-enhanced Language Models + + +
+ As a branch of advanced artificial intelligence, dialogue systems are +prospering. Multi-turn response selection is a general research problem in +dialogue systems. With the assistance of background information and pre-trained +language models, the performance of state-of-the-art methods on this problem +gains impressive improvement. However, existing studies neglect the importance +of external commonsense knowledge. Hence, we design a Siamese network where a +pre-trained Language model merges with a Graph neural network (SinLG). SinLG +takes advantage of Pre-trained Language Models (PLMs) to catch the word +correlations in the context and response candidates and utilizes a Graph Neural +Network (GNN) to reason helpful common sense from an external knowledge graph. +The GNN aims to assist the PLM in fine-tuning, and arousing its related +memories to attain better performance. Specifically, we first extract related +concepts as nodes from an external knowledge graph to construct a subgraph with +the context response pair as a super node for each sample. Next, we learn two +representations for the context response pair via both the PLM and GNN. A +similarity loss between the two representations is utilized to transfer the +commonsense knowledge from the GNN to the PLM. Then only the PLM is used to +infer online so that efficiency can be guaranteed. Finally, we conduct +extensive experiments on two variants of the PERSONA-CHAT dataset, which proves +that our solution can not only improve the performance of the PLM but also +achieve an efficient inference. + +
+
+
+
+
+ + ☆ Constructing the CORD-19 Vaccine Dataset + + +
+ We introduce new dataset 'CORD-19-Vaccination' to cater to scientists +specifically looking into COVID-19 vaccine-related research. This dataset is +extracted from CORD-19 dataset [Wang et al., 2020] and augmented with new +columns for language detail, author demography, keywords, and topic per paper. +Facebook's fastText model is used to identify languages [Joulin et al., 2016]. +To establish author demography (author affiliation, lab/institution location, +and lab/institution country columns) we processed the JSON file for each paper +and then further enhanced using Google's search API to determine country +values. 'Yake' was used to extract keywords from the title, abstract, and body +of each paper and the LDA (Latent Dirichlet Allocation) algorithm was used to +add topic information [Campos et al., 2020, 2018a,b]. To evaluate the dataset, +we demonstrate a question-answering task like the one used in the CORD-19 +Kaggle challenge [Goldbloom et al., 2022]. For further evaluation, sequential +sentence classification was performed on each paper's abstract using the model +from Dernoncourt et al. [2016]. We partially hand annotated the training +dataset and used a pre-trained BERT-PubMed layer. 'CORD- 19-Vaccination' +contains 30k research papers and can be immensely valuable for NLP research +such as text mining, information extraction, and question answering, specific +to the domain of COVID-19 vaccine research. + +
+
+
+
+
+ + ☆ Enhancing Dysarthric Speech Recognition for Unseen Speakers via + Prototype-Based Adaptation + + +
+ Dysarthric speech recognition (DSR) presents a formidable challenge due to +inherent inter-speaker variability, leading to severe performance degradation +when applying DSR models to new dysarthric speakers. Traditional speaker +adaptation methodologies typically involve fine-tuning models for each speaker, +but this strategy is cost-prohibitive and inconvenient for disabled users, +requiring substantial data collection. To address this issue, we introduce a +prototype-based approach that markedly improves DSR performance for unseen +dysarthric speakers without additional fine-tuning. Our method employs a +feature extractor trained with HuBERT to produce per-word prototypes that +encapsulate the characteristics of previously unseen speakers. These prototypes +serve as the basis for classification. Additionally, we incorporate supervised +contrastive learning to refine feature extraction. By enhancing representation +quality, we further improve DSR performance, enabling effective personalized +DSR. We release our code at https://github.com/NKU-HLT/PB-DSR. + +
+
+ comment: accepted by Interspeech 2024 +
+
+
+
+
+ + ☆ Fairness Definitions in Language Models Explained + + +
+ Language Models (LMs) have demonstrated exceptional performance across +various Natural Language Processing (NLP) tasks. Despite these advancements, +LMs can inherit and amplify societal biases related to sensitive attributes +such as gender and race, limiting their adoption in real-world applications. +Therefore, fairness has been extensively explored in LMs, leading to the +proposal of various fairness notions. However, the lack of clear agreement on +which fairness definition to apply in specific contexts (\textit{e.g.,} +medium-sized LMs versus large-sized LMs) and the complexity of understanding +the distinctions between these definitions can create confusion and impede +further progress. To this end, this paper proposes a systematic survey that +clarifies the definitions of fairness as they apply to LMs. Specifically, we +begin with a brief introduction to LMs and fairness in LMs, followed by a +comprehensive, up-to-date overview of existing fairness notions in LMs and the +introduction of a novel taxonomy that categorizes these concepts based on their +foundational principles and operational distinctions. We further illustrate +each definition through experiments, showcasing their practical implications +and outcomes. Finally, we discuss current research challenges and open +questions, aiming to foster innovative ideas and advance the field. The +implementation and additional resources are publicly available at +https://github.com/LavinWong/Fairness-in-Large-Language-Models/tree/main/definitions. + +
+
+
+
+
+ + ☆ Guidance-Based Prompt Data Augmentation in Specialized Domains for Named + Entity Recognition + + +
+ While the abundance of rich and vast datasets across numerous fields has +facilitated the advancement of natural language processing, sectors in need of +specialized data types continue to struggle with the challenge of finding +quality data. Our study introduces a novel guidance data augmentation technique +utilizing abstracted context and sentence structures to produce varied +sentences while maintaining context-entity relationships, addressing data +scarcity challenges. By fostering a closer relationship between context, +sentence structure, and role of entities, our method enhances data +augmentation's effectiveness. Consequently, by showcasing diversification in +both entity-related vocabulary and overall sentence structure, and +simultaneously improving the training performance of named entity recognition +task. + +
+
+
+
+
+ + ☆ Many-Shot In-Context Learning for Molecular Inverse Design + + +
+ Large Language Models (LLMs) have demonstrated great performance in few-shot +In-Context Learning (ICL) for a variety of generative and discriminative +chemical design tasks. The newly expanded context windows of LLMs can further +improve ICL capabilities for molecular inverse design and lead optimization. To +take full advantage of these capabilities we developed a new semi-supervised +learning method that overcomes the lack of experimental data available for +many-shot ICL. Our approach involves iterative inclusion of LLM generated +molecules with high predicted performance, along with experimental data. We +further integrated our method in a multi-modal LLM which allows for the +interactive modification of generated molecular structures using text +instructions. As we show, the new method greatly improves upon existing ICL +methods for molecular design while being accessible and easy to use for +scientists. + +
+
+
+
+
+ + ☆ OfficeBench: Benchmarking Language Agents across Multiple Applications + for Office Automation + + +
+ Office automation significantly enhances human productivity by automatically +finishing routine tasks in the workflow. Beyond the basic information +extraction studied in much of the prior document AI literature, the office +automation research should be extended to more realistic office tasks which +require to integrate various information sources in the office system and +produce outputs through a series of decision-making processes. We introduce +OfficeBench, one of the first office automation benchmarks for evaluating +current LLM agents' capability to address office tasks in realistic office +workflows. OfficeBench requires LLM agents to perform feasible long-horizon +planning, proficiently switch between applications in a timely manner, and +accurately ground their actions within a large combined action space, based on +the contextual demands of the workflow. Applying our customized evaluation +methods on each task, we find that GPT-4 Omni achieves the highest pass rate of +47.00%, demonstrating a decent performance in handling office tasks. However, +this is still far below the human performance and accuracy standards required +by real-world office workflows. We further observe that most issues are related +to operation redundancy and hallucinations, as well as limitations in switching +between multiple applications, which may provide valuable insights for +developing effective agent frameworks for office automation. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Optimizing Numerical Estimation and Operational Efficiency in the Legal + Domain through Large Language Models CIKM + + +
+ The legal landscape encompasses a wide array of lawsuit types, presenting +lawyers with challenges in delivering timely and accurate information to +clients, particularly concerning critical aspects like potential imprisonment +duration or financial repercussions. Compounded by the scarcity of legal +experts, there's an urgent need to enhance the efficiency of traditional legal +workflows. Recent advances in deep learning, especially Large Language Models +(LLMs), offer promising solutions to this challenge. Leveraging LLMs' +mathematical reasoning capabilities, we propose a novel approach integrating +LLM-based methodologies with specially designed prompts to address precision +requirements in legal Artificial Intelligence (LegalAI) applications. The +proposed work seeks to bridge the gap between traditional legal practices and +modern technological advancements, paving the way for a more accessible, +efficient, and equitable legal system. To validate this method, we introduce a +curated dataset tailored to precision-oriented LegalAI tasks, serving as a +benchmark for evaluating LLM-based approaches. Extensive experimentation +confirms the efficacy of our methodology in generating accurate numerical +estimates within the legal domain, emphasizing the role of LLMs in streamlining +legal processes and meeting the evolving demands of LegalAI. + +
+
+ comment: The paper has been accepted by the 33rd ACM International Conference + on Information and Knowledge Management (CIKM) in 2024 +
+
+
+
+
+ + ♻ ☆ Recursive Introspection: Teaching Language Model Agents How to + Self-Improve + + +
+ A central piece in enabling intelligent agentic behavior in foundation models +is to make them capable of introspecting upon their behavior, reasoning, and +correcting their mistakes as more computation or interaction is available. Even +the strongest proprietary large language models (LLMs) do not quite exhibit the +ability of continually improving their responses sequentially, even in +scenarios where they are explicitly told that they are making a mistake. In +this paper, we develop RISE: Recursive IntroSpEction, an approach for +fine-tuning LLMs to introduce this capability, despite prior work hypothesizing +that this capability may not be possible to attain. Our approach prescribes an +iterative fine-tuning procedure, which attempts to teach the model how to alter +its response after having executed previously unsuccessful attempts to solve a +hard test-time problem, with optionally additional environment feedback. RISE +poses fine-tuning for a single-turn prompt as solving a multi-turn Markov +decision process (MDP), where the initial state is the prompt. Inspired by +principles in online imitation learning and reinforcement learning, we propose +strategies for multi-turn data collection and training so as to imbue an LLM +with the capability to recursively detect and correct its previous mistakes in +subsequent iterations. Our experiments show that RISE enables Llama2, Llama3, +and Mistral models to improve themselves with more turns on math reasoning +tasks, outperforming several single-turn strategies given an equal amount of +inference-time computation. We also find that RISE scales well, often attaining +larger benefits with more capable models. Our analysis shows that RISE makes +meaningful improvements to responses to arrive at the correct solution for +challenging prompts, without disrupting one-turn abilities as a result of +expressing more complex distributions. + +
+
+
+
+
+ + ♻ ☆ How Well Can a Long Sequence Model Model Long Sequences? Comparing + Architechtural Inductive Biases on Long-Context Abilities + + +
+ Long sequences occur in abundance within real-world scenarios, hence properly +modelling them opens numerous down-stream use-cases. Deep neural networks, +however, have often struggled with these for a variety of reasons. Recent +advances, both in system engineering as well as model design, have enabled the +scaling up of model that are purported to support extended context length. In +particular, the state-space and linear recurrent neural network families of +models hypothetically can entend to infinite sequence lenth. However, is this +too good to be true? We conduct an evaluation to show that while such claims +may be sound theoretically, there remain large practical gaps that are +empirically observed. In particular, recurrent models still suffer in the same +settings as long-context LLMs with attention. We further show that different +inductive biases have inconsistent extrapolation capabilities, highlighting the +need to further study such paradigms and investigate why long-context models +seemingly fail to behave as one might expect. + +
+
+ comment: Work In Progress. 9 pages +
+
+
+
+
+ + ♻ ☆ Distilling Multi-Scale Knowledge for Event Temporal Relation Extraction CIKM 2024 + + +
+ Event Temporal Relation Extraction (ETRE) is paramount but challenging. +Within a discourse, event pairs are situated at different distances or the +so-called proximity bands. The temporal ordering communicated about event pairs +where at more remote (i.e., ``long'') or less remote (i.e., ``short'') +proximity bands are encoded differently. SOTA models have tended to perform +well on events situated at either short or long proximity bands, but not both. +Nonetheless, real-world, natural texts contain all types of temporal +event-pairs. In this paper, we present MulCo: Distilling Multi-Scale Knowledge +via Contrastive Learning, a knowledge co-distillation approach that shares +knowledge across multiple event pair proximity bands to improve performance on +all types of temporal datasets. Our experimental results show that MulCo +successfully integrates linguistic cues pertaining to temporal reasoning across +both short and long proximity bands and achieves new state-of-the-art results +on several ETRE benchmark datasets. + +
+
+ comment: Accepted to CIKM 2024 Full Research Track, camera ready version +
+
+
+
+
+ + ♻ ☆ Dallah: A Dialect-Aware Multimodal Large Language Model for Arabic + + +
+ Recent advancements have significantly enhanced the capabilities of +Multimodal Large Language Models (MLLMs) in generating and understanding +image-to-text content. Despite these successes, progress is predominantly +limited to English due to the scarcity of high quality multimodal resources in +other languages. This limitation impedes the development of competitive models +in languages such as Arabic. To alleviate this situation, we introduce an +efficient Arabic multimodal assistant, dubbed Dallah, that utilizes an advanced +language model based on LLaMA-2 to facilitate multimodal interactions. Dallah +demonstrates state-of-the-art performance in Arabic MLLMs. Through fine-tuning +six Arabic dialects, Dallah showcases its capability to handle complex +dialectal interactions incorporating both textual and visual elements. The +model excels in two benchmark tests: one evaluating its performance on Modern +Standard Arabic (MSA) and another specifically designed to assess dialectal +responses. Beyond its robust performance in multimodal interaction tasks, +Dallah has the potential to pave the way for further development of +dialect-aware Arabic MLLMs. + +
+
+
+
+
+ + ♻ ☆ EHR-SeqSQL : A Sequential Text-to-SQL Dataset For Interactively + Exploring Electronic Health Records ACL 2024 + + +
+ In this paper, we introduce EHR-SeqSQL, a novel sequential text-to-SQL +dataset for Electronic Health Record (EHR) databases. EHR-SeqSQL is designed to +address critical yet underexplored aspects in text-to-SQL parsing: +interactivity, compositionality, and efficiency. To the best of our knowledge, +EHR-SeqSQL is not only the largest but also the first medical text-to-SQL +dataset benchmark to include sequential and contextual questions. We provide a +data split and the new test set designed to assess compositional generalization +ability. Our experiments demonstrate the superiority of a multi-turn approach +over a single-turn approach in learning compositionality. Additionally, our +dataset integrates specially crafted tokens into SQL queries to improve +execution efficiency. With EHR-SeqSQL, we aim to bridge the gap between +practical needs and academic research in the text-to-SQL domain. EHR-SeqSQL is +available \href{https://github.com/seonhee99/EHR-SeqSQL}{at this https URL}. + +
+
+ comment: ACL 2024 (Findings) +
+
+
+
+
+ + ♻ ☆ Harnessing the Power of Large Language Models for Empathetic Response + Generation: Empirical Investigations and Improvements EMNLP 2023 + + +
+ Empathetic dialogue is an indispensable part of building harmonious social +relationships and contributes to the development of a helpful AI. Previous +approaches are mainly based on fine small-scale language models. With the +advent of ChatGPT, the application effect of large language models (LLMs) in +this field has attracted great attention. This work empirically investigates +the performance of LLMs in generating empathetic responses and proposes three +improvement methods of semantically similar in-context learning, two-stage +interactive generation, and combination with the knowledge base. Extensive +experiments show that LLMs can significantly benefit from our proposed methods +and is able to achieve state-of-the-art performance in both automatic and human +evaluations. Additionally, we explore the possibility of GPT-4 simulating human +evaluators. + +
+
+ comment: Findings of EMNLP 2023 +
+
+
+
+
+ + ♻ ☆ Pseudo-Prompt Generating in Pre-trained Vision-Language Models for + Multi-Label Medical Image Classification + + +
+ The task of medical image recognition is notably complicated by the presence +of varied and multiple pathological indications, presenting a unique challenge +in multi-label classification with unseen labels. This complexity underlines +the need for computer-aided diagnosis methods employing multi-label zero-shot +learning. Recent advancements in pre-trained vision-language models (VLMs) have +showcased notable zero-shot classification abilities on medical images. +However, these methods have limitations on leveraging extensive pre-trained +knowledge from broader image datasets, and often depend on manual prompt +construction by expert radiologists. By automating the process of prompt +tuning, prompt learning techniques have emerged as an efficient way to adapt +VLMs to downstream tasks. Yet, existing CoOp-based strategies fall short in +performing class-specific prompts on unseen categories, limiting +generalizability in fine-grained scenarios. To overcome these constraints, we +introduce a novel prompt generation approach inspirited by text generation in +natural language processing (NLP). Our method, named Pseudo-Prompt Generating +(PsPG), capitalizes on the priori knowledge of multi-modal features. Featuring +a RNN-based decoder, PsPG autoregressively generates class-tailored embedding +vectors, i.e., pseudo-prompts. Comparative evaluations on various multi-label +chest radiograph datasets affirm the superiority of our approach against +leading medical vision-language and multi-label prompt learning methods. The +source code is available at https://github.com/fallingnight/PsPG + +
+
+ comment: Accepted by PRCV 2024 +
+
+
+
+
+ + ♻ ☆ Large Language Model for Table Processing: A Survey + + +
+ Tables, typically two-dimensional and structured to store large amounts of +data, are essential in daily activities like database queries, spreadsheet +manipulations, web table question answering, and image table information +extraction. Automating these table-centric tasks with Large Language Models +(LLMs) or Visual Language Models (VLMs) offers significant public benefits, +garnering interest from academia and industry. This survey provides a +comprehensive overview of table-related tasks, examining both user scenarios +and technical aspects. It covers traditional tasks like table question +answering as well as emerging fields such as spreadsheet manipulation and table +data analysis. We summarize the training techniques for LLMs and VLMs tailored +for table processing. Additionally, we discuss prompt engineering, particularly +the use of LLM-powered agents, for various table-related tasks. Finally, we +highlight several challenges, including processing implicit user intentions and +extracting information from various table sources. + +
+
+
+
+
+ + ♻ ☆ Scaling Laws with Vocabulary: Larger Models Deserve Larger Vocabularies + + +
+ Research on scaling large language models (LLMs) has primarily focused on +model parameters and training data size, overlooking the role of vocabulary +size. We investigate how vocabulary size impacts LLM scaling laws by training +models ranging from 33M to 3B parameters on up to 500B characters with various +vocabulary configurations. We propose three complementary approaches for +predicting the compute-optimal vocabulary size: IsoFLOPs analysis, derivative +estimation, and parametric fit of the loss function. Our approaches converge on +the same result that the optimal vocabulary size depends on the available +compute budget and that larger models deserve larger vocabularies. However, +most LLMs use too small vocabulary sizes. For example, we predict that the +optimal vocabulary size of Llama2-70B should have been at least 216K, 7 times +larger than its vocabulary of 32K. We validate our predictions empirically by +training models with 3B parameters across different FLOPs budgets. Adopting our +predicted optimal vocabulary size consistently improves downstream performance +over commonly used vocabulary sizes. By increasing the vocabulary size from the +conventional 32K to 43K, we improve performance on ARC-Challenge from 29.1 to +32.0 with the same 2.3e21 FLOPs. Our work emphasizes the necessity of jointly +considering model parameters and vocabulary size for efficient scaling. + +
+
+ comment: 26 pages, 12 figures. Add more related work +
+
+
+
+
+ + ♻ ☆ Examining the Influence of Political Bias on Large Language Model + Performance in Stance Classification + + +
+ Large Language Models (LLMs) have demonstrated remarkable capabilities in +executing tasks based on natural language queries. However, these models, +trained on curated datasets, inherently embody biases ranging from racial to +national and gender biases. It remains uncertain whether these biases impact +the performance of LLMs for certain tasks. In this study, we investigate the +political biases of LLMs within the stance classification task, specifically +examining whether these models exhibit a tendency to more accurately classify +politically-charged stances. Utilizing three datasets, seven LLMs, and four +distinct prompting schemes, we analyze the performance of LLMs on politically +oriented statements and targets. Our findings reveal a statistically +significant difference in the performance of LLMs across various politically +oriented stance classification tasks. Furthermore, we observe that this +difference primarily manifests at the dataset level, with models and prompting +schemes showing statistically similar performances across different stance +classification datasets. Lastly, we observe that when there is greater +ambiguity in the target the statement is directed towards, LLMs have poorer +stance classification accuracy. + Code & Dataset: http://doi.org/10.5281/zenodo.12938478 + +
+
+ comment: Accepted at ICWSM 2025 +
+
+
+
+
+ + ♻ ☆ LLMs-in-the-loop Part-1: Expert Small AI Models for Bio-Medical Text + Translation + + +
+ Machine translation is indispensable in healthcare for enabling the global +dissemination of medical knowledge across languages. However, complex medical +terminology poses unique challenges to achieving adequate translation quality +and accuracy. This study introduces a novel "LLMs-in-the-loop" approach to +develop supervised neural machine translation models optimized specifically for +medical texts. While large language models (LLMs) have demonstrated powerful +capabilities, this research shows that small, specialized models trained on +high-quality in-domain (mostly synthetic) data can outperform even vastly +larger LLMs. + Custom parallel corpora in six languages were compiled from scientific +articles, synthetically generated clinical documents, and medical texts. Our +LLMs-in-the-loop methodology employs synthetic data generation, rigorous +evaluation, and agent orchestration to enhance performance. We developed small +medical translation models using the MarianMT base model. We introduce a new +medical translation test dataset to standardize evaluation in this domain. +Assessed using BLEU, METEOR, ROUGE, and BERT scores on this test set, our +MarianMT-based models outperform Google Translate, DeepL, and GPT-4-Turbo. + Results demonstrate that our LLMs-in-the-loop approach, combined with +fine-tuning high-quality, domain-specific data, enables specialized models to +outperform general-purpose and some larger systems. This research, part of a +broader series on expert small models, paves the way for future +healthcare-related AI developments, including deidentification and bio-medical +entity extraction models. Our study underscores the potential of tailored +neural translation models and the LLMs-in-the-loop methodology to advance the +field through improved data generation, evaluation, agent, and modeling +techniques. + +
+
+ comment: 14 pages, 2 figures, 9 tables +
+
+
+
+
+ + ♻ ☆ Exploring Scaling Trends in LLM Robustness + + +
+ Language model capabilities predictably improve from scaling a model's size +and training data. Motivated by this, increasingly large language models have +been trained, yielding an array of impressive capabilities. Yet these models +are vulnerable to adversarial prompts, such as "jailbreaks" that hijack models +to perform undesired behaviors, posing a significant risk of misuse. Prior work +indicates that computer vision models become more robust with model and data +scaling, raising the question: does language model robustness also improve with +scale? We study this question empirically, finding that larger models respond +substantially better to adversarial training, but there is little to no benefit +from model scale in the absence of explicit defenses. + +
+
+ comment: 31 pages; edit fixed metadata typo (author name) +
+
+
+
+
+ + ♻ ☆ Model Composition for Multimodal Large Language Models ACL2024 + + +
+ Recent developments in Multimodal Large Language Models (MLLMs) have shown +rapid progress, moving towards the goal of creating versatile MLLMs that +understand inputs from various modalities. However, existing methods typically +rely on joint training with paired multimodal instruction data, which is +resource-intensive and challenging to extend to new modalities. In this paper, +we propose a new paradigm through the model composition of existing MLLMs to +create a new model that retains the modal understanding capabilities of each +original model. Our basic implementation, NaiveMC, demonstrates the +effectiveness of this paradigm by reusing modality encoders and merging LLM +parameters. Furthermore, we introduce DAMC to address parameter interference +and mismatch issues during the merging process, thereby enhancing the model +performance. To facilitate research in this area, we propose MCUB, a benchmark +for assessing ability of MLLMs to understand inputs from diverse modalities. +Experiments on this benchmark and four other multimodal understanding tasks +show significant improvements over baselines, proving that model composition +can create a versatile model capable of processing inputs from multiple +modalities. + +
+
+ comment: ACL2024 Main Conference; Code is available at + https://github.com/THUNLP-MT/ModelCompose +
+
+
+
+
+ + ♻ ☆ Automatic Speech Recognition Advancements for Indigenous Languages of + the Americas + + +
+ Indigenous languages are a fundamental legacy in the development of human +communication, embodying the unique identity and culture of local communities +in America. The Second AmericasNLP (Americas Natural Language Processing) +Competition Track 1 of NeurIPS (Neural Information Processing Systems) 2022 +proposed the task of training automatic speech recognition (ASR) systems for +five Indigenous languages: Quechua, Guarani, Bribri, Kotiria, and Wa'ikhana. In +this paper, we describe the fine-tuning of a state-of-the-art ASR model for +each target language, using approximately 36.65 h of transcribed speech data +from diverse sources enriched with data augmentation methods. We systematically +investigate, using a Bayesian search, the impact of the different +hyperparameters on the Wav2vec2.0 XLS-R (Cross-Lingual Speech Representations) +variants of 300 M and 1 B parameters. Our findings indicate that data and +detailed hyperparameter tuning significantly affect ASR accuracy, but language +complexity determines the final result. The Quechua model achieved the lowest +character error rate (CER) (12.14), while the Kotiria model, despite having the +most extensive dataset during the fine-tuning phase, showed the highest CER +(36.59). Conversely, with the smallest dataset, the Guarani model achieved a +CER of 15.59, while Bribri and Wa'ikhana obtained, respectively, CERs of 34.70 +and 35.23. Additionally, Sobol' sensitivity analysis highlighted the crucial +roles of freeze fine-tuning updates and dropout rates. We release our best +models for each language, marking the first open ASR models for Wa'ikhana and +Kotiria. This work opens avenues for future research to advance ASR techniques +in preserving minority Indigenous languages + +
+
+
+
+
+ + ♻ ☆ RCAgent: Cloud Root Cause Analysis by Autonomous Agents with + Tool-Augmented Large Language Models + + +
+ Large language model (LLM) applications in cloud root cause analysis (RCA) +have been actively explored recently. However, current methods are still +reliant on manual workflow settings and do not unleash LLMs' decision-making +and environment interaction capabilities. We present RCAgent, a tool-augmented +LLM autonomous agent framework for practical and privacy-aware industrial RCA +usage. Running on an internally deployed model rather than GPT families, +RCAgent is capable of free-form data collection and comprehensive analysis with +tools. Our framework combines a variety of enhancements, including a unique +Self-Consistency for action trajectories, and a suite of methods for context +management, stabilization, and importing domain knowledge. Our experiments show +RCAgent's evident and consistent superiority over ReAct across all aspects of +RCA -- predicting root causes, solutions, evidence, and responsibilities -- and +tasks covered or uncovered by current rules, as validated by both automated +metrics and human evaluations. Furthermore, RCAgent has already been integrated +into the diagnosis and issue discovery workflow of the Real-time Compute +Platform for Apache Flink of Alibaba Cloud. + +
+
+
+
+
+ + ♻ ☆ Customized Retrieval Augmented Generation and Benchmarking for EDA Tool + Documentation QA + + +
+ Retrieval augmented generation (RAG) enhances the accuracy and reliability of +generative AI models by sourcing factual information from external databases, +which is extensively employed in document-grounded question-answering (QA) +tasks. Off-the-shelf RAG flows are well pretrained on general-purpose +documents, yet they encounter significant challenges when being applied to +knowledge-intensive vertical domains, such as electronic design automation +(EDA). This paper addresses such issue by proposing a customized RAG framework +along with three domain-specific techniques for EDA tool documentation QA, +including a contrastive learning scheme for text embedding model fine-tuning, a +reranker distilled from proprietary LLM, and a generative LLM fine-tuned with +high-quality domain corpus. Furthermore, we have developed and released a +documentation QA evaluation benchmark, ORD-QA, for OpenROAD, an advanced +RTL-to-GDSII design platform. Experimental results demonstrate that our +proposed RAG flow and techniques have achieved superior performance on ORD-QA +as well as on a commercial tool, compared with state-of-the-arts. The ORD-QA +benchmark and the training dataset for our customized RAG flow are open-source +at https://github.com/lesliepy99/RAG-EDA. + +
+
+ comment: Accepted by ICCAD 2024 +
+
+
+
+
+ + ♻ ☆ A Systematic Review of Aspect-based Sentiment Analysis: Domains, + Methods, and Trends + + +
+ Aspect-based Sentiment Analysis (ABSA) is a fine-grained type of sentiment +analysis that identifies aspects and their associated opinions from a given +text. With the surge of digital opinionated text data, ABSA gained increasing +popularity for its ability to mine more detailed and targeted insights. Many +review papers on ABSA subtasks and solution methodologies exist, however, few +focus on trends over time or systemic issues relating to research application +domains, datasets, and solution approaches. To fill the gap, this paper +presents a Systematic Literature Review (SLR) of ABSA studies with a focus on +trends and high-level relationships among these fundamental components. This +review is one of the largest SLRs on ABSA. To our knowledge, it is also the +first to systematically examine the interrelations among ABSA research and data +distribution across domains, as well as trends in solution paradigms and +approaches. Our sample includes 727 primary studies screened from 8550 search +results without time constraints via an innovative automatic filtering process. +Our quantitative analysis not only identifies trends in nearly two decades of +ABSA research development but also unveils a systemic lack of dataset and +domain diversity as well as domain mismatch that may hinder the development of +future ABSA research. We discuss these findings and their implications and +propose suggestions for future research. + +
+
+
+
+
+ + ♻ ☆ Reactor Mk.1 performances: MMLU, HumanEval and BBH test results + + +
+ The paper presents the performance results of Reactor Mk.1, ARCs flagship +large language model, through a benchmarking process analysis. The model +utilizes the Lychee AI engine and possesses less than 100 billion parameters, +resulting in a combination of efficiency and potency. The Reactor Mk.1 +outperformed models such as GPT-4o, Claude Opus, and Llama 3, with achieved +scores of 92% on the MMLU dataset, 91% on HumanEval dataset, and 88% on BBH +dataset. It excels in both managing difficult jobs and reasoning, establishing +as a prominent AI solution in the present cutting-edge AI technology. + +
+
+
+
+
+ + ♻ ☆ 3MVRD: Multimodal Multi-task Multi-teacher Visually-Rich Form Document + Understanding ACL 2024 + + +
+ This paper presents a groundbreaking multimodal, multi-task, multi-teacher +joint-grained knowledge distillation model for visually-rich form document +understanding. The model is designed to leverage insights from both +fine-grained and coarse-grained levels by facilitating a nuanced correlation +between token and entity representations, addressing the complexities inherent +in form documents. Additionally, we introduce new inter-grained and +cross-grained loss functions to further refine diverse multi-teacher knowledge +distillation transfer process, presenting distribution gaps and a harmonised +understanding of form documents. Through a comprehensive evaluation across +publicly available form document understanding datasets, our proposed model +consistently outperforms existing baselines, showcasing its efficacy in +handling the intricate structures and content of visually complex form +documents. + +
+
+ comment: Accepted at Findings of ACL 2024 +
+
+
+
+
+ + ♻ ☆ Grounding Language Models for Visual Entity Recognition ECCV 2024 + + +
+ We introduce AutoVER, an Autoregressive model for Visual Entity Recognition. +Our model extends an autoregressive Multi-modal Large Language Model by +employing retrieval augmented constrained generation. It mitigates low +performance on out-of-domain entities while excelling in queries that require +visually-situated reasoning. Our method learns to distinguish similar entities +within a vast label space by contrastively training on hard negative pairs in +parallel with a sequence-to-sequence objective without an external retriever. +During inference, a list of retrieved candidate answers explicitly guides +language generation by removing invalid decoding paths. The proposed method +achieves significant improvements across different dataset splits in the +recently proposed Oven-Wiki benchmark. Accuracy on the Entity seen split rises +from 32.7% to 61.5%. It also demonstrates superior performance on the unseen +and query splits by a substantial double-digit margin. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ♻ ☆ AIR-Bench: Benchmarking Large Audio-Language Models via Generative + Comprehension ACL + 2024 + + +
+ Recently, instruction-following audio-language models have received broad +attention for human-audio interaction. However, the absence of benchmarks +capable of evaluating audio-centric interaction capabilities has impeded +advancements in this field. Previous models primarily focus on assessing +different fundamental tasks, such as Automatic Speech Recognition (ASR), and +lack an assessment of the open-ended generative capabilities centered around +audio. Thus, it is challenging to track the progression in the Large +Audio-Language Models (LALMs) domain and to provide guidance for future +improvement. In this paper, we introduce AIR-Bench (\textbf{A}udio +\textbf{I}nst\textbf{R}uction \textbf{Bench}mark), the first benchmark designed +to evaluate the ability of LALMs to understand various types of audio signals +(including human speech, natural sounds, and music), and furthermore, to +interact with humans in the textual format. AIR-Bench encompasses two +dimensions: \textit{foundation} and \textit{chat} benchmarks. The former +consists of 19 tasks with approximately 19k single-choice questions, intending +to inspect the basic single-task ability of LALMs. The latter one contains 2k +instances of open-ended question-and-answer data, directly assessing the +comprehension of the model on complex audio and its capacity to follow +instructions. Both benchmarks require the model to generate hypotheses +directly. We design a unified framework that leverages advanced language +models, such as GPT-4, to evaluate the scores of generated hypotheses given the +meta-information of the audio. Experimental results demonstrate a high level of +consistency between GPT-4-based evaluation and human evaluation. By revealing +the limitations of existing LALMs through evaluation results, AIR-Bench can +provide insights into the direction of future research. + +
+
+ comment: Code and Data: https://github.com/OFA-Sys/AIR-Bench. Accepted by ACL + 2024 +
+
+
+
+
+ + ♻ ☆ The Janus Interface: How Fine-Tuning in Large Language Models Amplifies + the Privacy Risks CCS 2024 + + +
+ The rapid advancements of large language models (LLMs) have raised public +concerns about the privacy leakage of personally identifiable information (PII) +within their extensive training datasets. Recent studies have demonstrated that +an adversary could extract highly sensitive privacy data from the training data +of LLMs with carefully designed prompts. However, these attacks suffer from the +model's tendency to hallucinate and catastrophic forgetting (CF) in the +pre-training stage, rendering the veracity of divulged PIIs negligible. In our +research, we propose a novel attack, Janus, which exploits the fine-tuning +interface to recover forgotten PIIs from the pre-training data in LLMs. We +formalize the privacy leakage problem in LLMs and explain why forgotten PIIs +can be recovered through empirical analysis on open-source language models. +Based upon these insights, we evaluate the performance of Janus on both +open-source language models and two latest LLMs, i.e., GPT-3.5-Turbo and +LLaMA-2-7b. Our experiment results show that Janus amplifies the privacy risks +by over 10 times in comparison with the baseline and significantly outperforms +the state-of-the-art privacy extraction attacks including prefix attacks and +in-context learning (ICL). Furthermore, our analysis validates that existing +fine-tuning APIs provided by OpenAI and Azure AI Studio are susceptible to our +Janus attack, allowing an adversary to conduct such an attack at a low cost. + +
+
+ comment: This work has been accepted by CCS 2024 +
+
+
+
+
+ + ♻ ☆ AutoRE: Document-Level Relation Extraction with Large Language Models + + +
+ Large Language Models (LLMs) have demonstrated exceptional abilities in +comprehending and generating text, motivating numerous researchers to utilize +them for Information Extraction (IE) purposes, including Relation Extraction +(RE). Nonetheless, most existing methods are predominantly designed for +Sentence-level Relation Extraction (SentRE) tasks, which typically encompass a +restricted set of relations and triplet facts within a single sentence. +Furthermore, certain approaches resort to treating relations as candidate +choices integrated into prompt templates, leading to inefficient processing and +suboptimal performance when tackling Document-Level Relation Extraction (DocRE) +tasks, which entail handling multiple relations and triplet facts distributed +across a given document, posing distinct challenges. To overcome these +limitations, we introduce AutoRE, an end-to-end DocRE model that adopts a novel +RE extraction paradigm named RHF (Relation-Head-Facts). Unlike existing +approaches, AutoRE does not rely on the assumption of known relation options, +making it more reflective of real-world scenarios. Additionally, we have +developed an easily extensible RE framework using a Parameters Efficient Fine +Tuning (PEFT) algorithm (QLoRA). Our experiments on the RE-DocRED dataset +showcase AutoRE's best performance, achieving state-of-the-art results, +surpassing TAG by 10.03\% and 9.03\% respectively on the dev and test set. The +code is available at https://github.com/THUDM/AutoRE and the demonstration +video is provided at https://www.youtube.com/watch?v=IhKRsZUAxKk. + +
+
+ comment: 10 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ PersLLM: A Personified Training Approach for Large Language Models + + +
+ Large language models exhibit aspects of human-level intelligence that +catalyze their application as human-like agents in domains such as social +simulations, human-machine interactions, and collaborative multi-agent systems. +However, the absence of distinct personalities, such as displaying ingratiating +behaviors, inconsistent opinions, and uniform response patterns, diminish LLMs +utility in practical applications. Addressing this, the development of +personality traits in LLMs emerges as a crucial area of research to unlock +their latent potential. Existing methods to personify LLMs generally involve +strategies like employing stylized training data for instruction tuning or +using prompt engineering to simulate different personalities. These methods +only capture superficial linguistic styles instead of the core of personalities +and are therefore not stable. In this study, we propose PersLLM, integrating +psychology-grounded principles of personality: social practice, consistency, +and dynamic development, into a comprehensive training methodology. We +incorporate personality traits directly into the model parameters, enhancing +the model's resistance to induction, promoting consistency, and supporting the +dynamic evolution of personality. Single-agent evaluation validates our +method's superiority, as it produces responses more aligned with reference +personalities compared to other approaches. Case studies for multi-agent +communication highlight its benefits in enhancing opinion consistency within +individual agents and fostering collaborative creativity among multiple agents +in dialogue contexts, potentially benefiting human simulation and multi-agent +cooperation. Additionally, human-agent interaction evaluations indicate that +our personified models significantly enhance interactive experiences, +underscoring the practical implications of our research. + +
+
+ comment: 10 pages for main text, 5 figures +
+
+
+
+
+ + ♻ ☆ Tag-LLM: Repurposing General-Purpose LLMs for Specialized Domains ICML 2024 + + +
+ Large Language Models (LLMs) have demonstrated remarkable proficiency in +understanding and generating natural language. However, their capabilities wane +in highly specialized domains underrepresented in the pretraining corpus, such +as physical and biomedical sciences. This work explores how to repurpose +general LLMs into effective task solvers for specialized domains. We introduce +a novel, model-agnostic framework for learning custom input tags, which are +parameterized as continuous vectors appended to the LLM's embedding layer, to +condition the LLM. We design two types of input tags: domain tags are used to +delimit specialized representations (e.g., chemical formulas) and provide +domain-relevant context; function tags are used to represent specific functions +(e.g., predicting molecular properties) and compress function-solving +instructions. We develop a three-stage protocol to learn these tags using +auxiliary data and domain knowledge. By explicitly disentangling task domains +from task functions, our method enables zero-shot generalization to unseen +problems through diverse combinations of the input tags. It also boosts LLM's +performance in various specialized domains, such as predicting protein or +chemical properties and modeling drug-target interactions, outperforming expert +models tailored to these tasks. + +
+
+ comment: ICML 2024 +
+
+
+
+
+ + ♻ ☆ LLM Platform Security: Applying a Systematic Evaluation Framework to + OpenAI's ChatGPT Plugins AAAI + + +
+ Large language model (LLM) platforms, such as ChatGPT, have recently begun +offering an app ecosystem to interface with third-party services on the +internet. While these apps extend the capabilities of LLM platforms, they are +developed by arbitrary third parties and thus cannot be implicitly trusted. +Apps also interface with LLM platforms and users using natural language, which +can have imprecise interpretations. In this paper, we propose a framework that +lays a foundation for LLM platform designers to analyze and improve the +security, privacy, and safety of current and future third-party integrated LLM +platforms. Our framework is a formulation of an attack taxonomy that is +developed by iteratively exploring how LLM platform stakeholders could leverage +their capabilities and responsibilities to mount attacks against each other. As +part of our iterative process, we apply our framework in the context of +OpenAI's plugin (apps) ecosystem. We uncover plugins that concretely +demonstrate the potential for the types of issues that we outline in our attack +taxonomy. We conclude by discussing novel challenges and by providing +recommendations to improve the security, privacy, and safety of present and +future LLM-based computing platforms. + +
+
+ comment: To appear in the proceedings of the 7th AAAI / ACM Conference on AI, + Ethics, and Society (AIES), October 2024 +
+
+
+
+
+ + ♻ ☆ RAM-EHR: Retrieval Augmentation Meets Clinical Predictions on Electronic + Health Records ACL 2024 + + +
+ We present RAM-EHR, a Retrieval AugMentation pipeline to improve clinical +predictions on Electronic Health Records (EHRs). RAM-EHR first collects +multiple knowledge sources, converts them into text format, and uses dense +retrieval to obtain information related to medical concepts. This strategy +addresses the difficulties associated with complex names for the concepts. +RAM-EHR then augments the local EHR predictive model co-trained with +consistency regularization to capture complementary information from patient +visits and summarized knowledge. Experiments on two EHR datasets show the +efficacy of RAM-EHR over previous knowledge-enhanced baselines (3.4% gain in +AUROC and 7.2% gain in AUPR), emphasizing the effectiveness of the summarized +knowledge from RAM-EHR for clinical prediction tasks. The code will be +published at \url{https://github.com/ritaranx/RAM-EHR}. + +
+
+ comment: ACL 2024 (Oral) +
+
+
+
+
+ + ♻ ☆ Cutting through the noise to motivate people: A comprehensive analysis + of COVID-19 social media posts de/motivating vaccination + + +
+ The COVID-19 pandemic exposed significant weaknesses in the healthcare +information system. The overwhelming volume of misinformation on social media +and other socioeconomic factors created extraordinary challenges to motivate +people to take proper precautions and get vaccinated. In this context, our work +explored a novel direction by analyzing an extensive dataset collected over two +years, identifying the topics de/motivating the public about COVID-19 +vaccination. We analyzed these topics based on time, geographic location, and +political orientation. We noticed that while the motivating topics remain the +same over time and geographic location, the demotivating topics change rapidly. +We also identified that intrinsic motivation, rather than external mandate, is +more advantageous to inspire the public. This study addresses scientific +communication and public motivation in social media. It can help public health +officials, policymakers, and social media platforms develop more effective +messaging strategies to cut through the noise of misinformation and educate the +public about scientific findings. + +
+
+ comment: 51 pages, 13 figures, 12 tables. Accepted at Natural Language + Processing Journal +
+
+
+
+
+ + ♻ ☆ Simulating Policy Impacts: Developing a Generative Scenario Writing + Method to Evaluate the Perceived Effects of Regulation AAAI + + +
+ The rapid advancement of AI technologies yields numerous future impacts on +individuals and society. Policymakers are tasked to react quickly and establish +policies that mitigate those impacts. However, anticipating the effectiveness +of policies is a difficult task, as some impacts might only be observable in +the future and respective policies might not be applicable to the future +development of AI. In this work we develop a method for using large language +models (LLMs) to evaluate the efficacy of a given piece of policy at mitigating +specified negative impacts. We do so by using GPT-4 to generate scenarios both +pre- and post-introduction of policy and translating these vivid stories into +metrics based on human perceptions of impacts. We leverage an already +established taxonomy of impacts of generative AI in the media environment to +generate a set of scenario pairs both mitigated and non-mitigated by the +transparency policy in Article 50 of the EU AI Act. We then run a user study +(n=234) to evaluate these scenarios across four risk-assessment dimensions: +severity, plausibility, magnitude, and specificity to vulnerable populations. +We find that this transparency legislation is perceived to be effective at +mitigating harms in areas such as labor and well-being, but largely ineffective +in areas such as social cohesion and security. Through this case study we +demonstrate the efficacy of our method as a tool to iterate on the +effectiveness of policy for mitigating various negative impacts. We expect this +method to be useful to researchers or other stakeholders who want to brainstorm +the potential utility of different pieces of policy or other mitigation +strategies. + +
+
+ comment: To be published in the proceedings of the Seventh AAAI/ACM Conference + on AI, Ethics, and Society +
+
+
+
+
+ + ♻ ☆ Instruction Mining: Instruction Data Selection for Tuning Large Language + Models + + +
+ Large language models (LLMs) are initially pretrained for broad capabilities +and then finetuned with instruction-following datasets to improve their +performance in interacting with humans. Despite advances in finetuning, a +standardized guideline for selecting high-quality datasets to optimize this +process remains elusive. In this paper, we first propose InstructMining, an +innovative method designed for automatically selecting premium +instruction-following data for finetuning LLMs. Specifically, InstructMining +utilizes natural language indicators as a measure of data quality, applying +them to evaluate unseen datasets. During experimentation, we discover that +double descent phenomenon exists in large language model finetuning. Based on +this observation, we further leverage BlendSearch to help find the best subset +among the entire dataset (i.e., 2,532 out of 100,000). Experiment results show +that InstructMining-7B achieves state-of-the-art performance on two of the most +popular benchmarks: LLM-as-a-judge and Huggingface OpenLLM leaderboard. + +
+
+ comment: 24 pages, 7 figures +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 97 + +
+
+
+ + ☆ Floating No More: Object-Ground Reconstruction from a Single Image + + +
+ Recent advancements in 3D object reconstruction from single images have +primarily focused on improving the accuracy of object shapes. Yet, these +techniques often fail to accurately capture the inter-relation between the +object, ground, and camera. As a result, the reconstructed objects often appear +floating or tilted when placed on flat surfaces. This limitation significantly +affects 3D-aware image editing applications like shadow rendering and object +pose manipulation. To address this issue, we introduce ORG (Object +Reconstruction with Ground), a novel task aimed at reconstructing 3D object +geometry in conjunction with the ground surface. Our method uses two compact +pixel-level representations to depict the relationship between camera, object, +and ground. Experiments show that the proposed ORG model can effectively +reconstruct object-ground geometry on unseen data, significantly enhancing the +quality of shadow generation and pose manipulation compared to conventional +single-image 3D reconstruction techniques. + +
+
+ comment: Project Page: https://yunzeman.github.io/ORG/ +
+
+
+
+
+ + ☆ HRP: Human Affordances for Robotic Pre-Training + + +
+ In order to *generalize* to various tasks in the wild, robotic agents will +need a suitable representation (i.e., vision network) that enables the robot to +predict optimal actions given high dimensional vision inputs. However, learning +such a representation requires an extreme amount of diverse training data, +which is prohibitively expensive to collect on a real robot. How can we +overcome this problem? Instead of collecting more robot data, this paper +proposes using internet-scale, human videos to extract "affordances," both at +the environment and agent level, and distill them into a pre-trained +representation. We present a simple framework for pre-training representations +on hand, object, and contact "affordance labels" that highlight relevant +objects in images and how to interact with them. These affordances are +automatically extracted from human video data (with the help of off-the-shelf +computer vision modules) and used to fine-tune existing representations. Our +approach can efficiently fine-tune *any* existing representation, and results +in models with stronger downstream robotic performance across the board. We +experimentally demonstrate (using 3000+ robot trials) that this affordance +pre-training scheme boosts performance by a minimum of 15% on 5 real-world +tasks, which consider three diverse robot morphologies (including a dexterous +hand). Unlike prior works in the space, these representations improve +performance across 3 different camera views. Quantitatively, we find that our +approach leads to higher levels of generalization in out-of-distribution +settings. For code, weights, and data check: https://hrp-robot.github.io + +
+
+ comment: Accepted to Robotics Science and Systems 2024 +
+
+
+
+
+ + ☆ Wolf: Captioning Everything with a World Summarization Framework + + +
+ We propose Wolf, a WOrLd summarization Framework for accurate video +captioning. Wolf is an automated captioning framework that adopts a +mixture-of-experts approach, leveraging complementary strengths of Vision +Language Models (VLMs). By utilizing both image and video models, our framework +captures different levels of information and summarizes them efficiently. Our +approach can be applied to enhance video understanding, auto-labeling, and +captioning. To evaluate caption quality, we introduce CapScore, an LLM-based +metric to assess the similarity and quality of generated captions compared to +the ground truth captions. We further build four human-annotated datasets in +three domains: autonomous driving, general scenes, and robotics, to facilitate +comprehensive comparisons. We show that Wolf achieves superior captioning +performance compared to state-of-the-art approaches from the research community +(VILA1.5, CogAgent) and commercial solutions (Gemini-Pro-1.5, GPT-4V). For +instance, in comparison with GPT-4V, Wolf improves CapScore both quality-wise +by 55.6% and similarity-wise by 77.4% on challenging driving videos. Finally, +we establish a benchmark for video captioning and introduce a leaderboard, +aiming to accelerate advancements in video understanding, captioning, and data +alignment. Leaderboard: https://wolfv0.github.io/leaderboard.html. + +
+
+
+
+
+ + ☆ SHIC: Shape-Image Correspondences with no Keypoint Supervision ECCV 2024 + + +
+ Canonical surface mapping generalizes keypoint detection by assigning each +pixel of an object to a corresponding point in a 3D template. Popularised by +DensePose for the analysis of humans, authors have since attempted to apply the +concept to more categories, but with limited success due to the high cost of +manual supervision. In this work, we introduce SHIC, a method to learn +canonical maps without manual supervision which achieves better results than +supervised methods for most categories. Our idea is to leverage foundation +computer vision models such as DINO and Stable Diffusion that are open-ended +and thus possess excellent priors over natural categories. SHIC reduces the +problem of estimating image-to-template correspondences to predicting +image-to-image correspondences using features from the foundation models. The +reduction works by matching images of the object to non-photorealistic renders +of the template, which emulates the process of collecting manual annotations +for this task. These correspondences are then used to supervise high-quality +canonical maps for any object of interest. We also show that image generators +can further improve the realism of the template views, which provide an +additional source of supervision for the model. + +
+
+ comment: ECCV 2024. Project website + https://www.robots.ox.ac.uk/~vgg/research/shic/ +
+
+
+
+
+ + ☆ A Scalable Quantum Non-local Neural Network for Image Classification + + +
+ Non-local operations play a crucial role in computer vision enabling the +capture of long-range dependencies through weighted sums of features across the +input, surpassing the constraints of traditional convolution operations that +focus solely on local neighborhoods. Non-local operations typically require +computing pairwise relationships between all elements in a set, leading to +quadratic complexity in terms of time and memory. Due to the high computational +and memory demands, scaling non-local neural networks to large-scale problems +can be challenging. This article introduces a hybrid quantum-classical scalable +non-local neural network, referred to as Quantum Non-Local Neural Network +(QNL-Net), to enhance pattern recognition. The proposed QNL-Net relies on +inherent quantum parallelism to allow the simultaneous processing of a large +number of input features enabling more efficient computations in +quantum-enhanced feature space and involving pairwise relationships through +quantum entanglement. We benchmark our proposed QNL-Net with other quantum +counterparts to binary classification with datasets MNIST and CIFAR-10. The +simulation findings showcase our QNL-Net achieves cutting-edge accuracy levels +in binary image classification among quantum classifiers while utilizing fewer +qubits. + +
+
+ comment: draft, 13 pages (including references and appendix), 5 figures +
+
+
+
+
+ + ☆ Learn from the Learnt: Source-Free Active Domain Adaptation via + Contrastive Sampling and Visual Persistence ECCV 2024 + + +
+ Domain Adaptation (DA) facilitates knowledge transfer from a source domain to +a related target domain. This paper investigates a practical DA paradigm, +namely Source data-Free Active Domain Adaptation (SFADA), where source data +becomes inaccessible during adaptation, and a minimum amount of annotation +budget is available in the target domain. Without referencing the source data, +new challenges emerge in identifying the most informative target samples for +labeling, establishing cross-domain alignment during adaptation, and ensuring +continuous performance improvements through the iterative query-and-adaptation +process. In response, we present learn from the learnt (LFTL), a novel paradigm +for SFADA to leverage the learnt knowledge from the source pretrained model and +actively iterated models without extra overhead. We propose Contrastive Active +Sampling to learn from the hypotheses of the preceding model, thereby querying +target samples that are both informative to the current model and persistently +challenging throughout active learning. During adaptation, we learn from +features of actively selected anchors obtained from previous intermediate +models, so that the Visual Persistence-guided Adaptation can facilitate feature +distribution alignment and active sample exploitation. Extensive experiments on +three widely-used benchmarks show that our LFTL achieves state-of-the-art +performance, superior computational efficiency and continuous improvements as +the annotation budget increases. Our code is available at +https://github.com/lyumengyao/lftl. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ☆ Unifying Visual and Semantic Feature Spaces with Diffusion Models for + Enhanced Cross-Modal Alignment + + +
+ Image classification models often demonstrate unstable performance in +real-world applications due to variations in image information, driven by +differing visual perspectives of subject objects and lighting discrepancies. To +mitigate these challenges, existing studies commonly incorporate additional +modal information matching the visual data to regularize the model's learning +process, enabling the extraction of high-quality visual features from complex +image regions. Specifically, in the realm of multimodal learning, cross-modal +alignment is recognized as an effective strategy, harmonizing different modal +information by learning a domain-consistent latent feature space for visual and +semantic features. However, this approach may face limitations due to the +heterogeneity between multimodal information, such as differences in feature +distribution and structure. To address this issue, we introduce a Multimodal +Alignment and Reconstruction Network (MARNet), designed to enhance the model's +resistance to visual noise. Importantly, MARNet includes a cross-modal +diffusion reconstruction module for smoothly and stably blending information +across different domains. Experiments conducted on two benchmark datasets, +Vireo-Food172 and Ingredient-101, demonstrate that MARNet effectively improves +the quality of image information extracted by the model. It is a plug-and-play +framework that can be rapidly integrated into various image classification +frameworks, boosting model performance. + +
+
+
+
+
+ + ☆ Scalable Group Choreography via Variational Phase Manifold Learning ECCV 2024 + + +
+ Generating group dance motion from the music is a challenging task with +several industrial applications. Although several methods have been proposed to +tackle this problem, most of them prioritize optimizing the fidelity in dancing +movement, constrained by predetermined dancer counts in datasets. This +limitation impedes adaptability to real-world applications. Our study addresses +the scalability problem in group choreography while preserving naturalness and +synchronization. In particular, we propose a phase-based variational generative +model for group dance generation on learning a generative manifold. Our method +achieves high-fidelity group dance motion and enables the generation with an +unlimited number of dancers while consuming only a minimal and constant amount +of memory. The intensive experiments on two public datasets show that our +proposed method outperforms recent state-of-the-art approaches by a large +margin and is scalable to a great number of dancers beyond the training data. + +
+
+ comment: Accepted at ECCV 2024 +
+
+
+
+
+ + ☆ Deep Companion Learning: Enhancing Generalization Through Historical + Consistency ECCV 2024 + + +
+ We propose Deep Companion Learning (DCL), a novel training method for Deep +Neural Networks (DNNs) that enhances generalization by penalizing inconsistent +model predictions compared to its historical performance. To achieve this, we +train a deep-companion model (DCM), by using previous versions of the model to +provide forecasts on new inputs. This companion model deciphers a meaningful +latent semantic structure within the data, thereby providing targeted +supervision that encourages the primary model to address the scenarios it finds +most challenging. We validate our approach through both theoretical analysis +and extensive experimentation, including ablation studies, on a variety of +benchmark datasets (CIFAR-100, Tiny-ImageNet, ImageNet-1K) using diverse +architectural models (ShuffleNetV2, ResNet, Vision Transformer, etc.), +demonstrating state-of-the-art performance. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ☆ Benchmarking Dependence Measures to Prevent Shortcut Learning in Medical + Imaging + + +
+ Medical imaging cohorts are often confounded by factors such as acquisition +devices, hospital sites, patient backgrounds, and many more. As a result, deep +learning models tend to learn spurious correlations instead of causally related +features, limiting their generalizability to new and unseen data. This problem +can be addressed by minimizing dependence measures between intermediate +representations of task-related and non-task-related variables. These measures +include mutual information, distance correlation, and the performance of +adversarial classifiers. Here, we benchmark such dependence measures for the +task of preventing shortcut learning. We study a simplified setting using +Morpho-MNIST and a medical imaging task with CheXpert chest radiographs. Our +results provide insights into how to mitigate confounding factors in medical +imaging. + +
+
+ comment: Accepted to the 15th International Workshop on Machine Learning in + Medical Imaging (MLMI 2024) +
+
+
+
+
+ + ☆ BCTR: Bidirectional Conditioning Transformer for Scene Graph Generation + + +
+ Scene Graph Generation (SGG) remains a challenging task due to its +compositional property. Previous approaches improve prediction efficiency by +learning in an end-to-end manner. However, these methods exhibit limited +performance as they assume unidirectional conditioning between entities and +predicates, leading to insufficient information interaction. To address this +limitation, we propose a novel bidirectional conditioning factorization for +SGG, introducing efficient interaction between entities and predicates. +Specifically, we develop an end-to-end scene graph generation model, +Bidirectional Conditioning Transformer (BCTR), to implement our factorization. +BCTR consists of two key modules. First, the Bidirectional Conditioning +Generator (BCG) facilitates multi-stage interactive feature augmentation +between entities and predicates, enabling mutual benefits between the two +predictions. Second, Random Feature Alignment (RFA) regularizes the feature +space by distilling multi-modal knowledge from pre-trained models, enhancing +BCTR's ability on tailed categories without relying on statistical priors. We +conduct a series of experiments on Visual Genome and Open Image V6, +demonstrating that BCTR achieves state-of-the-art performance on both +benchmarks. The code will be available upon acceptance of the paper. + +
+
+ comment: 9 pages, 3 figures +
+
+
+
+
+ + ☆ PIV3CAMS: a multi-camera dataset for multiple computer vision problems + and its application to novel view-point synthesis + + +
+ The modern approaches for computer vision tasks significantly rely on machine +learning, which requires a large number of quality images. While there is a +plethora of image datasets with a single type of images, there is a lack of +datasets collected from multiple cameras. In this thesis, we introduce Paired +Image and Video data from three CAMeraS, namely PIV3CAMS, aimed at multiple +computer vision tasks. The PIV3CAMS dataset consists of 8385 pairs of images +and 82 pairs of videos taken from three different cameras: Canon D5 Mark IV, +Huawei P20, and ZED stereo camera. The dataset includes various indoor and +outdoor scenes from different locations in Zurich (Switzerland) and Cheonan +(South Korea). Some of the computer vision applications that can benefit from +the PIV3CAMS dataset are image/video enhancement, view interpolation, image +matching, and much more. We provide a careful explanation of the data +collection process and detailed analysis of the data. The second part of this +thesis studies the usage of depth information in the view synthesizing task. In +addition to the regeneration of a current state-of-the-art algorithm, we +investigate several proposed alternative models that integrate depth +information geometrically. Through extensive experiments, we show that the +effect of depth is crucial in small view changes. Finally, we apply our model +to the introduced PIV3CAMS dataset to synthesize novel target views as an +example application of PIV3CAMS. + +
+
+
+
+
+ + ☆ Rapid Object Annotation + + +
+ In this report we consider the problem of rapidly annotating a video with +bounding boxes for a novel object. We describe a UI and associated workflow +designed to make this process fast for an arbitrary novel target. + +
+
+
+
+
+ + ☆ A Survey on Cell Nuclei Instance Segmentation and Classification: + Leveraging Context and Attention + + +
+ Manually annotating nuclei from the gigapixel Hematoxylin and Eosin +(H&E)-stained Whole Slide Images (WSIs) is a laborious and costly task, meaning +automated algorithms for cell nuclei instance segmentation and classification +could alleviate the workload of pathologists and clinical researchers and at +the same time facilitate the automatic extraction of clinically interpretable +features. But due to high intra- and inter-class variability of nuclei +morphological and chromatic features, as well as H&E-stains susceptibility to +artefacts, state-of-the-art algorithms cannot correctly detect and classify +instances with the necessary performance. In this work, we hypothesise context +and attention inductive biases in artificial neural networks (ANNs) could +increase the generalization of algorithms for cell nuclei instance segmentation +and classification. We conduct a thorough survey on context and attention +methods for cell nuclei instance segmentation and classification from +H&E-stained microscopy imaging, while providing a comprehensive discussion of +the challenges being tackled with context and attention. Besides, we illustrate +some limitations of current approaches and present ideas for future research. +As a case study, we extend both a general instance segmentation and +classification method (Mask-RCNN) and a tailored cell nuclei instance +segmentation and classification model (HoVer-Net) with context- and +attention-based mechanisms, and do a comparative analysis on a multi-centre +colon nuclei identification and counting dataset. Although pathologists rely on +context at multiple levels while paying attention to specific Regions of +Interest (RoIs) when analysing and annotating WSIs, our findings suggest +translating that domain knowledge into algorithm design is no trivial task, but +to fully exploit these mechanisms, the scientific understanding of these +methods should be addressed. + +
+
+
+
+
+ + ☆ A Labeled Ophthalmic Ultrasound Dataset with Medical Report Generation + Based on Cross-modal Deep Learning + + +
+ Ultrasound imaging reveals eye morphology and aids in diagnosing and treating +eye diseases. However, interpreting diagnostic reports requires specialized +physicians. We present a labeled ophthalmic dataset for the precise analysis +and the automated exploration of medical images along with their associated +reports. It collects three modal data, including the ultrasound images, blood +flow information and examination reports from 2,417 patients at an +ophthalmology hospital in Shenyang, China, during the year 2018, in which the +patient information is de-identified for privacy protection. To the best of our +knowledge, it is the only ophthalmic dataset that contains the three modal +information simultaneously. It incrementally consists of 4,858 images with the +corresponding free-text reports, which describe 15 typical imaging findings of +intraocular diseases and the corresponding anatomical locations. Each image +shows three kinds of blood flow indices at three specific arteries, i.e., nine +parameter values to describe the spectral characteristics of blood flow +distribution. The reports were written by ophthalmologists during the clinical +care. The proposed dataset is applied to generate medical report based on the +cross-modal deep learning model. The experimental results demonstrate that our +dataset is suitable for training supervised models concerning cross-modal +medical data. + +
+
+
+
+
+ + ☆ Local Binary Pattern(LBP) Optimization for Feature Extraction + + +
+ The rapid growth of image data has led to the development of advanced image +processing and computer vision techniques, which are crucial in various +applications such as image classification, image segmentation, and pattern +recognition. Texture is an important feature that has been widely used in many +image processing tasks. Therefore, analyzing and understanding texture plays a +pivotal role in image analysis and understanding.Local binary pattern (LBP) is +a powerful operator that describes the local texture features of images. This +paper provides a novel mathematical representation of the LBP by separating the +operator into three matrices, two of which are always fixed and do not depend +on the input data. These fixed matrices are analyzed in depth, and a new +algorithm is proposed to optimize them for improved classification performance. +The optimization process is based on the singular value decomposition (SVD) +algorithm. As a result, the authors present optimal LBPs that effectively +describe the texture of human face images. Several experiment results presented +in this paper convincingly verify the efficiency and superiority of the +optimized LBPs for face detection and facial expression recognition tasks. + +
+
+
+
+
+ + ☆ Adversarial Robustification via Text-to-Image Diffusion Models + + +
+ Adversarial robustness has been conventionally believed as a challenging +property to encode for neural networks, requiring plenty of training data. In +the recent paradigm of adopting off-the-shelf models, however, access to their +training data is often infeasible or not practical, while most of such models +are not originally trained concerning adversarial robustness. In this paper, we +develop a scalable and model-agnostic solution to achieve adversarial +robustness without using any data. Our intuition is to view recent +text-to-image diffusion models as "adaptable" denoisers that can be optimized +to specify target tasks. Based on this, we propose: (a) to initiate a +denoise-and-classify pipeline that offers provable guarantees against +adversarial attacks, and (b) to leverage a few synthetic reference images +generated from the text-to-image model that enables novel adaptation schemes. +Our experiments show that our data-free scheme applied to the pre-trained CLIP +could improve the (provable) adversarial robustness of its diverse zero-shot +classification derivatives (while maintaining their accuracy), significantly +surpassing prior approaches that utilize the full training data. Not only for +CLIP, we also demonstrate that our framework is easily applicable for +robustifying other visual classifiers efficiently. + +
+
+ comment: Code is available at https://github.com/ChoiDae1/robustify-T2I +
+
+
+
+
+ + ☆ Auto DragGAN: Editing the Generative Image Manifold in an Autoregressive + Manner + + +
+ Pixel-level fine-grained image editing remains an open challenge. Previous +works fail to achieve an ideal trade-off between control granularity and +inference speed. They either fail to achieve pixel-level fine-grained control, +or their inference speed requires optimization. To address this, this paper for +the first time employs a regression-based network to learn the variation +patterns of StyleGAN latent codes during the image dragging process. This +method enables pixel-level precision in dragging editing with little time cost. +Users can specify handle points and their corresponding target points on any +GAN-generated images, and our method will move each handle point to its +corresponding target point. Through experimental analysis, we discover that a +short movement distance from handle points to target points yields a +high-fidelity edited image, as the model only needs to predict the movement of +a small portion of pixels. To achieve this, we decompose the entire movement +process into multiple sub-processes. Specifically, we develop a transformer +encoder-decoder based network named 'Latent Predictor' to predict the latent +code motion trajectories from handle points to target points in an +autoregressive manner. Moreover, to enhance the prediction stability, we +introduce a component named 'Latent Regularizer', aimed at constraining the +latent code motion within the distribution of natural images. Extensive +experiments demonstrate that our method achieves state-of-the-art (SOTA) +inference speed and image editing performance at the pixel-level granularity. + +
+
+ comment: This paper has been accepted as a poster paper for ACM Multimedia + 2024 +
+
+
+
+
+ + ☆ DynamicTrack: Advancing Gigapixel Tracking in Crowded Scenes + + +
+ Tracking in gigapixel scenarios holds numerous potential applications in +video surveillance and pedestrian analysis. Existing algorithms attempt to +perform tracking in crowded scenes by utilizing multiple cameras or group +relationships. However, their performance significantly degrades when +confronted with complex interaction and occlusion inherent in gigapixel images. +In this paper, we introduce DynamicTrack, a dynamic tracking framework designed +to address gigapixel tracking challenges in crowded scenes. In particular, we +propose a dynamic detector that utilizes contrastive learning to jointly detect +the head and body of pedestrians. Building upon this, we design a dynamic +association algorithm that effectively utilizes head and body information for +matching purposes. Extensive experiments show that our tracker achieves +state-of-the-art performance on widely used tracking benchmarks specifically +designed for gigapixel crowded scenes. + +
+
+
+
+
+ + ☆ Every Part Matters: Integrity Verification of Scientific Figures Based + on Multimodal Large Language Models + + +
+ This paper tackles a key issue in the interpretation of scientific figures: +the fine-grained alignment of text and figures. It advances beyond prior +research that primarily dealt with straightforward, data-driven visualizations +such as bar and pie charts and only offered a basic understanding of diagrams +through captioning and classification. We introduce a novel task, Figure +Integrity Verification, designed to evaluate the precision of technologies in +aligning textual knowledge with visual elements in scientific figures. To +support this, we develop a semi-automated method for constructing a large-scale +dataset, Figure-seg, specifically designed for this task. Additionally, we +propose an innovative framework, Every Part Matters (EPM), which leverages +Multimodal Large Language Models (MLLMs) to not only incrementally improve the +alignment and verification of text-figure integrity but also enhance integrity +through analogical reasoning. Our comprehensive experiments show that these +innovations substantially improve upon existing methods, allowing for more +precise and thorough analysis of complex scientific figures. This progress not +only enhances our understanding of multimodal technologies but also stimulates +further research and practical applications across fields requiring the +accurate interpretation of complex visual data. + +
+
+ comment: 28 pages, 11 figures, under review +
+
+
+
+
+ + ☆ MOoSE: Multi-Orientation Sharing Experts for Open-set Scene Text + Recognition ICDAR2024 + + +
+ Open-set text recognition, which aims to address both novel characters and +previously seen ones, is one of the rising subtopics in the text recognition +field. However, the current open-set text recognition solutions only focuses on +horizontal text, which fail to model the real-life challenges posed by the +variety of writing directions in real-world scene text. Multi-orientation text +recognition, in general, faces challenges from the diverse image aspect ratios, +significant imbalance in data amount, and domain gaps between orientations. In +this work, we first propose a Multi-Oriented Open-Set Text Recognition task +(MOOSTR) to model the challenges of both novel characters and writing direction +variety. We then propose a Multi-Orientation Sharing Experts (MOoSE) framework +as a strong baseline solution. MOoSE uses a mixture-of-experts scheme to +alleviate the domain gaps between orientations, while exploiting common +structural knowledge among experts to alleviate the data scarcity that some +experts face. The proposed MOoSE framework is validated by ablative +experiments, and also tested for feasibility on the existing open-set +benchmark. Code, models, and documents are available at: +https://github.com/lancercat/Moose/ + +
+
+ comment: Accepted in ICDAR2024 +
+
+
+
+
+ + ☆ LookupForensics: A Large-Scale Multi-Task Dataset for Multi-Phase + Image-Based Fact Verification + + +
+ Amid the proliferation of forged images, notably the tsunami of deepfake +content, extensive research has been conducted on using artificial intelligence +(AI) to identify forged content in the face of continuing advancements in +counterfeiting technologies. We have investigated the use of AI to provide the +original authentic image after deepfake detection, which we believe is a +reliable and persuasive solution. We call this "image-based automated fact +verification," a name that originated from a text-based fact-checking system +used by journalists. We have developed a two-phase open framework that +integrates detection and retrieval components. Additionally, inspired by a +dataset proposed by Meta Fundamental AI Research, we further constructed a +large-scale dataset that is specifically designed for this task. This dataset +simulates real-world conditions and includes both content-preserving and +content-aware manipulations that present a range of difficulty levels and have +potential for ongoing research. This multi-task dataset is fully annotated, +enabling it to be utilized for sub-tasks within the forgery identification and +fact retrieval domains. This paper makes two main contributions: (1) We +introduce a new task, "image-based automated fact verification," and present a +novel two-phase open framework combining "forgery identification" and "fact +retrieval." (2) We present a large-scale dataset tailored for this new task +that features various hand-crafted image edits and machine learning-driven +manipulations, with extensive annotations suitable for various sub-tasks. +Extensive experimental results validate its practicality for fact verification +research and clarify its difficulty levels for various sub-tasks. + +
+
+ comment: Pages 1-13 are the main body of the paper, and pages 14-16 are the + supplementary material +
+
+
+
+
+ + ☆ Dilated Strip Attention Network for Image Restoration + + +
+ Image restoration is a long-standing task that seeks to recover the latent +sharp image from its deteriorated counterpart. Due to the robust capacity of +self-attention to capture long-range dependencies, transformer-based methods or +some attention-based convolutional neural networks have demonstrated promising +results on many image restoration tasks in recent years. However, existing +attention modules encounters limited receptive fields or abundant parameters. +In order to integrate contextual information more effectively and efficiently, +in this paper, we propose a dilated strip attention network (DSAN) for image +restoration. Specifically, to gather more contextual information for each pixel +from its neighboring pixels in the same row or column, a dilated strip +attention (DSA) mechanism is elaborately proposed. By employing the DSA +operation horizontally and vertically, each location can harvest the contextual +information from a much wider region. In addition, we utilize multi-scale +receptive fields across different feature groups in DSA to improve +representation learning. Extensive experiments show that our DSAN outperforms +state-of-the-art algorithms on several image restoration tasks. + +
+
+
+
+
+ + ☆ IOVS4NeRF:Incremental Optimal View Selection for Large-Scale NeRFs + + +
+ Urban-level three-dimensional reconstruction for modern applications demands +high rendering fidelity while minimizing computational costs. The advent of +Neural Radiance Fields (NeRF) has enhanced 3D reconstruction, yet it exhibits +artifacts under multiple viewpoints. In this paper, we propose a new NeRF +framework method to address these issues. Our method uses image content and +pose data to iteratively plan the next best view. A crucial aspect of this +method involves uncertainty estimation, guiding the selection of views with +maximum information gain from a candidate set. This iterative process enhances +rendering quality over time. Simultaneously, we introduce the Vonoroi diagram +and threshold sampling together with flight classifier to boost the efficiency, +while keep the original NeRF network intact. It can serve as a plug-in tool to +assist in better rendering, outperforming baselines and similar prior works. + +
+
+
+
+
+ + ☆ LinguaLinker: Audio-Driven Portraits Animation with Implicit Facial + Control Enhancement + + +
+ This study delves into the intricacies of synchronizing facial dynamics with +multilingual audio inputs, focusing on the creation of visually compelling, +time-synchronized animations through diffusion-based techniques. Diverging from +traditional parametric models for facial animation, our approach, termed +LinguaLinker, adopts a holistic diffusion-based framework that integrates +audio-driven visual synthesis to enhance the synergy between auditory stimuli +and visual responses. We process audio features separately and derive the +corresponding control gates, which implicitly govern the movements in the +mouth, eyes, and head, irrespective of the portrait's origin. The advanced +audio-driven visual synthesis mechanism provides nuanced control but keeps the +compatibility of output video and input audio, allowing for a more tailored and +effective portrayal of distinct personas across different languages. The +significant improvements in the fidelity of animated portraits, the accuracy of +lip-syncing, and the appropriate motion variations achieved by our method +render it a versatile tool for animating any portrait in any language. + +
+
+
+
+
+ + ☆ Content-driven Magnitude-Derivative Spectrum Complementary Learning for + Hyperspectral Image Classification + + +
+ Extracting discriminative information from complex spectral details in +hyperspectral image (HSI) for HSI classification is pivotal. While current +prevailing methods rely on spectral magnitude features, they could cause +confusion in certain classes, resulting in misclassification and decreased +accuracy. We find that the derivative spectrum proves more adept at capturing +concealed information, thereby offering a distinct advantage in separating +these confusion classes. Leveraging the complementarity between spectral +magnitude and derivative features, we propose a Content-driven Spectrum +Complementary Network based on Magnitude-Derivative Dual Encoder, employing +these two features as combined inputs. To fully utilize their complementary +information, we raise a Content-adaptive Point-wise Fusion Module, enabling +adaptive fusion of dual-encoder features in a point-wise selective manner, +contingent upon feature representation. To preserve a rich source of +complementary information while extracting more distinguishable features, we +introduce a Hybrid Disparity-enhancing Loss that enhances the differential +expression of the features from the two branches and increases the inter-class +distance. As a result, our method achieves state-of-the-art results on the +extensive WHU-OHS dataset and eight other benchmark datasets. + +
+
+ comment: accepted by TGRS +
+
+
+
+
+ + ☆ From 2D to 3D: AISG-SLA Visual Localization Challenge + + +
+ Research in 3D mapping is crucial for smart city applications, yet the cost +of acquiring 3D data often hinders progress. Visual localization, particularly +monocular camera position estimation, offers a solution by determining the +camera's pose solely through visual cues. However, this task is challenging due +to limited data from a single camera. To tackle these challenges, we organized +the AISG-SLA Visual Localization Challenge (VLC) at IJCAI 2023 to explore how +AI can accurately extract camera pose data from 2D images in 3D space. The +challenge attracted over 300 participants worldwide, forming 50+ teams. Winning +teams achieved high accuracy in pose estimation using images from a car-mounted +camera with low frame rates. The VLC dataset is available for research purposes +upon request via vlc-dataset@aisingapore.org. + +
+
+
+
+
+ + ☆ HICEScore: A Hierarchical Metric for Image Captioning Evaluation ACM MM2024 + + +
+ Image captioning evaluation metrics can be divided into two categories, +reference-based metrics and reference-free metrics. However, reference-based +approaches may struggle to evaluate descriptive captions with abundant visual +details produced by advanced multimodal large language models, due to their +heavy reliance on limited human-annotated references. In contrast, previous +reference-free metrics have been proven effective via CLIP cross-modality +similarity. Nonetheless, CLIP-based metrics, constrained by their solution of +global image-text compatibility, often have a deficiency in detecting local +textual hallucinations and are insensitive to small visual objects. Besides, +their single-scale designs are unable to provide an interpretable evaluation +process such as pinpointing the position of caption mistakes and identifying +visual regions that have not been described. To move forward, we propose a +novel reference-free metric for image captioning evaluation, dubbed +Hierarchical Image Captioning Evaluation Score (HICE-S). By detecting local +visual regions and textual phrases, HICE-S builds an interpretable hierarchical +scoring mechanism, breaking through the barriers of the single-scale structure +of existing reference-free metrics. Comprehensive experiments indicate that our +proposed metric achieves the SOTA performance on several benchmarks, +outperforming existing reference-free metrics like CLIP-S and PAC-S, and +reference-based metrics like METEOR and CIDEr. Moreover, several case studies +reveal that the assessment process of HICE-S on detailed captions closely +resembles interpretable human judgments.Our code is available at +https://github.com/joeyz0z/HICE. + +
+
+ comment: Accepted by ACM MM2024 +
+
+
+
+
+ + ☆ Learning to Enhance Aperture Phasor Field for Non-Line-of-Sight Imaging + + +
+ This paper aims to facilitate more practical NLOS imaging by reducing the +number of samplings and scan areas. To this end, we introduce a phasor-based +enhancement network that is capable of predicting clean and full measurements +from noisy partial observations. We leverage a denoising autoencoder scheme to +acquire rich and noise-robust representations in the measurement space. Through +this pipeline, our enhancement network is trained to accurately reconstruct +complete measurements from their corrupted and partial counterparts. However, +we observe that the \naive application of denoising often yields degraded and +over-smoothed results, caused by unnecessary and spurious frequency signals +present in measurements. To address this issue, we introduce a phasor-based +pipeline designed to limit the spectrum of our network to the frequency range +of interests, where the majority of informative signals are detected. The +phasor wavefronts at the aperture, which are band-limited signals, are employed +as inputs and outputs of the network, guiding our network to learn from the +frequency range of interests and discard unnecessary information. The +experimental results in more practical acquisition scenarios demonstrate that +we can look around the corners with $16\times$ or $64\times$ fewer samplings +and $4\times$ smaller apertures. Our code is available at +\url{https://github.com/join16/LEAP}. + +
+
+
+
+
+ + ☆ Learning Spectral-Decomposed Tokens for Domain Generalized Semantic + Segmentation + + +
+ The rapid development of Vision Foundation Model (VFM) brings inherent +out-domain generalization for a variety of down-stream tasks. Among them, +domain generalized semantic segmentation (DGSS) holds unique challenges as the +cross-domain images share common pixel-wise content information but vary +greatly in terms of the style. In this paper, we present a novel +Spectral-dEcomposed Token (SET) learning framework to advance the frontier. +Delving into further than existing fine-tuning token & frozen backbone +paradigm, the proposed SET especially focuses on the way learning +style-invariant features from these learnable tokens. Particularly, the frozen +VFM features are first decomposed into the phase and amplitude components in +the frequency space, which mainly contain the information of content and style, +respectively, and then separately processed by learnable tokens for +task-specific information extraction. After the decomposition, style variation +primarily impacts the token-based feature enhancement within the amplitude +branch. To address this issue, we further develop an attention optimization +method to bridge the gap between style-affected representation and static +tokens during inference. Extensive cross-domain experiments show its +state-of-the-art performance. + +
+
+ comment: accecpted by ACMM MM2024 +
+
+
+
+
+ + ☆ VSSD: Vision Mamba with Non-Casual State Space Duality + + +
+ Vision transformers have significantly advanced the field of computer vision, +offering robust modeling capabilities and global receptive field. However, +their high computational demands limit their applicability in processing long +sequences. To tackle this issue, State Space Models (SSMs) have gained +prominence in vision tasks as they offer linear computational complexity. +Recently, State Space Duality (SSD), an improved variant of SSMs, was +introduced in Mamba2 to enhance model performance and efficiency. However, the +inherent causal nature of SSD/SSMs restricts their applications in non-causal +vision tasks. To address this limitation, we introduce Visual State Space +Duality (VSSD) model, which has a non-causal format of SSD. Specifically, we +propose to discard the magnitude of interactions between the hidden state and +tokens while preserving their relative weights, which relieves the dependencies +of token contribution on previous tokens. Together with the involvement of +multi-scan strategies, we show that the scanning results can be integrated to +achieve non-causality, which not only improves the performance of SSD in vision +tasks but also enhances its efficiency. We conduct extensive experiments on +various benchmarks including image classification, detection, and segmentation, +where VSSD surpasses existing state-of-the-art SSM-based models. Code and +weights are available at \url{https://github.com/YuHengsss/VSSD}. + +
+
+ comment: 16 pages, 5 figures, 7 tables +
+
+
+
+
+ + ☆ How To Segment in 3D Using 2D Models: Automated 3D Segmentation of + Prostate Cancer Metastatic Lesions on PET Volumes Using Multi-Angle Maximum + Intensity Projections and Diffusion Models MICCAI + + +
+ Prostate specific membrane antigen (PSMA) positron emission +tomography/computed tomography (PET/CT) imaging provides a tremendously +exciting frontier in visualization of prostate cancer (PCa) metastatic lesions. +However, accurate segmentation of metastatic lesions is challenging due to low +signal-to-noise ratios and variable sizes, shapes, and locations of the +lesions. This study proposes a novel approach for automated segmentation of +metastatic lesions in PSMA PET/CT 3D volumetric images using 2D denoising +diffusion probabilistic models (DDPMs). Instead of 2D trans-axial slices or 3D +volumes, the proposed approach segments the lesions on generated multi-angle +maximum intensity projections (MA-MIPs) of the PSMA PET images, then obtains +the final 3D segmentation masks from 3D ordered subset expectation maximization +(OSEM) reconstruction of 2D MA-MIPs segmentations. Our proposed method achieved +superior performance compared to state-of-the-art 3D segmentation approaches in +terms of accuracy and robustness in detecting and segmenting small metastatic +PCa lesions. The proposed method has significant potential as a tool for +quantitative analysis of metastatic burden in PCa patients. + +
+
+ comment: 11 pages, 2 figures, accepted in the DGM4MICCAI workshop, MICCAI, + 2024 +
+
+
+
+
+ + ☆ Skin Cancer Detection utilizing Deep Learning: Classification of Skin + Lesion Images using a Vision Transformer + + +
+ Skin cancer detection still represents a major challenge in healthcare. +Common detection methods can be lengthy and require human assistance which +falls short in many countries. Previous research demonstrates how convolutional +neural networks (CNNs) can help effectively through both automation and an +accuracy that is comparable to the human level. However, despite the progress +in previous decades, the precision is still limited, leading to substantial +misclassifications that have a serious impact on people's health. Hence, we +employ a Vision Transformer (ViT) that has been developed in recent years based +on the idea of a self-attention mechanism, specifically two configurations of a +pre-trained ViT. We generally find superior metrics for classifying skin +lesions after comparing them to base models such as decision tree classifier +and k-nearest neighbor (KNN) classifier, as well as to CNNs and less complex +ViTs. In particular, we attach greater importance to the performance of +melanoma, which is the most lethal type of skin cancer. The ViT-L32 model +achieves an accuracy of 91.57% and a melanoma recall of 58.54%, while ViT-L16 +achieves an accuracy of 92.79% and a melanoma recall of 56.10%. This offers a +potential tool for faster and more accurate diagnoses and an overall +improvement for the healthcare sector. + +
+
+
+
+
+ + ☆ Multimodal Emotion Recognition using Audio-Video Transformer Fusion with + Cross Attention + + +
+ Understanding emotions is a fundamental aspect of human communication. +Integrating audio and video signals offers a more comprehensive understanding +of emotional states compared to traditional methods that rely on a single data +source, such as speech or facial expressions. Despite its potential, multimodal +emotion recognition faces significant challenges, particularly in +synchronization, feature extraction, and fusion of diverse data sources. To +address these issues, this paper introduces a novel transformer-based model +named Audio-Video Transformer Fusion with Cross Attention (AVT-CA). The AVT-CA +model employs a transformer fusion approach to effectively capture and +synchronize interlinked features from both audio and video inputs, thereby +resolving synchronization problems. Additionally, the Cross Attention mechanism +within AVT-CA selectively extracts and emphasizes critical features while +discarding irrelevant ones from both modalities, addressing feature extraction +and fusion challenges. Extensive experimental analysis conducted on the +CMU-MOSEI, RAVDESS and CREMA-D datasets demonstrates the efficacy of the +proposed model. The results underscore the importance of AVT-CA in developing +precise and reliable multimodal emotion recognition systems for practical +applications. + +
+
+ comment: 38 Pages, 9 Tables, 12 Figures +
+
+
+
+
+ + ☆ Boosting Cross-Domain Point Classification via Distilling Relational + Priors from 2D Transformers + + +
+ Semantic pattern of an object point cloud is determined by its topological +configuration of local geometries. Learning discriminative representations can +be challenging due to large shape variations of point sets in local regions and +incomplete surface in a global perspective, which can be made even more severe +in the context of unsupervised domain adaptation (UDA). In specific, +traditional 3D networks mainly focus on local geometric details and ignore the +topological structure between local geometries, which greatly limits their +cross-domain generalization. Recently, the transformer-based models have +achieved impressive performance gain in a range of image-based tasks, +benefiting from its strong generalization capability and scalability stemming +from capturing long range correlation across local patches. Inspired by such +successes of visual transformers, we propose a novel Relational Priors +Distillation (RPD) method to extract relational priors from the well-trained +transformers on massive images, which can significantly empower cross-domain +representations with consistent topological priors of objects. To this end, we +establish a parameter-frozen pre-trained transformer module shared between 2D +teacher and 3D student models, complemented by an online knowledge distillation +strategy for semantically regularizing the 3D student model. Furthermore, we +introduce a novel self-supervised task centered on reconstructing masked point +cloud patches using corresponding masked multi-view image features, thereby +empowering the model with incorporating 3D geometric information. Experiments +on the PointDA-10 and the Sim-to-Real datasets verify that the proposed method +consistently achieves the state-of-the-art performance of UDA for point cloud +classification. The source code of this work is available at +https://github.com/zou-longkun/RPD.git. + +
+
+
+
+
+ + ☆ She Works, He Works: A Curious Exploration of Gender Bias in + AI-Generated Imagery + + +
+ This paper examines gender bias in AI-generated imagery of construction +workers, highlighting discrepancies in the portrayal of male and female +figures. Grounded in Griselda Pollock's theories on visual culture and gender, +the analysis reveals that AI models tend to sexualize female figures while +portraying male figures as more authoritative and competent. These findings +underscore AI's potential to mirror and perpetuate societal biases, emphasizing +the need for critical engagement with AI-generated content. The project +contributes to discussions on the ethical implications of AI in creative +practices and its broader impact on cultural perceptions of gender. + +
+
+ comment: 11 pages, 8 figures +
+
+
+
+
+ + ☆ Text-Region Matching for Multi-Label Image Recognition with Missing + Labels ACM MM + + +
+ Recently, large-scale visual language pre-trained (VLP) models have +demonstrated impressive performance across various downstream tasks. Motivated +by these advancements, pioneering efforts have emerged in multi-label image +recognition with missing labels, leveraging VLP prompt-tuning technology. +However, they usually cannot match text and vision features well, due to +complicated semantics gaps and missing labels in a multi-label image. To tackle +this challenge, we propose \textbf{T}ext-\textbf{R}egion \textbf{M}atching for +optimizing \textbf{M}ulti-\textbf{L}abel prompt tuning, namely TRM-ML, a novel +method for enhancing meaningful cross-modal matching. Compared to existing +methods, we advocate exploring the information of category-aware regions rather +than the entire image or pixels, which contributes to bridging the semantic gap +between textual and visual representations in a one-to-one matching manner. +Concurrently, we further introduce multimodal contrastive learning to narrow +the semantic gap between textual and visual modalities and establish +intra-class and inter-class relationships. Additionally, to deal with missing +labels, we propose a multimodal category prototype that leverages intra- and +inter-category semantic relationships to estimate unknown labels, facilitating +pseudo-label generation. Extensive experiments on the MS-COCO, PASCAL VOC, +Visual Genome, NUS-WIDE, and CUB-200-211 benchmark datasets demonstrate that +our proposed framework outperforms the state-of-the-art methods by a +significant margin. Our code is available +here\href{https://github.com/yu-gi-oh-leilei/TRM-ML}{\raisebox{-1pt}{\faGithub}}. + +
+
+ comment: Accepted to ACM International Conference on Multimedia (ACM MM) 2024 +
+
+
+
+
+ + ☆ Revisit Event Generation Model: Self-Supervised Learning of + Event-to-Video Reconstruction with Implicit Neural Representations + + +
+ Reconstructing intensity frames from event data while maintaining high +temporal resolution and dynamic range is crucial for bridging the gap between +event-based and frame-based computer vision. Previous approaches have depended +on supervised learning on synthetic data, which lacks interpretability and risk +over-fitting to the setting of the event simulator. Recently, self-supervised +learning (SSL) based methods, which primarily utilize per-frame optical flow to +estimate intensity via photometric constancy, has been actively investigated. +However, they are vulnerable to errors in the case of inaccurate optical flow. +This paper proposes a novel SSL event-to-video reconstruction approach, dubbed +EvINR, which eliminates the need for labeled data or optical flow estimation. +Our core idea is to reconstruct intensity frames by directly addressing the +event generation model, essentially a partial differential equation (PDE) that +describes how events are generated based on the time-varying brightness +signals. Specifically, we utilize an implicit neural representation (INR), +which takes in spatiotemporal coordinate $(x, y, t)$ and predicts intensity +values, to represent the solution of the event generation equation. The INR, +parameterized as a fully-connected Multi-layer Perceptron (MLP), can be +optimized with its temporal derivatives supervised by events. To make EvINR +feasible for online requisites, we propose several acceleration techniques that +substantially expedite the training process. Comprehensive experiments +demonstrate that our EvINR surpasses previous SSL methods by 38% w.r.t. Mean +Squared Error (MSE) and is comparable or superior to SoTA supervised methods. +Project page: https://vlislab22.github.io/EvINR/. + +
+
+
+
+
+ + ☆ Answerability Fields: Answerable Location Estimation via Diffusion + Models IROS2024 + + +
+ In an era characterized by advancements in artificial intelligence and +robotics, enabling machines to interact with and understand their environment +is a critical research endeavor. In this paper, we propose Answerability +Fields, a novel approach to predicting answerability within complex indoor +environments. Leveraging a 3D question answering dataset, we construct a +comprehensive Answerability Fields dataset, encompassing diverse scenes and +questions from ScanNet. Using a diffusion model, we successfully infer and +evaluate these Answerability Fields, demonstrating the importance of objects +and their locations in answering questions within a scene. Our results showcase +the efficacy of Answerability Fields in guiding scene-understanding tasks, +laying the foundation for their application in enhancing interactions between +intelligent agents and their environments. + +
+
+ comment: IROS2024 +
+
+
+
+
+ + ☆ Neural Modulation Alteration to Positive and Negative Emotions in + Depressed Patients: Insights from fMRI Using Positive/Negative Emotion Atlas + + +
+ Background: Although it has been noticed that depressed patients show +differences in processing emotions, the precise neural modulation mechanisms of +positive and negative emotions remain elusive. FMRI is a cutting-edge medical +imaging technology renowned for its high spatial resolution and dynamic +temporal information, making it particularly suitable for the neural dynamics +of depression research. Methods: To address this gap, our study firstly +leveraged fMRI to delineate activated regions associated with positive and +negative emotions in healthy individuals, resulting in the creation of positive +emotion atlas (PEA) and negative emotion atlas (NEA). Subsequently, we examined +neuroimaging changes in depression patients using these atlases and evaluated +their diagnostic performance based on machine learning. Results: Our findings +demonstrate that the classification accuracy of depressed patients based on PEA +and NEA exceeded 0.70, a notable improvement compared to the whole-brain +atlases. Furthermore, ALFF analysis unveiled significant differences between +depressed patients and healthy controls in eight functional clusters during the +NEA, focusing on the left cuneus, cingulate gyrus, and superior parietal +lobule. In contrast, the PEA revealed more pronounced differences across +fifteen clusters, involving the right fusiform gyrus, parahippocampal gyrus, +and inferior parietal lobule. Limitations: Due to the limited sample size and +subtypes of depressed patients, the efficacy may need further validation in +future. Conclusions: These findings emphasize the complex interplay between +emotion modulation and depression, showcasing significant alterations in both +PEA and NEA among depression patients. This research enhances our understanding +of emotion modulation in depression, with implications for diagnosis and +treatment evaluation. + +
+
+
+
+
+ + ☆ SMPISD-MTPNet: Scene Semantic Prior-Assisted Infrared Ship Detection + Using Multi-Task Perception Networks + + +
+ Infrared ship detection (IRSD) has received increasing attention in recent +years due to the robustness of infrared images to adverse weather. However, a +large number of false alarms may occur in complex scenes. To address these +challenges, we propose the Scene Semantic Prior-Assisted Multi-Task Perception +Network (SMPISD-MTPNet), which includes three stages: scene semantic +extraction, deep feature extraction, and prediction. In the scene semantic +extraction stage, we employ a Scene Semantic Extractor (SSE) to guide the +network by the features extracted based on expert knowledge. In the deep +feature extraction stage, a backbone network is employed to extract deep +features. These features are subsequently integrated by a fusion network, +enhancing the detection capabilities across targets of varying sizes. In the +prediction stage, we utilize the Multi-Task Perception Module, which includes +the Gradient-based Module and the Scene Segmentation Module, enabling precise +detection of small and dim targets within complex scenes. For the training +process, we introduce the Soft Fine-tuning training strategy to suppress the +distortion caused by data augmentation. Besides, due to the lack of a publicly +available dataset labelled for scenes, we introduce the Infrared Ship Dataset +with Scene Segmentation (IRSDSS). Finally, we evaluate the network and compare +it with state-of-the-art (SOTA) methods, indicating that SMPISD-MTPNet +outperforms existing approaches. The source code and dataset for this research +can be accessed at https://github.com/greekinRoma/KMNDNet. + +
+
+
+
+
+ + ☆ A Progressive Single-Modality to Multi-Modality Classification Framework + for Alzheimer's Disease Sub-type Diagnosis + + +
+ The current clinical diagnosis framework of Alzheimer's disease (AD) involves +multiple modalities acquired from multiple diagnosis stages, each with distinct +usage and cost. Previous AD diagnosis research has predominantly focused on how +to directly fuse multiple modalities for an end-to-end one-stage diagnosis, +which practically requires a high cost in data acquisition. Moreover, a +significant part of these methods diagnose AD without considering clinical +guideline and cannot offer accurate sub-type diagnosis. In this paper, by +exploring inter-correlation among multiple modalities, we propose a novel +progressive AD sub-type diagnosis framework, aiming to give diagnosis results +based on easier-to-access modalities in earlier low-cost stages, instead of +modalities from all stages. Specifically, first, we design 1) a text +disentanglement network for better processing tabular data collected in the +initial stage, and 2) a modality fusion module for fusing multi-modality +features separately. Second, we align features from modalities acquired in +earlier low-cost stage(s) with later high-cost stage(s) to give accurate +diagnosis without actual modality acquisition in later-stage(s) for saving +cost. Furthermore, we follow the clinical guideline to align features at each +stage for achieving sub-type diagnosis. Third, we leverage a progressive +classifier that can progressively include additional acquired modalities (if +needed) for diagnosis, to achieve the balance between diagnosis cost and +diagnosis performance. We evaluate our proposed framework on large diverse +public and in-home datasets (8280 in total) and achieve superior performance +over state-of-the-art methods. Our codes will be released after the acceptance. + +
+
+
+
+
+ + ☆ Lensless fiber endomicroscopic phase imaging with speckle-conditioned + diffusion model + + +
+ Lensless fiber endomicroscope is an emerging tool for in-vivo microscopic +imaging, where quantitative phase imaging (QPI) can be utilized as a label-free +method to enhance image contrast. However, existing single-shot phase +reconstruction methods through lensless fiber endomicroscope typically perform +well on simple images but struggle with complex microscopic structures. Here, +we propose a speckle-conditioned diffusion model (SpecDiffusion), which +reconstructs phase images directly from speckles captured at the detection side +of a multi-core fiber (MCF). Unlike conventional neural networks, SpecDiffusion +employs iterative phase denoising steps for speckle-driven phase +reconstruction. The iteration scheme allows SpecDiffusion to break down the +phase reconstruction process into multiple steps, gradually building up to the +final phase image. This attribute alleviates the computation challenge at each +step and enables the reconstruction of rich details in complex microscopic +images. To validate its efficacy, we build an optical system to capture +speckles from MCF and construct a dataset consisting of 100,000 paired images. +SpecDiffusion provides high-fidelity phase reconstruction results and shows +powerful generalization capacity for unseen objects, such as test charts and +biological tissues, reducing the average mean absolute error of the +reconstructed tissue images by 7 times. Furthermore, the reconstructed tissue +images using SpecDiffusion shows higher accuracy in zero-shot cell segmentation +tasks compared to the conventional method, demonstrating the potential for +further cell morphology analysis through the learning-based lensless fiber +endomicroscope. SpecDiffusion offers a precise and generalized method to phase +reconstruction through scattering media, including MCFs, opening new +perspective in lensless fiber endomicroscopic imaging. + +
+
+
+
+
+ + ☆ Textile Anomaly Detection: Evaluation of the State-of-the-Art for + Automated Quality Inspection of Carpet + + +
+ In this study, state-of-the-art unsupervised detection models were evaluated +for the purpose of automated anomaly inspection of wool carpets. A custom +dataset of four unique types of carpet textures was created to thoroughly test +the models and their robustness in detecting subtle anomalies in complex +textures. Due to the requirements of an inline inspection system in a +manufacturing use case, the metrics of importance in this study were accuracy +in detecting anomalous areas, the number of false detections, and the inference +times of each model for real-time performance. Of the evaluated models, the +student-teacher network based methods were found on average to yield the +highest detection accuracy and lowest false detection rates. When trained on a +multi-class dataset the models were found to yield comparable if not better +results than single-class training. Finally, in terms of detection speed, with +exception to the generative model, all other evaluated models were found to +have comparable inference times on a GPU, with an average of 0.16s per image. +On a CPU, most of these models typically produced results between 1.5 to 2 +times the respective GPU inference times. + +
+
+ comment: Accepted at the 2023 Australasian Conference on Robotics and + Automation (ACRA 2023) Publication url + https://www.scopus.com/inward/record.uri?eid=2-s2.0-85184380272&partnerID=40&md5=74fde263f4a24a1bff75d6560b423994 + ISSN: 14482053 Contains 10 pages and three figures +
+
+
+
+
+ + ☆ Towards A Generalizable Pathology Foundation Model via Unified Knowledge + Distillation + + +
+ Foundation models pretrained on large-scale datasets are revolutionizing the +field of computational pathology (CPath). The generalization ability of +foundation models is crucial for the success in various downstream clinical +tasks. However, current foundation models have only been evaluated on a limited +type and number of tasks, leaving their generalization ability and overall +performance unclear. To address this gap, we established a most comprehensive +benchmark to evaluate the performance of off-the-shelf foundation models across +six distinct clinical task types, encompassing a total of 39 specific tasks. +Our findings reveal that existing foundation models excel at certain task types +but struggle to effectively handle the full breadth of clinical tasks. To +improve the generalization of pathology foundation models, we propose a unified +knowledge distillation framework consisting of both expert and self knowledge +distillation, where the former allows the model to learn from the knowledge of +multiple expert models, while the latter leverages self-distillation to enable +image representation learning via local-global alignment. Based on this +framework, a Generalizable Pathology Foundation Model (GPFM) is pretrained on a +large-scale dataset consisting of 190 million images from around 86,000 public +H\&E whole slides across 34 major tissue types. Evaluated on the established +benchmark, GPFM achieves an impressive average rank of 1.36, with 29 tasks +ranked 1st, while the the second-best model, UNI, attains an average rank of +2.96, with only 4 tasks ranked 1st. The superior generalization of GPFM +demonstrates its exceptional modeling capabilities across a wide range of +clinical tasks, positioning it as a new cornerstone for feature representation +in CPath. + +
+
+
+
+
+ + ☆ HybridDepth: Robust Depth Fusion for Mobile AR by Leveraging Depth from + Focus and Single-Image Priors + + +
+ We propose HYBRIDDEPTH, a robust depth estimation pipeline that addresses the +unique challenges of depth estimation for mobile AR, such as scale ambiguity, +hardware heterogeneity, and generalizability. HYBRIDDEPTH leverages the camera +features available on mobile devices. It effectively combines the scale +accuracy inherent in Depth from Focus (DFF) methods with the generalization +capabilities enabled by strong single-image depth priors. By utilizing the +focal planes of a mobile camera, our approach accurately captures depth values +from focused pixels and applies these values to compute scale and shift +parameters for transforming relative depths into metric depths. We test our +pipeline as an end-to-end system, with a newly developed mobile client to +capture focal stacks, which are then sent to a GPU-powered server for depth +estimation. + Through comprehensive quantitative and qualitative analyses, we demonstrate +that HYBRIDDEPTH not only outperforms state-of-the-art (SOTA) models in common +datasets (DDFF12, NYU Depth v2) and a real-world AR dataset ARKitScenes but +also demonstrates strong zero-shot generalization. For example, HYBRIDDEPTH +trained on NYU Depth v2 achieves comparable performance on the DDFF12 to +existing models trained on DDFF12. it also outperforms all the SOTA models in +zero-shot performance on the ARKitScenes dataset. Additionally, we conduct a +qualitative comparison between our model and the ARCore framework, +demonstrating that our models output depth maps are significantly more accurate +in terms of structural details and metric accuracy. The source code of this +project is available at github. + +
+
+
+
+
+ + ☆ Mixed Non-linear Quantization for Vision Transformers + + +
+ The majority of quantization methods have been proposed to reduce the model +size of Vision Transformers, yet most of them have overlooked the quantization +of non-linear operations. Only a few works have addressed quantization for +non-linear operations, but they applied a single quantization method across all +non-linear operations. We believe that this can be further improved by +employing a different quantization method for each non-linear operation. +Therefore, to assign the most error-minimizing quantization method from the +known methods to each non-linear layer, we propose a mixed non-linear +quantization that considers layer-wise quantization sensitivity measured by +SQNR difference metric. The results show that our method outperforms I-BERT, +FQ-ViT, and I-ViT in both 8-bit and 6-bit settings for ViT, DeiT, and Swin +models by an average of 0.6%p and 19.6%p, respectively. Our method outperforms +I-BERT and I-ViT by 0.6%p and 20.8%p, respectively, when training time is +limited. We plan to release our code at +https://gitlab.com/ones-ai/mixed-non-linear-quantization. + +
+
+ comment: 16 pages, 4 figures, under review +
+
+
+
+
+ + ♻ ☆ UGG: Unified Generative Grasping ECCV 2024 + + +
+ Dexterous grasping aims to produce diverse grasping postures with a high +grasping success rate. Regression-based methods that directly predict grasping +parameters given the object may achieve a high success rate but often lack +diversity. Generation-based methods that generate grasping postures conditioned +on the object can often produce diverse grasping, but they are insufficient for +high grasping success due to lack of discriminative information. To mitigate, +we introduce a unified diffusion-based dexterous grasp generation model, dubbed +the name UGG, which operates within the object point cloud and hand parameter +spaces. Our all-transformer architecture unifies the information from the +object, the hand, and the contacts, introducing a novel representation of +contact points for improved contact modeling. The flexibility and quality of +our model enable the integration of a lightweight discriminator, benefiting +from simulated discriminative data, which pushes for a high success rate while +preserving high diversity. Beyond grasp generation, our model can also generate +objects based on hand information, offering valuable insights into object +design and studying how the generative model perceives objects. Our model +achieves state-of-the-art dexterous grasping on the large-scale DexGraspNet +dataset while facilitating human-centric object design, marking a significant +advancement in dexterous grasping research. Our project page is +https://jiaxin-lu.github.io/ugg/. + +
+
+ comment: 17 pages, 14 figures, ECCV 2024 +
+
+
+
+
+ + ♻ ☆ MxT: Mamba x Transformer for Image Inpainting + + +
+ Image inpainting, or image completion, is a crucial task in computer vision +that aims to restore missing or damaged regions of images with semantically +coherent content. This technique requires a precise balance of local texture +replication and global contextual understanding to ensure the restored image +integrates seamlessly with its surroundings. Traditional methods using +Convolutional Neural Networks (CNNs) are effective at capturing local patterns +but often struggle with broader contextual relationships due to the limited +receptive fields. Recent advancements have incorporated transformers, +leveraging their ability to understand global interactions. However, these +methods face computational inefficiencies and struggle to maintain fine-grained +details. To overcome these challenges, we introduce MxT composed of the +proposed Hybrid Module (HM), which combines Mamba with the transformer in a +synergistic manner. Mamba is adept at efficiently processing long sequences +with linear computational costs, making it an ideal complement to the +transformer for handling long-scale data interactions. Our HM facilitates +dual-level interaction learning at both pixel and patch levels, greatly +enhancing the model to reconstruct images with high quality and contextual +accuracy. We evaluate MxT on the widely-used CelebA-HQ and Places2-standard +datasets, where it consistently outperformed existing state-of-the-art methods. + +
+
+
+
+
+ + ♻ ☆ Learning to Visually Connect Actions and their Effects + + +
+ We introduce the novel concept of visually Connecting Actions and Their +Effects (CATE) in video understanding. CATE can have applications in areas like +task planning and learning from demonstration. We identify and explore two +different aspects of the concept of CATE: Action Selection (AS) and +Effect-Affinity Assessment (EAA), where video understanding models connect +actions and effects at semantic and fine-grained levels, respectively. We +design various baseline models for AS and EAA. Despite the intuitive nature of +the task, we observe that models struggle, and humans outperform them by a +large margin. Our experiments show that in solving AS and EAA, models learn +intuitive properties like object tracking and pose encoding without explicit +supervision. We demonstrate that CATE can be an effective self-supervised task +for learning video representations from unlabeled videos. The study aims to +showcase the fundamental nature and versatility of CATE, with the hope of +inspiring advanced formulations and models. + +
+
+
+
+
+ + ♻ ☆ Selective Vision-Language Subspace Projection for Few-shot CLIP + + +
+ Vision-language models such as CLIP are capable of mapping the different +modality data into a unified feature space, enabling zero/few-shot inference by +measuring the similarity of given images and texts. However, most existing +methods overlook modality gaps in CLIP's encoded features, which is shown as +the text and image features lie far apart from each other, resulting in limited +classification performance. To tackle this issue, we introduce a method called +Selective Vision-Language Subspace Projection (SSP), which incorporates local +image features and utilizes them as a bridge to enhance the alignment between +image-text pairs. Specifically, our SSP framework comprises two parallel +modules: a vision projector and a language projector. Both projectors utilize +local image features to span the respective subspaces for image and texts, +thereby projecting the image and text features into their respective subspaces +to achieve alignment. Moreover, our approach entails only training-free matrix +calculations and can be seamlessly integrated into advanced CLIP-based few-shot +learning frameworks. Extensive experiments on 11 datasets have demonstrated +SSP's superior text-image alignment capabilities, outperforming the +state-of-the-art alignment methods. The code is available at +https://github.com/zhuhsingyuu/SSP + +
+
+ comment: Accepted as an Oral Paper at ACM Multimedia 2024 +
+
+
+
+
+ + ♻ ☆ Diffusion MRI with Machine Learning + + +
+ Diffusion-weighted magnetic resonance imaging (dMRI) offers unique +capabilities including noninvasive probing of brain's tissue microstructure and +structural connectivity. It is widely used for clinical assessment of brain +pathologies and for neuroscience research. Analyzing the dMRI data to extract +useful information for medical and scientific purposes can be challenging. The +dMRI measurements often suffer from strong noise and artifacts, there is +usually high inter-session and inter-scanner variability in the data, and +considerable inter-subject heterogeneity in brain structure. Moreover, the +relationship between measurements and the phenomena of interest can be highly +complex. Recent years have witnessed increasing use of machine learning methods +for dMRI analysis. This manuscript aims to assess these efforts, with a focus +on methods that have addressed data preprocessing and harmonization, +microstructure mapping, tractography, and white matter tract analysis. We study +the main findings, strengths, and weaknesses of the existing methods and +suggest topics for future research. We find that machine learning may be +exceptionally suited to tackle some of the difficult tasks in dMRI analysis. +However, for this to happen, several shortcomings of existing methods and +critical unresolved issues need to be addressed. These include deficient +evaluation practices, lack of rich training datasets and validation benchmarks, +as well as model generalizability, reliability, and explainability concerns. + +
+
+
+
+
+ + ♻ ☆ Metadata-enhanced contrastive learning from retinal optical coherence + tomography images + + +
+ Deep learning has potential to automate screening, monitoring and grading of +disease in medical images. Pretraining with contrastive learning enables models +to extract robust and generalisable features from natural image datasets, +facilitating label-efficient downstream image analysis. However, the direct +application of conventional contrastive methods to medical datasets introduces +two domain-specific issues. Firstly, several image transformations which have +been shown to be crucial for effective contrastive learning do not translate +from the natural image to the medical image domain. Secondly, the assumption +made by conventional methods, that any two images are dissimilar, is +systematically misleading in medical datasets depicting the same anatomy and +disease. This is exacerbated in longitudinal image datasets that repeatedly +image the same patient cohort to monitor their disease progression over time. +In this paper we tackle these issues by extending conventional contrastive +frameworks with a novel metadata-enhanced strategy. Our approach employs widely +available patient metadata to approximate the true set of inter-image +contrastive relationships. To this end we employ records for patient identity, +eye position (i.e. left or right) and time series information. In experiments +using two large longitudinal datasets containing 170,427 retinal OCT images of +7,912 patients with age-related macular degeneration (AMD), we evaluate the +utility of using metadata to incorporate the temporal dynamics of disease +progression into pretraining. Our metadata-enhanced approach outperforms both +standard contrastive methods and a retinal image foundation model in five out +of six image-level downstream tasks related to AMD. Due to its modularity, our +method can be quickly and cost-effectively tested to establish the potential +benefits of including available metadata in contrastive pretraining. + +
+
+
+
+
+ + ♻ ☆ Coupled Laplacian Eigenmaps for Locally-Aware 3D Rigid Point Cloud + Matching CVPR + + +
+ Point cloud matching, a crucial technique in computer vision, medical and +robotics fields, is primarily concerned with finding correspondences between +pairs of point clouds or voxels. In some practical scenarios, emphasizing local +differences is crucial for accurately identifying a correct match, thereby +enhancing the overall robustness and reliability of the matching process. +Commonly used shape descriptors have several limitations and often fail to +provide meaningful local insights about the paired geometries. In this work, we +propose a new technique, based on graph Laplacian eigenmaps, to match point +clouds by taking into account fine local structures. To deal with the order and +sign ambiguity of Laplacian eigenmaps, we introduce a new operator, called +Coupled Laplacian (https://github.com/matteo-bastico/CoupLap), that allows to +easily generate aligned eigenspaces for multiple registered geometries. We show +that the similarity between those aligned high-dimensional spaces provides a +locally meaningful score to match shapes. We firstly evaluate the performance +of the proposed technique in a point-wise manner, focusing on the task of +object anomaly localization on the MVTec 3D-AD dataset. Additionally, we define +a new medical task, called automatic Bone Side Estimation (BSE), which we +address through a global similarity score derived from coupled eigenspaces. In +order to test it, we propose a benchmark collecting bone surface structures +from various public datasets. Our matching technique, based on Coupled +Laplacian, outperforms other methods by reaching an impressive accuracy on both +tasks. + +
+
+ comment: This paper has been accepted at Computer Vision and Patter + Recognition (CVPR) 2024 +
+
+
+
+
+ + ♻ ☆ Pseudo-Prompt Generating in Pre-trained Vision-Language Models for + Multi-Label Medical Image Classification + + +
+ The task of medical image recognition is notably complicated by the presence +of varied and multiple pathological indications, presenting a unique challenge +in multi-label classification with unseen labels. This complexity underlines +the need for computer-aided diagnosis methods employing multi-label zero-shot +learning. Recent advancements in pre-trained vision-language models (VLMs) have +showcased notable zero-shot classification abilities on medical images. +However, these methods have limitations on leveraging extensive pre-trained +knowledge from broader image datasets, and often depend on manual prompt +construction by expert radiologists. By automating the process of prompt +tuning, prompt learning techniques have emerged as an efficient way to adapt +VLMs to downstream tasks. Yet, existing CoOp-based strategies fall short in +performing class-specific prompts on unseen categories, limiting +generalizability in fine-grained scenarios. To overcome these constraints, we +introduce a novel prompt generation approach inspirited by text generation in +natural language processing (NLP). Our method, named Pseudo-Prompt Generating +(PsPG), capitalizes on the priori knowledge of multi-modal features. Featuring +a RNN-based decoder, PsPG autoregressively generates class-tailored embedding +vectors, i.e., pseudo-prompts. Comparative evaluations on various multi-label +chest radiograph datasets affirm the superiority of our approach against +leading medical vision-language and multi-label prompt learning methods. The +source code is available at https://github.com/fallingnight/PsPG + +
+
+ comment: Accepted by PRCV 2024 +
+
+
+
+
+ + ♻ ☆ Unsqueeze [CLS] Bottleneck to Learn Rich Representations ECCV 2024 + + +
+ Distillation-based self-supervised learning typically leads to more +compressed representations due to its radical clustering process and the +implementation of a sharper target distribution. To overcome this limitation +and preserve more information from input, we introduce UDI, conceptualized as +Unsqueezed Distillation-based self-supervised learning (SSL). UDI enriches the +learned representation by encouraging multimodal prediction distilled from a +consolidated profile of local predictions that are derived via stratified +sampling. Our evaluations show that UDI not only promotes semantically +meaningful representations at instance level, delivering superior or +competitive results to state-of-the-art SSL methods in image classification, +but also effectively preserves the nuisance of input, which yields significant +improvement in dense prediction tasks, including object detection and +segmentation. Additionally, UDI performs competitively in low-shot image +classification, improving the scalability of joint-embedding pipelines. Various +visualizations and ablation studies are presented to further elucidate the +mechanisms behind UDI. Our source code is available at +https://github.com/ISL-CV/udi. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Breaking the Global North Stereotype: A Global South-centric Benchmark + Dataset for Auditing and Mitigating Biases in Facial Recognition Systems AAAI + + +
+ Facial Recognition Systems (FRSs) are being developed and deployed globally +at unprecedented rates. Most platforms are designed in a limited set of +countries but deployed in worldwide, without adequate checkpoints. This is +especially problematic for Global South countries which lack strong legislation +to safeguard persons facing disparate performance of these systems. A +combination of unavailability of datasets, lack of understanding of FRS +functionality and low-resource bias mitigation measures accentuate the problem. +In this work, we propose a new face dataset composed of 6,579 unique male and +female sportspersons from eight countries around the world. More than 50% of +the dataset comprises individuals from the Global South countries and is +demographically diverse. To aid adversarial audits and robust model training, +each image has four adversarial variants, totaling over 40,000 images. We also +benchmark five popular FRSs, both commercial and open-source, for the task of +gender prediction (and country prediction for one of the open-source models as +an example of red-teaming). Experiments on industrial FRSs reveal accuracies +ranging from 98.2%--38.1%, with a large disparity between males and females in +the Global South (max difference of 38.5%). Biases are also observed in all +FRSs between females of the Global North and South (max difference of ~50%). +Grad-CAM analysis identifies the nose, forehead and mouth as the regions of +interest on one of the open-source FRSs. Utilizing this insight, we design +simple, low-resource bias mitigation solutions using few-shot and novel +contrastive learning techniques significantly improving the accuracy with +disparity between males and females reducing from 50% to 1.5% in one of the +settings. In the red-teaming experiment with the open-source Deepface model, +contrastive learning proves more effective than simple fine-tuning. + +
+
+ comment: This work has been accepted for publication at AAAI/ACM AIES 2024 +
+
+
+
+
+ + ♻ ☆ Outlier detection by ensembling uncertainty with negative objectness BMVC 2024 + + +
+ Outlier detection is an essential capability in safety-critical applications +of supervised visual recognition. Most of the existing methods deliver best +results by encouraging standard closed-set models to produce low-confidence +predictions in negative training data. However, that approach conflates +prediction uncertainty with recognition of the negative class. We therefore +reconsider direct prediction of K+1 logits that correspond to K groundtruth +classes and one outlier class. This setup allows us to formulate a novel +anomaly score as an ensemble of in-distribution uncertainty and the posterior +of the outlier class which we term negative objectness. Now outliers can be +independently detected due to i) high prediction uncertainty or ii) similarity +with negative data. We embed our method into a dense prediction architecture +with mask-level recognition over K+2 classes. The training procedure encourages +the novel K+2-th class to learn negative objectness at pasted negative +instances. Our models outperform the current state-of-the art on standard +benchmarks for image-wide and pixel-level outlier detection with and without +training on real negative data. + +
+
+ comment: Accepted to BMVC 2024 +
+
+
+
+
+ + ♻ ☆ SCB-dataset: A Dataset for Detecting Student Classroom Behavior + + +
+ The use of deep learning methods for automatic detection of students' +classroom behavior is a promising approach to analyze their class performance +and enhance teaching effectiveness. However, the lack of publicly available +datasets on student behavior poses a challenge for researchers in this field. +To address this issue, we propose a Student Classroom Behavior dataset +(SCB-dataset) that reflects real-life scenarios. Our dataset includes 11,248 +labels and 4,003 images, with a focus on hand-raising behavior. We evaluated +the dataset using the YOLOv7 algorithm, achieving a mean average precision +(map) of up to 85.3%. We believe that our dataset can serve as a robust +foundation for future research in the field of student behavior detection and +promote further advancements in this area.Our SCB-dataset can be downloaded +from: https://github.com/Whiffe/SCB-dataset + +
+
+
+
+
+ + ♻ ☆ Relightable Neural Actor with Intrinsic Decomposition and Pose Control ECCV 2024 + + +
+ Creating a controllable and relightable digital avatar from multi-view video +with fixed illumination is a very challenging problem since humans are highly +articulated, creating pose-dependent appearance effects, and skin as well as +clothing require space-varying BRDF modeling. Existing works on creating +animatible avatars either do not focus on relighting at all, require controlled +illumination setups, or try to recover a relightable avatar from very low cost +setups, i.e. a single RGB video, at the cost of severely limited result +quality, e.g. shadows not even being modeled. To address this, we propose +Relightable Neural Actor, a new video-based method for learning a pose-driven +neural human model that can be relighted, allows appearance editing, and models +pose-dependent effects such as wrinkles and self-shadows. Importantly, for +training, our method solely requires a multi-view recording of the human under +a known, but static lighting condition. To tackle this challenging problem, we +leverage an implicit geometry representation of the actor with a drivable +density field that models pose-dependent deformations and derive a dynamic +mapping between 3D and UV spaces, where normal, visibility, and materials are +effectively encoded. To evaluate our approach in real-world scenarios, we +collect a new dataset with four identities recorded under different light +conditions, indoors and outdoors, providing the first benchmark of its kind for +human relighting, and demonstrating state-of-the-art relighting results for +novel human poses. + +
+
+ comment: Accepted to ECCV 2024. Project page: + https://vcai.mpi-inf.mpg.de/projects/RNA/ +
+
+
+
+
+ + ♻ ☆ Frequency Guidance Matters: Skeletal Action Recognition by + Frequency-Aware Mixed Transformer + + +
+ Recently, transformers have demonstrated great potential for modeling +long-term dependencies from skeleton sequences and thereby gained +ever-increasing attention in skeleton action recognition. However, the existing +transformer-based approaches heavily rely on the naive attention mechanism for +capturing the spatiotemporal features, which falls short in learning +discriminative representations that exhibit similar motion patterns. To address +this challenge, we introduce the Frequency-aware Mixed Transformer +(FreqMixFormer), specifically designed for recognizing similar skeletal actions +with subtle discriminative motions. First, we introduce a frequency-aware +attention module to unweave skeleton frequency representations by embedding +joint features into frequency attention maps, aiming to distinguish the +discriminative movements based on their frequency coefficients. Subsequently, +we develop a mixed transformer architecture to incorporate spatial features +with frequency features to model the comprehensive frequency-spatial patterns. +Additionally, a temporal transformer is proposed to extract the global +correlations across frames. Extensive experiments show that FreqMiXFormer +outperforms SOTA on 3 popular skeleton action recognition datasets, including +NTU RGB+D, NTU RGB+D 120, and NW-UCLA datasets. + +
+
+ comment: Accepted by ACM Multimedia 2024 +
+
+
+
+
+ + ♻ ☆ AEP$n$P: A Less-constrained EP$n$P Solver for Pose Estimation with + Anisotropic Scaling + + +
+ Perspective-$n$-Point (P$n$P) stands as a fundamental algorithm for pose +estimation in various applications. In this paper, we present a new approach to +the P$n$P problem with relaxed constraints, eliminating the need for precise 3D +coordinates, which is especially suitable for object pose estimation where +corresponding object models may not be available in practice. Built upon the +classical EP$n$P solver, we refer to it as AEP$n$P due to its ability to handle +unknown anisotropic scaling factors in addition to the common 6D +transformation. Through a few algebraic manipulations and a well-chosen frame +of reference, this new problem can be boiled down to a simple linear null-space +problem followed by point registration-based identification of a similarity +transformation. Experimental results on both simulated and real datasets +demonstrate the effectiveness of AEP$n$P as a flexible and practical solution +to object pose estimation. Code: https://github.com/goldoak/AEPnP. + +
+
+
+
+
+ + ♻ ☆ Embodied Laser Attack:Leveraging Scene Priors to Achieve Agent-based + Robust Non-contact Attacks ACM MM 2024 + + +
+ As physical adversarial attacks become extensively applied in unearthing the +potential risk of security-critical scenarios, especially in dynamic scenarios, +their vulnerability to environmental variations has also been brought to light. +The non-robust nature of physical adversarial attack methods brings +less-than-stable performance consequently. Although methods such as EOT have +enhanced the robustness of traditional contact attacks like adversarial +patches, they fall short in practicality and concealment within dynamic +environments such as traffic scenarios. Meanwhile, non-contact laser attacks, +while offering enhanced adaptability, face constraints due to a limited +optimization space for their attributes, rendering EOT less effective. This +limitation underscores the necessity for developing a new strategy to augment +the robustness of such practices. To address these issues, this paper +introduces the Embodied Laser Attack (ELA), a novel framework that leverages +the embodied intelligence paradigm of Perception-Decision-Control to +dynamically tailor non-contact laser attacks. For the perception module, given +the challenge of simulating the victim's view by full-image transformation, ELA +has innovatively developed a local perspective transformation network, based on +the intrinsic prior knowledge of traffic scenes and enables effective and +efficient estimation. For the decision and control module, ELA trains an attack +agent with data-driven reinforcement learning instead of adopting +time-consuming heuristic algorithms, making it capable of instantaneously +determining a valid attack strategy with the perceived information by +well-designed rewards, which is then conducted by a controllable laser emitter. +Experimentally, we apply our framework to diverse traffic scenarios both in the +digital and physical world, verifying the effectiveness of our method under +dynamic successive scenes. + +
+
+ comment: 9 pages, 7 figures, Accepted by ACM MM 2024 +
+
+
+
+
+ + ♻ ☆ Scene Coordinate Reconstruction: Posing of Image Collections via + Incremental Learning of a Relocalizer ECCV 2024 + + +
+ We address the task of estimating camera parameters from a set of images +depicting a scene. Popular feature-based structure-from-motion (SfM) tools +solve this task by incremental reconstruction: they repeat triangulation of +sparse 3D points and registration of more camera views to the sparse point +cloud. We re-interpret incremental structure-from-motion as an iterated +application and refinement of a visual relocalizer, that is, of a method that +registers new views to the current state of the reconstruction. This +perspective allows us to investigate alternative visual relocalizers that are +not rooted in local feature matching. We show that scene coordinate regression, +a learning-based relocalization approach, allows us to build implicit, neural +scene representations from unposed images. Different from other learning-based +reconstruction methods, we do not require pose priors nor sequential inputs, +and we optimize efficiently over thousands of images. In many cases, our +method, ACE0, estimates camera poses with an accuracy close to feature-based +SfM, as demonstrated by novel view synthesis. Project page: +https://nianticlabs.github.io/acezero/ + +
+
+ comment: ECCV 2024, Project page: https://nianticlabs.github.io/acezero/ +
+
+
+
+
+ + ♻ ☆ Quality Assured: Rethinking Annotation Strategies in Imaging AI ECCV 2024 + + +
+ This paper does not describe a novel method. Instead, it studies an essential +foundation for reliable benchmarking and ultimately real-world application of +AI-based image analysis: generating high-quality reference annotations. +Previous research has focused on crowdsourcing as a means of outsourcing +annotations. However, little attention has so far been given to annotation +companies, specifically regarding their internal quality assurance (QA) +processes. Therefore, our aim is to evaluate the influence of QA employed by +annotation companies on annotation quality and devise methodologies for +maximizing data annotation efficacy. Based on a total of 57,648 instance +segmented images obtained from a total of 924 annotators and 34 QA workers from +four annotation companies and Amazon Mechanical Turk (MTurk), we derived the +following insights: (1) Annotation companies perform better both in terms of +quantity and quality compared to the widely used platform MTurk. (2) Annotation +companies' internal QA only provides marginal improvements, if any. However, +improving labeling instructions instead of investing in QA can substantially +boost annotation performance. (3) The benefit of internal QA depends on +specific image characteristics. Our work could enable researchers to derive +substantially more value from a fixed annotation budget and change the way +annotation companies conduct internal QA. + +
+
+ comment: Accepted at ECCV 2024, preprint, Computer Vision, Data Annotation +
+
+
+
+
+ + ♻ ☆ Viewpoint Textual Inversion: Discovering Scene Representations and 3D + View Control in 2D Diffusion Models ECCV 2024 + + +
+ Text-to-image diffusion models generate impressive and realistic images, but +do they learn to represent the 3D world from only 2D supervision? We +demonstrate that yes, certain 3D scene representations are encoded in the text +embedding space of models like Stable Diffusion. Our approach, Viewpoint Neural +Textual Inversion (ViewNeTI), is to discover 3D view tokens; these tokens +control the 3D viewpoint - the rendering pose in a scene - of generated images. +Specifically, we train a small neural mapper to take continuous camera +viewpoint parameters and predict a view token (a word embedding). This token +conditions diffusion generation via cross-attention to produce images with the +desired camera viewpoint. Using ViewNeTI as an evaluation tool, we report two +findings: first, the text latent space has a continuous view-control manifold +for particular 3D scenes; second, we find evidence for a generalized +view-control manifold for all scenes. We conclude that since the view token +controls the 3D `rendering' viewpoint, there is likely a scene representation +embedded in frozen 2D diffusion models. Finally, we exploit the 3D scene +representations for 3D vision tasks, namely, view-controlled text-to-image +generation, and novel view synthesis from a single image, where our approach +sets state-of-the-art for LPIPS. Code available at +https://github.com/jmhb0/view_neti + +
+
+ comment: ECCV 2024 (European Conference on Computer Vision). Project page: + https://jmhb0.github.io/view_neti/ +
+
+
+
+
+ + ♻ ☆ DART: An Automated End-to-End Object Detection Pipeline with Data + Diversification, Open-Vocabulary Bounding Box Annotation, Pseudo-Label + Review, and Model Training + + +
+ Accurate real-time object detection is vital across numerous industrial +applications, from safety monitoring to quality control. Traditional +approaches, however, are hindered by arduous manual annotation and data +collection, struggling to adapt to ever-changing environments and novel target +objects. To address these limitations, this paper presents DART, an innovative +automated end-to-end pipeline that revolutionizes object detection workflows +from data collection to model evaluation. It eliminates the need for laborious +human labeling and extensive data collection while achieving outstanding +accuracy across diverse scenarios. DART encompasses four key stages: (1) Data +Diversification using subject-driven image generation (DreamBooth with SDXL), +(2) Annotation via open-vocabulary object detection (Grounding DINO) to +generate bounding box and class labels (3) Review of generated images and +pseudo-labels by large multimodal models (InternVL-1.5 and GPT-4o) to guarantee +credibility, (4) Training of real-time object detectors (YOLOv8 and YOLOv10) +using the verified data as ground truth. We apply DART to a self-collected +dataset of construction machines named Liebherr Product, which contains over +15K high-quality images across 23 categories. The current instantiation of DART +significantly increases average precision (AP) from 0.064 to 0.832. Its modular +design ensures easy exchangeability and extensibility, allowing for future +algorithm upgrades, seamless integration of new object categories, and +adaptability to customized environments without manual labeling and additional +data collection. The code and dataset are released at +https://github.com/chen-xin-94/DART. + +
+
+
+
+
+ + ♻ ☆ Real Time Multi Organ Classification on Computed Tomography Images + + +
+ Organ segmentation is a fundamental task in medical imaging since it is +useful for many clinical automation pipelines. However, some tasks do not +require full segmentation. Instead, a classifier can identify the selected +organ without segmenting the entire volume. In this study, we demonstrate a +classifier based method to obtain organ labels in real time by using a large +context size with a sparse data sampling strategy. Although our method operates +as an independent classifier at query locations, it can generate full +segmentations by querying grid locations at any resolution, offering faster +performance than segmentation algorithms. We compared our method with existing +segmentation techniques, demonstrating its superior runtime potential for +practical applications in medical imaging. + +
+
+
+
+
+ + ♻ ☆ Model Composition for Multimodal Large Language Models ACL2024 + + +
+ Recent developments in Multimodal Large Language Models (MLLMs) have shown +rapid progress, moving towards the goal of creating versatile MLLMs that +understand inputs from various modalities. However, existing methods typically +rely on joint training with paired multimodal instruction data, which is +resource-intensive and challenging to extend to new modalities. In this paper, +we propose a new paradigm through the model composition of existing MLLMs to +create a new model that retains the modal understanding capabilities of each +original model. Our basic implementation, NaiveMC, demonstrates the +effectiveness of this paradigm by reusing modality encoders and merging LLM +parameters. Furthermore, we introduce DAMC to address parameter interference +and mismatch issues during the merging process, thereby enhancing the model +performance. To facilitate research in this area, we propose MCUB, a benchmark +for assessing ability of MLLMs to understand inputs from diverse modalities. +Experiments on this benchmark and four other multimodal understanding tasks +show significant improvements over baselines, proving that model composition +can create a versatile model capable of processing inputs from multiple +modalities. + +
+
+ comment: ACL2024 Main Conference; Code is available at + https://github.com/THUNLP-MT/ModelCompose +
+
+
+
+
+ + ♻ ☆ Geometric Generative Models based on Morphological Equivariant PDEs and + GANs + + +
+ Content and image generation consist in creating or generating data from +noisy information by extracting specific features such as texture, edges, and +other thin image structures. We are interested here in generative models, and +two main problems are addressed. Firstly, the improvements of specific feature +extraction while accounting at multiscale levels intrinsic geometric features; +and secondly, the equivariance of the network to reduce its complexity and +provide a geometric interpretability. To proceed, we propose a geometric +generative model based on an equivariant partial differential equation (PDE) +for group convolution neural networks (G-CNNs), so called PDE-G-CNNs, built on +morphology operators and generative adversarial networks (GANs). Equivariant +morphological PDE layers are composed of multiscale dilations and erosions +formulated in Riemannian manifolds, while group symmetries are defined on a Lie +group. We take advantage of the Lie group structure to properly integrate the +equivariance in layers, and are able to use the Riemannian metric to solve the +multiscale morphological operations. Each point of the Lie group is associated +with a unique point in the manifold, which helps us derive a metric on the +Riemannian manifold from a tensor field invariant under the Lie group so that +the induced metric has the same symmetries. The proposed geometric +morphological GAN (GM-GAN) is obtained by using the proposed morphological +equivariant convolutions in PDE-G-CNNs to bring nonlinearity in classical CNNs. +GM-GAN is evaluated on MNIST data and compared with GANs. Preliminary results +show that GM-GAN model outperforms classical GAN. + +
+
+
+
+
+ + ♻ ☆ Similarity Distance-Based Label Assignment for Tiny Object Detection IROS 2024 + + +
+ Tiny object detection is becoming one of the most challenging tasks in +computer vision because of the limited object size and lack of information. The +label assignment strategy is a key factor affecting the accuracy of object +detection. Although there are some effective label assignment strategies for +tiny objects, most of them focus on reducing the sensitivity to the bounding +boxes to increase the number of positive samples and have some fixed +hyperparameters need to set. However, more positive samples may not necessarily +lead to better detection results, in fact, excessive positive samples may lead +to more false positives. In this paper, we introduce a simple but effective +strategy named the Similarity Distance (SimD) to evaluate the similarity +between bounding boxes. This proposed strategy not only considers both location +and shape similarity but also learns hyperparameters adaptively, ensuring that +it can adapt to different datasets and various object sizes in a dataset. Our +approach can be simply applied in common anchor-based detectors in place of the +IoU for label assignment and Non Maximum Suppression (NMS). Extensive +experiments on four mainstream tiny object detection datasets demonstrate +superior performance of our method, especially, 1.8 AP points and 4.1 AP points +of very tiny higher than the state-of-the-art competitors on AI-TOD. Code is +available at: \url{https://github.com/cszzshi/SimD}. + +
+
+ comment: 8 pages, 4 figures, this paper has been accepted by IEEE/RSJ + International Conference on Intelligent Robots and Systems (IROS 2024) +
+
+
+
+
+ + ♻ ☆ Back to the Color: Learning Depth to Specific Color Transformation for + Unsupervised Depth Estimation + + +
+ Virtual engines can generate dense depth maps for various synthetic scenes, +making them invaluable for training depth estimation models. However, +discrepancies between synthetic and real-world colors pose significant +challenges for depth estimation in real-world scenes, especially in complex and +uncertain environments encountered in unsupervised monocular depth estimation +tasks. To address this issue, we propose Back2Color, a framework that predicts +realistic colors from depth using a model trained on real-world data, thus +transforming synthetic colors into their real-world counterparts. Additionally, +we introduce the Syn-Real CutMix method for joint training with both real-world +unsupervised and synthetic supervised depth samples, enhancing monocular depth +estimation performance in real-world scenes. Furthermore, to mitigate the +impact of non-rigid motions on depth estimation, we present an auto-learning +uncertainty temporal-spatial fusion method (Auto-UTSF), which leverages the +strengths of unsupervised learning in both temporal and spatial dimensions. We +also designed VADepth, based on the Vision Attention Network, which offers +lower computational complexity and higher accuracy than transformers. Our +Back2Color framework achieves state-of-the-art performance on the Kitti +dataset, as evidenced by improvements in performance metrics and the production +of fine-grained details. This is particularly evident on more challenging +datasets such as Cityscapes for unsupervised depth estimation. + +
+
+
+
+
+ + ♻ ☆ Compound Expression Recognition via Multi Model Ensemble for the ABAW7 + Challenge + + +
+ Compound Expression Recognition (CER) is vital for effective interpersonal +interactions. Human emotional expressions are inherently complex due to the +presence of compound expressions, requiring the consideration of both local and +global facial cues for accurate judgment. In this paper, we propose an ensemble +learning-based solution to address this complexity. Our approach involves +training three distinct expression classification models using convolutional +networks, Vision Transformers, and multiscale local attention networks. By +employing late fusion for model ensemble, we combine the outputs of these +models to predict the final results. Our method demonstrates high accuracy on +the RAF-DB datasets and is capable of recognizing expressions in certain +portions of the C-EXPR-DB through zero-shot learning. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2403.12572 by other authors +
+
+
+
+
+ + ♻ ☆ Facial Affect Recognition based on Multi Architecture Encoder and + Feature Fusion for the ABAW7 Challenge + + +
+ In this paper, we present our approach to addressing the challenges of the +7th ABAW competition. The competition comprises three sub-challenges: Valence +Arousal (VA) estimation, Expression (Expr) classification, and Action Unit (AU) +detection. To tackle these challenges, we employ state-of-the-art models to +extract powerful visual features. Subsequently, a Transformer Encoder is +utilized to integrate these features for the VA, Expr, and AU sub-challenges. +To mitigate the impact of varying feature dimensions, we introduce an affine +module to align the features to a common dimension. Overall, our results +significantly outperform the baselines. + +
+
+
+
+
+ + ♻ ☆ Exploring the Effect of Dataset Diversity in Self-Supervised Learning + for Surgical Computer Vision MICCAI2024 + + +
+ Over the past decade, computer vision applications in minimally invasive +surgery have rapidly increased. Despite this growth, the impact of surgical +computer vision remains limited compared to other medical fields like pathology +and radiology, primarily due to the scarcity of representative annotated data. +Whereas transfer learning from large annotated datasets such as ImageNet has +been conventionally the norm to achieve high-performing models, recent +advancements in self-supervised learning (SSL) have demonstrated superior +performance. In medical image analysis, in-domain SSL pretraining has already +been shown to outperform ImageNet-based initialization. Although unlabeled data +in the field of surgical computer vision is abundant, the diversity within this +data is limited. This study investigates the role of dataset diversity in SSL +for surgical computer vision, comparing procedure-specific datasets against a +more heterogeneous general surgical dataset across three different downstream +surgical applications. The obtained results show that using solely +procedure-specific data can lead to substantial improvements of 13.8%, 9.5%, +and 36.8% compared to ImageNet pretraining. However, extending this data with +more heterogeneous surgical data further increases performance by an additional +5.0%, 5.2%, and 2.5%, suggesting that increasing diversity within SSL data is +beneficial for model performance. The code and pretrained model weights are +made publicly available at https://github.com/TimJaspers0801/SurgeNet. + +
+
+ comment: accepted - Data Engineering in Medical Imaging (DEMI) Workshop @ + MICCAI2024 +
+
+
+
+
+ + ♻ ☆ AutoRG-Brain: Grounded Report Generation for Brain MRI + + +
+ Radiologists are tasked with interpreting a large number of images in a daily +base, with the responsibility of generating corresponding reports. This +demanding workload elevates the risk of human error, potentially leading to +treatment delays, increased healthcare costs, revenue loss, and operational +inefficiencies. To address these challenges, we initiate a series of work on +grounded Automatic Report Generation (AutoRG), starting from the brain MRI +interpretation system, which supports the delineation of brain structures, the +localization of anomalies, and the generation of well-organized findings. We +make contributions from the following aspects, first, on dataset construction, +we release a comprehensive dataset encompassing segmentation masks of anomaly +regions and manually authored reports, termed as RadGenome-Brain MRI. This data +resource is intended to catalyze ongoing research and development in the field +of AI-assisted report generation systems. Second, on system design, we propose +AutoRG-Brain, the first brain MRI report generation system with pixel-level +grounded visual clues. Third, for evaluation, we conduct quantitative +assessments and human evaluations of brain structure segmentation, anomaly +localization, and report generation tasks to provide evidence of its +reliability and accuracy. This system has been integrated into real clinical +scenarios, where radiologists were instructed to write reports based on our +generated findings and anomaly segmentation masks. The results demonstrate that +our system enhances the report-writing skills of junior doctors, aligning their +performance more closely with senior doctors, thereby boosting overall +productivity. + +
+
+
+
+
+ + ♻ ☆ Characterizing Continual Learning Scenarios and Strategies for Audio + Analysis + + +
+ Audio analysis is useful in many application scenarios. The state-of-the-art +audio analysis approaches assume the data distribution at training and +deployment time will be the same. However, due to various real-life challenges, +the data may encounter drift in its distribution or can encounter new classes +in the late future. Thus, a one-time trained model might not perform +adequately. Continual learning (CL) approaches are devised to handle such +changes in data distribution. There have been a few attempts to use CL +approaches for audio analysis. Yet, there is a lack of a systematic evaluation +framework. In this paper, we create a comprehensive CL dataset and characterize +CL approaches for audio-based monitoring tasks. We have investigated the +following CL and non-CL approaches: EWC, LwF, SI, GEM, A-GEM, GDumb, Replay, +Naive, Cumulative, and Joint training. The study is very beneficial for +researchers and practitioners working in the area of audio analysis for +developing adaptive models. We observed that Replay achieved better results +than other methods in the DCASE challenge data. It achieved an accuracy of +70.12% for the domain incremental scenario and an accuracy of 96.98% for the +class incremental scenario. + +
+
+
+
+
+ + ♻ ☆ Frozen Feature Augmentation for Few-Shot Image Classification CVPR 2024 + + +
+ Training a linear classifier or lightweight model on top of pretrained vision +model outputs, so-called 'frozen features', leads to impressive performance on +a number of downstream few-shot tasks. Currently, frozen features are not +modified during training. On the other hand, when networks are trained directly +on images, data augmentation is a standard recipe that improves performance +with no substantial overhead. In this paper, we conduct an extensive pilot +study on few-shot image classification that explores applying data +augmentations in the frozen feature space, dubbed 'frozen feature augmentation +(FroFA)', covering twenty augmentations in total. Our study demonstrates that +adopting a deceptively simple pointwise FroFA, such as brightness, can improve +few-shot performance consistently across three network architectures, three +large pretraining datasets, and eight transfer datasets. + +
+
+ comment: CVPR 2024 (18 pages, main paper + supplementary material) +
+
+
+
+
+ + ♻ ☆ DenseTrack: Drone-based Crowd Tracking via Density-aware + Motion-appearance Synergy + + +
+ Drone-based crowd tracking faces difficulties in accurately identifying and +monitoring objects from an aerial perspective, largely due to their small size +and close proximity to each other, which complicates both localization and +tracking. To address these challenges, we present the Density-aware Tracking +(DenseTrack) framework. DenseTrack capitalizes on crowd counting to precisely +determine object locations, blending visual and motion cues to improve the +tracking of small-scale objects. It specifically addresses the problem of +cross-frame motion to enhance tracking accuracy and dependability. DenseTrack +employs crowd density estimates as anchors for exact object localization within +video frames. These estimates are merged with motion and position information +from the tracking network, with motion offsets serving as key tracking cues. +Moreover, DenseTrack enhances the ability to distinguish small-scale objects +using insights from the visual-language model, integrating appearance with +motion cues. The framework utilizes the Hungarian algorithm to ensure the +accurate matching of individuals across frames. Demonstrated on DroneCrowd +dataset, our approach exhibits superior performance, confirming its +effectiveness in scenarios captured by drones. + +
+
+
+
+
+ + ♻ ☆ Coordinate-Aware Thermal Infrared Tracking Via Natural Language Modeling + + +
+ Thermal infrared (TIR) tracking is pivotal in computer vision tasks due to +its all-weather imaging capability. Traditional tracking methods predominantly +rely on hand-crafted features, and while deep learning has introduced +correlation filtering techniques, these are often constrained by rudimentary +correlation operations. Furthermore, transformer-based approaches tend to +overlook temporal and coordinate information, which is critical for TIR +tracking that lacks texture and color information. In this paper, to address +these issues, we apply natural language modeling to TIR tracking and propose a +coordinate-aware thermal infrared tracking model called NLMTrack, which +enhances the utilization of coordinate and temporal information. NLMTrack +applies an encoder that unifies feature extraction and feature fusion, which +simplifies the TIR tracking pipeline. To address the challenge of low detail +and low contrast in TIR images, on the one hand, we design a multi-level +progressive fusion module that enhances the semantic representation and +incorporates multi-scale features. On the other hand, the decoder combines the +TIR features and the coordinate sequence features using a causal transformer to +generate the target sequence step by step. Moreover, we explore an adaptive +loss aimed at elevating tracking accuracy and a simple template update strategy +to accommodate the target's appearance variations. Experiments show that +NLMTrack achieves state-of-the-art performance on multiple benchmarks. The Code +is publicly available at \url{https://github.com/ELOESZHANG/NLMTrack}. + +
+
+
+
+
+ + ♻ ☆ 3MVRD: Multimodal Multi-task Multi-teacher Visually-Rich Form Document + Understanding ACL 2024 + + +
+ This paper presents a groundbreaking multimodal, multi-task, multi-teacher +joint-grained knowledge distillation model for visually-rich form document +understanding. The model is designed to leverage insights from both +fine-grained and coarse-grained levels by facilitating a nuanced correlation +between token and entity representations, addressing the complexities inherent +in form documents. Additionally, we introduce new inter-grained and +cross-grained loss functions to further refine diverse multi-teacher knowledge +distillation transfer process, presenting distribution gaps and a harmonised +understanding of form documents. Through a comprehensive evaluation across +publicly available form document understanding datasets, our proposed model +consistently outperforms existing baselines, showcasing its efficacy in +handling the intricate structures and content of visually complex form +documents. + +
+
+ comment: Accepted at Findings of ACL 2024 +
+
+
+
+
+ + ♻ ☆ Grounding Language Models for Visual Entity Recognition ECCV 2024 + + +
+ We introduce AutoVER, an Autoregressive model for Visual Entity Recognition. +Our model extends an autoregressive Multi-modal Large Language Model by +employing retrieval augmented constrained generation. It mitigates low +performance on out-of-domain entities while excelling in queries that require +visually-situated reasoning. Our method learns to distinguish similar entities +within a vast label space by contrastively training on hard negative pairs in +parallel with a sequence-to-sequence objective without an external retriever. +During inference, a list of retrieved candidate answers explicitly guides +language generation by removing invalid decoding paths. The proposed method +achieves significant improvements across different dataset splits in the +recently proposed Oven-Wiki benchmark. Accuracy on the Entity seen split rises +from 32.7% to 61.5%. It also demonstrates superior performance on the unseen +and query splits by a substantial double-digit margin. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ♻ ☆ ChangeMamba: Remote Sensing Change Detection With Spatiotemporal State + Space Model + + +
+ Convolutional neural networks (CNN) and Transformers have made impressive +progress in the field of remote sensing change detection (CD). However, both +architectures have inherent shortcomings: CNN are constrained by a limited +receptive field that may hinder their ability to capture broader spatial +contexts, while Transformers are computationally intensive, making them costly +to train and deploy on large datasets. Recently, the Mamba architecture, based +on state space models, has shown remarkable performance in a series of natural +language processing tasks, which can effectively compensate for the +shortcomings of the above two architectures. In this paper, we explore for the +first time the potential of the Mamba architecture for remote sensing CD tasks. +We tailor the corresponding frameworks, called MambaBCD, MambaSCD, and +MambaBDA, for binary change detection (BCD), semantic change detection (SCD), +and building damage assessment (BDA), respectively. All three frameworks adopt +the cutting-edge Visual Mamba architecture as the encoder, which allows full +learning of global spatial contextual information from the input images. For +the change decoder, which is available in all three architectures, we propose +three spatio-temporal relationship modeling mechanisms, which can be naturally +combined with the Mamba architecture and fully utilize its attribute to achieve +spatio-temporal interaction of multi-temporal features, thereby obtaining +accurate change information. On five benchmark datasets, our proposed +frameworks outperform current CNN- and Transformer-based approaches without +using any complex training strategies or tricks, fully demonstrating the +potential of the Mamba architecture in CD tasks. Further experiments show that +our architecture is quite robust to degraded data. The source code will be +available in https://github.com/ChenHongruixuan/MambaCD + +
+
+ comment: Accepted by IEEE TGRS: https://ieeexplore.ieee.org/document/10565926 +
+
+
+
+
+ + ♻ ☆ Adaptive Self-training Framework for Fine-grained Scene Graph Generation ICLR 2024 + + +
+ Scene graph generation (SGG) models have suffered from inherent problems +regarding the benchmark datasets such as the long-tailed predicate distribution +and missing annotation problems. In this work, we aim to alleviate the +long-tailed problem of SGG by utilizing unannotated triplets. To this end, we +introduce a Self-Training framework for SGG (ST-SGG) that assigns pseudo-labels +for unannotated triplets based on which the SGG models are trained. While there +has been significant progress in self-training for image recognition, designing +a self-training framework for the SGG task is more challenging due to its +inherent nature such as the semantic ambiguity and the long-tailed distribution +of predicate classes. Hence, we propose a novel pseudo-labeling technique for +SGG, called Class-specific Adaptive Thresholding with Momentum (CATM), which is +a model-agnostic framework that can be applied to any existing SGG models. +Furthermore, we devise a graph structure learner (GSL) that is beneficial when +adopting our proposed self-training framework to the state-of-the-art +message-passing neural network (MPNN)-based SGG models. Our extensive +experiments verify the effectiveness of ST-SGG on various SGG models, +particularly in enhancing the performance on fine-grained predicate classes. + +
+
+ comment: 9 pages; ICLR 2024 +
+
+
+
+
+ + ♻ ☆ When, Where, and What? A Novel Benchmark for Accident Anticipation and + Localization with Large Language Models + + +
+ As autonomous driving systems increasingly become part of daily +transportation, the ability to accurately anticipate and mitigate potential +traffic accidents is paramount. Traditional accident anticipation models +primarily utilizing dashcam videos are adept at predicting when an accident may +occur but fall short in localizing the incident and identifying involved +entities. Addressing this gap, this study introduces a novel framework that +integrates Large Language Models (LLMs) to enhance predictive capabilities +across multiple dimensions--what, when, and where accidents might occur. We +develop an innovative chain-based attention mechanism that dynamically adjusts +to prioritize high-risk elements within complex driving scenes. This mechanism +is complemented by a three-stage model that processes outputs from smaller +models into detailed multimodal inputs for LLMs, thus enabling a more nuanced +understanding of traffic dynamics. Empirical validation on the DAD, CCD, and +A3D datasets demonstrates superior performance in Average Precision (AP) and +Mean Time-To-Accident (mTTA), establishing new benchmarks for accident +prediction technology. Our approach not only advances the technological +framework for autonomous driving safety but also enhances human-AI interaction, +making predictive insights generated by autonomous systems more intuitive and +actionable. + +
+
+
+
+
+ + ♻ ☆ Vision language models are blind + + +
+ While large language models with vision capabilities (VLMs), e.g., GPT-4o and +Gemini 1.5 Pro, are powering various image-text applications and scoring high +on many vision-understanding benchmarks, we find that they are surprisingly +still struggling with low-level vision tasks that are easy to humans. +Specifically, on BlindTest, our suite of 7 very simple tasks such as +identifying (a) whether two circles overlap; (b) whether two lines intersect; +(c) which letter is being circled in a word; and (d) counting circles in an +Olympic-like logo, four state-of-the-art VLMs are only 58.57% accurate on +average. Claude 3.5 Sonnet performs the best at 74.94% accuracy, but this is +still far from the human expected accuracy of 100%. Across different image +resolutions and line widths, VLMs consistently struggle with tasks that require +precise spatial information and recognizing geometric primitives that overlap +or are close together. Code and data are available at: +https://vlmsareblind.github.io + +
+
+
+
+
+ + ♻ ☆ Dysca: A Dynamic and Scalable Benchmark for Evaluating Perception + Ability of LVLMs + + +
+ Currently many benchmarks have been proposed to evaluate the perception +ability of the Large Vision-Language Models (LVLMs). However, most benchmarks +conduct questions by selecting images from existing datasets, resulting in the +potential data leakage. Besides, these benchmarks merely focus on evaluating +LVLMs on the realistic style images and clean scenarios, leaving the +multi-stylized images and noisy scenarios unexplored. In response to these +challenges, we propose a dynamic and scalable benchmark named Dysca for +evaluating LVLMs by leveraging synthesis images. Specifically, we leverage +Stable Diffusion and design a rule-based method to dynamically generate novel +images, questions and the corresponding answers. We consider 51 kinds of image +styles and evaluate the perception capability in 20 subtasks. Moreover, we +conduct evaluations under 4 scenarios (i.e., Clean, Corruption, Print Attacking +and Adversarial Attacking) and 3 question types (i.e., Multi-choices, +True-or-false and Free-form). Thanks to the generative paradigm, Dysca serves +as a scalable benchmark for easily adding new subtasks and scenarios. A total +of 8 advanced open-source LVLMs with 10 checkpoints are evaluated on Dysca, +revealing the drawbacks of current LVLMs. The benchmark is released in +\url{https://github.com/Benchmark-Dysca/Dysca}. + +
+
+
+
+
+ + ♻ ☆ Tracking Meets LoRA: Faster Training, Larger Model, Stronger Performance ECCV 2024 + + +
+ Motivated by the Parameter-Efficient Fine-Tuning (PEFT) in large language +models, we propose LoRAT, a method that unveils the power of large ViT model +for tracking within laboratory-level resources. The essence of our work lies in +adapting LoRA, a technique that fine-tunes a small subset of model parameters +without adding inference latency, to the domain of visual tracking. However, +unique challenges and potential domain gaps make this transfer not as easy as +the first intuition. Firstly, a transformer-based tracker constructs unshared +position embedding for template and search image. This poses a challenge for +the transfer of LoRA, usually requiring consistency in the design when applied +to the pre-trained backbone, to downstream tasks. Secondly, the inductive bias +inherent in convolutional heads diminishes the effectiveness of +parameter-efficient fine-tuning in tracking models. To overcome these +limitations, we first decouple the position embeddings in transformer-based +trackers into shared spatial ones and independent type ones. The shared +embeddings, which describe the absolute coordinates of multi-resolution images +(namely, the template and search images), are inherited from the pre-trained +backbones. In contrast, the independent embeddings indicate the sources of each +token and are learned from scratch. Furthermore, we design an anchor-free head +solely based on MLP to adapt PETR, enabling better performance with less +computational overhead. With our design, 1) it becomes practical to train +trackers with the ViT-g backbone on GPUs with only memory of 25.8GB (batch size +of 16); 2) we reduce the training time of the L-224 variant from 35.0 to 10.8 +GPU hours; 3) we improve the LaSOT SUC score from 0.703 to 0.742 with the L-224 +variant; 4) we fast the inference speed of the L-224 variant from 52 to 119 +FPS. Code and models are available at https://github.com/LitingLin/LoRAT. + +
+
+ comment: Accepted by ECCV 2024. All experiment results are updated +
+
+
+
+
+ + ♻ ☆ Beyond MOT: Semantic Multi-Object Tracking ECCV2024 + + +
+ Current multi-object tracking (MOT) aims to predict trajectories of targets +(i.e.,"where") in videos. Yet, knowing merely "where" is insufficient in many +crucial applications. In comparison, semantic understanding such as +fine-grained behaviors, interactions, and overall summarized captions (i.e., +"what") from videos, associated with "where", is highly-desired for +comprehensive video analysis. Thus motivated, we introduce Semantic +Multi-Object Tracking (SMOT), that aims to estimate object trajectories and +meanwhile understand semantic details of associated trajectories including +instance captions, instance interactions, and overall video captions, +integrating "where" and "what" for tracking. In order to foster the exploration +of SMOT, we propose BenSMOT, a large-scale Benchmark for Semantic MOT. +Specifically, BenSMOT comprises 3,292 videos with 151K frames, covering various +scenarios for semantic tracking of humans. BenSMOT provides annotations for the +trajectories of targets, along with associated instance captions in natural +language, instance interactions, and overall caption for each video sequence. +To our best knowledge, BenSMOT is the first publicly available benchmark for +SMOT. Besides, to encourage future research, we present a novel tracker named +SMOTer, which is specially designed and end-to-end trained for SMOT, showing +promising performance. By releasing BenSMOT, we expect to go beyond +conventional MOT by predicting "where" and "what" for SMOT, opening up a new +direction in tracking for video understanding. Our BenSMOT and SMOTer will be +released. + +
+
+ comment: Accepted to ECCV2024 +
+
+
+
+
+ + ♻ ☆ Point-DAE: Denoising Autoencoders for Self-supervised Point Cloud + Learning + + +
+ Masked autoencoder has demonstrated its effectiveness in self-supervised +point cloud learning. Considering that masking is a kind of corruption, in this +work we explore a more general denoising autoencoder for point cloud learning +(Point-DAE) by investigating more types of corruptions beyond masking. +Specifically, we degrade the point cloud with certain corruptions as input, and +learn an encoder-decoder model to reconstruct the original point cloud from its +corrupted version. Three corruption families (\ie, density/masking, noise, and +affine transformation) and a total of fourteen corruption types are +investigated with traditional non-Transformer encoders. Besides the popular +masking corruption, we identify another effective corruption family, \ie, +affine transformation. The affine transformation disturbs all points globally, +which is complementary to the masking corruption where some local regions are +dropped. We also validate the effectiveness of affine transformation corruption +with the Transformer backbones, where we decompose the reconstruction of the +complete point cloud into the reconstructions of detailed local patches and +rough global shape, alleviating the position leakage problem in the +reconstruction. Extensive experiments on tasks of object classification, +few-shot learning, robustness testing, part segmentation, and 3D object +detection validate the effectiveness of the proposed method. The codes are +available at \url{https://github.com/YBZh/Point-DAE}. + +
+
+ comment: Journal revision; Codes are available at + \url{https://github.com/YBZh/Point-DAE} +
+
+
+
+
+ + ♻ ☆ FiLo: Zero-Shot Anomaly Detection by Fine-Grained Description and + High-Quality Localization ACM MM 2024 + + +
+ Zero-shot anomaly detection (ZSAD) methods entail detecting anomalies +directly without access to any known normal or abnormal samples within the +target item categories. Existing approaches typically rely on the robust +generalization capabilities of multimodal pretrained models, computing +similarities between manually crafted textual features representing "normal" or +"abnormal" semantics and image features to detect anomalies and localize +anomalous patches. However, the generic descriptions of "abnormal" often fail +to precisely match diverse types of anomalies across different object +categories. Additionally, computing feature similarities for single patches +struggles to pinpoint specific locations of anomalies with various sizes and +scales. To address these issues, we propose a novel ZSAD method called FiLo, +comprising two components: adaptively learned Fine-Grained Description (FG-Des) +and position-enhanced High-Quality Localization (HQ-Loc). FG-Des introduces +fine-grained anomaly descriptions for each category using Large Language Models +(LLMs) and employs adaptively learned textual templates to enhance the accuracy +and interpretability of anomaly detection. HQ-Loc, utilizing Grounding DINO for +preliminary localization, position-enhanced text prompts, and Multi-scale +Multi-shape Cross-modal Interaction (MMCI) module, facilitates more accurate +localization of anomalies of different sizes and shapes. Experimental results +on datasets like MVTec and VisA demonstrate that FiLo significantly improves +the performance of ZSAD in both detection and localization, achieving +state-of-the-art performance with an image-level AUC of 83.9% and a pixel-level +AUC of 95.9% on the VisA dataset. Code is available at +https://github.com/CASIA-IVA-Lab/FiLo. + +
+
+ comment: Accepted by ACM MM 2024 +
+
+
+
+
+ + ♻ ☆ Affective Behaviour Analysis via Progressive Learning + + +
+ Affective Behavior Analysis aims to develop emotionally intelligent +technology that can recognize and respond to human emotions. To advance this, +the 7th Affective Behavior Analysis in-the-wild (ABAW) competition establishes +two tracks: i.e., the Multi-task Learning (MTL) Challenge and the Compound +Expression (CE) challenge based on Aff-Wild2 and C-EXPR-DB datasets. In this +paper, we present our methods and experimental results for the two competition +tracks. Specifically, it can be summarized in the following four aspects: 1) To +attain high-quality facial features, we train a Masked-Auto Encoder in a +self-supervised manner. 2) We devise a temporal convergence module to capture +the temporal information between video frames and explore the impact of window +size and sequence length on each sub-task. 3) To facilitate the joint +optimization of various sub-tasks, we explore the impact of sub-task joint +training and feature fusion from individual tasks on each task performance +improvement. 4) We utilize curriculum learning to transition the model from +recognizing single expressions to recognizing compound expressions, thereby +improving the accuracy of compound expression recognition. Extensive +experiments demonstrate the superiority of our designs. + +
+
+ comment: Techical Report for 7th ABAW Competition +
+
+
+
+
+ + ♻ ☆ Disrupting Diffusion: Token-Level Attention Erasure Attack against + Diffusion-based Customization ACM MM2024 + + +
+ With the development of diffusion-based customization methods like +DreamBooth, individuals now have access to train the models that can generate +their personalized images. Despite the convenience, malicious users have +misused these techniques to create fake images, thereby triggering a privacy +security crisis. In light of this, proactive adversarial attacks are proposed +to protect users against customization. The adversarial examples are trained to +distort the customization model's outputs and thus block the misuse. In this +paper, we propose DisDiff (Disrupting Diffusion), a novel adversarial attack +method to disrupt the diffusion model outputs. We first delve into the +intrinsic image-text relationships, well-known as cross-attention, and +empirically find that the subject-identifier token plays an important role in +guiding image generation. Thus, we propose the Cross-Attention Erasure module +to explicitly "erase" the indicated attention maps and disrupt the text +guidance. Besides,we analyze the influence of the sampling process of the +diffusion model on Projected Gradient Descent (PGD) attack and introduce a +novel Merit Sampling Scheduler to adaptively modulate the perturbation updating +amplitude in a step-aware manner. Our DisDiff outperforms the state-of-the-art +methods by 12.75% of FDFR scores and 7.25% of ISM scores across two facial +benchmarks and two commonly used prompts on average. + +
+
+ comment: Accepted by ACM MM2024 +
+
+
+
+
+ + ♻ ☆ Per-Gaussian Embedding-Based Deformation for Deformable 3D Gaussian + Splatting ECCV 2024 + + +
+ As 3D Gaussian Splatting (3DGS) provides fast and high-quality novel view +synthesis, it is a natural extension to deform a canonical 3DGS to multiple +frames for representing a dynamic scene. However, previous works fail to +accurately reconstruct complex dynamic scenes. We attribute the failure to the +design of the deformation field, which is built as a coordinate-based function. +This approach is problematic because 3DGS is a mixture of multiple fields +centered at the Gaussians, not just a single coordinate-based framework. To +resolve this problem, we define the deformation as a function of per-Gaussian +embeddings and temporal embeddings. Moreover, we decompose deformations as +coarse and fine deformations to model slow and fast movements, respectively. +Also, we introduce a local smoothness regularization for per-Gaussian embedding +to improve the details in dynamic regions. Project page: +https://jeongminb.github.io/e-d3dgs/ + +
+
+ comment: ECCV 2024. Project page: https://jeongminb.github.io/e-d3dgs/ +
+
+
+
+
+ + ♻ ☆ Improving Representation of High-frequency Components for Medical + Foundation Models + + +
+ Foundation models have recently attracted significant attention for their +impressive generalizability across diverse downstream tasks. However, these +models are demonstrated to exhibit great limitations in representing +high-frequency components and fine-grained details. In many medical imaging +tasks, the precise representation of such information is crucial due to the +inherently intricate anatomical structures, sub-visual features, and complex +boundaries involved. Consequently, the limited representation of prevalent +foundation models can result in significant performance degradation or even +failure in these tasks. To address these challenges, we propose a novel +pretraining strategy, named Frequency-advanced Representation Autoencoder +(Frepa). Through high-frequency masking and low-frequency perturbation combined +with adversarial learning, Frepa encourages the encoder to effectively +represent and preserve high-frequency components in the image embeddings. +Additionally, we introduce an innovative histogram-equalized image masking +strategy, extending the Masked Autoencoder approach beyond ViT to other +architectures such as Swin Transformer and convolutional networks. We develop +Frepa across nine medical modalities and validate it on 32 downstream tasks for +both 2D images and 3D volume data. Without fine-tuning, Frepa can outperform +other self-supervised pretraining methods and, in some cases, even surpasses +task-specific trained models. This improvement is particularly significant for +tasks involving fine-grained details, such as achieving up to a +15% increase +in DSC for retina vessel segmentation and a +7% increase in IoU for lung nodule +detection. Further experiments quantitatively reveal that Frepa enables +superior high-frequency representations and preservation in the embeddings, +underscoring its potential for developing more generalized and universal +medical image foundation models. + +
+
+
+
+
+ + ♻ ☆ Harnessing Temporal Causality for Advanced Temporal Action Detection + + +
+ As a fundamental task in long-form video understanding, temporal action +detection (TAD) aims to capture inherent temporal relations in untrimmed videos +and identify candidate actions with precise boundaries. Over the years, various +networks, including convolutions, graphs, and transformers, have been explored +for effective temporal modeling for TAD. However, these modules typically treat +past and future information equally, overlooking the crucial fact that changes +in action boundaries are essentially causal events. Inspired by this insight, +we propose leveraging the temporal causality of actions to enhance TAD +representation by restricting the model's access to only past or future +context. We introduce CausalTAD, which combines causal attention and causal +Mamba to achieve state-of-the-art performance on multiple benchmarks. Notably, +with CausalTAD, we ranked 1st in the Action Recognition, Action Detection, and +Audio-Based Interaction Detection tracks at the EPIC-Kitchens Challenge 2024, +as well as 1st in the Moment Queries track at the Ego4D Challenge 2024. Our +code is available at https://github.com/sming256/OpenTAD/. + +
+
+ comment: 1st in Moment Queries track at the Ego4D Challenge 2024; 1st in + Action Recognition, Action Detection, and Audio-Based Interaction Detection + tracks at the EPIC-Kitchens Challenge 2024 +
+
+
+
+
+ + ♻ ☆ The Tug-of-War Between Deepfake Generation and Detection + + +
+ Multimodal generative models are rapidly evolving, leading to a surge in the +generation of realistic video and audio that offers exciting possibilities but +also serious risks. Deepfake videos, which can convincingly impersonate +individuals, have particularly garnered attention due to their potential misuse +in spreading misinformation and creating fraudulent content. This survey paper +examines the dual landscape of deepfake video generation and detection, +emphasizing the need for effective countermeasures against potential abuses. We +provide a comprehensive overview of current deepfake generation techniques, +including face swapping, reenactment, and audio-driven animation, which +leverage cutting-edge technologies like GANs and diffusion models to produce +highly realistic fake videos. Additionally, we analyze various detection +approaches designed to differentiate authentic from altered videos, from +detecting visual artifacts to deploying advanced algorithms that pinpoint +inconsistencies across video and audio signals. + The effectiveness of these detection methods heavily relies on the diversity +and quality of datasets used for training and evaluation. We discuss the +evolution of deepfake datasets, highlighting the importance of robust, diverse, +and frequently updated collections to enhance the detection accuracy and +generalizability. As deepfakes become increasingly indistinguishable from +authentic content, developing advanced detection techniques that can keep pace +with generation technologies is crucial. We advocate for a proactive approach +in the "tug-of-war" between deepfake creators and detectors, emphasizing the +need for continuous research collaboration, standardization of evaluation +metrics, and the creation of comprehensive benchmarks. + +
+
+
+
+
+
+
+
+ + Information Retrieval 16 + +
+
+
+ + ☆ Do We Really Need Graph Convolution During Training? Light Post-Training + Graph-ODE for Efficient Recommendation CIKM 2024 + + +
+ The efficiency and scalability of graph convolution networks (GCNs) in +training recommender systems (RecSys) have been persistent concerns, hindering +their deployment in real-world applications. This paper presents a critical +examination of the necessity of graph convolutions during the training phase +and introduces an innovative alternative: the Light Post-Training Graph +Ordinary-Differential-Equation (LightGODE). Our investigation reveals that the +benefits of GCNs are more pronounced during testing rather than training. +Motivated by this, LightGODE utilizes a novel post-training graph convolution +method that bypasses the computation-intensive message passing of GCNs and +employs a non-parametric continuous graph ordinary-differential-equation (ODE) +to dynamically model node representations. This approach drastically reduces +training time while achieving fine-grained post-training graph convolution to +avoid the distortion of the original training embedding space, termed the +embedding discrepancy issue. We validate our model across several real-world +datasets of different scales, demonstrating that LightGODE not only outperforms +GCN-based models in terms of efficiency and effectiveness but also +significantly mitigates the embedding discrepancy commonly associated with +deeper graph convolution layers. Our LightGODE challenges the prevailing +paradigms in RecSys training and suggests re-evaluating the role of graph +convolutions, potentially guiding future developments of efficient large-scale +graph-based RecSys. + +
+
+ comment: Accepted to CIKM 2024 +
+
+
+
+
+ + ☆ A Flexible and Scalable Approach for Collecting Wildlife Advertisements + on the Web + + +
+ Wildlife traffickers are increasingly carrying out their activities in +cyberspace. As they advertise and sell wildlife products in online +marketplaces, they leave digital traces of their activity. This creates a new +opportunity: by analyzing these traces, we can obtain insights into how +trafficking networks work as well as how they can be disrupted. However, +collecting such information is difficult. Online marketplaces sell a very large +number of products and identifying ads that actually involve wildlife is a +complex task that is hard to automate. Furthermore, given that the volume of +data is staggering, we need scalable mechanisms to acquire, filter, and store +the ads, as well as to make them available for analysis. In this paper, we +present a new approach to collect wildlife trafficking data at scale. We +propose a data collection pipeline that combines scoped crawlers for data +discovery and acquisition with foundational models and machine learning +classifiers to identify relevant ads. We describe a dataset we created using +this pipeline which is, to the best of our knowledge, the largest of its kind: +it contains almost a million ads obtained from 41 marketplaces, covering 235 +species and 20 languages. The source code is publicly available at +\url{https://github.com/VIDA-NYU/wildlife_pipeline}. + +
+
+
+
+
+ + ☆ Human-artificial intelligence teaming for scientific information + extraction from data-driven additive manufacturing research using large + language models + + +
+ Data-driven research in Additive Manufacturing (AM) has gained significant +success in recent years. This has led to a plethora of scientific literature to +emerge. The knowledge in these works consists of AM and Artificial Intelligence +(AI) contexts that have not been mined and formalized in an integrated way. It +requires substantial effort and time to extract scientific information from +these works. AM domain experts have contributed over two dozen review papers to +summarize these works. However, information specific to AM and AI contexts +still requires manual effort to extract. The recent success of foundation +models such as BERT (Bidirectional Encoder Representations for Transformers) or +GPT (Generative Pre-trained Transformers) on textual data has opened the +possibility of expediting scientific information extraction. We propose a +framework that enables collaboration between AM and AI experts to continuously +extract scientific information from data-driven AM literature. A demonstration +tool is implemented based on the proposed framework and a case study is +conducted to extract information relevant to the datasets, modeling, sensing, +and AM system categories. We show the ability of LLMs (Large Language Models) +to expedite the extraction of relevant information from data-driven AM +literature. In the future, the framework can be used to extract information +from the broader design and manufacturing literature in the engineering +discipline. + +
+
+ comment: 11 pages, 5 Figures, 3 Tables. This paper has been accepted to be + published in the proceedings of IDETC-CIE 2024 +
+
+
+
+
+ + ☆ AutoRDF2GML: Facilitating RDF Integration in Graph Machine Learning ISWC'24 + + +
+ In this paper, we introduce AutoRDF2GML, a framework designed to convert RDF +data into data representations tailored for graph machine learning tasks. +AutoRDF2GML enables, for the first time, the creation of both content-based +features -- i.e., features based on RDF datatype properties -- and +topology-based features -- i.e., features based on RDF object properties. +Characterized by automated feature extraction, AutoRDF2GML makes it possible +even for users less familiar with RDF and SPARQL to generate data +representations ready for graph machine learning tasks, such as link +prediction, node classification, and graph classification. Furthermore, we +present four new benchmark datasets for graph machine learning, created from +large RDF knowledge graphs using our framework. These datasets serve as +valuable resources for evaluating graph machine learning approaches, such as +graph neural networks. Overall, our framework effectively bridges the gap +between the Graph Machine Learning and Semantic Web communities, paving the way +for RDF-based machine learning applications. + +
+
+ comment: accepted at ISWC'24 +
+
+
+
+
+ + ☆ Decoding Knowledge Claims: The Evaluation of Scientific Publication + Contributions through Semantic Analysis + + +
+ The surge in scientific publications challenges the use of publication counts +as a measure of scientific progress, requiring alternative metrics that +emphasize the quality and novelty of scientific contributions rather than sheer +quantity. This paper proposes the use of Relaxed Word Mover's Distance (RWMD), +a semantic text similarity measure, to evaluate the novelty of scientific +papers. We hypothesize that RWMD can more effectively gauge the growth of +scientific knowledge. To test such an assumption, we apply RWMD to evaluate +seminal papers, with Hirsch's H-Index paper as a primary case study. We compare +RWMD results across three groups: 1) H-Index-related papers, 2) scientometric +studies, and 3) unrelated papers, aiming to discern redundant literature and +hype from genuine innovations. Findings suggest that emphasizing knowledge +claims offers a deeper insight into scientific contributions, marking RWMD as a +promising alternative method to traditional citation metrics, thus better +tracking significant scientific breakthroughs. + +
+
+ comment: This paper was submitted to STI 2024 - 28th International Conference + on Science, Technology and Innovation Indicators STI 2024 +
+
+
+
+
+ + ☆ REAPER: Reasoning based Retrieval Planning for Complex RAG Systems + + +
+ Complex dialog systems often use retrieved evidence to facilitate factual +responses. Such RAG (Retrieval Augmented Generation) systems retrieve from +massive heterogeneous data stores that are usually architected as multiple +indexes or APIs instead of a single monolithic source. For a given query, +relevant evidence needs to be retrieved from one or a small subset of possible +retrieval sources. Complex queries can even require multi-step retrieval. For +example, a conversational agent on a retail site answering customer questions +about past orders will need to retrieve the appropriate customer order first +and then the evidence relevant to the customer's question in the context of the +ordered product. Most RAG Agents handle such Chain-of-Thought (CoT) tasks by +interleaving reasoning and retrieval steps. However, each reasoning step +directly adds to the latency of the system. For large models (>100B parameters) +this latency cost is significant -- in the order of multiple seconds. +Multi-agent systems may classify the query to a single Agent associated with a +retrieval source, though this means that a (small) classification model +dictates the performance of a large language model. In this work we present +REAPER (REAsoning-based PlannER) - an LLM based planner to generate retrieval +plans in conversational systems. We show significant gains in latency over +Agent-based systems and are able to scale easily to new and unseen use cases as +compared to classification-based planning. Though our method can be applied to +any RAG system, we show our results in the context of Rufus -- Amazon's +conversational shopping assistant. + +
+
+
+
+
+ + ☆ FedUD: Exploiting Unaligned Data for Cross-Platform Federated + Click-Through Rate Prediction + + +
+ Click-through rate (CTR) prediction plays an important role in online +advertising platforms. Most existing methods use data from the advertising +platform itself for CTR prediction. As user behaviors also exist on many other +platforms, e.g., media platforms, it is beneficial to further exploit such +complementary information for better modeling user interest and for improving +CTR prediction performance. However, due to privacy concerns, data from +different platforms cannot be uploaded to a server for centralized model +training. Vertical federated learning (VFL) provides a possible solution which +is able to keep the raw data on respective participating parties and learn a +collaborative model in a privacy-preserving way. However, traditional VFL +methods only utilize aligned data with common keys across parties, which +strongly restricts their application scope. In this paper, we propose FedUD, +which is able to exploit unaligned data, in addition to aligned data, for more +accurate federated CTR prediction. FedUD contains two steps. In the first step, +FedUD utilizes aligned data across parties like traditional VFL, but it +additionally includes a knowledge distillation module. This module distills +useful knowledge from the guest party's high-level representations and guides +the learning of a representation transfer network. In the second step, FedUD +applies the learned knowledge to enrich the representations of the host party's +unaligned data such that both aligned and unaligned data can contribute to +federated model training. Experiments on two real-world datasets demonstrate +the superior performance of FedUD for federated CTR prediction. + +
+
+
+
+
+ + ☆ Constructing the CORD-19 Vaccine Dataset + + +
+ We introduce new dataset 'CORD-19-Vaccination' to cater to scientists +specifically looking into COVID-19 vaccine-related research. This dataset is +extracted from CORD-19 dataset [Wang et al., 2020] and augmented with new +columns for language detail, author demography, keywords, and topic per paper. +Facebook's fastText model is used to identify languages [Joulin et al., 2016]. +To establish author demography (author affiliation, lab/institution location, +and lab/institution country columns) we processed the JSON file for each paper +and then further enhanced using Google's search API to determine country +values. 'Yake' was used to extract keywords from the title, abstract, and body +of each paper and the LDA (Latent Dirichlet Allocation) algorithm was used to +add topic information [Campos et al., 2020, 2018a,b]. To evaluate the dataset, +we demonstrate a question-answering task like the one used in the CORD-19 +Kaggle challenge [Goldbloom et al., 2022]. For further evaluation, sequential +sentence classification was performed on each paper's abstract using the model +from Dernoncourt et al. [2016]. We partially hand annotated the training +dataset and used a pre-trained BERT-PubMed layer. 'CORD- 19-Vaccination' +contains 30k research papers and can be immensely valuable for NLP research +such as text mining, information extraction, and question answering, specific +to the domain of COVID-19 vaccine research. + +
+
+
+
+
+ + ☆ Synergizing Knowledge Graphs with Large Language Models: A Comprehensive + Review and Future Prospects + + +
+ Recent advancements have witnessed the ascension of Large Language Models +(LLMs), endowed with prodigious linguistic capabilities, albeit marred by +shortcomings including factual inconsistencies and opacity. Conversely, +Knowledge Graphs (KGs) harbor verifiable knowledge and symbolic reasoning +prowess, thereby complementing LLMs' deficiencies. Against this backdrop, the +synergy between KGs and LLMs emerges as a pivotal research direction. Our +contribution in this paper is a comprehensive dissection of the latest +developments in integrating KGs with LLMs. Through meticulous analysis of their +confluence points and methodologies, we introduce a unifying framework designed +to elucidate and stimulate further exploration among scholars engaged in +cognate disciplines. This framework serves a dual purpose: it consolidates +extant knowledge while simultaneously delineating novel avenues for real-world +deployment, thereby amplifying the translational impact of academic research. + +
+
+
+
+
+ + ☆ Sponsored is the New Organic: Implications of Sponsored Results on + Quality of Search Results in the Amazon Marketplace AAAI + + +
+ Interleaving sponsored results (advertisements) amongst organic results on +search engine result pages (SERP) has become a common practice across multiple +digital platforms. Advertisements have catered to consumer satisfaction and +fostered competition in digital public spaces; making them an appealing gateway +for businesses to reach their consumers. However, especially in the context of +digital marketplaces, due to the competitive nature of the sponsored results +with the organic ones, multiple unwanted repercussions have surfaced affecting +different stakeholders. From the consumers' perspective the sponsored +ads/results may cause degradation of search quality and nudge consumers to +potentially irrelevant and costlier products. The sponsored ads may also affect +the level playing field of the competition in the marketplaces among sellers. +To understand and unravel these potential concerns, we analyse the Amazon +digital marketplace in four different countries by simulating 4,800 search +operations. Our analyses over SERPs consisting 2M organic and 638K sponsored +results show items with poor organic ranks (beyond 100th position) appear as +sponsored results even before the top organic results on the first page of +Amazon SERP. Moreover, we also observe that in majority of the cases, these top +sponsored results are costlier and are of poorer quality than the top organic +results. We believe these observations can motivate researchers for further +deliberation to bring in more transparency and guard rails in the advertising +practices followed in digital marketplaces. + +
+
+ comment: This work has been accepted as a full paper in AAAI/ACM conference on + Artificial Intelligence, Ethics and Society (AIES) 2024 +
+
+
+
+
+ + ☆ MetaHive: A Cache-Optimized Metadata Management for Heterogeneous + Key-Value Stores + + +
+ Cloud key-value (KV) stores provide businesses with a cost-effective and +adaptive alternative to traditional on-premise data management solutions. KV +stores frequently consist of heterogeneous clusters, characterized by varying +hardware specifications of the deployment nodes, with each node potentially +running a distinct version of the KV store software. This heterogeneity is +accompanied by the diverse metadata that they need to manage. In this study, we +introduce MetaHive, a cache-optimized approach to managing metadata in +heterogeneous KV store clusters. MetaHive disaggregates the original data from +its associated metadata to promote independence between them, while maintaining +their interconnection during usage. This makes the metadata opaque from the +downstream processes and the other KV stores in the cluster. MetaHive also +ensures that the KV and metadata entries are stored in the vicinity of each +other in memory and storage. This allows MetaHive to optimally utilize the +caching mechanism without extra storage read overhead for metadata retrieval. +We deploy MetaHive to ensure data integrity in RocksDB and demonstrate its +rapid data validation with minimal effect on performance. + +
+
+ comment: Cloud Databases +
+
+
+
+
+ + ☆ Modular RAG: Transforming RAG Systems into LEGO-like Reconfigurable + Frameworks + + +
+ Retrieval-augmented Generation (RAG) has markedly enhanced the capabilities +of Large Language Models (LLMs) in tackling knowledge-intensive tasks. The +increasing demands of application scenarios have driven the evolution of RAG, +leading to the integration of advanced retrievers, LLMs and other complementary +technologies, which in turn has amplified the intricacy of RAG systems. +However, the rapid advancements are outpacing the foundational RAG paradigm, +with many methods struggling to be unified under the process of +"retrieve-then-generate". In this context, this paper examines the limitations +of the existing RAG paradigm and introduces the modular RAG framework. By +decomposing complex RAG systems into independent modules and specialized +operators, it facilitates a highly reconfigurable framework. Modular RAG +transcends the traditional linear architecture, embracing a more advanced +design that integrates routing, scheduling, and fusion mechanisms. Drawing on +extensive research, this paper further identifies prevalent RAG +patterns-linear, conditional, branching, and looping-and offers a comprehensive +analysis of their respective implementation nuances. Modular RAG presents +innovative opportunities for the conceptualization and deployment of RAG +systems. Finally, the paper explores the potential emergence of new operators +and paradigms, establishing a solid theoretical foundation and a practical +roadmap for the continued evolution and practical deployment of RAG +technologies. + +
+
+
+
+
+ + ♻ ☆ EHR-SeqSQL : A Sequential Text-to-SQL Dataset For Interactively + Exploring Electronic Health Records ACL 2024 + + +
+ In this paper, we introduce EHR-SeqSQL, a novel sequential text-to-SQL +dataset for Electronic Health Record (EHR) databases. EHR-SeqSQL is designed to +address critical yet underexplored aspects in text-to-SQL parsing: +interactivity, compositionality, and efficiency. To the best of our knowledge, +EHR-SeqSQL is not only the largest but also the first medical text-to-SQL +dataset benchmark to include sequential and contextual questions. We provide a +data split and the new test set designed to assess compositional generalization +ability. Our experiments demonstrate the superiority of a multi-turn approach +over a single-turn approach in learning compositionality. Additionally, our +dataset integrates specially crafted tokens into SQL queries to improve +execution efficiency. With EHR-SeqSQL, we aim to bridge the gap between +practical needs and academic research in the text-to-SQL domain. EHR-SeqSQL is +available \href{https://github.com/seonhee99/EHR-SeqSQL}{at this https URL}. + +
+
+ comment: ACL 2024 (Findings) +
+
+
+
+
+ + ♻ ☆ AMIR: Automated MisInformation Rebuttal -- A COVID-19 Vaccination + Datasets based Recommendation System + + +
+ Misinformation has emerged as a major societal threat in recent years in +general; specifically in the context of the COVID-19 pandemic, it has wrecked +havoc, for instance, by fuelling vaccine hesitancy. Cost-effective, scalable +solutions for combating misinformation are the need of the hour. This work +explored how existing information obtained from social media and augmented with +more curated fact checked data repositories can be harnessed to facilitate +automated rebuttal of misinformation at scale. While the ideas herein can be +generalized and reapplied in the broader context of misinformation mitigation +using a multitude of information sources and catering to the spectrum of social +media platforms, this work serves as a proof of concept, and as such, it is +confined in its scope to only rebuttal of tweets, and in the specific context +of misinformation regarding COVID-19. It leverages two publicly available +datasets, viz. FaCov (fact-checked articles) and misleading (social media +Twitter) data on COVID-19 Vaccination. + +
+
+ comment: Please cite our published paper on IEEE Transactions on Computational + Social Systems +
+
+
+
+
+ + ♻ ☆ RAM-EHR: Retrieval Augmentation Meets Clinical Predictions on Electronic + Health Records ACL 2024 + + +
+ We present RAM-EHR, a Retrieval AugMentation pipeline to improve clinical +predictions on Electronic Health Records (EHRs). RAM-EHR first collects +multiple knowledge sources, converts them into text format, and uses dense +retrieval to obtain information related to medical concepts. This strategy +addresses the difficulties associated with complex names for the concepts. +RAM-EHR then augments the local EHR predictive model co-trained with +consistency regularization to capture complementary information from patient +visits and summarized knowledge. Experiments on two EHR datasets show the +efficacy of RAM-EHR over previous knowledge-enhanced baselines (3.4% gain in +AUROC and 7.2% gain in AUPR), emphasizing the effectiveness of the summarized +knowledge from RAM-EHR for clinical prediction tasks. The code will be +published at \url{https://github.com/ritaranx/RAM-EHR}. + +
+
+ comment: ACL 2024 (Oral) +
+
+
+
+
+ + ♻ ☆ Density-based User Representation using Gaussian Process Regression for + Multi-interest Personalized Retrieval + + +
+ Accurate modeling of the diverse and dynamic interests of users remains a +significant challenge in the design of personalized recommender systems. +Existing user modeling methods, like single-point and multi-point +representations, have limitations w.r.t.\ accuracy, diversity, and +adaptability. To overcome these deficiencies, we introduce density-based user +representations (DURs), a novel method that leverages Gaussian process +regression (GPR) for effective multi-interest recommendation and retrieval. Our +approach, GPR4DUR, exploits DURs to capture user interest variability without +manual tuning, incorporates uncertainty-awareness, and scales well to large +numbers of users. Experiments using real-world offline datasets confirm the +adaptability and efficiency of GPR4DUR, while online experiments with simulated +users demonstrate its ability to address the exploration-exploitation trade-off +by effectively utilizing model uncertainty. + +
+
+ comment: 22 pages +
+
+
+
+
+
+
+
+ + Machine Learning 132 + +
+
+
+ + ☆ SOAP-RL: Sequential Option Advantage Propagation for Reinforcement + Learning in POMDP Environments + + +
+ This work compares ways of extending Reinforcement Learning algorithms to +Partially Observed Markov Decision Processes (POMDPs) with options. One view of +options is as temporally extended action, which can be realized as a memory +that allows the agent to retain historical information beyond the policy's +context window. While option assignment could be handled using heuristics and +hand-crafted objectives, learning temporally consistent options and associated +sub-policies without explicit supervision is a challenge. Two algorithms, PPOEM +and SOAP, are proposed and studied in depth to address this problem. PPOEM +applies the forward-backward algorithm (for Hidden Markov Models) to optimize +the expected returns for an option-augmented policy. However, this learning +approach is unstable during on-policy rollouts. It is also unsuited for +learning causal policies without the knowledge of future trajectories, since +option assignments are optimized for offline sequences where the entire episode +is available. As an alternative approach, SOAP evaluates the policy gradient +for an optimal option assignment. It extends the concept of the generalized +advantage estimation (GAE) to propagate option advantages through time, which +is an analytical equivalent to performing temporal back-propagation of option +policy gradients. This option policy is only conditional on the history of the +agent, not future actions. Evaluated against competing baselines, SOAP +exhibited the most robust performance, correctly discovering options for POMDP +corridor environments, as well as on standard benchmarks including Atari and +MuJoCo, outperforming PPOEM, as well as LSTM and Option-Critic baselines. The +open-sourced code is available at https://github.com/shuishida/SoapRL. + +
+
+
+
+
+ + ☆ Do We Really Need Graph Convolution During Training? Light Post-Training + Graph-ODE for Efficient Recommendation CIKM 2024 + + +
+ The efficiency and scalability of graph convolution networks (GCNs) in +training recommender systems (RecSys) have been persistent concerns, hindering +their deployment in real-world applications. This paper presents a critical +examination of the necessity of graph convolutions during the training phase +and introduces an innovative alternative: the Light Post-Training Graph +Ordinary-Differential-Equation (LightGODE). Our investigation reveals that the +benefits of GCNs are more pronounced during testing rather than training. +Motivated by this, LightGODE utilizes a novel post-training graph convolution +method that bypasses the computation-intensive message passing of GCNs and +employs a non-parametric continuous graph ordinary-differential-equation (ODE) +to dynamically model node representations. This approach drastically reduces +training time while achieving fine-grained post-training graph convolution to +avoid the distortion of the original training embedding space, termed the +embedding discrepancy issue. We validate our model across several real-world +datasets of different scales, demonstrating that LightGODE not only outperforms +GCN-based models in terms of efficiency and effectiveness but also +significantly mitigates the embedding discrepancy commonly associated with +deeper graph convolution layers. Our LightGODE challenges the prevailing +paradigms in RecSys training and suggests re-evaluating the role of graph +convolutions, potentially guiding future developments of efficient large-scale +graph-based RecSys. + +
+
+ comment: Accepted to CIKM 2024 +
+
+
+
+
+ + ☆ Hybrid summary statistics: neural weak lensing inference beyond the + power spectrum + + +
+ In inference problems, we often have domain knowledge which allows us to +define summary statistics that capture most of the information content in a +dataset. In this paper, we present a hybrid approach, where such physics-based +summaries are augmented by a set of compressed neural summary statistics that +are optimised to extract the extra information that is not captured by the +predefined summaries. The resulting statistics are very powerful inputs to +simulation-based or implicit inference of model parameters. We apply this +generalisation of Information Maximising Neural Networks (IMNNs) to parameter +constraints from tomographic weak gravitational lensing convergence maps to +find summary statistics that are explicitly optimised to complement angular +power spectrum estimates. We study several dark matter simulation resolutions +in low- and high-noise regimes. We show that i) the information-update +formalism extracts at least $3\times$ and up to $8\times$ as much information +as the angular power spectrum in all noise regimes, ii) the network summaries +are highly complementary to existing 2-point summaries, and iii) our formalism +allows for networks with smaller, physically-informed architectures to match +much larger regression networks with far fewer simulations needed to obtain +asymptotically optimal inference. + +
+
+ comment: 16 pages, 11 figures. Submitted to JCAP. We provide publicly + available code at https://github.com/tlmakinen/hybridStatsWL +
+
+
+
+
+ + ☆ Wolf: Captioning Everything with a World Summarization Framework + + +
+ We propose Wolf, a WOrLd summarization Framework for accurate video +captioning. Wolf is an automated captioning framework that adopts a +mixture-of-experts approach, leveraging complementary strengths of Vision +Language Models (VLMs). By utilizing both image and video models, our framework +captures different levels of information and summarizes them efficiently. Our +approach can be applied to enhance video understanding, auto-labeling, and +captioning. To evaluate caption quality, we introduce CapScore, an LLM-based +metric to assess the similarity and quality of generated captions compared to +the ground truth captions. We further build four human-annotated datasets in +three domains: autonomous driving, general scenes, and robotics, to facilitate +comprehensive comparisons. We show that Wolf achieves superior captioning +performance compared to state-of-the-art approaches from the research community +(VILA1.5, CogAgent) and commercial solutions (Gemini-Pro-1.5, GPT-4V). For +instance, in comparison with GPT-4V, Wolf improves CapScore both quality-wise +by 55.6% and similarity-wise by 77.4% on challenging driving videos. Finally, +we establish a benchmark for video captioning and introduce a leaderboard, +aiming to accelerate advancements in video understanding, captioning, and data +alignment. Leaderboard: https://wolfv0.github.io/leaderboard.html. + +
+
+
+
+
+ + ☆ A Scalable Quantum Non-local Neural Network for Image Classification + + +
+ Non-local operations play a crucial role in computer vision enabling the +capture of long-range dependencies through weighted sums of features across the +input, surpassing the constraints of traditional convolution operations that +focus solely on local neighborhoods. Non-local operations typically require +computing pairwise relationships between all elements in a set, leading to +quadratic complexity in terms of time and memory. Due to the high computational +and memory demands, scaling non-local neural networks to large-scale problems +can be challenging. This article introduces a hybrid quantum-classical scalable +non-local neural network, referred to as Quantum Non-Local Neural Network +(QNL-Net), to enhance pattern recognition. The proposed QNL-Net relies on +inherent quantum parallelism to allow the simultaneous processing of a large +number of input features enabling more efficient computations in +quantum-enhanced feature space and involving pairwise relationships through +quantum entanglement. We benchmark our proposed QNL-Net with other quantum +counterparts to binary classification with datasets MNIST and CIFAR-10. The +simulation findings showcase our QNL-Net achieves cutting-edge accuracy levels +in binary image classification among quantum classifiers while utilizing fewer +qubits. + +
+
+ comment: draft, 13 pages (including references and appendix), 5 figures +
+
+
+
+
+ + ☆ Lessons from Learning to Spin "Pens" + + +
+ In-hand manipulation of pen-like objects is an important skill in our daily +lives, as many tools such as hammers and screwdrivers are similarly shaped. +However, current learning-based methods struggle with this task due to a lack +of high-quality demonstrations and the significant gap between simulation and +the real world. In this work, we push the boundaries of learning-based in-hand +manipulation systems by demonstrating the capability to spin pen-like objects. +We first use reinforcement learning to train an oracle policy with privileged +information and generate a high-fidelity trajectory dataset in simulation. This +serves two purposes: 1) pre-training a sensorimotor policy in simulation; 2) +conducting open-loop trajectory replay in the real world. We then fine-tune the +sensorimotor policy using these real-world trajectories to adapt it to the real +world dynamics. With less than 50 trajectories, our policy learns to rotate +more than ten pen-like objects with different physical properties for multiple +revolutions. We present a comprehensive analysis of our design choices and +share the lessons learned during development. + +
+
+ comment: Website: https://penspin.github.io/ +
+
+
+
+
+ + ☆ AppWorld: A Controllable World of Apps and People for Benchmarking + Interactive Coding Agents ACL'24 + + +
+ Autonomous agents that address day-to-day digital tasks (e.g., ordering +groceries for a household), must not only operate multiple apps (e.g., notes, +messaging, shopping app) via APIs, but also generate rich code with complex +control flow in an iterative manner based on their interaction with the +environment. However, existing benchmarks for tool use are inadequate, as they +only cover tasks that require a simple sequence of API calls. + To remedy this gap, we built $\textbf{AppWorld Engine}$, a high-quality +execution environment (60K lines of code) of 9 day-to-day apps operable via 457 +APIs and populated with realistic digital activities simulating the lives of +~100 fictitious users. We then created $\textbf{AppWorld Benchmark}$ (40K lines +of code), a suite of 750 natural, diverse, and challenging autonomous agent +tasks requiring rich and interactive code generation. It supports robust +programmatic evaluation with state-based unit tests, allowing for different +ways of completing a task while also checking for unexpected changes, i.e., +collateral damage. The state-of-the-art LLM, GPT-4o, solves only ~49% of our +'normal' tasks and ~30% of 'challenge' tasks, while other models solve at least +16% fewer. This highlights the benchmark's difficulty and AppWorld's potential +to push the frontiers of interactive coding agents. The project website is +available at https://appworld.dev/. + +
+
+ comment: ACL'24 Camera Ready +
+
+
+
+
+ + ☆ Learn from the Learnt: Source-Free Active Domain Adaptation via + Contrastive Sampling and Visual Persistence ECCV 2024 + + +
+ Domain Adaptation (DA) facilitates knowledge transfer from a source domain to +a related target domain. This paper investigates a practical DA paradigm, +namely Source data-Free Active Domain Adaptation (SFADA), where source data +becomes inaccessible during adaptation, and a minimum amount of annotation +budget is available in the target domain. Without referencing the source data, +new challenges emerge in identifying the most informative target samples for +labeling, establishing cross-domain alignment during adaptation, and ensuring +continuous performance improvements through the iterative query-and-adaptation +process. In response, we present learn from the learnt (LFTL), a novel paradigm +for SFADA to leverage the learnt knowledge from the source pretrained model and +actively iterated models without extra overhead. We propose Contrastive Active +Sampling to learn from the hypotheses of the preceding model, thereby querying +target samples that are both informative to the current model and persistently +challenging throughout active learning. During adaptation, we learn from +features of actively selected anchors obtained from previous intermediate +models, so that the Visual Persistence-guided Adaptation can facilitate feature +distribution alignment and active sample exploitation. Extensive experiments on +three widely-used benchmarks show that our LFTL achieves state-of-the-art +performance, superior computational efficiency and continuous improvements as +the annotation budget increases. Our code is available at +https://github.com/lyumengyao/lftl. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ☆ Small Molecule Optimization with Large Language Models + + +
+ Recent advancements in large language models have opened new possibilities +for generative molecular drug design. We present Chemlactica and Chemma, two +language models fine-tuned on a novel corpus of 110M molecules with computed +properties, totaling 40B tokens. These models demonstrate strong performance in +generating molecules with specified properties and predicting new molecular +characteristics from limited samples. We introduce a novel optimization +algorithm that leverages our language models to optimize molecules for +arbitrary properties given limited access to a black box oracle. Our approach +combines ideas from genetic algorithms, rejection sampling, and prompt +optimization. It achieves state-of-the-art performance on multiple molecular +optimization benchmarks, including an 8% improvement on Practical Molecular +Optimization compared to previous methods. We publicly release the training +corpus, the language models and the optimization algorithm. + +
+
+
+
+
+ + ☆ On the Pros and Cons of Active Learning for Moral Preference Elicitation + + +
+ Computational preference elicitation methods are tools used to learn people's +preferences quantitatively in a given context. Recent works on preference +elicitation advocate for active learning as an efficient method to iteratively +construct queries (framed as comparisons between context-specific cases) that +are likely to be most informative about an agent's underlying preferences. In +this work, we argue that the use of active learning for moral preference +elicitation relies on certain assumptions about the underlying moral +preferences, which can be violated in practice. Specifically, we highlight the +following common assumptions (a) preferences are stable over time and not +sensitive to the sequence of presented queries, (b) the appropriate hypothesis +class is chosen to model moral preferences, and (c) noise in the agent's +responses is limited. While these assumptions can be appropriate for preference +elicitation in certain domains, prior research on moral psychology suggests +they may not be valid for moral judgments. Through a synthetic simulation of +preferences that violate the above assumptions, we observe that active learning +can have similar or worse performance than a basic random query selection +method in certain settings. Yet, simulation results also demonstrate that +active learning can still be viable if the degree of instability or noise is +relatively small and when the agent's preferences can be approximately +represented with the hypothesis class used for learning. Our study highlights +the nuances associated with effective moral preference elicitation in practice +and advocates for the cautious use of active learning as a methodology to learn +moral preferences. + +
+
+ comment: To appear in AIES 2024 +
+
+
+
+
+ + ☆ Embedding And Clustering Your Data Can Improve Contrastive Pretraining + + +
+ Recent studies of large-scale contrastive pretraining in the text embedding +domain show that using single-source minibatches, rather than mixed-source +minibatches, can substantially improve overall model accuracy. In this work, we +explore extending training data stratification beyond source granularity by +leveraging a pretrained text embedding model and the classic k-means clustering +algorithm to further split training data apart by the semantic clusters within +each source. Experimentally, we observe a notable increase in NDCG@10 when +pretraining a BERT-based text embedding model on query-passage pairs from the +MSMARCO passage retrieval dataset. Additionally, we conceptually connect our +clustering approach to both the Topic Aware Sampling (TAS) aspect of the TAS-B +methodology and the nearest-neighbor-based hard-negative mining aspect of the +ANCE methodology and discuss how this unified view motivates future lines of +research on the organization of contrastive pretraining data. + +
+
+ comment: 16 pages, 3 figures, 2 tables +
+
+
+
+
+ + ☆ Utilizing TTS Synthesized Data for Efficient Development of Keyword + Spotting Model + + +
+ This paper explores the use of TTS synthesized training data for KWS (keyword +spotting) task while minimizing development cost and time. Keyword spotting +models require a huge amount of training data to be accurate, and obtaining +such training data can be costly. In the current state of the art, TTS models +can generate large amounts of natural-sounding data, which can help reducing +cost and time for KWS model development. Still, TTS generated data can be +lacking diversity compared to real data. To pursue maximizing KWS model +accuracy under the constraint of limited resources and current TTS capability, +we explored various strategies to mix TTS data and real human speech data, with +a focus on minimizing real data use and maximizing diversity of TTS output. Our +experimental results indicate that relatively small amounts of real audio data +with speaker diversity (100 speakers, 2k utterances) and large amounts of TTS +synthesized data can achieve reasonably high accuracy (within 3x error rate of +baseline), compared to the baseline (trained with 3.8M real positive +utterances). + +
+
+ comment: to be published in a Workshop at Interspeech 2024, Synthetic Data's + Transformative Role in Foundational Speech Models +
+
+
+
+
+ + ☆ An Accelerated Multi-level Monte Carlo Approach for Average Reward + Reinforcement Learning with General Policy Parametrization + + +
+ In our study, we delve into average-reward reinforcement learning with +general policy parametrization. Within this domain, current guarantees either +fall short with suboptimal guarantees or demand prior knowledge of mixing time. +To address these issues, we introduce Randomized Accelerated Natural Actor +Critic, a method that integrates Multi-level Monte-Carlo and Natural Actor +Critic. Our approach is the first to achieve global convergence rate of +$\tilde{\mathcal{O}}(1/\sqrt{T})$ without requiring knowledge of mixing time, +significantly surpassing the state-of-the-art bound of +$\tilde{\mathcal{O}}(1/T^{1/4})$. + +
+
+ comment: 28 pages, 1 table +
+
+
+
+
+ + ☆ Generative Adversarial Networks for Imputing Sparse Learning Performance + + +
+ Learning performance data, such as correct or incorrect responses to +questions in Intelligent Tutoring Systems (ITSs) is crucial for tracking and +assessing the learners' progress and mastery of knowledge. However, the issue +of data sparsity, characterized by unexplored questions and missing attempts, +hampers accurate assessment and the provision of tailored, personalized +instruction within ITSs. This paper proposes using the Generative Adversarial +Imputation Networks (GAIN) framework to impute sparse learning performance +data, reconstructed into a three-dimensional (3D) tensor representation across +the dimensions of learners, questions and attempts. Our customized GAIN-based +method computational process imputes sparse data in a 3D tensor space, +significantly enhanced by convolutional neural networks for its input and +output layers. This adaptation also includes the use of a least squares loss +function for optimization and aligns the shapes of the input and output with +the dimensions of the questions-attempts matrices along the learners' +dimension. Through extensive experiments on six datasets from various ITSs, +including AutoTutor, ASSISTments and MATHia, we demonstrate that the GAIN +approach generally outperforms existing methods such as tensor factorization +and other generative adversarial network (GAN) based approaches in terms of +imputation accuracy. This finding enhances comprehensive learning data modeling +and analytics in AI-based education. + +
+
+
+
+
+ + ☆ Downlink CCM Estimation via Representation Learning with Graph + Regularization + + +
+ In this paper, we propose an algorithm for downlink (DL) channel covariance +matrix (CCM) estimation for frequency division duplexing (FDD) massive +multiple-input multiple-output (MIMO) communication systems with base station +(BS) possessing a uniform linear array (ULA) antenna structure. We make use of +the inherent similarity between the uplink (UL) CCM and the DL CCM due to +angular reciprocity. We consider a setting where the UL CCM is mapped to DL CCM +by a mapping function. We first present a theoretical error analysis of +learning a nonlinear embedding by constructing a mapping function, which points +to the importance of the Lipschitz regularity of the mapping function for +achieving high estimation performance. Then, based on the theoretical ground, +we propose a representation learning algorithm as a solution for the estimation +problem, where Gaussian RBF kernel interpolators are chosen to map UL CCMs to +their DL counterparts. The proposed algorithm is based on the optimization of +an objective function that fits a regression model between the DL CCM and UL +CCM samples in the training dataset and preserves the local geometric structure +of the data in the UL CCM space, while explicitly regulating the Lipschitz +continuity of the mapping function in light of our theoretical findings. The +proposed algorithm surpasses benchmark methods in terms of three error metrics +as shown by simulations. + +
+
+
+
+
+ + ☆ Enhancing material property prediction with ensemble deep graph + convolutional networks + + +
+ Machine learning (ML) models have emerged as powerful tools for accelerating +materials discovery and design by enabling accurate predictions of properties +from compositional and structural data. These capabilities are vital for +developing advanced technologies across fields such as energy, electronics, and +biomedicine, potentially reducing the time and resources needed for new +material exploration and promoting rapid innovation cycles. Recent efforts have +focused on employing advanced ML algorithms, including deep learning - based +graph neural network, for property prediction. Additionally, ensemble models +have proven to enhance the generalizability and robustness of ML and DL. +However, the use of such ensemble strategies in deep graph networks for +material property prediction remains underexplored. Our research provides an +in-depth evaluation of ensemble strategies in deep learning - based graph +neural network, specifically targeting material property prediction tasks. By +testing the Crystal Graph Convolutional Neural Network (CGCNN) and its +multitask version, MT-CGCNN, we demonstrated that ensemble techniques, +especially prediction averaging, substantially improve precision beyond +traditional metrics for key properties like formation energy per atom ($\Delta +E^{f}$), band gap ($E_{g}$) and density ($\rho$) in 33,990 stable inorganic +materials. These findings support the broader application of ensemble methods +to enhance predictive accuracy in the field. + +
+
+ comment: 9 pages, 6 figures, 2 tables +
+
+
+
+
+ + ☆ QT-TDM: Planning with Transformer Dynamics Model and Autoregressive + Q-Learning + + +
+ Inspired by the success of the Transformer architecture in natural language +processing and computer vision, we investigate the use of Transformers in +Reinforcement Learning (RL), specifically in modeling the environment's +dynamics using Transformer Dynamics Models (TDMs). We evaluate the capabilities +of TDMs for continuous control in real-time planning scenarios with Model +Predictive Control (MPC). While Transformers excel in long-horizon prediction, +their tokenization mechanism and autoregressive nature lead to costly planning +over long horizons, especially as the environment's dimensionality increases. +To alleviate this issue, we use a TDM for short-term planning, and learn an +autoregressive discrete Q-function using a separate Q-Transformer (QT) model to +estimate a long-term return beyond the short-horizon planning. Our proposed +method, QT-TDM, integrates the robust predictive capabilities of Transformers +as dynamics models with the efficacy of a model-free Q-Transformer to mitigate +the computational burden associated with real-time planning. Experiments in +diverse state-based continuous control tasks show that QT-TDM is superior in +performance and sample efficiency compared to existing Transformer-based RL +models while achieving fast and computationally efficient inference. + +
+
+
+
+
+ + ☆ The Cross-environment Hyperparameter Setting Benchmark for Reinforcement + Learning + + +
+ This paper introduces a new empirical methodology, the Cross-environment +Hyperparameter Setting Benchmark, that compares RL algorithms across +environments using a single hyperparameter setting, encouraging algorithmic +development which is insensitive to hyperparameters. We demonstrate that this +benchmark is robust to statistical noise and obtains qualitatively similar +results across repeated applications, even when using few samples. This +robustness makes the benchmark computationally cheap to apply, allowing +statistically sound insights at low cost. We demonstrate two example +instantiations of the CHS, on a set of six small control environments (SC-CHS) +and on the entire DM Control suite of 28 environments (DMC-CHS). Finally, to +illustrate the applicability of the CHS to modern RL algorithms on challenging +environments, we conduct a novel empirical study of an open question in the +continuous control literature. We show, with high confidence, that there is no +meaningful difference in performance between Ornstein-Uhlenbeck noise and +uncorrelated Gaussian noise for exploration with the DDPG algorithm on the +DMC-CHS. + +
+
+ comment: Accepted to RLC 2024 +
+
+
+
+
+ + ☆ The Role of Temporal Hierarchy in Spiking Neural Networks + + +
+ Spiking Neural Networks (SNNs) have the potential for rich spatio-temporal +signal processing thanks to exploiting both spatial and temporal parameters. +The temporal dynamics such as time constants of the synapses and neurons and +delays have been recently shown to have computational benefits that help reduce +the overall number of parameters required in the network and increase the +accuracy of the SNNs in solving temporal tasks. Optimizing such temporal +parameters, for example, through gradient descent, gives rise to a temporal +architecture for different problems. As has been shown in machine learning, to +reduce the cost of optimization, architectural biases can be applied, in this +case in the temporal domain. Such inductive biases in temporal parameters have +been found in neuroscience studies, highlighting a hierarchy of temporal +structure and input representation in different layers of the cortex. Motivated +by this, we propose to impose a hierarchy of temporal representation in the +hidden layers of SNNs, highlighting that such an inductive bias improves their +performance. We demonstrate the positive effects of temporal hierarchy in the +time constants of feed-forward SNNs applied to temporal tasks (Multi-Time-Scale +XOR and Keyword Spotting, with a benefit of up to 4.1% in classification +accuracy). Moreover, we show that such architectural biases, i.e. hierarchy of +time constants, naturally emerge when optimizing the time constants through +gradient descent, initialized as homogeneous values. We further pursue this +proposal in temporal convolutional SNNs, by introducing the hierarchical bias +in the size and dilation of temporal kernels, giving rise to competitive +results in popular temporal spike-based datasets. + +
+
+ comment: 16 pages, 9 figures, pre-print +
+
+
+
+
+ + ☆ Deep Companion Learning: Enhancing Generalization Through Historical + Consistency ECCV 2024 + + +
+ We propose Deep Companion Learning (DCL), a novel training method for Deep +Neural Networks (DNNs) that enhances generalization by penalizing inconsistent +model predictions compared to its historical performance. To achieve this, we +train a deep-companion model (DCM), by using previous versions of the model to +provide forecasts on new inputs. This companion model deciphers a meaningful +latent semantic structure within the data, thereby providing targeted +supervision that encourages the primary model to address the scenarios it finds +most challenging. We validate our approach through both theoretical analysis +and extensive experimentation, including ablation studies, on a variety of +benchmark datasets (CIFAR-100, Tiny-ImageNet, ImageNet-1K) using diverse +architectural models (ShuffleNetV2, ResNet, Vision Transformer, etc.), +demonstrating state-of-the-art performance. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ☆ Online Planning in POMDPs with State-Requests + + +
+ In key real-world problems, full state information is sometimes available but +only at a high cost, like activating precise yet energy-intensive sensors or +consulting humans, thereby compelling the agent to operate under partial +observability. For this scenario, we propose AEMS-SR (Anytime Error +Minimization Search with State Requests), a principled online planning +algorithm tailored for POMDPs with state requests. By representing the search +space as a graph instead of a tree, AEMS-SR avoids the exponential growth of +the search space originating from state requests. Theoretical analysis +demonstrates AEMS-SR's $\varepsilon$-optimality, ensuring solution quality, +while empirical evaluations illustrate its effectiveness compared with AEMS and +POMCP, two SOTA online planning algorithms. AEMS-SR enables efficient planning +in domains characterized by partial observability and costly state requests +offering practical benefits across various applications. + +
+
+
+
+
+ + ☆ Interpreting artificial neural networks to detect genome-wide + association signals for complex traits + + +
+ Investigating the genetic architecture of complex diseases is challenging due +to the highly polygenic and interactive landscape of genetic and environmental +factors. Although genome-wide association studies (GWAS) have identified +thousands of variants for multiple complex phenotypes, conventional statistical +approaches can be limited by simplified assumptions such as linearity and lack +of epistasis models. In this work, we trained artificial neural networks for +predicting complex traits using both simulated and real genotype/phenotype +datasets. We extracted feature importance scores via different post hoc +interpretability methods to identify potentially associated loci (PAL) for the +target phenotype. Simulations we performed with various parameters demonstrated +that associated loci can be detected with good precision using strict selection +criteria, but downstream analyses are required for fine-mapping the exact +variants due to linkage disequilibrium, similarly to conventional GWAS. By +applying our approach to the schizophrenia cohort in the Estonian Biobank, we +were able to detect multiple PAL related to this highly polygenic and heritable +disorder. We also performed enrichment analyses with PAL in genic regions, +which predominantly identified terms associated with brain morphology. With +further improvements in model optimization and confidence measures, artificial +neural networks can enhance the identification of genomic loci associated with +complex diseases, providing a more comprehensive approach for GWAS and serving +as initial screening tools for subsequent functional studies. + Keywords: Deep learning, interpretability, genome-wide association studies, +complex diseases + +
+
+
+
+
+ + ☆ Learning Chaotic Systems and Long-Term Predictions with Neural Jump ODEs + + +
+ The Path-dependent Neural Jump ODE (PD-NJ-ODE) is a model for online +prediction of generic (possibly non-Markovian) stochastic processes with +irregular (in time) and potentially incomplete (with respect to coordinates) +observations. It is a model for which convergence to the $L^2$-optimal +predictor, which is given by the conditional expectation, is established +theoretically. Thereby, the training of the model is solely based on a dataset +of realizations of the underlying stochastic process, without the need of +knowledge of the law of the process. In the case where the underlying process +is deterministic, the conditional expectation coincides with the process +itself. Therefore, this framework can equivalently be used to learn the +dynamics of ODE or PDE systems solely from realizations of the dynamical system +with different initial conditions. We showcase the potential of our method by +applying it to the chaotic system of a double pendulum. When training the +standard PD-NJ-ODE method, we see that the prediction starts to diverge from +the true path after about half of the evaluation time. In this work we enhance +the model with two novel ideas, which independently of each other improve the +performance of our modelling setup. The resulting dynamics match the true +dynamics of the chaotic system very closely. The same enhancements can be used +to provably enable the PD-NJ-ODE to learn long-term predictions for general +stochastic datasets, where the standard model fails. This is verified in +several experiments. + +
+
+
+
+
+ + ☆ Robust Learning in Bayesian Parallel Branching Graph Neural Networks: + The Narrow Width Limit + + +
+ The infinite width limit of random neural networks is known to result in +Neural Networks as Gaussian Process (NNGP) (Lee et al. [2018]), characterized +by task-independent kernels. It is widely accepted that larger network widths +contribute to improved generalization (Park et al. [2019]). However, this work +challenges this notion by investigating the narrow width limit of the Bayesian +Parallel Branching Graph Neural Network (BPB-GNN), an architecture that +resembles residual networks. We demonstrate that when the width of a BPB-GNN is +significantly smaller compared to the number of training examples, each branch +exhibits more robust learning due to a symmetry breaking of branches in kernel +renormalization. Surprisingly, the performance of a BPB-GNN in the narrow width +limit is generally superior or comparable to that achieved in the wide width +limit in bias-limited scenarios. Furthermore, the readout norms of each branch +in the narrow width limit are mostly independent of the architectural +hyperparameters but generally reflective of the nature of the data. Our results +characterize a newly defined narrow-width regime for parallel branching +networks in general. + +
+
+
+
+
+ + ☆ Log-Concave Coupling for Sampling Neural Net Posteriors + + +
+ In this work, we present a sampling algorithm for single hidden layer neural +networks. This algorithm is built upon a recursive series of Bayesian +posteriors using a method we call Greedy Bayes. Sampling of the Bayesian +posterior for neuron weight vectors $w$ of dimension $d$ is challenging because +of its multimodality. Our algorithm to tackle this problem is based on a +coupling of the posterior density for $w$ with an auxiliary random variable +$\xi$. + The resulting reverse conditional $w|\xi$ of neuron weights given auxiliary +random variable is shown to be log concave. In the construction of the +posterior distributions we provide some freedom in the choice of the prior. In +particular, for Gaussian priors on $w$ with suitably small variance, the +resulting marginal density of the auxiliary variable $\xi$ is proven to be +strictly log concave for all dimensions $d$. For a uniform prior on the unit +$\ell_1$ ball, evidence is given that the density of $\xi$ is again strictly +log concave for sufficiently large $d$. + The score of the marginal density of the auxiliary random variable $\xi$ is +determined by an expectation over $w|\xi$ and thus can be computed by various +rapidly mixing Markov Chain Monte Carlo methods. Moreover, the computation of +the score of $\xi$ permits methods of sampling $\xi$ by a stochastic diffusion +(Langevin dynamics) with drift function built from this score. With such +dynamics, information-theoretic methods pioneered by Bakry and Emery show that +accurate sampling of $\xi$ is obtained rapidly when its density is indeed +strictly log-concave. After which, one more draw from $w|\xi$, produces neuron +weights $w$ whose marginal distribution is from the desired posterior. + +
+
+ comment: This research was presented at the International Symposium on + Information Theory (ISIT). Athens, Greece, July 11, 2024. The material was + also presented in the 2024 Shannon Lecture +
+
+
+
+
+ + ☆ Benchmarking Dependence Measures to Prevent Shortcut Learning in Medical + Imaging + + +
+ Medical imaging cohorts are often confounded by factors such as acquisition +devices, hospital sites, patient backgrounds, and many more. As a result, deep +learning models tend to learn spurious correlations instead of causally related +features, limiting their generalizability to new and unseen data. This problem +can be addressed by minimizing dependence measures between intermediate +representations of task-related and non-task-related variables. These measures +include mutual information, distance correlation, and the performance of +adversarial classifiers. Here, we benchmark such dependence measures for the +task of preventing shortcut learning. We study a simplified setting using +Morpho-MNIST and a medical imaging task with CheXpert chest radiographs. Our +results provide insights into how to mitigate confounding factors in medical +imaging. + +
+
+ comment: Accepted to the 15th International Workshop on Machine Learning in + Medical Imaging (MLMI 2024) +
+
+
+
+
+ + ☆ Learning production functions for supply chains with graph neural + networks + + +
+ The global economy relies on the flow of goods over supply chain networks, +with nodes as firms and edges as transactions between firms. While we may +observe these external transactions, they are governed by unseen production +functions, which determine how firms internally transform the input products +they receive into output products that they sell. In this setting, it can be +extremely valuable to infer these production functions, to better understand +and improve supply chains, and to forecast future transactions more accurately. +However, existing graph neural networks (GNNs) cannot capture these hidden +relationships between nodes' inputs and outputs. Here, we introduce a new class +of models for this setting, by combining temporal GNNs with a novel inventory +module, which learns production functions via attention weights and a special +loss function. We evaluate our models extensively on real supply chains data, +along with data generated from our new open-source simulator, SupplySim. Our +models successfully infer production functions, with a 6-50% improvement over +baselines, and forecast future transactions on real and synthetic data, +outperforming baselines by 11-62%. + +
+
+
+
+
+ + ☆ Unsupervised Reservoir Computing for Multivariate Denoising of Severely + Contaminated Signals + + +
+ The interdependence and high dimensionality of multivariate signals present +significant challenges for denoising, as conventional univariate methods often +struggle to capture the complex interactions between variables. A successful +approach must consider not only the multivariate dependencies of the desired +signal but also the multivariate dependencies of the interfering noise. In our +previous research, we introduced a method using machine learning to extract the +maximum portion of ``predictable information" from univariate signal. We extend +this approach to multivariate signals, with the key idea being to properly +incorporate the interdependencies of the noise back into the interdependent +reconstruction of the signal. The method works successfully for various +multivariate signals, including chaotic signals and highly oscillating +sinusoidal signals which are corrupted by spatially correlated intensive noise. +It consistently outperforms other existing multivariate denoising methods +across a wide range of scenarios. + +
+
+ comment: 6pages, 2figures, 2tables +
+
+
+
+
+ + ☆ FLUE: Federated Learning with Un-Encrypted model weights + + +
+ Federated Learning enables diverse devices to collaboratively train a shared +model while keeping training data locally stored, avoiding the need for +centralized cloud storage. Despite existing privacy measures, concerns arise +from potential reverse engineering of gradients, even with added noise, +revealing private data. To address this, recent research emphasizes using +encrypted model parameters during training. This paper introduces a novel +federated learning algorithm, leveraging coded local gradients without +encryption, exchanging coded proxies for model parameters, and injecting +surplus noise for enhanced privacy. Two algorithm variants are presented, +showcasing convergence and learning rates adaptable to coding schemes and raw +data characteristics. Two encryption-free implementations with fixed and random +coding matrices are provided, demonstrating promising simulation results from +both federated optimization and machine learning perspectives. + +
+
+
+
+
+ + ☆ FairAIED: Navigating Fairness, Bias, and Ethics in Educational AI + Applications + + +
+ The integration of Artificial Intelligence (AI) into education has +transformative potential, providing tailored learning experiences and creative +instructional approaches. However, the inherent biases in AI algorithms hinder +this improvement by unintentionally perpetuating prejudice against specific +demographics, especially in human-centered applications like education. This +survey delves deeply into the developing topic of algorithmic fairness in +educational contexts, providing a comprehensive evaluation of the diverse +literature on fairness, bias, and ethics in AI-driven educational applications. +It identifies the common forms of biases, such as data-related, algorithmic, +and user-interaction, that fundamentally undermine the accomplishment of +fairness in AI teaching aids. By outlining existing techniques for mitigating +these biases, ranging from varied data gathering to algorithmic fairness +interventions, the survey emphasizes the critical role of ethical +considerations and legal frameworks in shaping a more equitable educational +environment. Furthermore, it guides readers through the complexities of +fairness measurements, methods, and datasets, shedding light on the way to bias +reduction. Despite these gains, this survey highlights long-standing issues, +such as achieving a balance between fairness and accuracy, as well as the need +for diverse datasets. Overcoming these challenges and ensuring the ethical and +fair use of AI's promise in education call for a collaborative, +interdisciplinary approach. + +
+
+
+
+
+ + ☆ AutoRDF2GML: Facilitating RDF Integration in Graph Machine Learning ISWC'24 + + +
+ In this paper, we introduce AutoRDF2GML, a framework designed to convert RDF +data into data representations tailored for graph machine learning tasks. +AutoRDF2GML enables, for the first time, the creation of both content-based +features -- i.e., features based on RDF datatype properties -- and +topology-based features -- i.e., features based on RDF object properties. +Characterized by automated feature extraction, AutoRDF2GML makes it possible +even for users less familiar with RDF and SPARQL to generate data +representations ready for graph machine learning tasks, such as link +prediction, node classification, and graph classification. Furthermore, we +present four new benchmark datasets for graph machine learning, created from +large RDF knowledge graphs using our framework. These datasets serve as +valuable resources for evaluating graph machine learning approaches, such as +graph neural networks. Overall, our framework effectively bridges the gap +between the Graph Machine Learning and Semantic Web communities, paving the way +for RDF-based machine learning applications. + +
+
+ comment: accepted at ISWC'24 +
+
+
+
+
+ + ☆ A Physics-Informed Neural Network-Based Approach for the Spatial + Upsampling of Spherical Microphone Arrays + + +
+ Spherical microphone arrays are convenient tools for capturing the spatial +characteristics of a sound field. However, achieving superior spatial +resolution requires arrays with numerous capsules, consequently leading to +expensive devices. To address this issue, we present a method for spatially +upsampling spherical microphone arrays with a limited number of capsules. Our +approach exploits a physics-informed neural network with Rowdy activation +functions, leveraging physical constraints to provide high-order microphone +array signals, starting from low-order devices. Results show that, within its +domain of application, our approach outperforms a state of the art method based +on signal processing for spherical microphone arrays upsampling. + +
+
+ comment: Accepted for publication at IWAENC 2024 +
+
+
+
+
+ + ☆ LLASP: Fine-tuning Large Language Models for Answer Set Programming + + +
+ Recently, Large Language Models (LLMs) have showcased their potential in +various natural language processing tasks, including code generation. However, +while significant progress has been made in adapting LLMs to generate code for +several imperative programming languages and tasks, there remains a notable gap +in their application to declarative formalisms, such as Answer Set Programming +(ASP). In this paper, we move a step towards exploring the capabilities of LLMs +for ASP code generation. First, we perform a systematic evaluation of several +state-of-the-art LLMs. Despite their power in terms of number of parameters, +training data and computational resources, empirical results demonstrate +inadequate performances in generating correct ASP programs. Therefore, we +propose LLASP, a fine-tuned lightweight model specifically trained to encode +fundamental ASP program patterns. To this aim, we create an ad-hoc dataset +covering a wide variety of fundamental problem specifications that can be +encoded in ASP. Our experiments demonstrate that the quality of ASP programs +generated by LLASP is remarkable. This holds true not only when compared to the +non-fine-tuned counterpart but also when compared to the majority of eager LLM +candidates, particularly from a semantic perspective. All the code and data +used to perform the experiments are publicly available at +https://anonymous.4open.science/r/LLASP-D86C/. + +
+
+
+
+
+ + ☆ Cluster-norm for Unsupervised Probing of Knowledge + + +
+ The deployment of language models brings challenges in generating reliable +information, especially when these models are fine-tuned using human +preferences. To extract encoded knowledge without (potentially) biased human +labels, unsupervised probing techniques like Contrast-Consistent Search (CCS) +have been developed (Burns et al., 2022). However, salient but unrelated +features in a given dataset can mislead these probes (Farquhar et al., 2023). +Addressing this, we propose a cluster normalization method to minimize the +impact of such features by clustering and normalizing activations of contrast +pairs before applying unsupervised probing techniques. While this approach does +not address the issue of differentiating between knowledge in general and +simulated knowledge - a major issue in the literature of latent knowledge +elicitation (Christiano et al., 2021) - it significantly improves the ability +of unsupervised probes to identify the intended knowledge amidst distractions. + +
+
+ comment: 34 pages, 35 figures +
+
+
+
+
+ + ☆ Finite Neural Networks as Mixtures of Gaussian Processes: From Provable + Error Bounds to Prior Selection + + +
+ Infinitely wide or deep neural networks (NNs) with independent and +identically distributed (i.i.d.) parameters have been shown to be equivalent to +Gaussian processes. Because of the favorable properties of Gaussian processes, +this equivalence is commonly employed to analyze neural networks and has led to +various breakthroughs over the years. However, neural networks and Gaussian +processes are equivalent only in the limit; in the finite case there are +currently no methods available to approximate a trained neural network with a +Gaussian model with bounds on the approximation error. In this work, we present +an algorithmic framework to approximate a neural network of finite width and +depth, and with not necessarily i.i.d. parameters, with a mixture of Gaussian +processes with error bounds on the approximation error. In particular, we +consider the Wasserstein distance to quantify the closeness between +probabilistic models and, by relying on tools from optimal transport and +Gaussian processes, we iteratively approximate the output distribution of each +layer of the neural network as a mixture of Gaussian processes. Crucially, for +any NN and $\epsilon >0$ our approach is able to return a mixture of Gaussian +processes that is $\epsilon$-close to the NN at a finite set of input points. +Furthermore, we rely on the differentiability of the resulting error bound to +show how our approach can be employed to tune the parameters of a NN to mimic +the functional behavior of a given Gaussian process, e.g., for prior selection +in the context of Bayesian inference. We empirically investigate the +effectiveness of our results on both regression and classification problems +with various neural network architectures. Our experiments highlight how our +results can represent an important step towards understanding neural network +predictions and formally quantifying their uncertainty. + +
+
+
+
+
+ + ☆ Adaptive Contrastive Search: Uncertainty-Guided Decoding for Open-Ended + Text Generation + + +
+ Decoding from the output distributions of large language models to produce +high-quality text is a complex challenge in language modeling. Various +approaches, such as beam search, sampling with temperature, $k-$sampling, +nucleus $p-$sampling, typical decoding, contrastive decoding, and contrastive +search, have been proposed to address this problem, aiming to improve +coherence, diversity, as well as resemblance to human-generated text. In this +study, we introduce adaptive contrastive search, a novel decoding strategy +extending contrastive search by incorporating an adaptive degeneration penalty, +guided by the estimated uncertainty of the model at each generation step. This +strategy is designed to enhance both the creativity and diversity of the +language modeling process while at the same time producing coherent and +high-quality generated text output. Our findings indicate performance +enhancement in both aspects, across different model architectures and datasets, +underscoring the effectiveness of our method in text generation tasks. Our code +base, datasets, and models are publicly available. + +
+
+
+
+
+ + ☆ Deep learning for predicting the occurrence of tipping points + + +
+ Tipping points occur in many real-world systems, at which the system shifts +suddenly from one state to another. The ability to predict the occurrence of +tipping points from time series data remains an outstanding challenge and a +major interest in a broad range of research fields. Particularly, the widely +used methods based on bifurcation theory are neither reliable in prediction +accuracy nor applicable for irregularly-sampled time series which are commonly +observed from real-world systems. Here we address this challenge by developing +a deep learning algorithm for predicting the occurrence of tipping points in +untrained systems, by exploiting information about normal forms. Our algorithm +not only outperforms traditional methods for regularly-sampled model time +series but also achieves accurate predictions for irregularly-sampled model +time series and empirical time series. Our ability to predict tipping points +for complex systems paves the way for mitigation risks, prevention of +catastrophic failures, and restoration of degraded systems, with broad +applications in social science, engineering, and biology. + +
+
+
+
+
+ + ☆ Graph Neural Networks for Virtual Sensing in Complex Systems: Addressing + Heterogeneous Temporal Dynamics + + +
+ Real-time condition monitoring is crucial for the reliable and efficient +operation of complex systems. However, relying solely on physical sensors can +be limited due to their cost, placement constraints, or inability to directly +measure certain critical parameters. Virtual sensing addresses these +limitations by leveraging readily available sensor data and system knowledge to +estimate inaccessible parameters or infer system states. The increasing +complexity of industrial systems necessitates deployments of sensors with +diverse modalities to provide a comprehensive understanding of system states. +These sensors capture data at varying frequencies to monitor both rapid and +slowly varying system dynamics, as well as local and global state evolutions of +the systems. This leads to heterogeneous temporal dynamics, which, particularly +under varying operational end environmental conditions, pose a significant +challenge for accurate virtual sensing. To address this, we propose a +Heterogeneous Temporal Graph Neural Network (HTGNN) framework. HTGNN explicitly +models signals from diverse sensors and integrates operating conditions into +the model architecture. We evaluate HTGNN using two newly released datasets: a +bearing dataset with diverse load conditions for bearing load prediction and a +year-long simulated dataset for predicting bridge live loads. Our results +demonstrate that HTGNN significantly outperforms established baseline methods +in both tasks, particularly under highly varying operating conditions. These +results highlight HTGNN's potential as a robust and accurate virtual sensing +approach for complex systems, paving the way for improved monitoring, +predictive maintenance, and enhanced system performance. + +
+
+ comment: This paper extends our previous conference paper (Best Paper at + European Conference of the PHM Society 2024, + https://doi.org/10.36001/phme.2024.v8i1.3998) +
+
+
+
+
+ + ☆ Rapid Object Annotation + + +
+ In this report we consider the problem of rapidly annotating a video with +bounding boxes for a novel object. We describe a UI and associated workflow +designed to make this process fast for an arbitrary novel target. + +
+
+
+
+
+ + ☆ Right Now, Wrong Then: Non-Stationary Direct Preference Optimization + under Preference Drift + + +
+ Reinforcement learning from human feedback (RLHF) aligns Large Language +Models (LLMs) with human preferences. However, these preferences can often +change over time due to external factors (e.g. environment change and societal +influence). Consequently, what was wrong then might be right now. Current +preference optimization algorithms do not account for temporal preference drift +in their modeling, which can lead to severe misalignment. To address this +limitation, we use a Dynamic Bradley-Terry model that models preferences via +time-dependent reward functions, and propose Non-Stationary Direct Preference +Optimisation (NS-DPO). By introducing a discount parameter in the loss +function, NS-DPO applies exponential weighting, which proportionally focuses +learning on more time-relevant datapoints. We theoretically analyse the +convergence of NS-DPO in the offline setting, providing upper bounds on the +estimation error caused by non-stationary preferences. Finally, we demonstrate +the effectiveness of NS-DPO1 for fine-tuning LLMs in scenarios with drifting +preferences. By simulating preference drift using renowned reward models and +modifying popular LLM datasets accordingly, we show that NS-DPO fine-tuned LLMs +remain robust under non-stationarity, significantly outperforming baseline +algorithms that ignore temporal preference changes, without sacrificing +performance in stationary cases. + +
+
+ comment: 30 pages, 9 figures +
+
+
+
+
+ + ☆ A dual ensemble classifier used to recognise contaminated multi-channel + EMG and MMG signals in the control of upper limb bioprosthesis + + +
+ Myopotential pattern recognition to decode the intent of the user is the most +advanced approach to controlling a powered bioprosthesis. Unfortunately, many +factors make this a difficult problem and achieving acceptable recognition +quality in real-word conditions is a serious challenge. The aim of the paper is +to develop a recognition system that will mitigate factors related to +multimodality and multichannel recording of biosignals and their high +susceptibility to contamination. The proposed method involves the use of two +co-operating multiclassifier systems. The first system is composed of one-class +classifiers related to individual electromyographic (EMG) and mechanomyographic +(MMG) biosignal recording channels, and its task is to recognise contaminated +channels. The role of the second system is to recognise the class of movement +resulting from the patient's intention. The ensemble system consists of base +classifiers using the representation (extracted features) of biosignals from +different channels. The system uses a dynamic selection mechanism, eliminating +those base classifiers that are associated with biosignal channels that are +recognised by the one-class ensemble system as being contaminated. Experimental +studies were conducted using signals from an able-bodied person with simulation +of amputation. The results obtained allow us to reject the null hypothesis that +the application of the dual ensemble foes not lead to improved classification +quality. + +
+
+
+
+
+ + ☆ A Survey on Cell Nuclei Instance Segmentation and Classification: + Leveraging Context and Attention + + +
+ Manually annotating nuclei from the gigapixel Hematoxylin and Eosin +(H&E)-stained Whole Slide Images (WSIs) is a laborious and costly task, meaning +automated algorithms for cell nuclei instance segmentation and classification +could alleviate the workload of pathologists and clinical researchers and at +the same time facilitate the automatic extraction of clinically interpretable +features. But due to high intra- and inter-class variability of nuclei +morphological and chromatic features, as well as H&E-stains susceptibility to +artefacts, state-of-the-art algorithms cannot correctly detect and classify +instances with the necessary performance. In this work, we hypothesise context +and attention inductive biases in artificial neural networks (ANNs) could +increase the generalization of algorithms for cell nuclei instance segmentation +and classification. We conduct a thorough survey on context and attention +methods for cell nuclei instance segmentation and classification from +H&E-stained microscopy imaging, while providing a comprehensive discussion of +the challenges being tackled with context and attention. Besides, we illustrate +some limitations of current approaches and present ideas for future research. +As a case study, we extend both a general instance segmentation and +classification method (Mask-RCNN) and a tailored cell nuclei instance +segmentation and classification model (HoVer-Net) with context- and +attention-based mechanisms, and do a comparative analysis on a multi-centre +colon nuclei identification and counting dataset. Although pathologists rely on +context at multiple levels while paying attention to specific Regions of +Interest (RoIs) when analysing and annotating WSIs, our findings suggest +translating that domain knowledge into algorithm design is no trivial task, but +to fully exploit these mechanisms, the scientific understanding of these +methods should be addressed. + +
+
+
+
+
+ + ☆ Adversarial Robustification via Text-to-Image Diffusion Models + + +
+ Adversarial robustness has been conventionally believed as a challenging +property to encode for neural networks, requiring plenty of training data. In +the recent paradigm of adopting off-the-shelf models, however, access to their +training data is often infeasible or not practical, while most of such models +are not originally trained concerning adversarial robustness. In this paper, we +develop a scalable and model-agnostic solution to achieve adversarial +robustness without using any data. Our intuition is to view recent +text-to-image diffusion models as "adaptable" denoisers that can be optimized +to specify target tasks. Based on this, we propose: (a) to initiate a +denoise-and-classify pipeline that offers provable guarantees against +adversarial attacks, and (b) to leverage a few synthetic reference images +generated from the text-to-image model that enables novel adaptation schemes. +Our experiments show that our data-free scheme applied to the pre-trained CLIP +could improve the (provable) adversarial robustness of its diverse zero-shot +classification derivatives (while maintaining their accuracy), significantly +surpassing prior approaches that utilize the full training data. Not only for +CLIP, we also demonstrate that our framework is easily applicable for +robustifying other visual classifiers efficiently. + +
+
+ comment: Code is available at https://github.com/ChoiDae1/robustify-T2I +
+
+
+
+
+ + ☆ Aspects of importance sampling in parameter selection for neural + networks using ridgelet transform + + +
+ The choice of parameters in neural networks is crucial in the performance, +and an oracle distribution derived from the ridgelet transform enables us to +obtain suitable initial parameters. In other words, the distribution of +parameters is connected to the integral representation of target functions. The +oracle distribution allows us to avoid the conventional backpropagation +learning process; only a linear regression is enough to construct the neural +network in simple cases. This study provides a new look at the oracle +distributions and ridgelet transforms, i.e., an aspect of importance sampling. +In addition, we propose extensions of the parameter sampling methods. We +demonstrate the aspect of importance sampling and the proposed sampling +algorithms via one-dimensional and high-dimensional examples; the results imply +that the magnitude of weight parameters could be more crucial than the +intercept parameters. + +
+
+ comment: 7 pages, 5 figures +
+
+
+
+
+ + ☆ Achieving interpretable machine learning by functional decomposition of + black-box models into explainable predictor effects + + +
+ Machine learning (ML) has seen significant growth in both popularity and +importance. The high prediction accuracy of ML models is often achieved through +complex black-box architectures that are difficult to interpret. This +interpretability problem has been hindering the use of ML in fields like +medicine, ecology and insurance, where an understanding of the inner workings +of the model is paramount to ensure user acceptance and fairness. The need for +interpretable ML models has boosted research in the field of interpretable +machine learning (IML). Here we propose a novel approach for the functional +decomposition of black-box predictions, which is considered a core concept of +IML. The idea of our method is to replace the prediction function by a +surrogate model consisting of simpler subfunctions. Similar to additive +regression models, these functions provide insights into the direction and +strength of the main feature contributions and their interactions. Our method +is based on a novel concept termed stacked orthogonality, which ensures that +the main effects capture as much functional behavior as possible and do not +contain information explained by higher-order interactions. Unlike earlier +functional IML approaches, it is neither affected by extrapolation nor by +hidden feature interactions. To compute the subfunctions, we propose an +algorithm based on neural additive modeling and an efficient post-hoc +orthogonalization procedure. + +
+
+
+
+
+ + ☆ Fast and Reliable Probabilistic Reflectometry Inversion with + Prior-Amortized Neural Posterior Estimation + + +
+ Reconstructing the structure of thin films and multilayers from measurements +of scattered X-rays or neutrons is key to progress in physics, chemistry, and +biology. However, finding all structures compatible with reflectometry data is +computationally prohibitive for standard algorithms, which typically results in +unreliable analysis with only a single potential solution identified. We +address this lack of reliability with a probabilistic deep learning method that +identifies all realistic structures in seconds, setting new standards in +reflectometry. Our method, Prior-Amortized Neural Posterior Estimation (PANPE), +combines simulation-based inference with novel adaptive priors that inform the +inference network about known structural properties and controllable +experimental conditions. PANPE networks support key scenarios such as +high-throughput sample characterization, real-time monitoring of evolving +structures, or the co-refinement of several experimental data sets, and can be +adapted to provide fast, reliable, and flexible inference across many other +inverse problems. + +
+
+
+
+
+ + ☆ Contrastive Learning of Asset Embeddings from Financial Time Series + + +
+ Representation learning has emerged as a powerful paradigm for extracting +valuable latent features from complex, high-dimensional data. In financial +domains, learning informative representations for assets can be used for tasks +like sector classification, and risk management. However, the complex and +stochastic nature of financial markets poses unique challenges. We propose a +novel contrastive learning framework to generate asset embeddings from +financial time series data. Our approach leverages the similarity of asset +returns over many subwindows to generate informative positive and negative +samples, using a statistical sampling strategy based on hypothesis testing to +address the noisy nature of financial data. We explore various contrastive loss +functions that capture the relationships between assets in different ways to +learn a discriminative representation space. Experiments on real-world datasets +demonstrate the effectiveness of the learned asset embeddings on benchmark +industry classification and portfolio optimization tasks. In each case our +novel approaches significantly outperform existing baselines highlighting the +potential for contrastive learning to capture meaningful and actionable +relationships in financial data. + +
+
+ comment: 9 pages, 4 figures, 4 tables +
+
+
+
+
+ + ☆ Vulnerability Detection in Ethereum Smart Contracts via Machine + Learning: A Qualitative Analysis + + +
+ Smart contracts are central to a myriad of critical blockchain applications, +from financial transactions to supply chain management. However, their adoption +is hindered by security vulnerabilities that can result in significant +financial losses. Most vulnerability detection tools and methods available +nowadays leverage either static analysis methods or machine learning. +Unfortunately, as valuable as they are, both approaches suffer from limitations +that make them only partially effective. In this survey, we analyze the state +of the art in machine-learning vulnerability detection for Ethereum smart +contracts, by categorizing existing tools and methodologies, evaluating them, +and highlighting their limitations. Our critical assessment unveils issues such +as restricted vulnerability coverage and dataset construction flaws, providing +us with new metrics to overcome the difficulties that restrain a sound +comparison of existing solutions. Driven by our findings, we discuss best +practices to enhance the accuracy, scope, and efficiency of vulnerability +detection in smart contracts. Our guidelines address the known flaws while at +the same time opening new avenues for research and development. By shedding +light on current challenges and offering novel directions for improvement, we +contribute to the advancement of secure smart contract development and +blockchain technology as a whole. + +
+
+
+
+
+ + ☆ Robust VAEs via Generating Process of Noise Augmented Data + + +
+ Advancing defensive mechanisms against adversarial attacks in generative +models is a critical research topic in machine learning. Our study focuses on a +specific type of generative models - Variational Auto-Encoders (VAEs). Contrary +to common beliefs and existing literature which suggest that noise injection +towards training data can make models more robust, our preliminary experiments +revealed that naive usage of noise augmentation technique did not substantially +improve VAE robustness. In fact, it even degraded the quality of learned +representations, making VAEs more susceptible to adversarial perturbations. +This paper introduces a novel framework that enhances robustness by +regularizing the latent space divergence between original and noise-augmented +data. Through incorporating a paired probabilistic prior into the standard +variational lower bound, our method significantly boosts defense against +adversarial attacks. Our empirical evaluations demonstrate that this approach, +termed Robust Augmented Variational Auto-ENcoder (RAVEN), yields superior +performance in resisting adversarial inputs on widely-recognized benchmark +datasets. + +
+
+
+
+
+ + ☆ CardioLab: Laboratory Values Estimation from Electrocardiogram Features + -- An Exploratory Study + + +
+ Introduction: Laboratory value represents a cornerstone of medical +diagnostics, but suffers from slow turnaround times, and high costs and only +provides information about a single point in time. The continuous estimation of +laboratory values from non-invasive data such as electrocardiogram (ECG) would +therefore mark a significant frontier in healthcare monitoring. Despite its +transformative potential, this domain remains relatively underexplored within +the medical community. + Methods: In this preliminary study, we used a publicly available dataset +(MIMIC-IV-ECG) to investigate the feasibility of inferring laboratory values +from ECG features and patient demographics using tree-based models (XGBoost). +We define the prediction task as a binary prediction problem of predicting +whether the lab value falls into low or high abnormalities. The model +performance can then be assessed using AUROC. + Results: Our findings demonstrate promising results in the estimation of +laboratory values related to different organ systems based on a small yet +comprehensive set of features. While further research and validation are +warranted to fully assess the clinical utility and generalizability of +ECG-based estimation in healthcare monitoring, our findings lay the groundwork +for future investigations into approaches to laboratory value estimation using +ECG data. Such advancements hold promise for revolutionizing predictive +healthcare applications, offering faster, non-invasive, and more affordable +means of patient monitoring. + +
+
+ comment: 5 pages, code under https://github.com/AI4HealthUOL/CardioLab +
+
+
+
+
+ + ☆ Multi-Agent Deep Reinforcement Learning for Energy Efficient Multi-Hop + STAR-RIS-Assisted Transmissions + + +
+ Simultaneously transmitting and reflecting reconfigurable intelligent surface +(STAR-RIS) provides a promising way to expand coverage in wireless +communications. However, limitation of single STAR-RIS inspire us to integrate +the concept of multi-hop transmissions, as focused on RIS in existing research. +Therefore, we propose the novel architecture of multi-hop STAR-RISs to achieve +a wider range of full-plane service coverage. In this paper, we intend to solve +active beamforming of the base station and passive beamforming of STAR-RISs, +aiming for maximizing the energy efficiency constrained by hardware limitation +of STAR-RISs. Furthermore, we investigate the impact of the on-off state of +STAR-RIS elements on energy efficiency. To tackle the complex problem, a +Multi-Agent Global and locAl deep Reinforcement learning (MAGAR) algorithm is +designed. The global agent elevates the collaboration among local agents, which +focus on individual learning. In numerical results, we observe the significant +improvement of MAGAR compared to the other benchmarks, including Q-learning, +multi-agent deep Q network (DQN) with golbal reward, and multi-agent DQN with +local rewards. Moreover, the proposed architecture of multi-hop STAR-RISs +achieves the highest energy efficiency compared to mode switching based +STAR-RISs, conventional RISs and deployment without RISs or STAR-RISs. + +
+
+ comment: Accepted by Proc. IEEE VTC-fall +
+
+
+
+
+ + ☆ Dual-Decoupling Learning and Metric-Adaptive Thresholding for + Semi-Supervised Multi-Label Learning + + +
+ Semi-supervised multi-label learning (SSMLL) is a powerful framework for +leveraging unlabeled data to reduce the expensive cost of collecting precise +multi-label annotations. Unlike semi-supervised learning, one cannot select the +most probable label as the pseudo-label in SSMLL due to multiple semantics +contained in an instance. To solve this problem, the mainstream method +developed an effective thresholding strategy to generate accurate +pseudo-labels. Unfortunately, the method neglected the quality of model +predictions and its potential impact on pseudo-labeling performance. In this +paper, we propose a dual-perspective method to generate high-quality +pseudo-labels. To improve the quality of model predictions, we perform +dual-decoupling to boost the learning of correlative and discriminative +features, while refining the generation and utilization of pseudo-labels. To +obtain proper class-wise thresholds, we propose the metric-adaptive +thresholding strategy to estimate the thresholds, which maximize the +pseudo-label performance for a given metric on labeled data. Experiments on +multiple benchmark datasets show the proposed method can achieve the +state-of-the-art performance and outperform the comparative methods with a +significant margin. + +
+
+
+
+
+ + ☆ Denoising Lévy Probabilistic Models + + +
+ Investigating noise distribution beyond Gaussian in diffusion generative +models is an open problem. The Gaussian case has seen success experimentally +and theoretically, fitting a unified SDE framework for score-based and +denoising formulations. Recent studies suggest heavy-tailed noise distributions +can address mode collapse and manage datasets with class imbalance, heavy +tails, or outliers. Yoon et al. (NeurIPS 2023) introduced the L\'evy-Ito model +(LIM), extending the SDE framework to heavy-tailed SDEs with $\alpha$-stable +noise. Despite its theoretical elegance and performance gains, LIM's complex +mathematics may limit its accessibility and broader adoption. This study takes +a simpler approach by extending the denoising diffusion probabilistic model +(DDPM) with $\alpha$-stable noise, creating the denoising L\'evy probabilistic +model (DLPM). Using elementary proof techniques, we show DLPM reduces to +running vanilla DDPM with minimal changes, allowing the use of existing +implementations with minimal changes. DLPM and LIM have different training +algorithms and, unlike the Gaussian case, they admit different backward +processes and sampling algorithms. Our experiments demonstrate that DLPM +achieves better coverage of data distribution tail, improved generation of +unbalanced datasets, and faster computation times with fewer backward steps. + +
+
+
+
+
+ + ☆ Using GPT-4 to guide causal machine learning + + +
+ Since its introduction to the public, ChatGPT has had an unprecedented +impact. While some experts praised AI advancements and highlighted their +potential risks, others have been critical about the accuracy and usefulness of +Large Language Models (LLMs). In this paper, we are interested in the ability +of LLMs to identify causal relationships. We focus on the well-established +GPT-4 (Turbo) and evaluate its performance under the most restrictive +conditions, by isolating its ability to infer causal relationships based solely +on the variable labels without being given any context, demonstrating the +minimum level of effectiveness one can expect when it is provided with +label-only information. We show that questionnaire participants judge the GPT-4 +graphs as the most accurate in the evaluated categories, closely followed by +knowledge graphs constructed by domain experts, with causal Machine Learning +(ML) far behind. We use these results to highlight the important limitation of +causal ML, which often produces causal graphs that violate common sense, +affecting trust in them. However, we show that pairing GPT-4 with causal ML +overcomes this limitation, resulting in graphical structures learnt from real +data that align more closely with those identified by domain experts, compared +to structures learnt by causal ML alone. Overall, our findings suggest that +despite GPT-4 not being explicitly designed to reason causally, it can still be +a valuable tool for causal representation, as it improves the causal discovery +process of causal ML algorithms that are designed to do just that. + +
+
+
+
+
+ + ☆ A data balancing approach designing of an expert system for Heart + Disease Prediction + + +
+ Heart disease is a major global health concern that results in millions of +deaths annually. Prevention and effective treatment of heart-related problems +depend heavily on early detection and accurate prediction. It was previously +predicted accurately with machine learning methods. This innovative development +in healthcare has the power to transform preventative care and save a great +deal of lives. The study starts with a thorough assessment of the literature +that covers a wide range of topics, including pre-processing techniques, +performance evaluation measures, datasets used in heart disease research, +predictive modeling strategies, diagnostic methodologies, and current issues in +the field. Building on these fundamental understandings, the background section +describes the particular actions conducted in this investigation, such as the +description of the dataset, data pre-treatment techniques, label encoding, +feature selection methodology, algorithm selection tactics, and stringent +performance evaluation techniques.The results indicate that ensemble methods, +particularly random forests, outperformed individual classifiers in predicting +heart disease. Key predictors identified included hypertension, cholesterol +levels, smoking status, and physical inactivity. The Decision Tree and Random +Forest model achieved an accuracy of 99.83%. This work demonstrates how machine +learning models, particularly ensemble approaches, can increase the precision +of heart disease prediction. In comparison to conventional techniques, the +models offer a more reliable risk assessment since they integrate a wide range +of variables and sophisticated algorithms. The results open the door to +tailored healthcare treatments that facilitate early identification and +treatment of cardiac disease. + +
+
+
+
+
+ + ☆ Climbing the Complexity Ladder with Expressive Attention + + +
+ Attention involves comparing query and key vectors in terms of a scalar +product, $\mathbf{Q}^T\mathbf{K}$, together with a subsequent softmax +normalization. Classicaly, parallel/orthogonal/antiparallel queries and keys +lead to large/intermediate/small attention weights. Here we study expressive +attention (EA), which is based on $(\mathbf{Q}^T\mathbf{K})^2$, the squared dot +product. In this case attention is enhanced when query and key are either +parallel or antiparallel, and suppressed for orthogonal configurations. For a +series of autoregressive prediction tasks, we find that EA performs at least as +well as the standard mechanism, dot-product attention (DPA). Increasing task +complexity, EA is observed to outperform DPA with increasing margins, which +also holds for multi-task settings. For a given model size, EA manages to +achieve 100\% performance for a range of complexity levels not accessible to +DPA. + +
+
+
+
+
+ + ☆ Reinforcement Learning for Sustainable Energy: A Survey + + +
+ The transition to sustainable energy is a key challenge of our time, +requiring modifications in the entire pipeline of energy production, storage, +transmission, and consumption. At every stage, new sequential decision-making +challenges emerge, ranging from the operation of wind farms to the management +of electrical grids or the scheduling of electric vehicle charging stations. +All such problems are well suited for reinforcement learning, the branch of +machine learning that learns behavior from data. Therefore, numerous studies +have explored the use of reinforcement learning for sustainable energy. This +paper surveys this literature with the intention of bridging both the +underlying research communities: energy and machine learning. After a brief +introduction of both fields, we systematically list relevant sustainability +challenges, how they can be modeled as a reinforcement learning problem, and +what solution approaches currently exist in the literature. Afterwards, we zoom +out and identify overarching reinforcement learning themes that appear +throughout sustainability, such as multi-agent, offline, and safe reinforcement +learning. Lastly, we also cover standardization of environments, which will be +crucial for connecting both research fields, and highlight potential directions +for future work. In summary, this survey provides an extensive overview of +reinforcement learning methods for sustainable energy, which may play a vital +role in the energy transition. + +
+
+ comment: 22 pages excluding references, 40 pages including references, 7 + images +
+
+
+
+
+ + ☆ PP-TIL: Personalized Planning for Autonomous Driving with Instance-based + Transfer Imitation Learning + + +
+ Personalized motion planning holds significant importance within urban +automated driving, catering to the unique requirements of individual users. +Nevertheless, prior endeavors have frequently encountered difficulties in +simultaneously addressing two crucial aspects: personalized planning within +intricate urban settings and enhancing planning performance through data +utilization. The challenge arises from the expensive and limited nature of user +data, coupled with the scene state space tending towards infinity. These +factors contribute to overfitting and poor generalization problems during model +training. Henceforth, we propose an instance-based transfer imitation learning +approach. This method facilitates knowledge transfer from extensive expert +domain data to the user domain, presenting a fundamental resolution to these +issues. We initially train a pre-trained model using large-scale expert data. +Subsequently, during the fine-tuning phase, we feed the batch data, which +comprises expert and user data. Employing the inverse reinforcement learning +technique, we extract the style feature distribution from user demonstrations, +constructing the regularization term for the approximation of user style. In +our experiments, we conducted extensive evaluations of the proposed method. +Compared to the baseline methods, our approach mitigates the overfitting issue +caused by sparse user data. Furthermore, we discovered that integrating the +driving model with a differentiable nonlinear optimizer as a safety protection +layer for end-to-end personalized fine-tuning results in superior planning +performance. + +
+
+
+
+
+ + ☆ Unveiling Privacy Vulnerabilities: Investigating the Role of Structure + in Graph Data KDD'24 + + +
+ The public sharing of user information opens the door for adversaries to +infer private data, leading to privacy breaches and facilitating malicious +activities. While numerous studies have concentrated on privacy leakage via +public user attributes, the threats associated with the exposure of user +relationships, particularly through network structure, are often neglected. +This study aims to fill this critical gap by advancing the understanding and +protection against privacy risks emanating from network structure, moving +beyond direct connections with neighbors to include the broader implications of +indirect network structural patterns. To achieve this, we first investigate the +problem of Graph Privacy Leakage via Structure (GPS), and introduce a novel +measure, the Generalized Homophily Ratio, to quantify the various mechanisms +contributing to privacy breach risks in GPS. Based on this insight, we develop +a novel graph private attribute inference attack, which acts as a pivotal tool +for evaluating the potential for privacy leakage through network structures +under worst-case scenarios. To protect users' private data from such +vulnerabilities, we propose a graph data publishing method incorporating a +learnable graph sampling technique, effectively transforming the original graph +into a privacy-preserving version. Extensive experiments demonstrate that our +attack model poses a significant threat to user privacy, and our graph data +publishing method successfully achieves the optimal privacy-utility trade-off +compared to baselines. + +
+
+ comment: In KDD'24; with full appendix +
+
+
+
+
+ + ☆ Look Globally and Reason: Two-stage Path Reasoning over Sparse Knowledge + Graphs CIKM 2024 + + +
+ Sparse Knowledge Graphs (KGs), frequently encountered in real-world +applications, contain fewer facts in the form of (head entity, relation, tail +entity) compared to more populated KGs. The sparse KG completion task, which +reasons answers for given queries in the form of (head entity, relation, ?) for +sparse KGs, is particularly challenging due to the necessity of reasoning +missing facts based on limited facts. Path-based models, known for excellent +explainability, are often employed for this task. However, existing path-based +models typically rely on external models to fill in missing facts and +subsequently perform path reasoning. This approach introduces unexplainable +factors or necessitates meticulous rule design. In light of this, this paper +proposes an alternative approach by looking inward instead of seeking external +assistance. We introduce a two-stage path reasoning model called LoGRe (Look +Globally and Reason) over sparse KGs. LoGRe constructs a relation-path +reasoning schema by globally analyzing the training data to alleviate the +sparseness problem. Based on this schema, LoGRe then aggregates paths to reason +out answers. Experimental results on five benchmark sparse KG datasets +demonstrate the effectiveness of the proposed LoGRe model. + +
+
+ comment: Accepted to CIKM 2024 +
+
+
+
+
+ + ☆ Multimodal Emotion Recognition using Audio-Video Transformer Fusion with + Cross Attention + + +
+ Understanding emotions is a fundamental aspect of human communication. +Integrating audio and video signals offers a more comprehensive understanding +of emotional states compared to traditional methods that rely on a single data +source, such as speech or facial expressions. Despite its potential, multimodal +emotion recognition faces significant challenges, particularly in +synchronization, feature extraction, and fusion of diverse data sources. To +address these issues, this paper introduces a novel transformer-based model +named Audio-Video Transformer Fusion with Cross Attention (AVT-CA). The AVT-CA +model employs a transformer fusion approach to effectively capture and +synchronize interlinked features from both audio and video inputs, thereby +resolving synchronization problems. Additionally, the Cross Attention mechanism +within AVT-CA selectively extracts and emphasizes critical features while +discarding irrelevant ones from both modalities, addressing feature extraction +and fusion challenges. Extensive experimental analysis conducted on the +CMU-MOSEI, RAVDESS and CREMA-D datasets demonstrates the efficacy of the +proposed model. The results underscore the importance of AVT-CA in developing +precise and reliable multimodal emotion recognition systems for practical +applications. + +
+
+ comment: 38 Pages, 9 Tables, 12 Figures +
+
+
+
+
+ + ☆ Utilising Explainable Techniques for Quality Prediction in a Complex + Textiles Manufacturing Use Case + + +
+ This paper develops an approach to classify instances of product failure in a +complex textiles manufacturing dataset using explainable techniques. The +dataset used in this study was obtained from a New Zealand manufacturer of +woollen carpets and rugs. In investigating the trade-off between accuracy and +explainability, three different tree-based classification algorithms were +evaluated: a Decision Tree and two ensemble methods, Random Forest and XGBoost. +Additionally, three feature selection methods were also evaluated: the +SelectKBest method, using chi-squared as the scoring function, the Pearson +Correlation Coefficient, and the Boruta algorithm. Not surprisingly, the +ensemble methods typically produced better results than the Decision Tree +model. The Random Forest model yielded the best results overall when combined +with the Boruta feature selection technique. Finally, a tree ensemble +explaining technique was used to extract rule lists to capture necessary and +sufficient conditions for classification by a trained model that could be +easily interpreted by a human. Notably, several features that were in the +extracted rule lists were statistical features and calculated features that +were added to the original dataset. This demonstrates the influence that +bringing in additional information during the data preprocessing stages can +have on the ultimate model performance. + +
+
+ comment: Accepted at the 2024 IEEE 20th International Conference on Automation + Science and Engineering (CASE 2024), awaiting publication Contains seven + pages and five figures +
+
+
+
+
+ + ☆ Constructing Enhanced Mutual Information for Online Class-Incremental + Learning + + +
+ Online Class-Incremental continual Learning (OCIL) addresses the challenge of +continuously learning from a single-channel data stream, adapting to new tasks +while mitigating catastrophic forgetting. Recently, Mutual Information +(MI)-based methods have shown promising performance in OCIL. However, existing +MI-based methods treat various knowledge components in isolation, ignoring the +knowledge confusion across tasks. This narrow focus on simple MI knowledge +alignment may lead to old tasks being easily forgotten with the introduction of +new tasks, risking the loss of common parts between past and present +knowledge.To address this, we analyze the MI relationships from the +perspectives of diversity, representativeness, and separability, and propose an +Enhanced Mutual Information (EMI) method based on knwoledge decoupling. EMI +consists of Diversity Mutual Information (DMI), Representativeness Mutual +Information (RMI) and Separability Mutual Information (SMI). DMI diversifies +intra-class sample features by considering the similarity relationships among +inter-class sample features to enable the network to learn more general +knowledge. RMI summarizes representative features for each category and aligns +sample features with these representative features, making the intra-class +sample distribution more compact. SMI establishes MI relationships for +inter-class representative features, enhancing the stability of representative +features while increasing the distinction between inter-class representative +features, thus creating clear boundaries between class. Extensive experimental +results on widely used benchmark datasets demonstrate the superior performance +of EMI over state-of-the-art baseline methods. + +
+
+
+
+
+ + ☆ Is larger always better? Evaluating and prompting large language models + for non-generative medical tasks + + +
+ The use of Large Language Models (LLMs) in medicine is growing, but their +ability to handle both structured Electronic Health Record (EHR) data and +unstructured clinical notes is not well-studied. This study benchmarks various +models, including GPT-based LLMs, BERT-based models, and traditional clinical +predictive models, for non-generative medical tasks utilizing renowned +datasets. We assessed 14 language models (9 GPT-based and 5 BERT-based) and 7 +traditional predictive models using the MIMIC dataset (ICU patient records) and +the TJH dataset (early COVID-19 EHR data), focusing on tasks such as mortality +and readmission prediction, disease hierarchy reconstruction, and biomedical +sentence matching, comparing both zero-shot and finetuned performance. Results +indicated that LLMs exhibited robust zero-shot predictive capabilities on +structured EHR data when using well-designed prompting strategies, frequently +surpassing traditional models. However, for unstructured medical texts, LLMs +did not outperform finetuned BERT models, which excelled in both supervised and +unsupervised tasks. Consequently, while LLMs are effective for zero-shot +learning on structured data, finetuned BERT models are more suitable for +unstructured texts, underscoring the importance of selecting models based on +specific task requirements and data characteristics to optimize the application +of NLP technology in healthcare. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2402.01713 +
+
+
+
+
+ + ☆ DTFormer: A Transformer-Based Method for Discrete-Time Dynamic Graph + Representation Learning + + +
+ Discrete-Time Dynamic Graphs (DTDGs), which are prevalent in real-world +implementations and notable for their ease of data acquisition, have garnered +considerable attention from both academic researchers and industry +practitioners. The representation learning of DTDGs has been extensively +applied to model the dynamics of temporally changing entities and their +evolving connections. Currently, DTDG representation learning predominantly +relies on GNN+RNN architectures, which manifest the inherent limitations of +both Graph Neural Networks (GNNs) and Recurrent Neural Networks (RNNs). GNNs +suffer from the over-smoothing issue as the models architecture goes deeper, +while RNNs struggle to capture long-term dependencies effectively. GNN+RNN +architectures also grapple with scaling to large graph sizes and long +sequences. Additionally, these methods often compute node representations +separately and focus solely on individual node characteristics, thereby +overlooking the behavior intersections between the two nodes whose link is +being predicted, such as instances where the two nodes appear together in the +same context or share common neighbors. + This paper introduces a novel representation learning method DTFormer for +DTDGs, pivoting from the traditional GNN+RNN framework to a Transformer-based +architecture. Our approach exploits the attention mechanism to concurrently +process topological information within the graph at each timestamp and temporal +dynamics of graphs along the timestamps, circumventing the aforementioned +fundamental weakness of both GNNs and RNNs. Moreover, we enhance the model's +expressive capability by incorporating the intersection relationships among +nodes and integrating a multi-patching module. Extensive experiments conducted +on six public dynamic graph benchmark datasets confirm our model's efficacy, +achieving the SOTA performance. + +
+
+ comment: 11 pages, 3 figures +
+
+
+
+
+ + ☆ TCGPN: Temporal-Correlation Graph Pre-trained Network for Stock + Forecasting + + +
+ Recently, the incorporation of both temporal features and the correlation +across time series has become an effective approach in time series prediction. +Spatio-Temporal Graph Neural Networks (STGNNs) demonstrate good performance on +many Temporal-correlation Forecasting Problem. However, when applied to tasks +lacking periodicity, such as stock data prediction, the effectiveness and +robustness of STGNNs are found to be unsatisfactory. And STGNNs are limited by +memory savings so that cannot handle problems with a large number of nodes. In +this paper, we propose a novel approach called the Temporal-Correlation Graph +Pre-trained Network (TCGPN) to address these limitations. TCGPN utilize +Temporal-correlation fusion encoder to get a mixed representation and +pre-training method with carefully designed temporal and correlation +pre-training tasks. Entire structure is independent of the number and order of +nodes, so better results can be obtained through various data enhancements. And +memory consumption during training can be significantly reduced through +multiple sampling. Experiments are conducted on real stock market data sets +CSI300 and CSI500 that exhibit minimal periodicity. We fine-tune a simple MLP +in downstream tasks and achieve state-of-the-art results, validating the +capability to capture more robust temporal correlation patterns. + +
+
+
+
+
+ + ☆ WorkR: Occupation Inference for Intelligent Task Assistance + + +
+ Occupation information can be utilized by digital assistants to provide +occupation-specific personalized task support, including interruption +management, task planning, and recommendations. Prior research in the digital +workplace assistant domain requires users to input their occupation information +for effective support. However, as many individuals switch between multiple +occupations daily, current solutions falter without continuous user input. To +address this, this study introduces WorkR, a framework that leverages passive +sensing to capture pervasive signals from various task activities, addressing +three challenges: the lack of a passive sensing architecture, personalization +of occupation characteristics, and discovering latent relationships among +occupation variables. We argue that signals from application usage, movements, +social interactions, and the environment can inform a user's occupation. WorkR +uses a Variational Autoencoder (VAE) to derive latent features for training +models to infer occupations. Our experiments with an anonymized, context-rich +activity and task log dataset demonstrate that our models can accurately infer +occupations with more than 91% accuracy across six ISO occupation categories. + +
+
+
+
+
+ + ☆ The formation of perceptual space in early phonetic acquisition: a + cross-linguistic modeling approach + + +
+ This study investigates how learners organize perceptual space in early +phonetic acquisition by advancing previous studies in two key aspects. Firstly, +it examines the shape of the learned hidden representation as well as its +ability to categorize phonetic categories. Secondly, it explores the impact of +training models on context-free acoustic information, without involving +contextual cues, on phonetic acquisition, closely mimicking the early language +learning stage. Using a cross-linguistic modeling approach, autoencoder models +are trained on English and Mandarin and evaluated in both native and non-native +conditions, following experimental conditions used in infant language +perception studies. The results demonstrate that unsupervised bottom-up +training on context-free acoustic information leads to comparable learned +representations of perceptual space between native and non-native conditions +for both English and Mandarin, resembling the early stage of universal +listening in infants. These findings provide insights into the organization of +perceptual space during early phonetic acquisition and contribute to our +understanding of the formation and representation of phonetic categories. + +
+
+ comment: 51 pages +
+
+
+
+
+ + ☆ Towards More Accurate Prediction of Human Empathy and Emotion in Text + and Multi-turn Conversations by Combining Advanced NLP, Transformers-based + Networks, and Linguistic Methodologies + + +
+ Based on the WASSA 2022 Shared Task on Empathy Detection and Emotion +Classification, we predict the level of empathic concern and personal distress +displayed in essays. For the first stage of this project we implemented a +Feed-Forward Neural Network using sentence-level embeddings as features. We +experimented with four different embedding models for generating the inputs to +the neural network. The subsequent stage builds upon the previous work and we +have implemented three types of revisions. The first revision focuses on the +enhancements to the model architecture and the training approach. The second +revision focuses on handling class imbalance using stratified data sampling. +The third revision focuses on leveraging lexical resources, where we apply four +different resources to enrich the features associated with the dataset. During +the final stage of this project, we have created the final end-to-end system +for the primary task using an ensemble of models to revise primary task +performance. Additionally, as part of the final stage, these approaches have +been adapted to the WASSA 2023 Shared Task on Empathy Emotion and Personality +Detection in Interactions, in which the empathic concern, emotion polarity, and +emotion intensity in dyadic text conversations are predicted. + +
+
+
+
+
+ + ☆ Conversational Dueling Bandits in Generalized Linear Models + + +
+ Conversational recommendation systems elicit user preferences by interacting +with users to obtain their feedback on recommended commodities. Such systems +utilize a multi-armed bandit framework to learn user preferences in an online +manner and have received great success in recent years. However, existing +conversational bandit methods have several limitations. First, they only enable +users to provide explicit binary feedback on the recommended items or +categories, leading to ambiguity in interpretation. In practice, users are +usually faced with more than one choice. Relative feedback, known for its +informativeness, has gained increasing popularity in recommendation system +design. Moreover, current contextual bandit methods mainly work under linear +reward assumptions, ignoring practical non-linear reward structures in +generalized linear models. Therefore, in this paper, we introduce relative +feedback-based conversations into conversational recommendation systems through +the integration of dueling bandits in generalized linear models (GLM) and +propose a novel conversational dueling bandit algorithm called ConDuel. +Theoretical analyses of regret upper bounds and empirical validations on +synthetic and real-world data underscore ConDuel's efficacy. We also +demonstrate the potential to extend our algorithm to multinomial logit bandits +with theoretical and experimental guarantees, which further proves the +applicability of the proposed framework. + +
+
+
+
+
+ + ☆ Practical Attribution Guidance for Rashomon Sets + + +
+ Different prediction models might perform equally well (Rashomon set) in the +same task, but offer conflicting interpretations and conclusions about the +data. The Rashomon effect in the context of Explainable AI (XAI) has been +recognized as a critical factor. Although the Rashomon set has been introduced +and studied in various contexts, its practical application is at its infancy +stage and lacks adequate guidance and evaluation. We study the problem of the +Rashomon set sampling from a practical viewpoint and identify two fundamental +axioms - generalizability and implementation sparsity that exploring methods +ought to satisfy in practical usage. These two axioms are not satisfied by most +known attribution methods, which we consider to be a fundamental weakness. We +use the norms to guide the design of an $\epsilon$-subgradient-based sampling +method. We apply this method to a fundamental mathematical problem as a proof +of concept and to a set of practical datasets to demonstrate its ability +compared with existing sampling methods. + +
+
+
+
+
+ + ☆ Scalable Graph Compressed Convolutions + + +
+ Designing effective graph neural networks (GNNs) with message passing has two +fundamental challenges, i.e., determining optimal message-passing pathways and +designing local aggregators. Previous methods of designing optimal pathways are +limited with information loss on the input features. On the other hand, +existing local aggregators generally fail to extract multi-scale features and +approximate diverse operators under limited parameter scales. In contrast to +these methods, Euclidean convolution has been proven as an expressive +aggregator, making it a perfect candidate for GNN construction. However, the +challenges of generalizing Euclidean convolution to graphs arise from the +irregular structure of graphs. To bridge the gap between Euclidean space and +graph topology, we propose a differentiable method that applies permutations to +calibrate input graphs for Euclidean convolution. The permutations constrain +all nodes in a row regardless of their input order and therefore enable the +flexible generalization of Euclidean convolution to graphs. Based on the graph +calibration, we propose the Compressed Convolution Network (CoCN) for +hierarchical graph representation learning. CoCN follows local feature-learning +and global parameter-sharing mechanisms of convolution neural networks. The +whole model can be trained end-to-end, with compressed convolution applied to +learn individual node features and their corresponding structure features. CoCN +can further borrow successful practices from Euclidean convolution, including +residual connection and inception mechanism. We validate CoCN on both +node-level and graph-level benchmarks. CoCN achieves superior performance over +competitive GNN baselines. Codes are available at +https://github.com/sunjss/CoCN. + +
+
+
+
+
+ + ☆ FedUD: Exploiting Unaligned Data for Cross-Platform Federated + Click-Through Rate Prediction + + +
+ Click-through rate (CTR) prediction plays an important role in online +advertising platforms. Most existing methods use data from the advertising +platform itself for CTR prediction. As user behaviors also exist on many other +platforms, e.g., media platforms, it is beneficial to further exploit such +complementary information for better modeling user interest and for improving +CTR prediction performance. However, due to privacy concerns, data from +different platforms cannot be uploaded to a server for centralized model +training. Vertical federated learning (VFL) provides a possible solution which +is able to keep the raw data on respective participating parties and learn a +collaborative model in a privacy-preserving way. However, traditional VFL +methods only utilize aligned data with common keys across parties, which +strongly restricts their application scope. In this paper, we propose FedUD, +which is able to exploit unaligned data, in addition to aligned data, for more +accurate federated CTR prediction. FedUD contains two steps. In the first step, +FedUD utilizes aligned data across parties like traditional VFL, but it +additionally includes a knowledge distillation module. This module distills +useful knowledge from the guest party's high-level representations and guides +the learning of a representation transfer network. In the second step, FedUD +applies the learned knowledge to enrich the representations of the host party's +unaligned data such that both aligned and unaligned data can contribute to +federated model training. Experiments on two real-world datasets demonstrate +the superior performance of FedUD for federated CTR prediction. + +
+
+
+
+
+ + ☆ Constructing the CORD-19 Vaccine Dataset + + +
+ We introduce new dataset 'CORD-19-Vaccination' to cater to scientists +specifically looking into COVID-19 vaccine-related research. This dataset is +extracted from CORD-19 dataset [Wang et al., 2020] and augmented with new +columns for language detail, author demography, keywords, and topic per paper. +Facebook's fastText model is used to identify languages [Joulin et al., 2016]. +To establish author demography (author affiliation, lab/institution location, +and lab/institution country columns) we processed the JSON file for each paper +and then further enhanced using Google's search API to determine country +values. 'Yake' was used to extract keywords from the title, abstract, and body +of each paper and the LDA (Latent Dirichlet Allocation) algorithm was used to +add topic information [Campos et al., 2020, 2018a,b]. To evaluate the dataset, +we demonstrate a question-answering task like the one used in the CORD-19 +Kaggle challenge [Goldbloom et al., 2022]. For further evaluation, sequential +sentence classification was performed on each paper's abstract using the model +from Dernoncourt et al. [2016]. We partially hand annotated the training +dataset and used a pre-trained BERT-PubMed layer. 'CORD- 19-Vaccination' +contains 30k research papers and can be immensely valuable for NLP research +such as text mining, information extraction, and question answering, specific +to the domain of COVID-19 vaccine research. + +
+
+
+
+
+ + ☆ Diffusion-Driven Semantic Communication for Generative Models with + Bandwidth Constraints + + +
+ Diffusion models have been extensively utilized in AI-generated content +(AIGC) in recent years, thanks to the superior generation capabilities. +Combining with semantic communications, diffusion models are used for tasks +such as denoising, data reconstruction, and content generation. However, +existing diffusion-based generative models do not consider the stringent +bandwidth limitation, which limits its application in wireless communication. +This paper introduces a diffusion-driven semantic communication framework with +advanced VAE-based compression for bandwidth-constrained generative model. Our +designed architecture utilizes the diffusion model, where the signal +transmission process through the wireless channel acts as the forward process +in diffusion. To reduce bandwidth requirements, we incorporate a downsampling +module and a paired upsampling module based on a variational auto-encoder with +reparameterization at the receiver to ensure that the recovered features +conform to the Gaussian distribution. Furthermore, we derive the loss function +for our proposed system and evaluate its performance through comprehensive +experiments. Our experimental results demonstrate significant improvements in +pixel-level metrics such as peak signal to noise ratio (PSNR) and semantic +metrics like learned perceptual image patch similarity (LPIPS). These +enhancements are more profound regarding the compression rates and SNR compared +to deep joint source-channel coding (DJSCC). + +
+
+ comment: 13 pages, 7 figures, submitted to IEEE for possible publication +
+
+
+
+
+ + ☆ Machine Unlearning using a Multi-GAN based Model + + +
+ This article presents a new machine unlearning approach that utilizes +multiple Generative Adversarial Network (GAN) based models. The proposed method +comprises two phases: i) data reorganization in which synthetic data using the +GAN model is introduced with inverted class labels of the forget datasets, and +ii) fine-tuning the pre-trained model. The GAN models consist of two pairs of +generators and discriminators. The generator discriminator pairs generate +synthetic data for the retain and forget datasets. Then, a pre-trained model is +utilized to get the class labels of the synthetic datasets. The class labels of +synthetic and original forget datasets are inverted. Finally, all combined +datasets are used to fine-tune the pre-trained model to get the unlearned +model. We have performed the experiments on the CIFAR-10 dataset and tested the +unlearned models using Membership Inference Attacks (MIA). The inverted class +labels procedure and synthetically generated data help to acquire valuable +information that enables the model to outperform state-of-the-art models and +other standard unlearning classifiers. + +
+
+
+
+
+ + ☆ MistralBSM: Leveraging Mistral-7B for Vehicular Networks Misbehavior + Detection + + +
+ Vehicular networks are exposed to various threats resulting from malicious +attacks. These threats compromise the security and reliability of +communications among road users, thereby jeopardizing road and traffic safety. +One of the main vectors of these attacks within vehicular networks is +misbehaving vehicles. To address this challenge, we propose deploying a +pretrained Large Language Model (LLM)-empowered Misbehavior Detection System +(MDS) within an edge-cloud detection framework. Specifically, we fine-tune +Mistral-7B, a state-of-the-art LLM, as the edge component to enable real-time +detection, whereas a larger LLM deployed in the cloud can conduct a more +comprehensive analysis. Our experiments conducted on the extended VeReMi +dataset demonstrate Mistral-7B's superior performance, achieving 98\% accuracy +compared to other LLMs such as LLAMA2-7B and RoBERTa. Additionally, we +investigate the impact of window size on computational costs to optimize +deployment efficiency. Leveraging LLMs in MDS shows interesting results in +improving the detection of vehicle misbehavior, consequently strengthening +vehicular network security to ensure the safety of road users. + +
+
+
+
+
+ + ☆ Fairness Definitions in Language Models Explained + + +
+ Language Models (LMs) have demonstrated exceptional performance across +various Natural Language Processing (NLP) tasks. Despite these advancements, +LMs can inherit and amplify societal biases related to sensitive attributes +such as gender and race, limiting their adoption in real-world applications. +Therefore, fairness has been extensively explored in LMs, leading to the +proposal of various fairness notions. However, the lack of clear agreement on +which fairness definition to apply in specific contexts (\textit{e.g.,} +medium-sized LMs versus large-sized LMs) and the complexity of understanding +the distinctions between these definitions can create confusion and impede +further progress. To this end, this paper proposes a systematic survey that +clarifies the definitions of fairness as they apply to LMs. Specifically, we +begin with a brief introduction to LMs and fairness in LMs, followed by a +comprehensive, up-to-date overview of existing fairness notions in LMs and the +introduction of a novel taxonomy that categorizes these concepts based on their +foundational principles and operational distinctions. We further illustrate +each definition through experiments, showcasing their practical implications +and outcomes. Finally, we discuss current research challenges and open +questions, aiming to foster innovative ideas and advance the field. The +implementation and additional resources are publicly available at +https://github.com/LavinWong/Fairness-in-Large-Language-Models/tree/main/definitions. + +
+
+
+
+
+ + ☆ Textile Anomaly Detection: Evaluation of the State-of-the-Art for + Automated Quality Inspection of Carpet + + +
+ In this study, state-of-the-art unsupervised detection models were evaluated +for the purpose of automated anomaly inspection of wool carpets. A custom +dataset of four unique types of carpet textures was created to thoroughly test +the models and their robustness in detecting subtle anomalies in complex +textures. Due to the requirements of an inline inspection system in a +manufacturing use case, the metrics of importance in this study were accuracy +in detecting anomalous areas, the number of false detections, and the inference +times of each model for real-time performance. Of the evaluated models, the +student-teacher network based methods were found on average to yield the +highest detection accuracy and lowest false detection rates. When trained on a +multi-class dataset the models were found to yield comparable if not better +results than single-class training. Finally, in terms of detection speed, with +exception to the generative model, all other evaluated models were found to +have comparable inference times on a GPU, with an average of 0.16s per image. +On a CPU, most of these models typically produced results between 1.5 to 2 +times the respective GPU inference times. + +
+
+ comment: Accepted at the 2023 Australasian Conference on Robotics and + Automation (ACRA 2023) Publication url + https://www.scopus.com/inward/record.uri?eid=2-s2.0-85184380272&partnerID=40&md5=74fde263f4a24a1bff75d6560b423994 + ISSN: 14482053 Contains 10 pages and three figures +
+
+
+
+
+ + ☆ Towards A Generalizable Pathology Foundation Model via Unified Knowledge + Distillation + + +
+ Foundation models pretrained on large-scale datasets are revolutionizing the +field of computational pathology (CPath). The generalization ability of +foundation models is crucial for the success in various downstream clinical +tasks. However, current foundation models have only been evaluated on a limited +type and number of tasks, leaving their generalization ability and overall +performance unclear. To address this gap, we established a most comprehensive +benchmark to evaluate the performance of off-the-shelf foundation models across +six distinct clinical task types, encompassing a total of 39 specific tasks. +Our findings reveal that existing foundation models excel at certain task types +but struggle to effectively handle the full breadth of clinical tasks. To +improve the generalization of pathology foundation models, we propose a unified +knowledge distillation framework consisting of both expert and self knowledge +distillation, where the former allows the model to learn from the knowledge of +multiple expert models, while the latter leverages self-distillation to enable +image representation learning via local-global alignment. Based on this +framework, a Generalizable Pathology Foundation Model (GPFM) is pretrained on a +large-scale dataset consisting of 190 million images from around 86,000 public +H\&E whole slides across 34 major tissue types. Evaluated on the established +benchmark, GPFM achieves an impressive average rank of 1.36, with 29 tasks +ranked 1st, while the the second-best model, UNI, attains an average rank of +2.96, with only 4 tasks ranked 1st. The superior generalization of GPFM +demonstrates its exceptional modeling capabilities across a wide range of +clinical tasks, positioning it as a new cornerstone for feature representation +in CPath. + +
+
+
+
+
+ + ☆ Impact of Recurrent Neural Networks and Deep Learning Frameworks on + Real-time Lightweight Time Series Anomaly Detection ICICS2024 + + +
+ Real-time lightweight time series anomaly detection has become increasingly +crucial in cybersecurity and many other domains. Its ability to adapt to +unforeseen pattern changes and swiftly identify anomalies enables prompt +responses and critical decision-making. While several such anomaly detection +approaches have been introduced in recent years, they primarily utilize a +single type of recurrent neural networks (RNNs) and have been implemented in +only one deep learning framework. It is unclear how the use of different types +of RNNs available in various deep learning frameworks affects the performance +of these anomaly detection approaches due to the absence of comprehensive +evaluations. Arbitrarily choosing a RNN variant and a deep learning framework +to implement an anomaly detection approach may not reflect its true performance +and could potentially mislead users into favoring one approach over another. In +this paper, we aim to study the influence of various types of RNNs available in +popular deep learning frameworks on real-time lightweight time series anomaly +detection. We reviewed several state-of-the-art approaches and implemented a +representative anomaly detection approach using well-known RNN variants +supported by three widely recognized deep learning frameworks. A comprehensive +evaluation is then conducted to analyze the performance of each implementation +across real-world, open-source time series datasets. The evaluation results +provide valuable guidance for selecting the appropriate RNN variant and deep +learning framework for real-time, lightweight time series anomaly detection. + +
+
+ comment: 20 pages, 4 figures, 7 tables, The 26th International Conference on + Information and Communications Security, 26-28 August, 2024, Mytilene, + Lesvos, Greece (ICICS2024) +
+
+
+
+
+ + ☆ A Model for Combinatorial Dictionary Learning and Inference + + +
+ We are often interested in decomposing complex, structured data into simple +components that explain the data. The linear version of this problem is +well-studied as dictionary learning and factor analysis. In this work, we +propose a combinatorial model in which to study this question, motivated by the +way objects occlude each other in a scene to form an image. First, we identify +a property we call "well-structuredness" of a set of low-dimensional components +which ensures that no two components in the set are too similar. We show how +well-structuredness is sufficient for learning the set of latent components +comprising a set of sample instances. We then consider the problem: given a set +of components and an instance generated from some unknown subset of them, +identify which parts of the instance arise from which components. We consider +two variants: (1) determine the minimal number of components required to +explain the instance; (2) determine the correct explanation for as many +locations as possible. For the latter goal, we also devise a version that is +robust to adversarial corruptions, with just a slightly stronger assumption on +the components. Finally, we show that the learning problem is computationally +infeasible in the absence of any assumptions. + +
+
+ comment: 31 pages, 3 figures +
+
+
+
+
+ + ☆ Investigating the Privacy Risk of Using Robot Vacuum Cleaners in Smart + Environments ICICS2024 + + +
+ Robot vacuum cleaners have become increasingly popular and are widely used in +various smart environments. To improve user convenience, manufacturers also +introduced smartphone applications that enable users to customize cleaning +settings or access information about their robot vacuum cleaners. While this +integration enhances the interaction between users and their robot vacuum +cleaners, it results in potential privacy concerns because users' personal +information may be exposed. To address these concerns, end-to-end encryption is +implemented between the application, cloud service, and robot vacuum cleaners +to secure the exchanged information. Nevertheless, network header metadata +remains unencrypted and it is still vulnerable to network eavesdropping. In +this paper, we investigate the potential risk of private information exposure +through such metadata. A popular robot vacuum cleaner was deployed in a real +smart environment where passive network eavesdropping was conducted during +several selected cleaning events. Our extensive analysis, based on Association +Rule Learning, demonstrates that it is feasible to identify certain events +using only the captured Internet traffic metadata, thereby potentially exposing +private user information and raising privacy concerns. + +
+
+ comment: 18 pages, 11 figures, 4 tables, The 26th International Conference on + Information and Communications Security, 26-28 August, 2024, Mytilene, + Lesvos, Greece (ICICS2024) +
+
+
+
+
+ + ♻ ☆ Physics-Guided Actor-Critic Reinforcement Learning for Swimming in + Turbulence + + +
+ Turbulent diffusion causes particles placed in proximity to separate. We +investigate the required swimming efforts to maintain a particle close to its +passively advected counterpart. We explore optimally balancing these efforts +with the intended goal by developing and comparing a novel Physics-Informed +Reinforcement Learning (PIRL) strategy with prescribed control (PC) and +standard physics-agnostic Reinforcement Learning strategies. Our PIRL scheme, +coined the Actor-Physicist, is an adaptation of the Actor-Critic algorithm in +which the Neural Network parameterized Critic is replaced with an analytically +derived physical heuristic function (the physicist). This strategy is then +compared with an analytically computed optimal PC policy derived from a +stochastic optimal control formulation and standard physics-agnostic +Actor-Critic type algorithms. + +
+
+ comment: 23 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Recursive Introspection: Teaching Language Model Agents How to + Self-Improve + + +
+ A central piece in enabling intelligent agentic behavior in foundation models +is to make them capable of introspecting upon their behavior, reasoning, and +correcting their mistakes as more computation or interaction is available. Even +the strongest proprietary large language models (LLMs) do not quite exhibit the +ability of continually improving their responses sequentially, even in +scenarios where they are explicitly told that they are making a mistake. In +this paper, we develop RISE: Recursive IntroSpEction, an approach for +fine-tuning LLMs to introduce this capability, despite prior work hypothesizing +that this capability may not be possible to attain. Our approach prescribes an +iterative fine-tuning procedure, which attempts to teach the model how to alter +its response after having executed previously unsuccessful attempts to solve a +hard test-time problem, with optionally additional environment feedback. RISE +poses fine-tuning for a single-turn prompt as solving a multi-turn Markov +decision process (MDP), where the initial state is the prompt. Inspired by +principles in online imitation learning and reinforcement learning, we propose +strategies for multi-turn data collection and training so as to imbue an LLM +with the capability to recursively detect and correct its previous mistakes in +subsequent iterations. Our experiments show that RISE enables Llama2, Llama3, +and Mistral models to improve themselves with more turns on math reasoning +tasks, outperforming several single-turn strategies given an equal amount of +inference-time computation. We also find that RISE scales well, often attaining +larger benefits with more capable models. Our analysis shows that RISE makes +meaningful improvements to responses to arrive at the correct solution for +challenging prompts, without disrupting one-turn abilities as a result of +expressing more complex distributions. + +
+
+
+
+
+ + ♻ ☆ Regression prediction algorithm for energy consumption regression in + cloud computing based on horned lizard algorithm optimised convolutional + neural network-bidirectional gated recurrent unit + + +
+ For this paper, a prediction study of cloud computing energy consumption was +conducted by optimising the data regression algorithm based on the horned +lizard optimisation algorithm for Convolutional Neural Networks-Bi-Directional +Gated Recurrent Units. Firstly, through Spearman correlation analysis of CPU, +usage, memory usage, network traffic, power consumption, number of instructions +executed, execution time and energy efficiency, we found that power consumption +has the highest degree of positive correlation with energy efficiency, while +CPU usage has the highest degree of negative correlation with energy +efficiency. In our experiments, we introduced a random forest model and an +optimisation model based on the horned lizard optimisation algorithm for +testing, and the results show that the optimisation algorithm has better +prediction results compared to the random forest model. Specifically, the mean +square error (MSE) of the optimisation algorithm is 0.01 smaller than that of +the random forest model, and the mean absolute error (MAE) is 0.01 smaller than +that of the random forest.3 The results of the combined metrics show that the +optimisation algorithm performs more accurately and reliably in predicting +energy efficiency. This research result provides new ideas and methods to +improve the energy efficiency of cloud computing systems. This research not +only expands the scope of application in the field of cloud computing, but also +provides a strong support for improving the energy use efficiency of the +system. + +
+
+
+
+
+ + ♻ ☆ How Well Can a Long Sequence Model Model Long Sequences? Comparing + Architechtural Inductive Biases on Long-Context Abilities + + +
+ Long sequences occur in abundance within real-world scenarios, hence properly +modelling them opens numerous down-stream use-cases. Deep neural networks, +however, have often struggled with these for a variety of reasons. Recent +advances, both in system engineering as well as model design, have enabled the +scaling up of model that are purported to support extended context length. In +particular, the state-space and linear recurrent neural network families of +models hypothetically can entend to infinite sequence lenth. However, is this +too good to be true? We conduct an evaluation to show that while such claims +may be sound theoretically, there remain large practical gaps that are +empirically observed. In particular, recurrent models still suffer in the same +settings as long-context LLMs with attention. We further show that different +inductive biases have inconsistent extrapolation capabilities, highlighting the +need to further study such paradigms and investigate why long-context models +seemingly fail to behave as one might expect. + +
+
+ comment: Work In Progress. 9 pages +
+
+
+
+
+ + ♻ ☆ Variational Inference via Smoothed Particle Hydrodynamics + + +
+ A new variational inference method, SPH-ParVI, based on smoothed particle +hydrodynamics (SPH), is proposed for sampling partially known densities (e.g. +up to a constant) or sampling using gradients. SPH-ParVI simulates the flow of +a fluid under external effects driven by the target density; transient or +steady state of the fluid approximates the target density. The continuum fluid +is modelled as an interacting particle system (IPS) via SPH, where each +particle carries smoothed properties, interacts and evolves as per the +Navier-Stokes equations. This mesh-free, Lagrangian simulation method offers +fast, flexible, scalable and deterministic sampling and inference for a class +of probabilistic models such as those encountered in Bayesian inference and +generative modelling. + +
+
+
+
+
+ + ♻ ☆ Distilling Multi-Scale Knowledge for Event Temporal Relation Extraction CIKM 2024 + + +
+ Event Temporal Relation Extraction (ETRE) is paramount but challenging. +Within a discourse, event pairs are situated at different distances or the +so-called proximity bands. The temporal ordering communicated about event pairs +where at more remote (i.e., ``long'') or less remote (i.e., ``short'') +proximity bands are encoded differently. SOTA models have tended to perform +well on events situated at either short or long proximity bands, but not both. +Nonetheless, real-world, natural texts contain all types of temporal +event-pairs. In this paper, we present MulCo: Distilling Multi-Scale Knowledge +via Contrastive Learning, a knowledge co-distillation approach that shares +knowledge across multiple event pair proximity bands to improve performance on +all types of temporal datasets. Our experimental results show that MulCo +successfully integrates linguistic cues pertaining to temporal reasoning across +both short and long proximity bands and achieves new state-of-the-art results +on several ETRE benchmark datasets. + +
+
+ comment: Accepted to CIKM 2024 Full Research Track, camera ready version +
+
+
+
+
+ + ♻ ☆ Semantic Prototypes: Enhancing Transparency Without Black Boxes CIKM 2024 + + +
+ As machine learning (ML) models and datasets increase in complexity, the +demand for methods that enhance explainability and interpretability becomes +paramount. Prototypes, by encapsulating essential characteristics within data, +offer insights that enable tactical decision-making and enhance transparency. +Traditional prototype methods often rely on sub-symbolic raw data and opaque +latent spaces, reducing explainability and increasing the risk of +misinterpretations. This paper presents a novel framework that utilizes +semantic descriptions to define prototypes and provide clear explanations, +effectively addressing the shortcomings of conventional methods. Our approach +leverages concept-based descriptions to cluster data on the semantic level, +ensuring that prototypes not only represent underlying properties intuitively +but are also straightforward to interpret. Our method simplifies the +interpretative process and effectively bridges the gap between complex data +structures and human cognitive processes, thereby enhancing transparency and +fostering trust. Our approach outperforms existing widely-used prototype +methods in facilitating human understanding and informativeness, as validated +through a user survey. + +
+
+ comment: This paper has been accepted for publication as a full paper at the + 33rd ACM International Conference on Information and Knowledge Management + (CIKM 2024) +
+
+
+
+
+ + ♻ ☆ On TinyML and Cybersecurity: Electric Vehicle Charging Infrastructure + Use Case + + +
+ As technology advances, the use of Machine Learning (ML) in cybersecurity is +becoming increasingly crucial to tackle the growing complexity of cyber +threats. While traditional ML models can enhance cybersecurity, their high +energy and resource demands limit their applications, leading to the emergence +of Tiny Machine Learning (TinyML) as a more suitable solution for +resource-constrained environments. TinyML is widely applied in areas such as +smart homes, healthcare, and industrial automation. TinyML focuses on +optimizing ML algorithms for small, low-power devices, enabling intelligent +data processing directly on edge devices. This paper provides a comprehensive +review of common challenges of TinyML techniques, such as power consumption, +limited memory, and computational constraints; it also explores potential +solutions to these challenges, such as energy harvesting, computational +optimization techniques, and transfer learning for privacy preservation. On the +other hand, this paper discusses TinyML's applications in advancing +cybersecurity for Electric Vehicle Charging Infrastructures (EVCIs) as a +representative use case. It presents an experimental case study that enhances +cybersecurity in EVCI using TinyML, evaluated against traditional ML in terms +of reduced delay and memory usage, with a slight trade-off in accuracy. +Additionally, the study includes a practical setup using the ESP32 +microcontroller in the PlatformIO environment, which provides a hands-on +assessment of TinyML's application in cybersecurity for EVCI. + +
+
+ comment: Accepted and to appear in IEEE Access; Code is available at GitHub + link: https://github.com/Western-OC2-Lab/TinyML_EVCI +
+
+
+
+
+ + ♻ ☆ On The Expressive Power of Knowledge Graph Embedding Methods + + +
+ Knowledge Graph Embedding (KGE) is a popular approach, which aims to +represent entities and relations of a knowledge graph in latent spaces. Their +representations are known as embeddings. To measure the plausibility of +triplets, score functions are defined over embedding spaces. Despite wide +dissemination of KGE in various tasks, KGE methods have limitations in +reasoning abilities. In this paper we propose a mathematical framework to +compare reasoning abilities of KGE methods. We show that STransE has a higher +capability than TransComplEx, and then present new STransCoRe method, which +improves the STransE by combining it with the TransCoRe insights, which can +reduce the STransE space complexity. + +
+
+ comment: This paper may involve data that is not readily available to the + public +
+
+
+
+
+ + ♻ ☆ CGGM: A conditional graph generation model with adaptive sparsity for + node anomaly detection in IoT networks + + +
+ Dynamic graphs are extensively employed for detecting anomalous behavior in +nodes within the Internet of Things (IoT). Generative models are often used to +address the issue of imbalanced node categories in dynamic graphs. +Nevertheless, the constraints it faces include the monotonicity of adjacency +relationships, the difficulty in constructing multi-dimensional features for +nodes, and the lack of a method for end-to-end generation of multiple +categories of nodes. This paper presents a novel graph generation model, called +CGGM, designed specifically to generate a larger number of nodes belonging to +the minority class. The mechanism for generating an adjacency matrix, through +adaptive sparsity, enhances flexibility in its structure. The feature +generation module, called multidimensional features generator (MFG) to generate +node features along with topological information. Labels are transformed into +embedding vectors, serving as conditional constraints to control the generation +of synthetic data across multiple categories. Using a multi-stage loss, the +distribution of synthetic data is adjusted to closely resemble that of real +data. In extensive experiments, we show that CGGM's synthetic data outperforms +state-of-the-art methods across various metrics. Our results demonstrate +efficient generation of diverse data categories, robustly enhancing +multi-category classification model performance. + +
+
+ comment: 13 pages, 19 figures +
+
+
+
+
+ + ♻ ☆ Learning to Visually Connect Actions and their Effects + + +
+ We introduce the novel concept of visually Connecting Actions and Their +Effects (CATE) in video understanding. CATE can have applications in areas like +task planning and learning from demonstration. We identify and explore two +different aspects of the concept of CATE: Action Selection (AS) and +Effect-Affinity Assessment (EAA), where video understanding models connect +actions and effects at semantic and fine-grained levels, respectively. We +design various baseline models for AS and EAA. Despite the intuitive nature of +the task, we observe that models struggle, and humans outperform them by a +large margin. Our experiments show that in solving AS and EAA, models learn +intuitive properties like object tracking and pose encoding without explicit +supervision. We demonstrate that CATE can be an effective self-supervised task +for learning video representations from unlabeled videos. The study aims to +showcase the fundamental nature and versatility of CATE, with the hope of +inspiring advanced formulations and models. + +
+
+
+
+
+ + ♻ ☆ Weyl Calculus and Exactly Solvable Schrödinger Bridges with + Quadratic State Cost + + +
+ Schr\"{o}dinger bridge--a stochastic dynamical generalization of optimal mass +transport--exhibits a learning-control duality. Viewed as a stochastic control +problem, the Schr\"{o}dinger bridge finds an optimal control policy that steers +a given joint state statistics to another while minimizing the total control +effort subject to controlled diffusion and deadline constraints. Viewed as a +stochastic learning problem, the Schr\"{o}dinger bridge finds the most-likely +distribution-valued trajectory connecting endpoint distributional observations, +i.e., solves the two point boundary-constrained maximum likelihood problem over +the manifold of probability distributions. Recent works have shown that solving +the Schr\"{o}dinger bridge problem with state cost requires finding the Markov +kernel associated with a reaction-diffusion PDE where the state cost appears as +a state-dependent reaction rate. We explain how ideas from Weyl calculus in +quantum mechanics, specifically the Weyl operator and the Weyl symbol, can help +determine such Markov kernels. We illustrate these ideas by explicitly finding +the Markov kernel for the case of quadratic state cost via Weyl calculus, +recovering our earlier results but avoiding tedious computation with Hermite +polynomials. + +
+
+
+
+
+ + ♻ ☆ Diffusion MRI with Machine Learning + + +
+ Diffusion-weighted magnetic resonance imaging (dMRI) offers unique +capabilities including noninvasive probing of brain's tissue microstructure and +structural connectivity. It is widely used for clinical assessment of brain +pathologies and for neuroscience research. Analyzing the dMRI data to extract +useful information for medical and scientific purposes can be challenging. The +dMRI measurements often suffer from strong noise and artifacts, there is +usually high inter-session and inter-scanner variability in the data, and +considerable inter-subject heterogeneity in brain structure. Moreover, the +relationship between measurements and the phenomena of interest can be highly +complex. Recent years have witnessed increasing use of machine learning methods +for dMRI analysis. This manuscript aims to assess these efforts, with a focus +on methods that have addressed data preprocessing and harmonization, +microstructure mapping, tractography, and white matter tract analysis. We study +the main findings, strengths, and weaknesses of the existing methods and +suggest topics for future research. We find that machine learning may be +exceptionally suited to tackle some of the difficult tasks in dMRI analysis. +However, for this to happen, several shortcomings of existing methods and +critical unresolved issues need to be addressed. These include deficient +evaluation practices, lack of rich training datasets and validation benchmarks, +as well as model generalizability, reliability, and explainability concerns. + +
+
+
+
+
+ + ♻ ☆ Unsqueeze [CLS] Bottleneck to Learn Rich Representations ECCV 2024 + + +
+ Distillation-based self-supervised learning typically leads to more +compressed representations due to its radical clustering process and the +implementation of a sharper target distribution. To overcome this limitation +and preserve more information from input, we introduce UDI, conceptualized as +Unsqueezed Distillation-based self-supervised learning (SSL). UDI enriches the +learned representation by encouraging multimodal prediction distilled from a +consolidated profile of local predictions that are derived via stratified +sampling. Our evaluations show that UDI not only promotes semantically +meaningful representations at instance level, delivering superior or +competitive results to state-of-the-art SSL methods in image classification, +but also effectively preserves the nuisance of input, which yields significant +improvement in dense prediction tasks, including object detection and +segmentation. Additionally, UDI performs competitively in low-shot image +classification, improving the scalability of joint-embedding pipelines. Various +visualizations and ablation studies are presented to further elucidate the +mechanisms behind UDI. Our source code is available at +https://github.com/ISL-CV/udi. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Attacks on fairness in Federated Learning + + +
+ Federated Learning is an important emerging distributed training paradigm +that keeps data private on clients. It is now well understood that by +controlling only a small subset of FL clients, it is possible to introduce a +backdoor to a federated learning model, in the presence of certain attributes. +In this paper, we present a new type of attack that compromises the fairness of +the trained model. Fairness is understood to be the attribute-level performance +distribution of a trained model. It is particularly salient in domains where, +for example, skewed accuracy discrimination between subpopulations could have +disastrous consequences. We find that by employing a threat model similar to +that of a backdoor attack, an attacker is able to influence the aggregated +model to have an unfair performance distribution between any given set of +attributes. Furthermore, we find that this attack is possible by controlling +only a single client. While combating naturally induced unfairness in FL has +previously been discussed in depth, its artificially induced kind has been +neglected. We show that defending against attacks on fairness should be a +critical consideration in any situation where unfairness in a trained model +could benefit a user who participated in its training. + +
+
+
+
+
+ + ♻ ☆ MUVO: A Multimodal World Model with Spatial Representations for + Autonomous Driving + + +
+ Learning unsupervised world models for autonomous driving has the potential +to improve the reasoning capabilities of today's systems dramatically. However, +most work neglects the physical attributes of the world and focuses on sensor +data alone. We propose MUVO, a MUltimodal World Model with spatial VOxel +representations, to address this challenge. We utilize raw camera and lidar +data to learn a sensor-agnostic geometric representation of the world. We +demonstrate multimodal future predictions and show that our spatial +representation improves the prediction quality of both camera images and lidar +point clouds. + +
+
+ comment: Daniel Bogdoll and Yitian Yang contributed equally +
+
+
+
+
+ + ♻ ☆ Dynamics of Moral Behavior in Heterogeneous Populations of Learning + Agents AAAI + + +
+ Growing concerns about safety and alignment of AI systems highlight the +importance of embedding moral capabilities in artificial agents: a promising +solution is the use of learning from experience, i.e., Reinforcement Learning. +In multi-agent (social) environments, complex population-level phenomena may +emerge from interactions between individual learning agents. Many of the +existing studies rely on simulated social dilemma environments to study the +interactions of independent learning agents; however, they tend to ignore the +moral heterogeneity that is likely to be present in societies of agents in +practice. For example, at different points in time a single learning agent may +face opponents who are consequentialist (i.e., focused on maximizing outcomes +over time), norm-based (i.e., conforming to specific norms), or virtue-based +(i.e., considering a combination of different virtues). The extent to which +agents' co-development may be impacted by such moral heterogeneity in +populations is not well understood. In this paper, we present a study of the +learning dynamics of morally heterogeneous populations interacting in a social +dilemma setting. Using an Iterated Prisoner's Dilemma environment with a +partner selection mechanism, we investigate the extent to which the prevalence +of diverse moral agents in populations affects individual agents' learning +behaviors and emergent population-level outcomes. We observe several types of +non-trivial interactions between pro-social and anti-social agents, and find +that certain types of moral agents are able to steer selfish agents towards +more cooperative behavior. + +
+
+ comment: Accepted at AIES 2024 (7th AAAI/ACM Conference on AI, Ethics, and + Society - San Jose, CA, USA) +
+
+
+
+
+ + ♻ ☆ Outlier detection by ensembling uncertainty with negative objectness BMVC 2024 + + +
+ Outlier detection is an essential capability in safety-critical applications +of supervised visual recognition. Most of the existing methods deliver best +results by encouraging standard closed-set models to produce low-confidence +predictions in negative training data. However, that approach conflates +prediction uncertainty with recognition of the negative class. We therefore +reconsider direct prediction of K+1 logits that correspond to K groundtruth +classes and one outlier class. This setup allows us to formulate a novel +anomaly score as an ensemble of in-distribution uncertainty and the posterior +of the outlier class which we term negative objectness. Now outliers can be +independently detected due to i) high prediction uncertainty or ii) similarity +with negative data. We embed our method into a dense prediction architecture +with mask-level recognition over K+2 classes. The training procedure encourages +the novel K+2-th class to learn negative objectness at pasted negative +instances. Our models outperform the current state-of-the art on standard +benchmarks for image-wide and pixel-level outlier detection with and without +training on real negative data. + +
+
+ comment: Accepted to BMVC 2024 +
+
+
+
+
+ + ♻ ☆ Using representation balancing to learn conditional-average dose + responses from clustered data + + +
+ Estimating a unit's responses to interventions with an associated dose, the +"conditional average dose response" (CADR), is relevant in a variety of +domains, from healthcare to business, economics, and beyond. Such a response +typically needs to be estimated from observational data, which introduces +several challenges. That is why the machine learning (ML) community has +proposed several tailored CADR estimators. Yet, the proposal of most of these +methods requires strong assumptions on the distribution of data and the +assignment of interventions, which go beyond the standard assumptions in causal +inference. Whereas previous works have so far focused on smooth shifts in +covariate distributions across doses, in this work, we will study estimating +CADR from clustered data and where different doses are assigned to different +segments of a population. On a novel benchmarking dataset, we show the impacts +of clustered data on model performance and propose an estimator, CBRNet, that +learns cluster-agnostic and hence dose-agnostic covariate representations +through representation balancing for unbiased CADR inference. We run extensive +experiments to illustrate the workings of our method and compare it with the +state of the art in ML for CADR estimation. + +
+
+ comment: 21 pages, 7 figures, v2: updated methodology and experiments +
+
+
+
+
+ + ♻ ☆ MMPolymer: A Multimodal Multitask Pretraining Framework for Polymer + Property Prediction CIKM 2024 + + +
+ Polymers are high-molecular-weight compounds constructed by the covalent +bonding of numerous identical or similar monomers so that their 3D structures +are complex yet exhibit unignorable regularity. Typically, the properties of a +polymer, such as plasticity, conductivity, bio-compatibility, and so on, are +highly correlated with its 3D structure. However, existing polymer property +prediction methods heavily rely on the information learned from polymer SMILES +sequences (P-SMILES strings) while ignoring crucial 3D structural information, +resulting in sub-optimal performance. In this work, we propose MMPolymer, a +novel multimodal multitask pretraining framework incorporating polymer 1D +sequential and 3D structural information to encourage downstream polymer +property prediction tasks. Besides, considering the scarcity of polymer 3D +data, we further introduce the "Star Substitution" strategy to extract 3D +structural information effectively. During pretraining, in addition to +predicting masked tokens and recovering clear 3D coordinates, MMPolymer +achieves the cross-modal alignment of latent representations. Then we further +fine-tune the pretrained MMPolymer for downstream polymer property prediction +tasks in the supervised learning paradigm. Experiments show that MMPolymer +achieves state-of-the-art performance in downstream property prediction tasks. +Moreover, given the pretrained MMPolymer, utilizing merely a single modality in +the fine-tuning phase can also outperform existing methods, showcasing the +exceptional capability of MMPolymer in polymer feature extraction and +utilization. + +
+
+ comment: Accepted by the 33rd ACM International Conference on Information and + Knowledge Management (CIKM 2024) +
+
+
+
+
+ + ♻ ☆ Merit-based Fair Combinatorial Semi-Bandit with Unrestricted Feedback + Delays ECAI 2024 + + +
+ We study the stochastic combinatorial semi-bandit problem with unrestricted +feedback delays under merit-based fairness constraints. This is motivated by +applications such as crowdsourcing, and online advertising, where immediate +feedback is not immediately available and fairness among different choices (or +arms) is crucial. We consider two types of unrestricted feedback delays: +reward-independent delays where the feedback delays are independent of the +rewards, and reward-dependent delays where the feedback delays are correlated +with the rewards. Furthermore, we introduce merit-based fairness constraints to +ensure a fair selection of the arms. We define the reward regret and the +fairness regret and present new bandit algorithms to select arms under +unrestricted feedback delays based on their merits. We prove that our +algorithms all achieve sublinear expected reward regret and expected fairness +regret, with a dependence on the quantiles of the delay distribution. We also +conduct extensive experiments using synthetic and real-world data and show that +our algorithms can fairly select arms with different feedback delays. + +
+
+ comment: 28 pages, 9 figures, accepted for 27th European Conference on + Artificial Intelligence (ECAI 2024), Source code added +
+
+
+
+
+ + ♻ ☆ Target Specific De Novo Design of Drug Candidate Molecules with Graph + Transformer-based Generative Adversarial Networks + + +
+ Discovering novel drug candidate molecules is one of the most fundamental and +critical steps in drug development. Generative deep learning models, which +create synthetic data given a probability distribution, offer a high potential +for designing de novo molecules. However, for them to be useful in real-life +drug development pipelines, these models should be able to design drug-like and +target-centric molecules. In this study, we propose an end-to-end generative +system, DrugGEN, for the de novo design of drug candidate molecules that +interact with intended target proteins. The proposed method represents +molecules as graphs and processes them via a generative adversarial network +comprising graph transformer layers. The system is trained using a large +dataset of drug-like compounds and target-specific bioactive molecules to +design effective inhibitory molecules against the AKT1 protein, which is +critically important in developing treatments for various types of cancer. We +conducted molecular docking and dynamics to assess the target-centric +generation performance of the model, as well as attention score visualisation +to examine model interpretability. Results indicate that our de novo molecules +have a high potential for interacting with the AKT1 protein at the level of its +native ligands. Using the open-access DrugGEN codebase, it is possible to +easily train models for other druggable proteins, given a dataset of +experimentally known bioactive molecules. + +
+
+
+
+
+ + ♻ ☆ Exploring Scaling Trends in LLM Robustness + + +
+ Language model capabilities predictably improve from scaling a model's size +and training data. Motivated by this, increasingly large language models have +been trained, yielding an array of impressive capabilities. Yet these models +are vulnerable to adversarial prompts, such as "jailbreaks" that hijack models +to perform undesired behaviors, posing a significant risk of misuse. Prior work +indicates that computer vision models become more robust with model and data +scaling, raising the question: does language model robustness also improve with +scale? We study this question empirically, finding that larger models respond +substantially better to adversarial training, but there is little to no benefit +from model scale in the absence of explicit defenses. + +
+
+ comment: 31 pages; edit fixed metadata typo (author name) +
+
+
+
+
+ + ♻ ☆ Quality Assured: Rethinking Annotation Strategies in Imaging AI ECCV 2024 + + +
+ This paper does not describe a novel method. Instead, it studies an essential +foundation for reliable benchmarking and ultimately real-world application of +AI-based image analysis: generating high-quality reference annotations. +Previous research has focused on crowdsourcing as a means of outsourcing +annotations. However, little attention has so far been given to annotation +companies, specifically regarding their internal quality assurance (QA) +processes. Therefore, our aim is to evaluate the influence of QA employed by +annotation companies on annotation quality and devise methodologies for +maximizing data annotation efficacy. Based on a total of 57,648 instance +segmented images obtained from a total of 924 annotators and 34 QA workers from +four annotation companies and Amazon Mechanical Turk (MTurk), we derived the +following insights: (1) Annotation companies perform better both in terms of +quantity and quality compared to the widely used platform MTurk. (2) Annotation +companies' internal QA only provides marginal improvements, if any. However, +improving labeling instructions instead of investing in QA can substantially +boost annotation performance. (3) The benefit of internal QA depends on +specific image characteristics. Our work could enable researchers to derive +substantially more value from a fixed annotation budget and change the way +annotation companies conduct internal QA. + +
+
+ comment: Accepted at ECCV 2024, preprint, Computer Vision, Data Annotation +
+
+
+
+
+ + ♻ ☆ Viewpoint Textual Inversion: Discovering Scene Representations and 3D + View Control in 2D Diffusion Models ECCV 2024 + + +
+ Text-to-image diffusion models generate impressive and realistic images, but +do they learn to represent the 3D world from only 2D supervision? We +demonstrate that yes, certain 3D scene representations are encoded in the text +embedding space of models like Stable Diffusion. Our approach, Viewpoint Neural +Textual Inversion (ViewNeTI), is to discover 3D view tokens; these tokens +control the 3D viewpoint - the rendering pose in a scene - of generated images. +Specifically, we train a small neural mapper to take continuous camera +viewpoint parameters and predict a view token (a word embedding). This token +conditions diffusion generation via cross-attention to produce images with the +desired camera viewpoint. Using ViewNeTI as an evaluation tool, we report two +findings: first, the text latent space has a continuous view-control manifold +for particular 3D scenes; second, we find evidence for a generalized +view-control manifold for all scenes. We conclude that since the view token +controls the 3D `rendering' viewpoint, there is likely a scene representation +embedded in frozen 2D diffusion models. Finally, we exploit the 3D scene +representations for 3D vision tasks, namely, view-controlled text-to-image +generation, and novel view synthesis from a single image, where our approach +sets state-of-the-art for LPIPS. Code available at +https://github.com/jmhb0/view_neti + +
+
+ comment: ECCV 2024 (European Conference on Computer Vision). Project page: + https://jmhb0.github.io/view_neti/ +
+
+
+
+
+ + ♻ ☆ The SkipSponge Attack: Sponge Weight Poisoning of Deep Neural Networks + + +
+ Sponge attacks aim to increase the energy consumption and computation time of +neural networks deployed on hardware accelerators. Existing sponge attacks can +be performed during inference via sponge examples or during training via Sponge +Poisoning. Sponge examples leverage perturbations added to the model's input to +increase energy and latency, while Sponge Poisoning alters the objective +function of a model to induce inference-time energy effects. In this work, we +propose a novel sponge attack called SkipSponge. SkipSponge is the first sponge +attack that is performed directly on the parameters of a pre-trained model +using only a few data samples. Our experiments show that SkipSponge can +successfully increase the energy consumption of image classification models, +GANs, and autoencoders with fewer samples required than Sponge Poisoning. We +show that poisoning defenses are ineffective if not adjusted specifically for +the defense against SkipSponge (i.e., they decrease target layer bias values). +Our work shows that SkipSponge is more effective on the GANs and the +autoencoders than the state-of-the-art. Additionally, SkipSponge is stealthier +than the previous Sponge Poisoning attack as it does not require significant +changes in the victim model's weights. Our experiments indicate that the +SkipSponge attack can be performed even when an attacker has access to only 1% +of the entire dataset and reaches up to 13% energy increase. + +
+
+
+
+
+ + ♻ ☆ Real Time Multi Organ Classification on Computed Tomography Images + + +
+ Organ segmentation is a fundamental task in medical imaging since it is +useful for many clinical automation pipelines. However, some tasks do not +require full segmentation. Instead, a classifier can identify the selected +organ without segmenting the entire volume. In this study, we demonstrate a +classifier based method to obtain organ labels in real time by using a large +context size with a sparse data sampling strategy. Although our method operates +as an independent classifier at query locations, it can generate full +segmentations by querying grid locations at any resolution, offering faster +performance than segmentation algorithms. We compared our method with existing +segmentation techniques, demonstrating its superior runtime potential for +practical applications in medical imaging. + +
+
+
+
+
+ + ♻ ☆ DistriBlock: Identifying adversarial audio samples by leveraging + characteristics of the output distribution + + +
+ Adversarial attacks can mislead automatic speech recognition (ASR) systems +into predicting an arbitrary target text, thus posing a clear security threat. +To prevent such attacks, we propose DistriBlock, an efficient detection +strategy applicable to any ASR system that predicts a probability distribution +over output tokens in each time step. We measure a set of characteristics of +this distribution: the median, maximum, and minimum over the output +probabilities, the entropy of the distribution, as well as the Kullback-Leibler +and the Jensen-Shannon divergence with respect to the distributions of the +subsequent time step. Then, by leveraging the characteristics observed for both +benign and adversarial data, we apply binary classifiers, including simple +threshold-based classification, ensembles of such classifiers, and neural +networks. Through extensive analysis across different state-of-the-art ASR +systems and language data sets, we demonstrate the supreme performance of this +approach, with a mean area under the receiver operating characteristic curve +for distinguishing target adversarial examples against clean and noisy data of +99% and 97%, respectively. To assess the robustness of our method, we show that +adaptive adversarial examples that can circumvent DistriBlock are much noisier, +which makes them easier to detect through filtering and creates another avenue +for preserving the system's robustness. + +
+
+
+
+
+ + ♻ ☆ When Meta-Learning Meets Online and Continual Learning: A Survey + + +
+ Over the past decade, deep neural networks have demonstrated significant +success using the training scheme that involves mini-batch stochastic gradient +descent on extensive datasets. Expanding upon this accomplishment, there has +been a surge in research exploring the application of neural networks in other +learning scenarios. One notable framework that has garnered significant +attention is meta-learning. Often described as "learning to learn," +meta-learning is a data-driven approach to optimize the learning algorithm. +Other branches of interest are continual learning and online learning, both of +which involve incrementally updating a model with streaming data. While these +frameworks were initially developed independently, recent works have started +investigating their combinations, proposing novel problem settings and learning +algorithms. However, due to the elevated complexity and lack of unified +terminology, discerning differences between the learning frameworks can be +challenging even for experienced researchers. To facilitate a clear +understanding, this paper provides a comprehensive survey that organizes +various problem settings using consistent terminology and formal descriptions. +By offering an overview of these learning paradigms, our work aims to foster +further advancements in this promising area of research. + +
+
+
+
+
+ + ♻ ☆ A Systematic Review of Aspect-based Sentiment Analysis: Domains, + Methods, and Trends + + +
+ Aspect-based Sentiment Analysis (ABSA) is a fine-grained type of sentiment +analysis that identifies aspects and their associated opinions from a given +text. With the surge of digital opinionated text data, ABSA gained increasing +popularity for its ability to mine more detailed and targeted insights. Many +review papers on ABSA subtasks and solution methodologies exist, however, few +focus on trends over time or systemic issues relating to research application +domains, datasets, and solution approaches. To fill the gap, this paper +presents a Systematic Literature Review (SLR) of ABSA studies with a focus on +trends and high-level relationships among these fundamental components. This +review is one of the largest SLRs on ABSA. To our knowledge, it is also the +first to systematically examine the interrelations among ABSA research and data +distribution across domains, as well as trends in solution paradigms and +approaches. Our sample includes 727 primary studies screened from 8550 search +results without time constraints via an innovative automatic filtering process. +Our quantitative analysis not only identifies trends in nearly two decades of +ABSA research development but also unveils a systemic lack of dataset and +domain diversity as well as domain mismatch that may hinder the development of +future ABSA research. We discuss these findings and their implications and +propose suggestions for future research. + +
+
+
+
+
+ + ♻ ☆ MDS-ED: Multimodal Decision Support in the Emergency Department -- a + Benchmark Dataset for Diagnoses and Deterioration Prediction in Emergency + Medicine + + +
+ Background: Benchmarking medical decision support algorithms often struggles +due to limited access to datasets, narrow prediction tasks, and restricted +input modalities. These limitations affect their clinical relevance and +performance in high-stakes areas like emergency care, complicating replication, +validation, and improvement of benchmarks. + Methods: We introduce a dataset based on MIMIC-IV, benchmarking protocol, and +initial results for evaluating multimodal decision support in the emergency +department (ED). We use diverse data modalities from the first 1.5 hours of +patient arrival, including demographics, biometrics, vital signs, lab values, +and electrocardiogram waveforms. We analyze 1443 clinical labels across two +contexts: predicting diagnoses with ICD-10 codes and forecasting patient +deterioration. + Results: Our multimodal diagnostic model achieves an AUROC score over 0.8 in +a statistically significant manner for 357 out of 1428 conditions, including +cardiac issues like myocardial infarction and non-cardiac conditions such as +renal disease and diabetes. The deterioration model scores above 0.8 in a +statistically significant manner for 13 out of 15 targets, including critical +events like cardiac arrest and mechanical ventilation, ICU admission as well as +short- and long-term mortality. Incorporating raw waveform data significantly +improves model performance, which represents one of the first robust +demonstrations of this effect. + Conclusions: This study highlights the uniqueness of our dataset, which +encompasses a wide range of clinical tasks and utilizes a comprehensive set of +features collected early during the emergency after arriving at the ED. The +strong performance, as evidenced by high AUROC scores across diagnostic and +deterioration targets, underscores the potential of our approach to +revolutionize decision-making in acute and emergency medicine. + +
+
+ comment: 14 pages, 1 figure, code available under + https://github.com/AI4HealthUOL/MDS-ED +
+
+
+
+
+ + ♻ ☆ Characterizing Continual Learning Scenarios and Strategies for Audio + Analysis + + +
+ Audio analysis is useful in many application scenarios. The state-of-the-art +audio analysis approaches assume the data distribution at training and +deployment time will be the same. However, due to various real-life challenges, +the data may encounter drift in its distribution or can encounter new classes +in the late future. Thus, a one-time trained model might not perform +adequately. Continual learning (CL) approaches are devised to handle such +changes in data distribution. There have been a few attempts to use CL +approaches for audio analysis. Yet, there is a lack of a systematic evaluation +framework. In this paper, we create a comprehensive CL dataset and characterize +CL approaches for audio-based monitoring tasks. We have investigated the +following CL and non-CL approaches: EWC, LwF, SI, GEM, A-GEM, GDumb, Replay, +Naive, Cumulative, and Joint training. The study is very beneficial for +researchers and practitioners working in the area of audio analysis for +developing adaptive models. We observed that Replay achieved better results +than other methods in the DCASE challenge data. It achieved an accuracy of +70.12% for the domain incremental scenario and an accuracy of 96.98% for the +class incremental scenario. + +
+
+
+
+
+ + ♻ ☆ Machine learning for structure-guided materials and process design + + +
+ In recent years, there has been a growing interest in accelerated materials +innovation in the context of the process-structure-property chain. In this +regard, it is essential to take into account manufacturing processes and tailor +materials design approaches to support downstream process design approaches. As +a major step into this direction, we present a holistic optimization approach +that covers the entire process-structure-property chain in materials +engineering. Our approach specifically employs machine learning to address two +critical identification problems: a materials design problem, which involves +identifying near-optimal material structures that exhibit desired properties, +and a process design problem that is to find an optimal processing path to +manufacture these structures. Both identification problems are typically +ill-posed, which presents a significant challenge for solution approaches. +However, the non-unique nature of these problems offers an important advantage +for processing: By having several target structures that perform similarly +well, processes can be efficiently guided towards manufacturing the best +reachable structure. The functionality of the approach will be demonstrated +manufacturing crystallographic textures with desired properties in a metal +forming process. + +
+
+
+
+
+ + ♻ ☆ Enhancing Solutions for Complex PDEs: Introducing Complementary + Convolution and Equivariant Attention in Fourier Neural Operators + + +
+ Neural operators improve conventional neural networks by expanding their +capabilities of functional mappings between different function spaces to solve +partial differential equations (PDEs). One of the most notable methods is the +Fourier Neural Operator (FNO), which draws inspiration from Green's function +method and directly approximates operator kernels in the frequency domain. +However, after empirical observation followed by theoretical validation, we +demonstrate that the FNO approximates kernels primarily in a relatively +low-frequency domain. This suggests a limited capability in solving complex +PDEs, particularly those characterized by rapid coefficient changes and +oscillations in the solution space. Such cases are crucial in specific +scenarios, like atmospheric convection and ocean circulation. To address this +challenge, inspired by the translation equivariant of the convolution kernel, +we propose a novel hierarchical Fourier neural operator along with +convolution-residual layers and attention mechanisms to make them complementary +in the frequency domain to solve complex PDEs. We perform experiments on +forward and reverse problems of multiscale elliptic equations, Navier-Stokes +equations, and other physical scenarios, and find that the proposed method +achieves superior performance in these PDE benchmarks, especially for equations +characterized by rapid coefficient variations. + +
+
+ comment: 14 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Socially Integrated Navigation: A Social Acting Robot with Deep + Reinforcement Learning IROS + + +
+ Mobile robots are being used on a large scale in various crowded situations +and become part of our society. The socially acceptable navigation behavior of +a mobile robot with individual human consideration is an essential requirement +for scalable applications and human acceptance. Deep Reinforcement Learning +(DRL) approaches are recently used to learn a robot's navigation policy and to +model the complex interactions between robots and humans. We propose to divide +existing DRL-based navigation approaches based on the robot's exhibited social +behavior and distinguish between social collision avoidance with a lack of +social behavior and socially aware approaches with explicit predefined social +behavior. In addition, we propose a novel socially integrated navigation +approach where the robot's social behavior is adaptive and emerges from the +interaction with humans. The formulation of our approach is derived from a +sociological definition, which states that social acting is oriented toward the +acting of others. The DRL policy is trained in an environment where other +agents interact socially integrated and reward the robot's behavior +individually. The simulation results indicate that the proposed socially +integrated navigation approach outperforms a socially aware approach in terms +of ego navigation performance while significantly reducing the negative impact +on all agents within the environment. + +
+
+ comment: Accepted at 2024 IEEE/RSJ International Conference on Intelligent + Robots and Systems (IROS) +
+
+
+
+
+ + ♻ ☆ AIR-Bench: Benchmarking Large Audio-Language Models via Generative + Comprehension ACL + 2024 + + +
+ Recently, instruction-following audio-language models have received broad +attention for human-audio interaction. However, the absence of benchmarks +capable of evaluating audio-centric interaction capabilities has impeded +advancements in this field. Previous models primarily focus on assessing +different fundamental tasks, such as Automatic Speech Recognition (ASR), and +lack an assessment of the open-ended generative capabilities centered around +audio. Thus, it is challenging to track the progression in the Large +Audio-Language Models (LALMs) domain and to provide guidance for future +improvement. In this paper, we introduce AIR-Bench (\textbf{A}udio +\textbf{I}nst\textbf{R}uction \textbf{Bench}mark), the first benchmark designed +to evaluate the ability of LALMs to understand various types of audio signals +(including human speech, natural sounds, and music), and furthermore, to +interact with humans in the textual format. AIR-Bench encompasses two +dimensions: \textit{foundation} and \textit{chat} benchmarks. The former +consists of 19 tasks with approximately 19k single-choice questions, intending +to inspect the basic single-task ability of LALMs. The latter one contains 2k +instances of open-ended question-and-answer data, directly assessing the +comprehension of the model on complex audio and its capacity to follow +instructions. Both benchmarks require the model to generate hypotheses +directly. We design a unified framework that leverages advanced language +models, such as GPT-4, to evaluate the scores of generated hypotheses given the +meta-information of the audio. Experimental results demonstrate a high level of +consistency between GPT-4-based evaluation and human evaluation. By revealing +the limitations of existing LALMs through evaluation results, AIR-Bench can +provide insights into the direction of future research. + +
+
+ comment: Code and Data: https://github.com/OFA-Sys/AIR-Bench. Accepted by ACL + 2024 +
+
+
+
+
+ + ♻ ☆ Credit Card Fraud Detection Using Advanced Transformer Model + + +
+ With the proliferation of various online and mobile payment systems, credit +card fraud has emerged as a significant threat to financial security. This +study focuses on innovative applications of the latest Transformer models for +more robust and precise fraud detection. To ensure the reliability of the data, +we meticulously processed the data sources, balancing the dataset to address +the issue of data sparsity significantly. We also selected highly correlated +vectors to strengthen the training process.To guarantee the reliability and +practicality of the new Transformer model, we conducted performance comparisons +with several widely adopted models, including Support Vector Machine (SVM), +Random Forest, Neural Network, and Logistic Regression. We rigorously compared +these models using metrics such as Precision, Recall, and F1 Score. Through +these detailed analyses and comparisons, we present to the readers a highly +efficient and powerful anti-fraud mechanism with promising prospects. The +results demonstrate that the Transformer model not only excels in traditional +applications but also shows great potential in niche areas like fraud +detection, offering a substantial advancement in the field. + +
+
+ comment: This paper have been received by https://ieee-metacom.org/ +
+
+
+
+
+ + ♻ ☆ Advanced Payment Security System:XGBoost, LightGBM and SMOTE Integrated + + +
+ With the rise of various online and mobile payment systems, transaction fraud +has become a significant threat to financial security. This study explores the +application of advanced machine learning models, specifically based on XGBoost +and LightGBM, for developing a more accurate and robust Payment Security +Protection Model. To enhance data reliability, we meticulously processed the +data sources and applied SMOTE (Synthetic Minority Over-sampling Technique) to +address class imbalance and improve data representation. By selecting highly +correlated features, we aimed to strengthen the training process and boost +model performance. We conducted thorough performance evaluations of our +proposed models, comparing them against traditional methods including Random +Forest, Neural Network, and Logistic Regression. Using metrics such as +Precision, Recall, and F1 Score, we rigorously assessed their effectiveness. +Our detailed analyses and comparisons reveal that the combination of SMOTE with +XGBoost and LightGBM offers a highly efficient and powerful mechanism for +payment security protection. Moreover, the integration of XGBoost and LightGBM +in a Local Ensemble model further demonstrated outstanding performance. After +incorporating SMOTE, the new combined model achieved a significant improvement +of nearly 6\% over traditional models and around 5\% over its sub-models, +showcasing remarkable results. + +
+
+ comment: This paper is received by https://ieee-metacom.org +
+
+
+
+
+ + ♻ ☆ YZS-model: A Predictive Model for Organic Drug Solubility Based on Graph + Convolutional Networks and Transformer-Attention + + +
+ Accurate prediction of drug molecule solubility is crucial for therapeutic +effectiveness and safety. Traditional methods often miss complex molecular +structures, leading to inaccuracies. We introduce the YZS-Model, a deep +learning framework integrating Graph Convolutional Networks (GCN), Transformer +architectures, and Long Short-Term Memory (LSTM) networks to enhance prediction +precision. GCNs excel at capturing intricate molecular topologies by modeling +the relationships between atoms and bonds. Transformers, with their +self-attention mechanisms, effectively identify long-range dependencies within +molecules, capturing global interactions. LSTMs process sequential data, +preserving long-term dependencies and integrating temporal information within +molecular sequences. This multifaceted approach leverages the strengths of each +component, resulting in a model that comprehensively understands and predicts +molecular properties. Trained on 9,943 compounds and tested on an anticancer +dataset, the YZS-Model achieved an $R^2$ of 0.59 and an RMSE of 0.57, +outperforming benchmark models ($R^2$ of 0.52 and RMSE of 0.61). In an +independent test, it demonstrated an RMSE of 1.05, improving accuracy by 45.9%. +The integration of these deep learning techniques allows the YZS-Model to learn +valuable features from complex data without predefined parameters, handle large +datasets efficiently, and adapt to various molecular types. This comprehensive +capability significantly improves predictive accuracy and model +generalizability. Its precision in solubility predictions can expedite drug +development by optimizing candidate selection, reducing costs, and enhancing +efficiency. Our research underscores deep learning's transformative potential +in pharmaceutical science, particularly for solubility prediction and drug +design. + +
+
+ comment: 23 pages, 16 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ A Correlation-induced Finite Difference Estimator + + +
+ Estimating stochastic gradients is pivotal in fields like service systems +within operations research. The classical method for this estimation is the +finite difference approximation, which entails generating samples at perturbed +inputs. Nonetheless, practical challenges persist in determining the +perturbation and obtaining an optimal finite difference estimator in the sense +of possessing the smallest mean squared error (MSE). To tackle this problem, we +propose a double sample-recycling approach in this paper. Firstly, pilot +samples are recycled to estimate the optimal perturbation. Secondly, recycling +these pilot samples again and generating new samples at the estimated +perturbation, lead to an efficient finite difference estimator. We analyze its +bias, variance and MSE. Our analyses demonstrate a reduction in asymptotic +variance, and in some cases, a decrease in asymptotic bias, compared to the +optimal finite difference estimator. Therefore, our proposed estimator +consistently coincides with, or even outperforms the optimal finite difference +estimator. In numerical experiments, we apply the estimator in several +examples, and numerical results demonstrate its robustness, as well as +coincidence with the theory presented, especially in the case of small sample +sizes. + +
+
+
+
+
+ + ♻ ☆ Enhancing Peak Assignment in 13C NMR Spectroscopy: A Novel Approach + Using Multimodal Alignment + + +
+ Nuclear magnetic resonance (NMR) spectroscopy plays an essential role in +deciphering molecular structure and dynamic behaviors. While AI-enhanced NMR +prediction models hold promise, challenges still persist in tasks such as +molecular retrieval, isomer recognition, and peak assignment. In response, this +paper introduces a novel solution, Multi-Level Multimodal Alignment with +Knowledge-Guided Instance-Wise Discrimination (K-M3AID), which establishes +correspondences between two heterogeneous modalities: molecular graphs and NMR +spectra. K-M3AID employs a dual-coordinated contrastive learning architecture +with three key modules: a graph-level alignment module, a node-level alignment +module, and a communication channel. Notably, K-M3AID introduces +knowledge-guided instance-wise discrimination into contrastive learning within +the node-level alignment module. In addition, K-M3AID demonstrates that skills +acquired during node-level alignment have a positive impact on graph-level +alignment, acknowledging meta-learning as an inherent property. Empirical +validation underscores K-M3AID's effectiveness in multiple zero-shot tasks. + +
+
+
+
+
+ + ♻ ☆ Online Differentially Private Synthetic Data Generation + + +
+ We present a polynomial-time algorithm for online differentially private +synthetic data generation. For a data stream within the hypercube $[0,1]^d$ and +an infinite time horizon, we develop an online algorithm that generates a +differentially private synthetic dataset at each time $t$. This algorithm +achieves a near-optimal accuracy bound of $O(\log(t)t^{-1/d})$ for $d\geq 2$ +and $O(\log^{4.5}(t)t^{-1})$ for $d=1$ in the 1-Wasserstein distance. This +result extends the previous work on the continual release model for counting +queries to Lipschitz queries. Compared to the offline case, where the entire +dataset is available at once, our approach requires only an extra polylog +factor in the accuracy bound. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ♻ ☆ FiLo: Zero-Shot Anomaly Detection by Fine-Grained Description and + High-Quality Localization ACM MM 2024 + + +
+ Zero-shot anomaly detection (ZSAD) methods entail detecting anomalies +directly without access to any known normal or abnormal samples within the +target item categories. Existing approaches typically rely on the robust +generalization capabilities of multimodal pretrained models, computing +similarities between manually crafted textual features representing "normal" or +"abnormal" semantics and image features to detect anomalies and localize +anomalous patches. However, the generic descriptions of "abnormal" often fail +to precisely match diverse types of anomalies across different object +categories. Additionally, computing feature similarities for single patches +struggles to pinpoint specific locations of anomalies with various sizes and +scales. To address these issues, we propose a novel ZSAD method called FiLo, +comprising two components: adaptively learned Fine-Grained Description (FG-Des) +and position-enhanced High-Quality Localization (HQ-Loc). FG-Des introduces +fine-grained anomaly descriptions for each category using Large Language Models +(LLMs) and employs adaptively learned textual templates to enhance the accuracy +and interpretability of anomaly detection. HQ-Loc, utilizing Grounding DINO for +preliminary localization, position-enhanced text prompts, and Multi-scale +Multi-shape Cross-modal Interaction (MMCI) module, facilitates more accurate +localization of anomalies of different sizes and shapes. Experimental results +on datasets like MVTec and VisA demonstrate that FiLo significantly improves +the performance of ZSAD in both detection and localization, achieving +state-of-the-art performance with an image-level AUC of 83.9% and a pixel-level +AUC of 95.9% on the VisA dataset. Code is available at +https://github.com/CASIA-IVA-Lab/FiLo. + +
+
+ comment: Accepted by ACM MM 2024 +
+
+
+
+
+ + ♻ ☆ Longhorn: State Space Models are Amortized Online Learners + + +
+ The most fundamental capability of modern AI methods such as Large Language +Models (LLMs) is the ability to predict the next token in a long sequence of +tokens, known as ``sequence modeling." Although the Transformers model is the +current dominant approach to sequence modeling, its quadratic computational +cost with respect to sequence length is a significant drawback. State-space +models (SSMs) offer a promising alternative due to their linear decoding +efficiency and high parallelizability during training. However, existing SSMs +often rely on seemingly ad hoc linear recurrence designs. In this work, we +explore SSM design through the lens of online learning, conceptualizing SSMs as +meta-modules for specific online learning problems. This approach links SSM +design to formulating precise online learning objectives, with state transition +rules derived from optimizing these objectives. Based on this insight, we +introduce a novel deep SSM architecture based on the implicit update for +optimizing an online regression objective. Our experimental results show that +our models outperform state-of-the-art SSMs, including the Mamba model, on +standard sequence modeling benchmarks and language modeling tasks. + +
+
+
+
+
+ + ♻ ☆ Tag-LLM: Repurposing General-Purpose LLMs for Specialized Domains ICML 2024 + + +
+ Large Language Models (LLMs) have demonstrated remarkable proficiency in +understanding and generating natural language. However, their capabilities wane +in highly specialized domains underrepresented in the pretraining corpus, such +as physical and biomedical sciences. This work explores how to repurpose +general LLMs into effective task solvers for specialized domains. We introduce +a novel, model-agnostic framework for learning custom input tags, which are +parameterized as continuous vectors appended to the LLM's embedding layer, to +condition the LLM. We design two types of input tags: domain tags are used to +delimit specialized representations (e.g., chemical formulas) and provide +domain-relevant context; function tags are used to represent specific functions +(e.g., predicting molecular properties) and compress function-solving +instructions. We develop a three-stage protocol to learn these tags using +auxiliary data and domain knowledge. By explicitly disentangling task domains +from task functions, our method enables zero-shot generalization to unseen +problems through diverse combinations of the input tags. It also boosts LLM's +performance in various specialized domains, such as predicting protein or +chemical properties and modeling drug-target interactions, outperforming expert +models tailored to these tasks. + +
+
+ comment: ICML 2024 +
+
+
+
+
+ + ♻ ☆ Exploring the Limitations of Kolmogorov-Arnold Networks in + Classification: Insights to Software Training and Hardware Implementation + + +
+ Kolmogorov-Arnold Networks (KANs), a novel type of neural network, have +recently gained popularity and attention due to the ability to substitute +multi-layer perceptions (MLPs) in artificial intelligence (AI) with higher +accuracy and interoperability. However, KAN assessment is still limited and +cannot provide an in-depth analysis of a specific domain. Furthermore, no study +has been conducted on the implementation of KANs in hardware design, which +would directly demonstrate whether KANs are truly superior to MLPs in practical +applications. As a result, in this paper, we focus on verifying KANs for +classification issues, which are a common but significant topic in AI using +four different types of datasets. Furthermore, the corresponding hardware +implementation is considered using the Vitis high-level synthesis (HLS) tool. +To the best of our knowledge, this is the first article to implement hardware +for KAN. The results indicate that KANs cannot achieve more accuracy than MLPs +in high complex datasets while utilizing substantially higher hardware +resources. Therefore, MLP remains an effective approach for achieving accuracy +and efficiency in software and hardware implementation. + +
+
+ comment: 6 pages, 3 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ HMM for Discovering Decision-Making Dynamics Using Reinforcement + Learning Experiments + + +
+ Major depressive disorder (MDD) presents challenges in diagnosis and +treatment due to its complex and heterogeneous nature. Emerging evidence +indicates that reward processing abnormalities may serve as a behavioral marker +for MDD. To measure reward processing, patients perform computer-based +behavioral tasks that involve making choices or responding to stimulants that +are associated with different outcomes. Reinforcement learning (RL) models are +fitted to extract parameters that measure various aspects of reward processing +to characterize how patients make decisions in behavioral tasks. Recent +findings suggest the inadequacy of characterizing reward learning solely based +on a single RL model; instead, there may be a switching of decision-making +processes between multiple strategies. An important scientific question is how +the dynamics of learning strategies in decision-making affect the reward +learning ability of individuals with MDD. Motivated by the probabilistic reward +task (PRT) within the EMBARC study, we propose a novel RL-HMM framework for +analyzing reward-based decision-making. Our model accommodates learning +strategy switching between two distinct approaches under a hidden Markov model +(HMM): subjects making decisions based on the RL model or opting for random +choices. We account for continuous RL state space and allow time-varying +transition probabilities in the HMM. We introduce a computationally efficient +EM algorithm for parameter estimation and employ a nonparametric bootstrap for +inference. We apply our approach to the EMBARC study to show that MDD patients +are less engaged in RL compared to the healthy controls, and engagement is +associated with brain activities in the negative affect circuitry during an +emotional conflict task. + +
+
+
+
+
+ + ♻ ☆ LOLA: LLM-Assisted Online Learning Algorithm for Content Experiments + + +
+ In the rapidly evolving digital content landscape, media firms and news +publishers require automated and efficient methods to enhance user engagement. +This paper introduces the LLM-Assisted Online Learning Algorithm (LOLA), a +novel framework that integrates Large Language Models (LLMs) with adaptive +experimentation to optimize content delivery. Leveraging a large-scale dataset +from Upworthy, which includes 17,681 headline A/B tests, we first investigate +three pure-LLM approaches: prompt-based methods, embedding-based classification +models, and fine-tuned open-source LLMs. We find that prompt-based approaches +perform poorly, achieving no more than 65\% accuracy in identifying the +catchier headline. In contrast, both OpenAI-embedding-based classification +models and fine-tuned Llama-3 with 8 billion parameters achieve an accuracy of +around 82-84\%. We then introduce LOLA, which combines the best pure-LLM +approach with the Upper Confidence Bound algorithm to allocate traffic and +maximize clicks adaptively. Our numerical experiments on Upworthy data show +that LOLA outperforms the standard A/B test method (the current status quo at +Upworthy), pure bandit algorithms, and pure-LLM approaches, particularly in +scenarios with limited experimental traffic. Our approach is scalable and +applicable to content experiments across various settings where firms seek to +optimize user engagement, including digital advertising and social media +recommendations. + +
+
+
+
+
+ + ♻ ☆ On Convergence Analysis of Policy Iteration Algorithms for + Entropy-Regularized Stochastic Control Problems + + +
+ In this paper we investigate the issues regarding the convergence of the +Policy Iteration Algorithm(PIA) for a class of general continuous-time +entropy-regularized stochastic control problems. In particular, instead of +employing sophisticated PDE estimates for the iterative PDEs involved in the +PIA (see, e.g., Huang-Wang-Zhou(2023)), we shall provide a simple proof from +scratch for the convergence of the PIA. Our approach builds on probabilistic +representation formulae for solutions of PDEs and their derivatives. Moreover, +in the infinite horizon model with large discount factor and in the finite +horizon model, the similar arguments lead to the exponential rate of +convergence of PIA without tear. Finally, with some extra efforts we show that +our approach can also be extended to the case when diffusion contains control, +in the one dimensional setting but without much extra constraints on the +coefficients. We believe that these results are new in the literature. + +
+
+ comment: In this version, we have added results on convergence and rate of + convergence for the diffusion control problem in the scalar case +
+
+
+
+
+
+
+
+ + Multimedia 7 + +
+
+
+ + ☆ Every Part Matters: Integrity Verification of Scientific Figures Based + on Multimodal Large Language Models + + +
+ This paper tackles a key issue in the interpretation of scientific figures: +the fine-grained alignment of text and figures. It advances beyond prior +research that primarily dealt with straightforward, data-driven visualizations +such as bar and pie charts and only offered a basic understanding of diagrams +through captioning and classification. We introduce a novel task, Figure +Integrity Verification, designed to evaluate the precision of technologies in +aligning textual knowledge with visual elements in scientific figures. To +support this, we develop a semi-automated method for constructing a large-scale +dataset, Figure-seg, specifically designed for this task. Additionally, we +propose an innovative framework, Every Part Matters (EPM), which leverages +Multimodal Large Language Models (MLLMs) to not only incrementally improve the +alignment and verification of text-figure integrity but also enhance integrity +through analogical reasoning. Our comprehensive experiments show that these +innovations substantially improve upon existing methods, allowing for more +precise and thorough analysis of complex scientific figures. This progress not +only enhances our understanding of multimodal technologies but also stimulates +further research and practical applications across fields requiring the +accurate interpretation of complex visual data. + +
+
+ comment: 28 pages, 11 figures, under review +
+
+
+
+
+ + ☆ LookupForensics: A Large-Scale Multi-Task Dataset for Multi-Phase + Image-Based Fact Verification + + +
+ Amid the proliferation of forged images, notably the tsunami of deepfake +content, extensive research has been conducted on using artificial intelligence +(AI) to identify forged content in the face of continuing advancements in +counterfeiting technologies. We have investigated the use of AI to provide the +original authentic image after deepfake detection, which we believe is a +reliable and persuasive solution. We call this "image-based automated fact +verification," a name that originated from a text-based fact-checking system +used by journalists. We have developed a two-phase open framework that +integrates detection and retrieval components. Additionally, inspired by a +dataset proposed by Meta Fundamental AI Research, we further constructed a +large-scale dataset that is specifically designed for this task. This dataset +simulates real-world conditions and includes both content-preserving and +content-aware manipulations that present a range of difficulty levels and have +potential for ongoing research. This multi-task dataset is fully annotated, +enabling it to be utilized for sub-tasks within the forgery identification and +fact retrieval domains. This paper makes two main contributions: (1) We +introduce a new task, "image-based automated fact verification," and present a +novel two-phase open framework combining "forgery identification" and "fact +retrieval." (2) We present a large-scale dataset tailored for this new task +that features various hand-crafted image edits and machine learning-driven +manipulations, with extensive annotations suitable for various sub-tasks. +Extensive experimental results validate its practicality for fact verification +research and clarify its difficulty levels for various sub-tasks. + +
+
+ comment: Pages 1-13 are the main body of the paper, and pages 14-16 are the + supplementary material +
+
+
+
+
+ + ☆ Multimodal Emotion Recognition using Audio-Video Transformer Fusion with + Cross Attention + + +
+ Understanding emotions is a fundamental aspect of human communication. +Integrating audio and video signals offers a more comprehensive understanding +of emotional states compared to traditional methods that rely on a single data +source, such as speech or facial expressions. Despite its potential, multimodal +emotion recognition faces significant challenges, particularly in +synchronization, feature extraction, and fusion of diverse data sources. To +address these issues, this paper introduces a novel transformer-based model +named Audio-Video Transformer Fusion with Cross Attention (AVT-CA). The AVT-CA +model employs a transformer fusion approach to effectively capture and +synchronize interlinked features from both audio and video inputs, thereby +resolving synchronization problems. Additionally, the Cross Attention mechanism +within AVT-CA selectively extracts and emphasizes critical features while +discarding irrelevant ones from both modalities, addressing feature extraction +and fusion challenges. Extensive experimental analysis conducted on the +CMU-MOSEI, RAVDESS and CREMA-D datasets demonstrates the efficacy of the +proposed model. The results underscore the importance of AVT-CA in developing +precise and reliable multimodal emotion recognition systems for practical +applications. + +
+
+ comment: 38 Pages, 9 Tables, 12 Figures +
+
+
+
+
+ + ☆ MangaUB: A Manga Understanding Benchmark for Large Multimodal Models + + +
+ Manga is a popular medium that combines stylized drawings and text to convey +stories. As manga panels differ from natural images, computational systems +traditionally had to be designed specifically for manga. Recently, the adaptive +nature of modern large multimodal models (LMMs) shows possibilities for more +general approaches. To provide an analysis of the current capability of LMMs +for manga understanding tasks and identifying areas for their improvement, we +design and evaluate MangaUB, a novel manga understanding benchmark for LMMs. +MangaUB is designed to assess the recognition and understanding of content +shown in a single panel as well as conveyed across multiple panels, allowing +for a fine-grained analysis of a model's various capabilities required for +manga understanding. Our results show strong performance on the recognition of +image content, while understanding the emotion and information conveyed across +multiple panels is still challenging, highlighting future work towards LMMs for +manga understanding. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ SWIFT: Semantic Watermarking for Image Forgery Thwarting + + +
+ This paper proposes a novel approach towards image authentication and +tampering detection by using watermarking as a communication channel for +semantic information. We modify the HiDDeN deep-learning watermarking +architecture to embed and extract high-dimensional real vectors representing +image captions. Our method improves significantly robustness on both malign and +benign edits. We also introduce a local confidence metric correlated with +Message Recovery Rate, enhancing the method's practical applicability. This +approach bridges the gap between traditional watermarking and passive forensic +methods, offering a robust solution for image integrity verification. + +
+
+ comment: Code will be released +
+
+
+
+
+ + ♻ ☆ Selective Vision-Language Subspace Projection for Few-shot CLIP + + +
+ Vision-language models such as CLIP are capable of mapping the different +modality data into a unified feature space, enabling zero/few-shot inference by +measuring the similarity of given images and texts. However, most existing +methods overlook modality gaps in CLIP's encoded features, which is shown as +the text and image features lie far apart from each other, resulting in limited +classification performance. To tackle this issue, we introduce a method called +Selective Vision-Language Subspace Projection (SSP), which incorporates local +image features and utilizes them as a bridge to enhance the alignment between +image-text pairs. Specifically, our SSP framework comprises two parallel +modules: a vision projector and a language projector. Both projectors utilize +local image features to span the respective subspaces for image and texts, +thereby projecting the image and text features into their respective subspaces +to achieve alignment. Moreover, our approach entails only training-free matrix +calculations and can be seamlessly integrated into advanced CLIP-based few-shot +learning frameworks. Extensive experiments on 11 datasets have demonstrated +SSP's superior text-image alignment capabilities, outperforming the +state-of-the-art alignment methods. The code is available at +https://github.com/zhuhsingyuu/SSP + +
+
+ comment: Accepted as an Oral Paper at ACM Multimedia 2024 +
+
+
+
+
+ + ♻ ☆ Multimodal Unlearnable Examples: Protecting Data against Multimodal + Contrastive Learning ACM MM2024 + + +
+ Multimodal contrastive learning (MCL) has shown remarkable advances in +zero-shot classification by learning from millions of image-caption pairs +crawled from the Internet. However, this reliance poses privacy risks, as +hackers may unauthorizedly exploit image-text data for model training, +potentially including personal and privacy-sensitive information. Recent works +propose generating unlearnable examples by adding imperceptible perturbations +to training images to build shortcuts for protection. However, they are +designed for unimodal classification, which remains largely unexplored in MCL. +We first explore this context by evaluating the performance of existing methods +on image-caption pairs, and they do not generalize effectively to multimodal +data and exhibit limited impact to build shortcuts due to the lack of labels +and the dispersion of pairs in MCL. In this paper, we propose Multi-step Error +Minimization (MEM), a novel optimization process for generating multimodal +unlearnable examples. It extends the Error-Minimization (EM) framework to +optimize both image noise and an additional text trigger, thereby enlarging the +optimized space and effectively misleading the model to learn the shortcut +between the noise features and the text trigger. Specifically, we adopt +projected gradient descent to solve the noise minimization problem and use +HotFlip to approximate the gradient and replace words to find the optimal text +trigger. Extensive experiments demonstrate the effectiveness of MEM, with +post-protection retrieval results nearly half of random guessing, and its high +transferability across different models. Our code is available on the +https://github.com/thinwayliu/Multimodal-Unlearnable-Examples + +
+
+ comment: ACM MM2024 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 87 + +
+
+
+ + ☆ Self-Training with Direct Preference Optimization Improves + Chain-of-Thought Reasoning ACL 2024 + + +
+ Effective training of language models (LMs) for mathematical reasoning tasks +demands high-quality supervised fine-tuning data. Besides obtaining annotations +from human experts, a common alternative is sampling from larger and more +powerful LMs. However, this knowledge distillation approach can be costly and +unstable, particularly when relying on closed-source, proprietary LMs like +GPT-4, whose behaviors are often unpredictable. In this work, we demonstrate +that the reasoning abilities of small-scale LMs can be enhanced through +self-training, a process where models learn from their own outputs. We also +show that the conventional self-training can be further augmented by a +preference learning algorithm called Direct Preference Optimization (DPO). By +integrating DPO into self-training, we leverage preference data to guide LMs +towards more accurate and diverse chain-of-thought reasoning. We evaluate our +method across various mathematical reasoning tasks using different base models. +Our experiments show that this approach not only improves LMs' reasoning +performance but also offers a more cost-effective and scalable solution +compared to relying on large proprietary LMs. + +
+
+ comment: ACL 2024. Code and data are available at + https://github.com/TianduoWang/DPO-ST +
+
+
+
+
+ + ☆ LoRA-Pro: Are Low-Rank Adapters Properly Optimized? + + +
+ Low-Rank Adaptation, also known as LoRA, has emerged as a prominent method +for parameter-efficient fine-tuning foundation models by re-parameterizing the +original matrix into the product of two low-rank matrices. Despite its +efficiency, LoRA often yields inferior performance compared to full +fine-tuning. In this paper, we propose LoRA-Pro to bridge this performance gap. +Firstly, we delve into the optimization processes in LoRA and full fine-tuning. +We reveal that while LoRA employs low-rank approximation, it neglects to +approximate the optimization process of full fine-tuning. To address this, we +introduce a novel concept called the "equivalent gradient." This virtual +gradient makes the optimization process on the re-parameterized matrix +equivalent to LoRA, which can be used to quantify the differences between LoRA +and full fine-tuning. The equivalent gradient is derived from the gradients of +matrices $A$ and $B$. To narrow the performance gap, our approach minimizes the +differences between the equivalent gradient and the gradient obtained from full +fine-tuning during the optimization process. By solving this objective, we +derive optimal closed-form solutions for updating matrices $A$ and $B$. Our +method constrains the optimization process, shrinking the performance gap +between LoRA and full fine-tuning. Extensive experiments on natural language +processing tasks validate the effectiveness of our method. + +
+
+
+
+
+ + ☆ Recursive Introspection: Teaching Language Model Agents How to + Self-Improve + + +
+ A central piece in enabling intelligent agentic behavior in foundation models +is to make them capable of introspecting upon their behavior, reasoning, and +correcting their mistakes as more computation or interaction is available. Even +the strongest proprietary large language models (LLMs) do not quite exhibit the +ability of continually improving their responses sequentially, even in +scenarios where they are explicitly told that they are making a mistake. In +this paper, we develop RISE: Recursive IntroSpEction, an approach for +fine-tuning LLMs to introduce this capability, despite prior work hypothesizing +that this capability may not be possible to attain. Our approach prescribes an +iterative fine-tuning procedure, which attempts to teach the model how to alter +its response after having executed previously unsuccessful attempts to solve a +hard test-time problem, with optionally additional environment feedback. RISE +poses fine-tuning for a single-turn prompt as solving a multi-turn Markov +decision process (MDP), where the initial state is the prompt. Inspired by +principles in online imitation learning and reinforcement learning, we propose +strategies for multi-turn data collection and training so as to imbue an LLM +with the capability to recursively detect and correct its previous mistakes in +subsequent iterations. Our experiments show that RISE enables Llama2, Llama3, +and Mistral models to improve themselves with more turns on math reasoning +tasks, outperforming several single-turn strategies given an equal amount of +inference-time computation. We also find that RISE scales well, often attaining +larger benefits with more capable models. Our analysis shows that RISE makes +meaningful improvements to responses to arrive at the correct solution for +challenging prompts, without disrupting one-turn abilities as a result of +expressing more complex distributions. + +
+
+
+
+
+ + ☆ Exploring Scaling Trends in LLM Robustness + + +
+ Language model capabilities predictably improve from scaling a model's size +and training data. Motivated by this, increasingly large language models have +been trained, yielding an array of impressive capabilities. Yet these models +are vulnerable to adversarial prompts, such as "jailbreaks" that hijack models +to perform undesired behaviors, posing a significant risk of misuse. Prior work +indicates that computer vision models become more robust with model and data +scaling, raising the question: does language model robustness also improve with +scale? We study this question empirically, finding that larger models respond +substantially better to adversarial training, but there is little to no benefit +from model scale in the absence of explicit defenses. + +
+
+ comment: 31 pages +
+
+
+
+
+ + ☆ The FIGNEWS Shared Task on News Media Narratives ACL 2024 + + +
+ We present an overview of the FIGNEWS shared task, organized as part of the +ArabicNLP 2024 conference co-located with ACL 2024. The shared task addresses +bias and propaganda annotation in multilingual news posts. We focus on the +early days of the Israel War on Gaza as a case study. The task aims to foster +collaboration in developing annotation guidelines for subjective tasks by +creating frameworks for analyzing diverse narratives highlighting potential +bias and propaganda. In a spirit of fostering and encouraging diversity, we +address the problem from a multilingual perspective, namely within five +languages: English, French, Arabic, Hebrew, and Hindi. A total of 17 teams +participated in two annotation subtasks: bias (16 teams) and propaganda (6 +teams). The teams competed in four evaluation tracks: guidelines development, +annotation quality, annotation quantity, and consistency. Collectively, the +teams produced 129,800 data points. Key findings and implications for the field +are discussed. + +
+
+ comment: 18 pages, 10 tables, 1 figure, accepted to ArabicNLP 2024 co-located + with ACL 2024 +
+
+
+
+
+ + ☆ Dallah: A Dialect-Aware Multimodal Large Language Model for Arabic + + +
+ Recent advancements have significantly enhanced the capabilities of +Multimodal Large Language Models (MLLMs) in generating and understanding +image-to-text content. Despite these successes, progress is predominantly +limited to English due to the scarcity of high quality multimodal resources in +other languages. This limitation impedes the development of competitive models +in languages such as Arabic. To alleviate this situation, we introduce an +efficient Arabic multimodal assistant, dubbed Dallah, that utilizes an advanced +language model based on LLaMA-2 to facilitate multimodal interactions. Dallah +demonstrates state-of-the-art performance in Arabic MLLMs. Through fine-tuning +six Arabic dialects, Dallah showcases its capability to handle complex +dialectal interactions incorporating both textual and visual elements. The +model excels in two benchmark tests: one evaluating its performance on Modern +Standard Arabic (MSA) and another specifically designed to assess dialectal +responses. Beyond its robust performance in multimodal interaction tasks, +Dallah has the potential to pave the way for further development of +dialect-aware Arabic MLLMs. + +
+
+
+
+
+ + ☆ Tracking linguistic information in transformer-based sentence embeddings + through targeted sparsification RepL4NLP 2024 + + +
+ Analyses of transformer-based models have shown that they encode a variety of +linguistic information from their textual input. While these analyses have shed +a light on the relation between linguistic information on one side, and +internal architecture and parameters on the other, a question remains +unanswered: how is this linguistic information reflected in sentence +embeddings? Using datasets consisting of sentences with known structure, we +test to what degree information about chunks (in particular noun, verb or +prepositional phrases), such as grammatical number, or semantic role, can be +localized in sentence embeddings. Our results show that such information is not +distributed over the entire sentence embedding, but rather it is encoded in +specific regions. Understanding how the information from an input text is +compressed into sentence embeddings helps understand current transformer models +and help build future explainable neural models. + +
+
+ comment: 12 pages, 9 figures, 1 table, published in RepL4NLP 2024 +
+
+
+
+
+ + ☆ PEFT-U: Parameter-Efficient Fine-Tuning for User Personalization + + +
+ The recent emergence of Large Language Models (LLMs) has heralded a new era +of human-AI interaction. These sophisticated models, exemplified by Chat-GPT +and its successors, have exhibited remarkable capabilities in language +understanding. However, as these LLMs have undergone exponential growth, a +crucial dimension that remains understudied is the personalization of these +models. Large foundation models such as GPT-3 etc. focus on creating a +universal model that serves a broad range of tasks and users. This approach +emphasizes the model's generalization capabilities, treating users as a +collective rather than as distinct individuals. While practical for many common +applications, this one-size-fits-all approach often fails to address the rich +tapestry of human diversity and individual needs. To explore this issue we +introduce the PEFT-U Benchmark: a new dataset for building and evaluating NLP +models for user personalization. \datasetname{} consists of a series of +user-centered tasks containing diverse and individualized expressions where the +preferences of users can potentially differ for the same input. Using PEFT-U, +we explore the challenge of efficiently personalizing LLMs to accommodate +user-specific preferences in the context of diverse user-centered tasks. + +
+
+
+
+
+ + ☆ Difficulty Estimation and Simplification of French Text Using LLMs + + +
+ We leverage generative large language models for language learning +applications, focusing on estimating the difficulty of foreign language texts +and simplifying them to lower difficulty levels. We frame both tasks as +prediction problems and develop a difficulty classification model using labeled +examples, transfer learning, and large language models, demonstrating superior +accuracy compared to previous approaches. For simplification, we evaluate the +trade-off between simplification quality and meaning preservation, comparing +zero-shot and fine-tuned performances of large language models. We show that +meaningful text simplifications can be obtained with limited fine-tuning. Our +experiments are conducted on French texts, but our methods are +language-agnostic and directly applicable to other foreign languages. + +
+
+ comment: 10 pages, 4 figures +
+
+
+
+
+ + ☆ I can listen but cannot read: An evaluation of two-tower multimodal + systems for instrument recognition + + +
+ Music two-tower multimodal systems integrate audio and text modalities into a +joint audio-text space, enabling direct comparison between songs and their +corresponding labels. These systems enable new approaches for classification +and retrieval, leveraging both modalities. Despite the promising results they +have shown for zero-shot classification and retrieval tasks, closer inspection +of the embeddings is needed. This paper evaluates the inherent zero-shot +properties of joint audio-text spaces for the case-study of instrument +recognition. We present an evaluation and analysis of two-tower systems for +zero-shot instrument recognition and a detailed analysis of the properties of +the pre-joint and joint embeddings spaces. Our findings suggest that audio +encoders alone demonstrate good quality, while challenges remain within the +text encoder or joint space projection. Specifically, two-tower systems exhibit +sensitivity towards specific words, favoring generic prompts over musically +informed ones. Despite the large size of textual encoders, they do not yet +leverage additional textual context or infer instruments accurately from their +descriptions. Lastly, a novel approach for quantifying the semantic +meaningfulness of the textual space leveraging an instrument ontology is +proposed. This method reveals deficiencies in the systems' understanding of +instruments and provides evidence of the need for fine-tuning text encoders on +musical data. + +
+
+ comment: Accepted to ISMIR 2024 +
+
+
+
+
+ + ☆ RestoreAgent: Autonomous Image Restoration Agent via Multimodal Large + Language Models + + +
+ Natural images captured by mobile devices often suffer from multiple types of +degradation, such as noise, blur, and low light. Traditional image restoration +methods require manual selection of specific tasks, algorithms, and execution +sequences, which is time-consuming and may yield suboptimal results. All-in-one +models, though capable of handling multiple tasks, typically support only a +limited range and often produce overly smooth, low-fidelity outcomes due to +their broad data distribution fitting. To address these challenges, we first +define a new pipeline for restoring images with multiple degradations, and then +introduce RestoreAgent, an intelligent image restoration system leveraging +multimodal large language models. RestoreAgent autonomously assesses the type +and extent of degradation in input images and performs restoration through (1) +determining the appropriate restoration tasks, (2) optimizing the task +sequence, (3) selecting the most suitable models, and (4) executing the +restoration. Experimental results demonstrate the superior performance of +RestoreAgent in handling complex degradation, surpassing human experts. +Furthermore, the system modular design facilitates the fast integration of new +tasks and models, enhancing its flexibility and scalability for various +applications. + +
+
+
+
+
+ + ☆ GermanPartiesQA: Benchmarking Commercial Large Language Models for + Political Bias and Sycophancy + + +
+ LLMs are changing the way humans create and interact with content, +potentially affecting citizens' political opinions and voting decisions. As +LLMs increasingly shape our digital information ecosystems, auditing to +evaluate biases, sycophancy, or steerability has emerged as an active field of +research. In this paper, we evaluate and compare the alignment of six LLMs by +OpenAI, Anthropic, and Cohere with German party positions and evaluate +sycophancy based on a prompt experiment. We contribute to evaluating political +bias and sycophancy in multi-party systems across major commercial LLMs. First, +we develop the benchmark dataset GermanPartiesQA based on the Voting Advice +Application Wahl-o-Mat covering 10 state and 1 national elections between 2021 +and 2023. In our study, we find a left-green tendency across all examined LLMs. +We then conduct our prompt experiment for which we use the benchmark and +sociodemographic data of leading German parliamentarians to evaluate changes in +LLMs responses. To differentiate between sycophancy and steerabilty, we use 'I +am [politician X], ...' and 'You are [politician X], ...' prompts. Against our +expectations, we do not observe notable differences between prompting 'I am' +and 'You are'. While our findings underscore that LLM responses can be +ideologically steered with political personas, they suggest that observed +changes in LLM outputs could be better described as personalization to the +given context rather than sycophancy. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ Keep the Cost Down: A Review on Methods to Optimize LLM' s KV-Cache + Consumption + + +
+ Large Language Models (LLMs), epitomized by ChatGPT' s release in late 2022, +have revolutionized various industries with their advanced language +comprehension. However, their efficiency is challenged by the Transformer +architecture' s struggle with handling long texts. KV-Cache has emerged as a +pivotal solution to this issue, converting the time complexity of token +generation from quadratic to linear, albeit with increased GPU memory overhead +proportional to conversation length. With the development of the LLM community +and academia, various KV-Cache compression methods have been proposed. In this +review, we dissect the various properties of KV-Cache and elaborate on various +methods currently used to optimize the KV-Cache space usage of LLMs. These +methods span the pre-training phase, deployment phase, and inference phase, and +we summarize the commonalities and differences among these methods. +Additionally, we list some metrics for evaluating the long-text capabilities of +large language models, from both efficiency and capability perspectives. Our +review thus sheds light on the evolving landscape of LLM optimization, offering +insights into future advancements in this dynamic field. + +
+
+ comment: to be published in CoLM 2024 +
+
+
+
+
+ + ☆ On the Effect of Purely Synthetic Training Data for Different Automatic + Speech Recognition Architectures + + +
+ In this work we evaluate the utility of synthetic data for training automatic +speech recognition (ASR). We use the ASR training data to train a +text-to-speech (TTS) system similar to FastSpeech-2. With this TTS we reproduce +the original training data, training ASR systems solely on synthetic data. For +ASR, we use three different architectures, attention-based encoder-decoder, +hybrid deep neural network hidden Markov model and a Gaussian mixture hidden +Markov model, showing the different sensitivity of the models to synthetic data +generation. In order to extend previous work, we present a number of ablation +studies on the effectiveness of synthetic vs. real training data for ASR. In +particular we focus on how the gap between training on synthetic and real data +changes by varying the speaker embedding or by scaling the model size. For the +latter we show that the TTS models generalize well, even when training scores +indicate overfitting. + +
+
+ comment: Accepted at the SynData4GenAI 2024 workshop +
+
+
+
+
+ + ☆ What does Kiki look like? Cross-modal associations between speech sounds + and visual shapes in vision-and-language models + + +
+ Humans have clear cross-modal preferences when matching certain novel words +to visual shapes. Evidence suggests that these preferences play a prominent +role in our linguistic processing, language learning, and the origins of +signal-meaning mappings. With the rise of multimodal models in AI, such as +vision- and-language (VLM) models, it becomes increasingly important to uncover +the kinds of visio-linguistic associations these models encode and whether they +align with human representations. Informed by experiments with humans, we probe +and compare four VLMs for a well-known human cross-modal preference, the +bouba-kiki effect. We do not find conclusive evidence for this effect but +suggest that results may depend on features of the models, such as architecture +design, model size, and training details. Our findings inform discussions on +the origins of the bouba-kiki effect in human cognition and future developments +of VLMs that align well with human cross-modal associations. + +
+
+ comment: Appeared at the 13th edition of the Workshop on Cognitive Modeling + and Computational Linguistics (CMCL 2024) +
+
+
+
+
+ + ☆ The Curious Case of Representational Alignment: Unravelling + Visio-Linguistic Tasks in Emergent Communication + + +
+ Natural language has the universal properties of being compositional and +grounded in reality. The emergence of linguistic properties is often +investigated through simulations of emergent communication in referential +games. However, these experiments have yielded mixed results compared to +similar experiments addressing linguistic properties of human language. Here we +address representational alignment as a potential contributing factor to these +results. Specifically, we assess the representational alignment between agent +image representations and between agent representations and input images. Doing +so, we confirm that the emergent language does not appear to encode human-like +conceptual visual features, since agent image representations drift away from +inputs whilst inter-agent alignment increases. We moreover identify a strong +relationship between inter-agent alignment and topographic similarity, a common +metric for compositionality, and address its consequences. To address these +issues, we introduce an alignment penalty that prevents representational drift +but interestingly does not improve performance on a compositional +discrimination task. Together, our findings emphasise the key role +representational alignment plays in simulations of language emergence. + +
+
+ comment: Appeared at the 13th edition of the Workshop on Cognitive Modeling + and Computational Linguistics (CMCL 2024) +
+
+
+
+
+ + ☆ Positive Text Reframing under Multi-strategy Optimization + + +
+ Differing from sentiment transfer, positive reframing seeks to substitute +negative perspectives with positive expressions while preserving the original +meaning. With the emergence of pre-trained language models (PLMs), it is +possible to achieve acceptable results by fine-tuning PLMs. Nevertheless, +generating fluent, diverse and task-constrained reframing text remains a +significant challenge. To tackle this issue, a \textbf{m}ulti-\textbf{s}trategy +\textbf{o}ptimization \textbf{f}ramework (MSOF) is proposed in this paper. +Starting from the objective of positive reframing, we first design positive +sentiment reward and content preservation reward to encourage the model to +transform the negative expressions of the original text while ensuring the +integrity and consistency of the semantics. Then, different decoding +optimization approaches are introduced to improve the quality of text +generation. Finally, based on the modeling formula of positive reframing, we +propose a multi-dimensional re-ranking method that further selects candidate +sentences from three dimensions: strategy consistency, text similarity and +fluency. Extensive experiments on two Seq2Seq PLMs, BART and T5, demonstrate +our framework achieves significant improvements on unconstrained and controlled +positive reframing tasks. + +
+
+
+
+
+ + ☆ Modelling Multimodal Integration in Human Concept Processing with + Vision-and-Language Models + + +
+ Representations from deep neural networks (DNNs) have proven remarkably +predictive of neural activity involved in both visual and linguistic +processing. Despite these successes, most studies to date concern unimodal +DNNs, encoding either visual or textual input but not both. Yet, there is +growing evidence that human meaning representations integrate linguistic and +sensory-motor information. Here we investigate whether the integration of +multimodal information operated by current vision-and-language DNN models +(VLMs) leads to representations that are more aligned with human brain activity +than those obtained by language-only and vision-only DNNs. We focus on fMRI +responses recorded while participants read concept words in the context of +either a full sentence or an accompanying picture. Our results reveal that VLM +representations correlate more strongly than language- and vision-only DNNs +with activations in brain areas functionally related to language processing. A +comparison between different types of visuo-linguistic architectures shows that +recent generative VLMs tend to be less brain-aligned than previous +architectures with lower performance on downstream applications. Moreover, +through an additional analysis comparing brain vs. behavioural alignment across +multiple VLMs, we show that -- with one remarkable exception -- representations +that strongly align with behavioural judgments do not correlate highly with +brain responses. This indicates that brain similarity does not go hand in hand +with behavioural similarity, and vice versa. + +
+
+
+
+
+ + ☆ The Power of Combining Data and Knowledge: GPT-4o is an Effective + Interpreter of Machine Learning Models in Predicting Lymph Node Metastasis of + Lung Cancer + + +
+ Lymph node metastasis (LNM) is a crucial factor in determining the initial +treatment for patients with lung cancer, yet accurate preoperative diagnosis of +LNM remains challenging. Recently, large language models (LLMs) have garnered +significant attention due to their remarkable text generation capabilities. +Leveraging the extensive medical knowledge learned from vast corpora, LLMs can +estimate probabilities for clinical problems, though their performance has +historically been inferior to data-driven machine learning models. In this +paper, we propose a novel ensemble method that combines the medical knowledge +acquired by LLMs with the latent patterns identified by machine learning models +to enhance LNM prediction performance. Initially, we developed machine learning +models using patient data. We then designed a prompt template to integrate the +patient data with the predicted probability from the machine learning model. +Subsequently, we instructed GPT-4o, the most advanced LLM developed by OpenAI, +to estimate the likelihood of LNM based on patient data and then adjust the +estimate using the machine learning output. Finally, we collected three outputs +from the GPT-4o using the same prompt and ensembled these results as the final +prediction. Using the proposed method, our models achieved an AUC value of +0.765 and an AP value of 0.415 for LNM prediction, significantly improving +predictive performance compared to baseline machine learning models. The +experimental results indicate that GPT-4o can effectively leverage its medical +knowledge and the probabilities predicted by machine learning models to achieve +more accurate LNM predictions. These findings demonstrate that LLMs can perform +well in clinical risk prediction tasks, offering a new paradigm for integrating +medical knowledge and patient data in clinical predictions. + +
+
+
+
+
+ + ☆ A Large-Scale Sensitivity Analysis on Latent Embeddings and + Dimensionality Reductions for Text Spatializations IEEE VIS 2024 + + +
+ The semantic similarity between documents of a text corpus can be visualized +using map-like metaphors based on two-dimensional scatterplot layouts. These +layouts result from a dimensionality reduction on the document-term matrix or a +representation within a latent embedding, including topic models. Thereby, the +resulting layout depends on the input data and hyperparameters of the +dimensionality reduction and is therefore affected by changes in them. +Furthermore, the resulting layout is affected by changes in the input data and +hyperparameters of the dimensionality reduction. However, such changes to the +layout require additional cognitive efforts from the user. In this work, we +present a sensitivity study that analyzes the stability of these layouts +concerning (1) changes in the text corpora, (2) changes in the hyperparameter, +and (3) randomness in the initialization. Our approach has two stages: data +measurement and data analysis. First, we derived layouts for the combination of +three text corpora and six text embeddings and a grid-search-inspired +hyperparameter selection of the dimensionality reductions. Afterward, we +quantified the similarity of the layouts through ten metrics, concerning local +and global structures and class separation. Second, we analyzed the resulting +42817 tabular data points in a descriptive statistical analysis. From this, we +derived guidelines for informed decisions on the layout algorithm and highlight +specific hyperparameter settings. We provide our implementation as a Git +repository at +https://github.com/hpicgs/Topic-Models-and-Dimensionality-Reduction-Sensitivity-Study +and results as Zenodo archive at https://doi.org/10.5281/zenodo.12772898. + +
+
+ comment: To be published at IEEE VIS 2024 conference +
+
+
+
+
+ + ☆ Improving Domain-Specific ASR with LLM-Generated Contextual Descriptions INTERSPEECH 2024 + + +
+ End-to-end automatic speech recognition (E2E ASR) systems have significantly +improved speech recognition through training on extensive datasets. Despite +these advancements, they still struggle to accurately recognize domain specific +words, such as proper nouns and technical terminologies. To address this +problem, we propose a method to utilize the state-of-the-art Whisper without +modifying its architecture, preserving its generalization performance while +enabling it to leverage descriptions effectively. Moreover, we propose two +additional training techniques to improve the domain specific ASR: decoder +fine-tuning, and context perturbation. We also propose a method to use a Large +Language Model (LLM) to generate descriptions with simple metadata, when +descriptions are unavailable. Our experiments demonstrate that proposed methods +notably enhance domain-specific ASR accuracy on real-life datasets, with +LLM-generated descriptions outperforming human-crafted ones in effectiveness. + +
+
+ comment: Accepted to INTERSPEECH 2024 +
+
+
+
+
+ + ☆ Is the Digital Forensics and Incident Response Pipeline Ready for + Text-Based Threats in LLM Era? + + +
+ In the era of generative AI, the widespread adoption of Neural Text +Generators (NTGs) presents new cybersecurity challenges, particularly within +the realms of Digital Forensics and Incident Response (DFIR). These challenges +primarily involve the detection and attribution of sources behind advanced +attacks like spearphishing and disinformation campaigns. As NTGs evolve, the +task of distinguishing between human and NTG-authored texts becomes critically +complex. This paper rigorously evaluates the DFIR pipeline tailored for +text-based security systems, specifically focusing on the challenges of +detecting and attributing authorship of NTG-authored texts. By introducing a +novel human-NTG co-authorship text attack, termed CS-ACT, our study uncovers +significant vulnerabilities in traditional DFIR methodologies, highlighting +discrepancies between ideal scenarios and real-world conditions. Utilizing 14 +diverse datasets and 43 unique NTGs, up to the latest GPT-4, our research +identifies substantial vulnerabilities in the forensic profiling phase, +particularly in attributing authorship to NTGs. Our comprehensive evaluation +points to factors such as model sophistication and the lack of distinctive +style within NTGs as significant contributors for these vulnerabilities. Our +findings underscore the necessity for more sophisticated and adaptable +strategies, such as incorporating adversarial learning, stylizing NTGs, and +implementing hierarchical attribution through the mapping of NTG lineages to +enhance source attribution. This sets the stage for future research and the +development of more resilient text-based security systems. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ Financial Statement Analysis with Large Language Models + + +
+ We investigate whether an LLM can successfully perform financial statement +analysis in a way similar to a professional human analyst. We provide +standardized and anonymous financial statements to GPT4 and instruct the model +to analyze them to determine the direction of future earnings. Even without any +narrative or industry-specific information, the LLM outperforms financial +analysts in its ability to predict earnings changes. The LLM exhibits a +relative advantage over human analysts in situations when the analysts tend to +struggle. Furthermore, we find that the prediction accuracy of the LLM is on +par with the performance of a narrowly trained state-of-the-art ML model. LLM +prediction does not stem from its training memory. Instead, we find that the +LLM generates useful narrative insights about a company's future performance. +Lastly, our trading strategies based on GPT's predictions yield a higher Sharpe +ratio and alphas than strategies based on other models. Taken together, our +results suggest that LLMs may take a central role in decision-making. + +
+
+ comment: Previously posted on SSRN (May 21, 2024). See + http://papers.ssrn.com/sol3/papers.cfm?abstract_id=4835311 +
+
+
+
+
+ + ☆ factgenie: A Framework for Span-based Evaluation of Generated Texts + + +
+ We present factgenie: a framework for annotating and visualizing word spans +in textual model outputs. Annotations can capture various span-based phenomena +such as semantic inaccuracies or irrelevant text. With factgenie, the +annotations can be collected both from human crowdworkers and large language +models. Our framework consists of a web interface for data visualization and +gathering text annotations, powered by an easily extensible codebase. + +
+
+ comment: Accepted to INLG 2024 (System Demonstrations) +
+
+
+
+
+ + ☆ Exploring Description-Augmented Dataless Intent Classification ACL + 2024 + + +
+ In this work, we introduce several schemes to leverage description-augmented +embedding similarity for dataless intent classification using current +state-of-the-art (SOTA) text embedding models. We report results of our methods +on four commonly used intent classification datasets and compare against +previous works of a similar nature. Our work shows promising results for +dataless classification scaling to a large number of unseen intents. We show +competitive results and significant improvements (+6.12\% Avg.) over strong +zero-shot baselines, all without training on labelled or task-specific data. +Furthermore, we provide qualitative error analysis of the shortfalls of this +methodology to help guide future research in this area. + +
+
+ comment: Accepted to the 6th NLP for Conversational AI Workshop at ACL + 2024(NLP4ConvAI) +
+
+
+
+
+ + ☆ Shapley Value-based Contrastive Alignment for Multimodal Information + Extraction + + +
+ The rise of social media and the exponential growth of multimodal +communication necessitates advanced techniques for Multimodal Information +Extraction (MIE). However, existing methodologies primarily rely on direct +Image-Text interactions, a paradigm that often faces significant challenges due +to semantic and modality gaps between images and text. In this paper, we +introduce a new paradigm of Image-Context-Text interaction, where large +multimodal models (LMMs) are utilized to generate descriptive textual context +to bridge these gaps. In line with this paradigm, we propose a novel Shapley +Value-based Contrastive Alignment (Shap-CA) method, which aligns both +context-text and context-image pairs. Shap-CA initially applies the Shapley +value concept from cooperative game theory to assess the individual +contribution of each element in the set of contexts, texts and images towards +total semantic and modality overlaps. Following this quantitative evaluation, a +contrastive learning strategy is employed to enhance the interactive +contribution within context-text/image pairs, while minimizing the influence +across these pairs. Furthermore, we design an adaptive fusion module for +selective cross-modal fusion. Extensive experiments across four MIE datasets +demonstrate that our method significantly outperforms existing state-of-the-art +methods. + +
+
+ comment: Accepted at ACM Multimedia 2024 +
+
+
+
+
+ + ☆ Scaling A Simple Approach to Zero-Shot Speech Recognition + + +
+ Despite rapid progress in increasing the language coverage of automatic +speech recognition, the field is still far from covering all languages with a +known writing script. Recent work showed promising results with a zero-shot +approach requiring only a small amount of text data, however, accuracy heavily +depends on the quality of the used phonemizer which is often weak for unseen +languages. In this paper, we present MMS Zero-shot a conceptually simpler +approach based on romanization and an acoustic model trained on data in 1,078 +different languages or three orders of magnitude more than prior art. MMS +Zero-shot reduces the average character error rate by a relative 46% over 100 +unseen languages compared to the best previous work. Moreover, the error rate +of our approach is only 2.5x higher compared to in-domain supervised baselines, +while our approach uses no labeled data for the evaluation languages at all. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ Innovative Speech-Based Deep Learning Approaches for Parkinson's Disease + Classification: A Systematic Review + + +
+ Parkinson's disease (PD), the second most prevalent neurodegenerative +disorder worldwide, frequently presents with early-stage speech impairments. +Recent advancements in Artificial Intelligence (AI), particularly deep learning +(DL), have significantly enhanced PD diagnosis through the analysis of speech +data. Nevertheless, the progress of research is restricted by the limited +availability of publicly accessible speech-based PD datasets, primarily due to +privacy and ethical concerns. This review covers the latest DL-based AI +approaches for speech-based PD classification, focusing on performance, +available resources and associated challenges of 33 scientific works published +between 2020 and March 2024. These DL approaches are categorized into +end-to-end (E2E) learning, transfer learning (TL) and deep acoustic features +(DAF) extraction. Among E2E approaches, Convolutional Neural Networks (CNNs) +are prevalent, though Transformers are increasingly popular. E2E approaches +face challenges such as limited data and computational resources, especially +with Transformers. TL addresses these issues by providing more robust PD +diagnosis and better generalizability across languages. DAF extraction aims to +improve the explainability and interpretability of results by examining the +specific effects of deep features on both other DL approaches and more +traditional machine learning (ML) methods. However, it often underperforms +compared to E2E and TL approaches. This review also discusses unresolved issues +related to bias, explainability and privacy, highlighting the need for future +research. + +
+
+ comment: Submitted in Applied Sciences - peer reviewed Open Access journal. + This research was funded by the NWO research programme AiNed Fellowship + Grants under the project Responsible AI for Voice Diagnostics (RAIVD) - grant + number NGF.1607.22.013 +
+
+
+
+
+ + ☆ Unified Lexical Representation for Interpretable Visual-Language + Alignment + + +
+ Visual-Language Alignment (VLA) has gained a lot of attention since CLIP's +groundbreaking work. Although CLIP performs well, the typical direct latent +feature alignment lacks clarity in its representation and similarity scores. On +the other hand, lexical representation, a vector whose element represents the +similarity between the sample and a word from the vocabulary, is a natural +sparse representation and interpretable, providing exact matches for individual +words. However, lexical representations is difficult to learn due to no +ground-truth supervision and false-discovery issues, and thus requires complex +design to train effectively. In this paper, we introduce LexVLA, a more +interpretable VLA framework by learning a unified lexical representation for +both modalities without complex design. We use DINOv2 as our visual model for +its local-inclined features and Llama 2, a generative language model, to +leverage its in-context lexical prediction ability. To avoid the false +discovery, we propose an overuse penalty to refrain the lexical representation +from falsely frequently activating meaningless words. We demonstrate that these +two pre-trained uni-modal models can be well-aligned by fine-tuning on modest +multi-modal dataset and avoid intricate training configurations. On cross-modal +retrieval benchmarks, LexVLA, trained on the CC-12M multi-modal dataset, +outperforms baselines fine-tuned on larger datasets (e.g., YFCC15M) and those +trained from scratch on even bigger datasets (e.g., 1.1B data, including +CC-12M). We conduct extensive experiments to analyze LexVLA. + +
+
+
+
+
+ + ☆ Demystifying Verbatim Memorization in Large Language Models + + +
+ Large Language Models (LLMs) frequently memorize long sequences verbatim, +often with serious legal and privacy implications. Much prior work has studied +such verbatim memorization using observational data. To complement such work, +we develop a framework to study verbatim memorization in a controlled setting +by continuing pre-training from Pythia checkpoints with injected sequences. We +find that (1) non-trivial amounts of repetition are necessary for verbatim +memorization to happen; (2) later (and presumably better) checkpoints are more +likely to verbatim memorize sequences, even for out-of-distribution sequences; +(3) the generation of memorized sequences is triggered by distributed model +states that encode high-level features and makes important use of general +language modeling capabilities. Guided by these insights, we develop stress +tests to evaluate unlearning methods and find they often fail to remove the +verbatim memorized information, while also degrading the LM. Overall, these +findings challenge the hypothesis that verbatim memorization stems from +specific model weights or mechanisms. Rather, verbatim memorization is +intertwined with the LM's general capabilities and thus will be very difficult +to isolate and suppress without degrading model quality. + +
+
+
+
+
+ + ☆ KiVA: Kid-inspired Visual Analogies for Testing Large Multimodal Models + + +
+ This paper investigates visual analogical reasoning in large multimodal +models (LMMs) compared to human adults and children. A "visual analogy" is an +abstract rule inferred from one image and applied to another. While benchmarks +exist for testing visual reasoning in LMMs, they require advanced skills and +omit basic visual analogies that even young children can make. Inspired by +developmental psychology, we propose a new benchmark of 1,400 visual +transformations of everyday objects to test LMMs on visual analogical reasoning +and compare them to children and adults. We structure the evaluation into three +stages: identifying what changed (e.g., color, number, etc.), how it changed +(e.g., added one object), and applying the rule to new scenarios. Our findings +show that while models like GPT-4V, LLaVA-1.5, and MANTIS identify the "what" +effectively, they struggle with quantifying the "how" and extrapolating this +rule to new objects. In contrast, children and adults exhibit much stronger +analogical reasoning at all three stages. Additionally, the strongest tested +model, GPT-4V, performs better in tasks involving simple visual attributes like +color and size, correlating with quicker human adult response times. +Conversely, more complex tasks such as number, rotation, and reflection, which +necessitate extensive cognitive processing and understanding of the 3D physical +world, present more significant challenges. Altogether, these findings +highlight the limitations of training models on data that primarily consists of +2D images and text. + +
+
+ comment: 9 pages. For the KiVA benchmark, see https://github.com/ey242/KiVA +
+
+
+
+
+ + ☆ ERIT Lightweight Multimodal Dataset for Elderly Emotion Recognition and + Multimodal Fusion Evaluation + + +
+ ERIT is a novel multimodal dataset designed to facilitate research in a +lightweight multimodal fusion. It contains text and image data collected from +videos of elderly individuals reacting to various situations, as well as seven +emotion labels for each data sample. Because of the use of labeled images of +elderly users reacting emotionally, it is also facilitating research on emotion +recognition in an underrepresented age group in machine learning visual emotion +recognition. The dataset is validated through comprehensive experiments +indicating its importance in neural multimodal fusion research. + +
+
+
+
+
+ + ☆ Banyan: Improved Representation Learning with Explicit Structure + + +
+ We present Banyan, an improved model to learn semantic representations by +inducing explicit structure over data. In contrast to prior approaches using +structure spanning single sentences, Banyan learns by resolving multiple +constituent structures into a shared one explicitly incorporating global +context. Combined with an improved message-passing scheme inspired by Griffin, +Banyan learns significantly better representations, avoids spurious false +negatives with contrastive learning, and drastically improves memory efficiency +in such explicit-structured models. Using the Self-StrAE framework, we show +that Banyan (a) outperforms baselines using sentential structure across various +settings (b) matches or outperforms unstructured baselines like GloVe +(+augmentations) and a RoBERTa medium (+simcse) pre-trained on 100M tokens, +despite having just a handful of (non-embedding) parameters, and (c) also +learns effective representations across several low resource (Asian and +African) languages as measured on SemRel tasks. + +
+
+ comment: First Draft +
+
+
+
+
+ + ☆ BotEval: Facilitating Interactive Human Evaluation ACL 2024 + + +
+ Following the rapid progress in natural language processing (NLP) models, +language models are applied to increasingly more complex interactive tasks such +as negotiations and conversation moderations. Having human evaluators directly +interact with these NLP models is essential for adequately evaluating the +performance on such interactive tasks. We develop BotEval, an easily +customizable, open-source, evaluation toolkit that focuses on enabling +human-bot interactions as part of the evaluation process, as opposed to human +evaluators making judgements for a static input. BotEval balances flexibility +for customization and user-friendliness by providing templates for common use +cases that span various degrees of complexity and built-in compatibility with +popular crowdsourcing platforms. We showcase the numerous useful features of +BotEval through a study that evaluates the performance of various chatbots on +their effectiveness for conversational moderation and discuss how BotEval +differs from other annotation tools. + +
+
+ comment: ACL 2024 SDT, 10 pages +
+
+
+
+
+ + ☆ Beyond Entity Alignment: Towards Complete Knowledge Graph Alignment via + Entity-Relation Synergy + + +
+ Knowledge Graph Alignment (KGA) aims to integrate knowledge from multiple +sources to address the limitations of individual Knowledge Graphs (KGs) in +terms of coverage and depth. However, current KGA models fall short in +achieving a ``complete'' knowledge graph alignment. Existing models primarily +emphasize the linkage of cross-graph entities but overlook aligning relations +across KGs, thereby providing only a partial solution to KGA. The semantic +correlations embedded in relations are largely overlooked, potentially +restricting a comprehensive understanding of cross-KG signals. In this paper, +we propose to conceptualize relation alignment as an independent task and +conduct KGA by decomposing it into two distinct but highly correlated +sub-tasks: entity alignment and relation alignment. To capture the mutually +reinforcing correlations between these objectives, we propose a novel +Expectation-Maximization-based model, EREM, which iteratively optimizes both +sub-tasks. Experimental results on real-world datasets demonstrate that EREM +consistently outperforms state-of-the-art models in both entity alignment and +relation alignment tasks. + +
+
+
+
+
+ + ☆ Cost-effective Instruction Learning for Pathology Vision and Language + Analysis + + +
+ The advent of vision-language models fosters the interactive conversations +between AI-enabled models and humans. Yet applying these models into clinics +must deal with daunting challenges around large-scale training data, financial, +and computational resources. Here we propose a cost-effective instruction +learning framework for conversational pathology named as CLOVER. CLOVER only +trains a lightweight module and uses instruction tuning while freezing the +parameters of the large language model. Instead of using costly GPT-4, we +propose well-designed prompts on GPT-3.5 for building generation-based +instructions, emphasizing the utility of pathological knowledge derived from +the Internet source. To augment the use of instructions, we construct a +high-quality set of template-based instructions in the context of digital +pathology. From two benchmark datasets, our findings reveal the strength of +hybrid-form instructions in the visual question-answer in pathology. Extensive +results show the cost-effectiveness of CLOVER in answering both open-ended and +closed-ended questions, where CLOVER outperforms strong baselines that possess +37 times more training parameters and use instruction data generated from +GPT-4. Through the instruction tuning, CLOVER exhibits robustness of few-shot +learning in the external clinical dataset. These findings demonstrate that +cost-effective modeling of CLOVER could accelerate the adoption of rapid +conversational applications in the landscape of digital pathology. + +
+
+
+
+
+ + ☆ Are Large Language Models Possible to Conduct Cognitive Behavioral + Therapy? + + +
+ In contemporary society, the issue of psychological health has become +increasingly prominent, characterized by the diversification, complexity, and +universality of mental disorders. Cognitive Behavioral Therapy (CBT), currently +the most influential and clinically effective psychological treatment method +with no side effects, has limited coverage and poor quality in most countries. +In recent years, researches on the recognition and intervention of emotional +disorders using large language models (LLMs) have been validated, providing new +possibilities for psychological assistance therapy. However, are LLMs truly +possible to conduct cognitive behavioral therapy? Many concerns have been +raised by mental health experts regarding the use of LLMs for therapy. Seeking +to answer this question, we collected real CBT corpus from online video +websites, designed and conducted a targeted automatic evaluation framework +involving the evaluation of emotion tendency of generated text, structured +dialogue pattern and proactive inquiry ability. For emotion tendency, we +calculate the emotion tendency score of the CBT dialogue text generated by each +model. For structured dialogue pattern, we use a diverse range of automatic +evaluation metrics to compare speaking style, the ability to maintain +consistency of topic and the use of technology in CBT between different models +. As for inquiring to guide the patient, we utilize PQA (Proactive Questioning +Ability) metric. We also evaluated the CBT ability of the LLM after integrating +a CBT knowledge base to explore the help of introducing additional knowledge to +enhance the model's CBT counseling ability. Four LLM variants with excellent +performance on natural language processing are evaluated, and the experimental +result shows the great potential of LLMs in psychological counseling realm, +especially after combining with other technological means. + +
+
+
+
+
+ + ☆ Describe Where You Are: Improving Noise-Robustness for Speech Emotion + Recognition with Text Description of the Environment + + +
+ Speech emotion recognition (SER) systems often struggle in real-world +environments, where ambient noise severely degrades their performance. This +paper explores a novel approach that exploits prior knowledge of testing +environments to maximize SER performance under noisy conditions. To address +this task, we propose a text-guided, environment-aware training where an SER +model is trained with contaminated speech samples and their paired noise +description. We use a pre-trained text encoder to extract the text-based +environment embedding and then fuse it to a transformer-based SER model during +training and inference. We demonstrate the effectiveness of our approach +through our experiment with the MSP-Podcast corpus and real-world additive +noise samples collected from the Freesound repository. Our experiment indicates +that the text-based environment descriptions processed by a large language +model (LLM) produce representations that improve the noise-robustness of the +SER system. In addition, our proposed approach with an LLM yields better +performance than our environment-agnostic baselines, especially in low +signal-to-noise ratio (SNR) conditions. When testing at -5dB SNR level, our +proposed method shows better performance than our best baseline model by 31.8 % +(arousal), 23.5% (dominance), and 9.5% (valence). + +
+
+
+
+
+ + ☆ Enhancing Agent Learning through World Dynamics Modeling + + +
+ While large language models (LLMs) have been increasingly deployed across +tasks in language understanding and interactive decision-making, their +impressive performance is largely due to the comprehensive and in-depth domain +knowledge embedded within them. However, the extent of this knowledge can vary +across different domains. Existing methods often assume that LLMs already +possess such comprehensive and in-depth knowledge of their environment, +overlooking potential gaps in their understanding of actual world dynamics. To +address this gap, we introduce Discover, Verify, and Evolve (DiVE), a framework +that discovers world dynamics from a small number of demonstrations, verifies +the correctness of these dynamics, and evolves new, advanced dynamics tailored +to the current situation. Through extensive evaluations, we analyze the impact +of each component on performance and compare the automatically generated +dynamics from DiVE with human-annotated world dynamics. Our results demonstrate +that LLMs guided by DiVE can make better decisions, achieving rewards +comparable to human players in the Crafter environment. + +
+
+
+
+
+ + ☆ Examining the Influence of Political Bias on Large Language Model + Performance in Stance Classification + + +
+ Large Language Models (LLMs) have demonstrated remarkable capabilities in +executing tasks based on natural language queries. However, these models, +trained on curated datasets, inherently embody biases ranging from racial to +national and gender biases. It remains uncertain whether these biases impact +the performance of LLMs for certain tasks. In this study, we investigate the +political biases of LLMs within the stance classification task, specifically +examining whether these models exhibit a tendency to more accurately classify +politically-charged stances. Utilizing three datasets, seven LLMs, and four +distinct prompting schemes, we analyze the performance of LLMs on politically +oriented statements and targets. Our findings reveal a statistically +significant difference in the performance of LLMs across various politically +oriented stance classification tasks. Furthermore, we observe that this +difference primarily manifests at the dataset level, with models and prompting +schemes showing statistically similar performances across different stance +classification datasets. Lastly, we observe that when there is greater +ambiguity in the target the statement is directed towards, LLMs have poorer +stance classification accuracy. + +
+
+ comment: Accepted at ICWSM 2025 +
+
+
+
+
+ + ☆ Transformers on Markov Data: Constant Depth Suffices + + +
+ Attention-based transformers have been remarkably successful at modeling +generative processes across various domains and modalities. In this paper, we +study the behavior of transformers on data drawn from \kth Markov processes, +where the conditional distribution of the next symbol in a sequence depends on +the previous $k$ symbols observed. We observe a surprising phenomenon +empirically which contradicts previous findings: when trained for sufficiently +long, a transformer with a fixed depth and $1$ head per layer is able to +achieve low test loss on sequences drawn from \kth Markov sources, even as $k$ +grows. Furthermore, this low test loss is achieved by the transformer's ability +to represent and learn the in-context conditional empirical distribution. On +the theoretical side, our main result is that a transformer with a single head +and three layers can represent the in-context conditional empirical +distribution for \kth Markov sources, concurring with our empirical +observations. Along the way, we prove that \textit{attention-only} transformers +with $O(\log_2(k))$ layers can represent the in-context conditional empirical +distribution by composing induction heads to track the previous $k$ symbols in +the sequence. These results provide more insight into our current understanding +of the mechanisms by which transformers learn to capture context, by +understanding their behavior on Markov sources. + +
+
+ comment: 29 pages, 10 figures +
+
+
+
+
+ + ☆ Efficient LLM Training and Serving with Heterogeneous Context Sharding + among Attention Heads + + +
+ Existing LLM training and inference frameworks struggle in boosting +efficiency with sparsity while maintaining the integrity of context and model +architecture. Inspired by the sharding concept in database and the fact that +attention parallelizes over heads on accelerators, we propose Sparsely-Sharded +(S2) Attention, an attention algorithm that allocates heterogeneous context +partitions for different attention heads to divide and conquer. S2-Attention +enforces each attention head to only attend to a partition of contexts +following a strided sparsity pattern, while the full context is preserved as +the union of all the shards. As attention heads are processed in separate +thread blocks, the context reduction for each head can thus produce end-to-end +speed-up and memory reduction. At inference, LLMs trained with S2-Attention can +then take the KV cache reduction as free meals with guaranteed model quality +preserve. In experiments, we show S2-Attentioncan provide as much as (1) 25.3X +wall-clock attention speed-up over FlashAttention-2, resulting in 6X reduction +in end-to-end training time and 10X inference latency, (2) on-par model +training quality compared to default attention, (3)perfect needle retrieval +accuracy over 32K context window. On top of the algorithm, we build DKernel, an +LLM training and inference kernel library that allows users to customize +sparsity patterns for their own models. We open-sourced DKerneland make it +compatible with Megatron, Pytorch, and vLLM. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Self-Directed Synthetic Dialogues and Revisions Technical Report + + +
+ Synthetic data has become an important tool in the fine-tuning of language +models to follow instructions and solve complex problems. Nevertheless, the +majority of open data to date is often lacking multi-turn data and collected on +closed models, limiting progress on advancing open fine-tuning methods. We +introduce Self Directed Synthetic Dialogues (SDSD), an experimental dataset +consisting of guided conversations of language models talking to themselves. +The dataset consists of multi-turn conversations generated with DBRX, Llama 2 +70B, and Mistral Large, all instructed to follow a conversation plan generated +prior to the conversation. We also explore including principles from +Constitutional AI and other related works to create synthetic preference data +via revisions to the final conversation turn. We hope this work encourages +further exploration in multi-turn data and the use of open models for expanding +the impact of synthetic data. + +
+
+ comment: 25 pages, 3 figures, 4 tables +
+
+
+
+
+ + ☆ The Art of Refusal: A Survey of Abstention in Large Language Models + + +
+ Abstention, the refusal of large language models (LLMs) to provide an answer, +is increasingly recognized for its potential to mitigate hallucinations and +enhance safety in building LLM systems. In this survey, we introduce a +framework to examine abstention behavior from three perspectives: the query, +the model, and human values. We review the literature on abstention methods +(categorized based on the development stages of LLMs), benchmarks, and +evaluation metrics, and discuss the merits and limitations of prior work. We +further identify and motivate areas for future research, such as encouraging +the study of abstention as a meta-capability across tasks and customizing +abstention abilities based on context. In doing so, we aim to broaden the scope +and impact of abstention methodologies in AI systems. + +
+
+ comment: preprint +
+
+
+
+
+ + ☆ PersonaGym: Evaluating Persona Agents and LLMs + + +
+ Persona agents, which are LLM agents that act according to an assigned +persona, have demonstrated impressive contextual response capabilities across +various applications. These persona agents offer significant enhancements +across diverse sectors, such as education, healthcare, and entertainment, where +model developers can align agent responses to different user requirements +thereby broadening the scope of agent applications. However, evaluating persona +agent performance is incredibly challenging due to the complexity of assessing +persona adherence in free-form interactions across various environments that +are relevant to each persona agent. We introduce PersonaGym, the first dynamic +evaluation framework for assessing persona agents, and PersonaScore, the first +automated human-aligned metric grounded in decision theory for comprehensive +large-scale evaluation of persona agents. Our evaluation of 6 open and +closed-source LLMs, using a benchmark encompassing 200 personas and 10,000 +questions, reveals significant opportunities for advancement in persona agent +capabilities across state-of-the-art models. For example, Claude 3.5 Sonnet +only has a 2.97% relative improvement in PersonaScore than GPT 3.5 despite +being a much more advanced model. Importantly, we find that increased model +size and complexity do not necessarily imply enhanced persona agent +capabilities thereby highlighting the pressing need for algorithmic and +architectural invention towards faithful and performant persona agents. + +
+
+ comment: 21 pages, 5 figures +
+
+
+
+
+ + ☆ Exploring Bengali Religious Dialect Biases in Large Language Models with + Evaluation Perspectives + + +
+ While Large Language Models (LLM) have created a massive technological impact +in the past decade, allowing for human-enabled applications, they can produce +output that contains stereotypes and biases, especially when using low-resource +languages. This can be of great ethical concern when dealing with sensitive +topics such as religion. As a means toward making LLMS more fair, we explore +bias from a religious perspective in Bengali, focusing specifically on two main +religious dialects: Hindu and Muslim-majority dialects. Here, we perform +different experiments and audit showing the comparative analysis of different +sentences using three commonly used LLMs: ChatGPT, Gemini, and Microsoft +Copilot, pertaining to the Hindu and Muslim dialects of specific words and +showcasing which ones catch the social biases and which do not. Furthermore, we +analyze our findings and relate them to potential reasons and evaluation +perspectives, considering their global impact with over 300 million speakers +worldwide. With this work, we hope to establish the rigor for creating more +fairness in LLMs, as these are widely used as creative writing agents. + +
+
+ comment: 10 Pages, 4 Figures. Accepted to the 1st Human-centered Evaluation + and Auditing of Language Models Workshop at CHI 2024 (Workshop website: + https://heal-workshop.github.io/#:~:text=Exploring%20Bengali%20Religious%20Dialect%20Biases%20in%20Large%20Language%20Models%20with%20Evaluation%20Perspectives) +
+
+
+
+
+ + ☆ Trust or Escalate: LLM Judges with Provable Guarantees for Human + Agreement + + +
+ We present a principled approach to provide LLM-based evaluation with a +rigorous guarantee of human agreement. We first propose that a reliable +evaluation method should not uncritically rely on model preferences for +pairwise evaluation, but rather assess the confidence of judge models and +selectively decide when to trust its judgement. We then show that under this +selective evaluation framework, human agreement can be provably guaranteed -- +such that the model evaluation aligns with that of humans to a user-specified +agreement level. As part of our framework, we also introduce Simulated +Annotators, a novel confidence estimation method that significantly improves +judge calibration and thus enables high coverage of evaluated instances. +Finally, we propose Cascaded Selective Evaluation, where we use cheaper models +as initial judges and escalate to stronger models only when necessary -- again, +while still providing a provable guarantee of human agreement. Experimental +results show that Cascaded Selective Evaluation guarantees strong alignment +with humans, far beyond what LLM judges could achieve without selective +evaluation. For example, on a subset of Chatbot Arena where GPT-4 almost never +achieves 80% human agreement, our method, even while employing substantially +cost-effective models such as Mistral-7B, guarantees over 80% human agreement +with almost 80% test coverage. + +
+
+
+
+
+ + ☆ Robust Claim Verification Through Fact Detection + + +
+ Claim verification can be a challenging task. In this paper, we present a +method to enhance the robustness and reasoning capabilities of automated claim +verification through the extraction of short facts from evidence. Our novel +approach, FactDetect, leverages Large Language Models (LLMs) to generate +concise factual statements from evidence and label these facts based on their +semantic relevance to the claim and evidence. The generated facts are then +combined with the claim and evidence. To train a lightweight supervised model, +we incorporate a fact-detection task into the claim verification process as a +multitasking approach to improve both performance and explainability. We also +show that augmenting FactDetect in the claim verification prompt enhances +performance in zero-shot claim verification using LLMs. Our method demonstrates +competitive results in the supervised claim verification model by 15% on the F1 +score when evaluated for challenging scientific claim verification datasets. We +also demonstrate that FactDetect can be augmented with claim and evidence for +zero-shot prompting (AugFactDetect) in LLMs for verdict prediction. We show +that AugFactDetect outperforms the baseline with statistical significance on +three challenging scientific claim verification datasets with an average of +17.3% performance gain compared to the best performing baselines. + +
+
+
+
+
+ + ♻ ☆ Block Verification Accelerates Speculative Decoding + + +
+ Speculative decoding is an effective method for lossless acceleration of +large language models during inference. It uses a fast model to draft a block +of tokens which are then verified in parallel by the target model, and provides +a guarantee that the output is distributed identically to a sample from the +target model. In prior works, draft verification is performed independently +token-by-token. Surprisingly, we show that this approach is not optimal. We +propose Block Verification, a simple draft verification algorithm that verifies +the entire block jointly and provides additional wall-clock speedup. We prove +that the proposed mechanism is optimal in the expected number of tokens +produced each iteration and specifically is never worse than the standard +token-level verification. Empirically, block verification provides modest but +consistent wall-clock speedups over the standard token verification algorithm +of 5%-8% in a range of tasks and datasets. Given that block verification does +not increase code complexity, maintains the strong lossless guarantee of the +standard speculative decoding verification algorithm, cannot deteriorate +performance, and, in fact, consistently improves it, it can be used as a good +default in speculative decoding implementations. + +
+
+
+
+
+ + ♻ ☆ ShiftAddLLM: Accelerating Pretrained LLMs via Post-Training + Multiplication-Less Reparameterization + + +
+ Large language models (LLMs) have shown impressive performance on language +tasks but face challenges when deployed on resource-constrained devices due to +their extensive parameters and reliance on dense multiplications, resulting in +high memory demands and latency bottlenecks. Shift-and-add reparameterization +offers a promising solution by replacing costly multiplications with +hardware-friendly primitives in both the attention and multi-layer perceptron +(MLP) layers of an LLM. However, current reparameterization techniques require +training from scratch or full parameter fine-tuning to restore accuracy, which +is resource-intensive for LLMs. To address this, we propose accelerating +pretrained LLMs through post-training shift-and-add reparameterization, +creating efficient multiplication-free models, dubbed ShiftAddLLM. +Specifically, we quantize each weight matrix into binary matrices paired with +group-wise scaling factors. The associated multiplications are reparameterized +into (1) shifts between activations and scaling factors and (2) queries and +adds according to the binary matrices. To reduce accuracy loss, we present a +multi-objective optimization method to minimize both weight and output +activation reparameterization errors. Additionally, based on varying +sensitivity across layers to reparameterization, we develop an automated bit +allocation strategy to further reduce memory usage and latency. Experiments on +five LLM families and eight tasks consistently validate the effectiveness of +ShiftAddLLM, achieving average perplexity improvements of 5.6 and 22.7 points +at comparable or lower latency compared to the most competitive quantized LLMs +at 3 and 2 bits, respectively, and more than 80% memory and energy reductions +over the original LLMs. Codes and models are available at +https://github.com/GATECH-EIC/ShiftAddLLM. + +
+
+
+
+
+ + ♻ ☆ When Linear Attention Meets Autoregressive Decoding: Towards More + Effective and Efficient Linearized Large Language Models ICML 2024 + + +
+ Autoregressive Large Language Models (LLMs) have achieved impressive +performance in language tasks but face two significant bottlenecks: (1) +quadratic complexity in the attention module as the number of tokens increases, +and (2) limited efficiency due to the sequential processing nature of +autoregressive LLMs during generation. While linear attention and speculative +decoding offer potential solutions, their applicability and synergistic +potential for enhancing autoregressive LLMs remain uncertain. We conduct the +first comprehensive study on the efficacy of existing linear attention methods +for autoregressive LLMs, integrating them with speculative decoding. We +introduce an augmentation technique for linear attention that ensures +compatibility with speculative decoding, enabling more efficient training and +serving of LLMs. Extensive experiments and ablation studies involving seven +existing linear attention models and five encoder/decoder-based LLMs +consistently validate the effectiveness of our augmented linearized LLMs. +Notably, our approach achieves up to a 6.67 reduction in perplexity on the +LLaMA model and up to a 2$\times$ speedup during generation compared to prior +linear attention methods. Codes and models are available at +https://github.com/GATECH-EIC/Linearized-LLM. + +
+
+ comment: Accepted by ICML 2024; 17 pages; 10 figures; 16 tables +
+
+
+
+
+ + ♻ ☆ A Unified Framework for Model Editing ACL 2024 + + +
+ ROME and MEMIT are largely believed to be two different model editing +algorithms, with the major difference between them being the ability to perform +batched edits. In this paper, we unify these two algorithms under a single +conceptual umbrella, optimizing for the same goal, which we call the +preservation-memorization objective. ROME uses an equality constraint to +optimize this objective to perform one edit at a time, whereas MEMIT employs a +more flexible least-square constraint that allows for batched edits. We +generalize ROME and enable batched editing with equality constraint in the form +of EMMET - an Equality-constrained Mass Model Editing algorithm for +Transformers, a new batched memory-editing algorithm. EMMET can perform +batched-edits up to a batch-size of 10,000, with very similar performance to +MEMIT across multiple dimensions. With the introduction of EMMET, we truly +unify ROME and MEMIT and show that both algorithms are equivalent in terms of +their optimization objective, their abilities (singular and batched editing), +their model editing performance and their limitations. + +
+
+ comment: Under review. To appear as poster at KnowledgeableLM Workshop + co-located with ACL 2024 +
+
+
+
+
+ + ♻ ☆ Regurgitative Training: The Value of Real Data in Training Large + Language Models + + +
+ What happens if we train a new Large Language Model (LLM) using data that are +at least partially generated by other LLMs? The explosive success of LLMs means +that a substantial amount of content online will be generated by LLMs rather +than humans, which will inevitably enter the training datasets of +next-generation LLMs. We evaluate the implications of such "regurgitative +training" on LLM performance. Through fine-tuning GPT-3.5 with data generated +either by itself or by other LLMs in a machine translation task, we find strong +evidence that regurgitative training clearly handicaps the performance of LLMs. +The same performance loss of regurgitative training is observed on transformer +models that we train from scratch. We find suggestive evidence that the +performance disadvantage of regurgitative training can be attributed to at +least two mechanisms: (1) higher error rates and (2) lower lexical diversity in +LLM-generated data as compared to real data. Based on these mechanisms, we +propose and evaluate three different strategies to mitigate the performance +loss of regurgitative training. First, we devise data-driven metrics to gauge +the quality of each LLM-generated data instance, and then carry out an ordered +training process where high-quality data are added before low-quality ones. +Second, we combine data generated by multiple different LLMs (as an attempt to +increase lexical diversity). Third, we train an AI detection classifier to +differentiate between LLM- and human-generated data, and include LLM-generated +data in the order of resemblance to human-generated data. All three strategies +can improve the performance of regurgitative training to some extent but are +not always able to fully close the gap from training with real data. Our +results highlight the value of real, human-generated data in training LLMs, +which cannot be easily substituted by synthetic, LLM-generated data. + +
+
+
+
+
+ + ♻ ☆ Machine Translation Hallucination Detection for Low and High Resource + Languages using Large Language Models + + +
+ Recent advancements in massively multilingual machine translation systems +have significantly enhanced translation accuracy; however, even the best +performing systems still generate hallucinations, severely impacting user +trust. Detecting hallucinations in Machine Translation (MT) remains a critical +challenge, particularly since existing methods excel with High-Resource +Languages (HRLs) but exhibit substantial limitations when applied to +Low-Resource Languages (LRLs). This paper evaluates hallucination detection +approaches using Large Language Models (LLMs) and semantic similarity within +massively multilingual embeddings. Our study spans 16 language directions, +covering HRLs, LRLs, with diverse scripts. We find that the choice of model is +essential for performance. On average, for HRLs, Llama3-70B outperforms the +previous state of the art by as much as 0.16 MCC (Matthews Correlation +Coefficient). However, for LRLs we observe that Claude Sonnet outperforms other +LLMs on average by 0.03 MCC. The key takeaway from our study is that LLMs can +achieve performance comparable or even better than previously proposed models, +despite not being explicitly trained for any machine translation task. However, +their advantage is less significant for LRLs. + +
+
+ comment: Authors Kenza Benkirane and Laura Gongas contributed equally to this + work +
+
+
+
+
+ + ♻ ☆ Harmonic LLMs are Trustworthy + + +
+ We introduce an intuitive method to test the robustness (stability and +explainability) of any black-box LLM in real-time via its local deviation from +harmoniticity, denoted as $\gamma$. To the best of our knowledge this is the +first completely model-agnostic and unsupervised method of measuring the +robustness of any given response from an LLM, based upon the model itself +conforming to a purely mathematical standard. To show general application and +immediacy of results, we measure $\gamma$ in 10 popular LLMs (ChatGPT, +Claude-2.1, Claude3.0, GPT-4, GPT-4o, Smaug-72B, Mixtral-8x7B, Llama2-7B, +Mistral-7B and MPT-7B) across thousands of queries in three objective domains: +WebQA, ProgrammingQA, and TruthfulQA. Across all models and domains tested, +human annotation confirms that $\gamma \to 0$ indicates trustworthiness, and +conversely searching higher values of $\gamma$ easily exposes examples of +hallucination, a fact that enables efficient adversarial prompt generation +through stochastic gradient ascent in $\gamma$. The low-$\gamma$ leaders among +the models in the respective domains are GPT-4o, GPT-4, and Smaug-72B, +providing evidence that mid-size open-source models can win out against large +commercial models. + +
+
+ comment: 15 pages, 2 figures, 16 tables; added Claude-3.0, GPT-4o, Mistral-7B, + Mixtral-8x7B, and more annotation for other models +
+
+
+
+
+ + ♻ ☆ Improving Stance Detection by Leveraging Measurement Knowledge from + Social Sciences: A Case Study of Dutch Political Tweets and Traditional + Gender Role Division + + +
+ Stance detection (SD) concerns automatically determining the viewpoint (i.e., +in favour of, against, or neutral) of a text's author towards a target. SD has +been applied to many research topics, among which the detection of stances +behind political tweets is an important one. In this paper, we apply SD to a +dataset of tweets from official party accounts in the Netherlands between 2017 +and 2021, with a focus on stances towards traditional gender role division, a +dividing issue between (some) Dutch political parties. To implement and improve +SD of traditional gender role division, we propose to leverage an established +survey instrument from social sciences, which has been validated for the +purpose of measuring attitudes towards traditional gender role division. Based +on our experiments, we show that using such a validated survey instrument helps +to improve SD performance. + +
+
+
+
+
+ + ♻ ☆ PATCH! Psychometrics-AssisTed benCHmarking of Large Language Models: A + Case Study of Proficiency in 8th Grade Mathematics + + +
+ Many existing benchmarks of large (multimodal) language models (LLMs) focus +on measuring LLMs' academic proficiency, often with also an interest in +comparing model performance with human test takers. While these benchmarks have +proven key to the development of LLMs, they suffer from several limitations, +including questionable measurement quality (e.g., Do they measure what they are +supposed to in a reliable way?), lack of quality assessment on the item level +(e.g., Are some items more important or difficult than others?) and unclear +human population reference (e.g., To whom can the model be compared?). In +response to these challenges, we propose leveraging knowledge from +psychometrics - a field dedicated to the measurement of latent variables like +academic proficiency - into LLM benchmarking. We make three primary +contributions. First, we introduce PATCH: a novel framework for +{P}sychometrics-{A}ssis{T}ed ben{CH}marking of LLMs. PATCH addresses the +aforementioned limitations, presenting a new direction for LLM benchmark +research. Second, we implement PATCH by measuring GPT-4 and Gemini-Pro-Vision's +proficiency in 8th grade mathematics against 56 human populations. We show that +adopting a psychometrics-based approach yields evaluation outcomes that diverge +from those based on existing benchmarking practices. Third, we release 4 +high-quality datasets to support measuring and comparing LLM proficiency in +grade school mathematics and science against human populations. + +
+
+
+
+
+ + ♻ ☆ Resolving Discrepancies in Compute-Optimal Scaling of Language Models + + +
+ Kaplan et al. and Hoffmann et al. developed influential scaling laws for the +optimal model size as a function of the compute budget, but these laws yield +substantially different predictions. We explain the discrepancy by reproducing +the Kaplan scaling law on two datasets (OpenWebText2 and RefinedWeb) and +identifying three factors causing the difference: last layer computational +cost, warmup duration, and scale-dependent optimizer tuning. With these factors +corrected, we obtain excellent agreement with the Hoffmann et al. (i.e., +"Chinchilla") scaling law. Counter to a hypothesis of Hoffmann et al., we find +that careful learning rate decay is not essential for the validity of their +scaling law. As a secondary result, we derive scaling laws for the optimal +learning rate and batch size, finding that tuning the AdamW $\beta_2$ parameter +is essential at lower batch sizes. + +
+
+ comment: Fixing bug in small models with tuned LR +
+
+
+
+
+ + ♻ ☆ The Larger the Better? Improved LLM Code-Generation via Budget + Reallocation + + +
+ It is a common belief that large language models (LLMs) are better than +smaller-sized ones. However, larger models also require significantly more time +and compute during inference. This begs the question: what happens when both +models operate under the same budget? (e.g., compute, run-time). To address +this question, we analyze code generation LLMs of various sizes and make +comparisons such as running a 70B model once vs. generating five outputs from a +13B model. We consider a standard unit-test setup, which can be used to select +the correct output from the smaller model. Our findings reveal that the +repeated use of smaller models can yield consistent improvements, with gains of +up to 15% across five tasks. On the other hand, in scenarios where unit-tests +are unavailable, a ranking-based selection of candidates from the smaller model +falls short of the performance of a single output from larger ones. Our results +highlight the potential of using smaller models instead of larger ones, and the +importance of studying approaches for ranking LLM outputs. + +
+
+ comment: COLM 2024 +
+
+
+
+
+ + ♻ ☆ AutoRE: Document-Level Relation Extraction with Large Language Models + + +
+ Large Language Models (LLMs) have demonstrated exceptional abilities in +comprehending and generating text, motivating numerous researchers to utilize +them for Information Extraction (IE) purposes, including Relation Extraction +(RE). Nonetheless, most existing methods are predominantly designed for +Sentence-level Relation Extraction (SentRE) tasks, which typically encompass a +restricted set of relations and triplet facts within a single sentence. +Furthermore, certain approaches resort to treating relations as candidate +choices integrated into prompt templates, leading to inefficient processing and +suboptimal performance when tackling Document-Level Relation Extraction (DocRE) +tasks, which entail handling multiple relations and triplet facts distributed +across a given document, posing distinct challenges. To overcome these +limitations, we introduce AutoRE, an end-to-end DocRE model that adopts a novel +RE extraction paradigm named RHF (Relation-Head-Facts). Unlike existing +approaches, AutoRE does not rely on the assumption of known relation options, +making it more reflective of real-world scenarios. Additionally, we have +developed an easily extensible RE framework using a Parameters Efficient Fine +Tuning (PEFT) algorithm (QLoRA). Our experiments on the RE-DocRED dataset +showcase AutoRE's best performance, achieving state-of-the-art results, +surpassing TAG by 10.03\% and 9.03\% respectively on the dev and test set. The +code is available\url{https://github.com/THUDM/AutoRE} and the demonstration +video is provided https://www.youtube.com/watch?v=IhKRsZUAxKk + +
+
+ comment: 11 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Large Language Models Understand Layout + + +
+ Large language models (LLMs) demonstrate extraordinary abilities in a wide +range of natural language processing (NLP) tasks. In this paper, we show that, +beyond text understanding capability, LLMs are capable of processing text +layouts that are denoted by spatial markers. They are able to answer questions +that require explicit spatial perceiving and reasoning, while a drastic +performance drop is observed when the spatial markers from the original data +are excluded. We perform a series of experiments with the GPT-3.5, Baichuan2, +Llama2 and ChatGLM3 models on various types of layout-sensitive datasets for +further analysis. The experimental results reveal that the layout understanding +ability of LLMs is mainly introduced by the coding data for pretraining, which +is further enhanced at the instruction-tuning stage. In addition, layout +understanding can be enhanced by integrating low-cost, auto-generated data +approached by a novel text game. Finally, we show that layout understanding +ability is beneficial for building efficient visual question-answering (VQA) +systems. + +
+
+
+
+
+ + ♻ ☆ KIVI: A Tuning-Free Asymmetric 2bit Quantization for KV Cache ICML2024 + + +
+ Efficiently serving large language models (LLMs) requires batching of many +requests to reduce the cost per request. Yet, with larger batch sizes and +longer context lengths, the key-value (KV) cache, which stores attention keys +and values to avoid re-computations, significantly increases memory demands and +becomes the new bottleneck in speed and memory usage. Additionally, the loading +of the KV cache causes the computational core to be idle, which limits the +inference speed. A straightforward and effective solution to reduce KV cache +size is quantization, which decreases the total bytes taken by KV cache. +However, there is a lack of in-depth studies that explore the element +distribution of KV cache to understand the hardness and limitation of KV cache +quantization. To fill the gap, we conducted a comprehensive study on the +element distribution in KV cache of popular LLMs. Our findings indicate that +the key cache should be quantized per-channel, i.e., group elements along the +channel dimension and quantize them together. In contrast, the value cache +should be quantized per-token. From this analysis, we developed a tuning-free +2bit KV cache quantization algorithm named KIVI. With hardware-friendly +implementation, KIVI can enable Llama, Falcon, and Mistral models to maintain +almost the same quality while using $\mathbf{2.6\times}$ less peak memory +(including model weight). This reduction in memory usage enables up to +$\mathbf{4\times}$ larger batch size, bringing $\mathbf{2.35\times \sim +3.47\times}$ throughput on real LLM inference workload. The source code is +available at https://github.com/jy-yuan/KIVI. + +
+
+ comment: ICML2024 +
+
+
+
+
+ + ♻ ☆ Identifying Semantic Induction Heads to Understand In-Context Learning + + +
+ Although large language models (LLMs) have demonstrated remarkable +performance, the lack of transparency in their inference logic raises concerns +about their trustworthiness. To gain a better understanding of LLMs, we conduct +a detailed analysis of the operations of attention heads and aim to better +understand the in-context learning of LLMs. Specifically, we investigate +whether attention heads encode two types of relationships between tokens +present in natural languages: the syntactic dependency parsed from sentences +and the relation within knowledge graphs. We find that certain attention heads +exhibit a pattern where, when attending to head tokens, they recall tail tokens +and increase the output logits of those tail tokens. More crucially, the +formulation of such semantic induction heads has a close correlation with the +emergence of the in-context learning ability of language models. The study of +semantic attention heads advances our understanding of the intricate operations +of attention heads in transformers, and further provides new insights into the +in-context learning of LLMs. + +
+
+
+
+
+ + ♻ ☆ SAFETY-J: Evaluating Safety with Critique + + +
+ The deployment of Large Language Models (LLMs) in content generation raises +significant safety concerns, particularly regarding the transparency and +interpretability of content evaluations. Current methods, primarily focused on +binary safety classifications, lack mechanisms for detailed critique, limiting +their utility for model improvement and user trust. To address these +limitations, we introduce SAFETY-J, a bilingual generative safety evaluator for +English and Chinese with critique-based judgment. SAFETY-J utilizes a robust +training dataset that includes diverse dialogues and augmented query-response +pairs to assess safety across various scenarios comprehensively. We establish +an automated meta-evaluation benchmark that objectively assesses the quality of +critiques with minimal human intervention, facilitating scalable and continuous +improvement. Additionally, SAFETY-J employs an iterative preference learning +technique to dynamically refine safety assessments based on meta-evaluations +and critiques. Our evaluations demonstrate that SAFETY-J provides more nuanced +and accurate safety evaluations, thereby enhancing both critique quality and +predictive reliability in complex content scenarios. To facilitate further +research and application, we open-source SAFETY-J's training protocols, +datasets, and code at \url{https://github.com/GAIR-NLP/Safety-J}. + +
+
+
+
+
+ + ♻ ☆ Behavioral Testing: Can Large Language Models Implicitly Resolve + Ambiguous Entities? + + +
+ One of the major aspects contributing to the striking performance of large +language models (LLMs) is the vast amount of factual knowledge accumulated +during pre-training. Yet, many LLMs suffer from self-inconsistency, which +raises doubts about their trustworthiness and reliability. In this paper, we +focus on entity type ambiguity and analyze current state-of-the-art LLMs for +their proficiency and consistency in applying their factual knowledge when +prompted for entities under ambiguity. To do so, we propose an evaluation +protocol that disentangles knowing from applying knowledge, and test +state-of-the-art LLMs on 49 entities. Our experiments reveal that LLMs perform +poorly with ambiguous prompts, achieving only 80% accuracy. Our results further +demonstrate systematic discrepancies in LLM behavior and their failure to +consistently apply information, indicating that the models can exhibit +knowledge without being able to utilize it, significant biases for preferred +readings, as well as self inconsistencies. Our study highlights the importance +of handling entity ambiguity in future for more trustworthy LLMs + +
+
+
+
+
+ + ♻ ☆ Brand Network Booster: A new system for improving brand connectivity + + +
+ This paper presents a new decision support system offered for an in-depth +analysis of semantic networks, which can provide insights for a better +exploration of a brand's image and the improvement of its connectivity. In +terms of network analysis, we show that this goal is achieved by solving an +extended version of the Maximum Betweenness Improvement problem, which includes +the possibility of considering adversarial nodes, constrained budgets, and +weighted networks - where connectivity improvement can be obtained by adding +links or increasing the weight of existing connections. Our contribution +includes a new algorithmic framework and the integration of this framework into +a software system called Brand Network Booster (BNB), which supports brand +connectivity evaluation and improvement. We present this new system together +with three case studies, and we also discuss its performance. Our tool and +approach are valuable to both network scholars and in facilitating strategic +decision-making processes for marketing and communication managers across +various sectors, be it public or private. + +
+
+
+
+
+ + ♻ ☆ Automatic Textual Normalization for Hate Speech Detection + + +
+ Social media data is a valuable resource for research, yet it contains a wide +range of non-standard words (NSW). These irregularities hinder the effective +operation of NLP tools. Current state-of-the-art methods for the Vietnamese +language address this issue as a problem of lexical normalization, involving +the creation of manual rules or the implementation of multi-staged deep +learning frameworks, which necessitate extensive efforts to craft intricate +rules. In contrast, our approach is straightforward, employing solely a +sequence-to-sequence (Seq2Seq) model. In this research, we provide a dataset +for textual normalization, comprising 2,181 human-annotated comments with an +inter-annotator agreement of 0.9014. By leveraging the Seq2Seq model for +textual normalization, our results reveal that the accuracy achieved falls +slightly short of 70%. Nevertheless, textual normalization enhances the +accuracy of the Hate Speech Detection (HSD) task by approximately 2%, +demonstrating its potential to improve the performance of complex NLP tasks. +Our dataset is accessible for research purposes. + +
+
+ comment: 2023 International Conference on Intelligent Systems Design and + Applications (ISDA2023) +
+
+
+
+
+ + ♻ ☆ LyricWhiz: Robust Multilingual Zero-shot Lyrics Transcription by + Whispering to ChatGPT + + +
+ We introduce LyricWhiz, a robust, multilingual, and zero-shot automatic +lyrics transcription method achieving state-of-the-art performance on various +lyrics transcription datasets, even in challenging genres such as rock and +metal. Our novel, training-free approach utilizes Whisper, a weakly supervised +robust speech recognition model, and GPT-4, today's most performant chat-based +large language model. In the proposed method, Whisper functions as the "ear" by +transcribing the audio, while GPT-4 serves as the "brain," acting as an +annotator with a strong performance for contextualized output selection and +correction. Our experiments show that LyricWhiz significantly reduces Word +Error Rate compared to existing methods in English and can effectively +transcribe lyrics across multiple languages. Furthermore, we use LyricWhiz to +create the first publicly available, large-scale, multilingual lyrics +transcription dataset with a CC-BY-NC-SA copyright license, based on +MTG-Jamendo, and offer a human-annotated subset for noise level estimation and +evaluation. We anticipate that our proposed method and dataset will advance the +development of multilingual lyrics transcription, a challenging and emerging +task. + +
+
+ comment: 9 pages, 2 figures, 5 tables, accepted by ISMIR 2023 +
+
+
+
+
+ + ♻ ☆ CIBench: Evaluating Your LLMs with a Code Interpreter Plugin + + +
+ While LLM-Based agents, which use external tools to solve complex problems, +have made significant progress, benchmarking their ability is challenging, +thereby hindering a clear understanding of their limitations. In this paper, we +propose an interactive evaluation framework, named CIBench, to comprehensively +assess LLMs' ability to utilize code interpreters for data science tasks. Our +evaluation framework includes an evaluation dataset and two evaluation modes. +The evaluation dataset is constructed using an LLM-human cooperative approach +and simulates an authentic workflow by leveraging consecutive and interactive +IPython sessions. The two evaluation modes assess LLMs' ability with and +without human assistance. We conduct extensive experiments to analyze the +ability of 24 LLMs on CIBench and provide valuable insights for future LLMs in +code interpreter utilization. + +
+
+ comment: Under review. The first three authors contribute equally, and + Songyang Zhang is the project leader +
+
+
+
+
+ + ♻ ☆ The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122 + Language Variants ACL 2024 + + +
+ We present Belebele, a multiple-choice machine reading comprehension (MRC) +dataset spanning 122 language variants. Significantly expanding the language +coverage of natural language understanding (NLU) benchmarks, this dataset +enables the evaluation of text models in high-, medium-, and low-resource +languages. Each question is based on a short passage from the Flores-200 +dataset and has four multiple-choice answers. The questions were carefully +curated to discriminate between models with different levels of general +language comprehension. The English dataset on its own proves difficult enough +to challenge state-of-the-art language models. Being fully parallel, this +dataset enables direct comparison of model performance across all languages. We +use this dataset to evaluate the capabilities of multilingual masked language +models (MLMs) and large language models (LLMs). We present extensive results +and find that despite significant cross-lingual transfer in English-centric +LLMs, much smaller MLMs pretrained on balanced multilingual data still +understand far more languages. We also observe that larger vocabulary size and +conscious vocabulary construction correlate with better performance on +low-resource languages. Overall, Belebele opens up new avenues for evaluating +and analyzing the multilingual capabilities of NLP systems. + +
+
+ comment: ACL 2024 +
+
+
+
+
+ + ♻ ☆ CCoE: A Compact LLM with Collaboration of Experts + + +
+ In the domain of Large Language Model (LLM), LLMs demonstrate significant +capabilities in natural language understanding and generation. With the growing +needs of applying LLMs on various domains, it is a research question that how +to efficiently train and build a model that has expertise in different domains +but with a low training cost. We propose CCoE architecture, a framework of +easily coupling multiple strong domain experts together to fuse into a big LLM, +provides a collective way of utilizing the different domain expert LLMs. +Besides, training a large collaborative of multiple expert LLMs requires a high +requirements on training sources. CCoE bypasses this problem through isolating +other experts and train each expert separately. The design of CCoE assembles +multiple expert LLMs through the CoE (Collaboration of Experts) layer. Each CoE +layer could have one or more expert LLMs. Expert LLMs have different number of +layers and have been well-trained for different domain tasks. Each expert is +fine-tuned to be able to achieve the comparable results with SOTA domain LLMs. +We start from 5 experts in the domain of Code, Math, Law, text-to-SQL and +Medical. The results indicate that our CCoE framework can easily and +efficiently boost nearly 10%-20% performance on original base model in +different domains but using less resources on training, as well as inference. + +
+
+
+
+
+ + ♻ ☆ Automatically Extracting Numerical Results from Randomized Controlled + Trials with Large Language Models + + +
+ Meta-analyses statistically aggregate the findings of different randomized +controlled trials (RCTs) to assess treatment effectiveness. Because this yields +robust estimates of treatment effectiveness, results from meta-analyses are +considered the strongest form of evidence. However, rigorous evidence syntheses +are time-consuming and labor-intensive, requiring manual extraction of data +from individual trials to be synthesized. Ideally, language technologies would +permit fully automatic meta-analysis, on demand. This requires accurately +extracting numerical results from individual trials, which has been beyond the +capabilities of natural language processing (NLP) models to date. In this work, +we evaluate whether modern large language models (LLMs) can reliably perform +this task. We annotate (and release) a modest but granular evaluation dataset +of clinical trial reports with numerical findings attached to interventions, +comparators, and outcomes. Using this dataset, we evaluate the performance of +seven LLMs applied zero-shot for the task of conditionally extracting numerical +findings from trial reports. We find that massive LLMs that can accommodate +lengthy inputs are tantalizingly close to realizing fully automatic +meta-analysis, especially for dichotomous (binary) outcomes (e.g., mortality). +However, LLMs -- including ones trained on biomedical texts -- perform poorly +when the outcome measures are complex and tallying the results requires +inference. This work charts a path toward fully automatic meta-analysis of RCTs +via LLMs, while also highlighting the limitations of existing models for this +aim. + +
+
+ comment: 25 pages, 7 figures, 6 tables, MLHC 2024 +
+
+
+
+
+ + ♻ ☆ Towards the Law of Capacity Gap in Distilling Language Models + + +
+ Language model (LM) distillation is a trending area that aims to distil the +knowledge residing in a large teacher LM to a small student one. While various +methods have been proposed to maximize the effectiveness of the distillation, +significant challenges persist, particularly when there is a substantial +capacity gap between the teacher and student LMs. This issue, often referred to +as the \textit{curse} of capacity gap, suggests that a larger teacher does not +necessarily result in a superior student compared to one distilled from a +smaller teacher. In other words, there is likely an optimal teacher yielding +the best student along the scaling course of the teacher. However, the curse of +capacity gap can not be tackled without notable compute overhead, as indicated +in previous studies. In the context of large LMs (LLMs), previously viable +approaches become much less meaningful, as it is an impossible triangle to +distill an expected student from an optimal teacher student with small compute +overhead. Fortunately, the impossible triangle can fortunately be possible +provided an inducted \textit{law} of capacity gap. In this paper, we take the +spirits of scaling law and reveal that the optimal teacher scale almost +consistently follows a linear scaling with the student scale across different +model architectures and data scales. The law later guides us to distil a 3B +student LM (termed \textsc{MiniMA}) from LLaMA2-7B. \textsc{MiniMA} is +demonstrated to outperform a wide range of 3B competitors and could even +compete with several 7B models. + +
+
+ comment: 32 pages, 10 figures, 15 tables, work in progress. Code and + checkpoints are available at https://github.com/GeneZC/MiniMA +
+
+
+
+
+ + ♻ ☆ Adapting Large Language Models to Domains via Reading Comprehension ICLR 2024 + + +
+ We explore how continued pre-training on domain-specific corpora influences +large language models, revealing that training on the raw corpora endows the +model with domain knowledge, but drastically hurts its prompting ability for +question answering. Taken inspiration from human learning via reading +comprehension--practice after reading improves the ability to answer questions +based on the learned knowledge--we propose a simple method for transforming raw +corpora into reading comprehension texts. Each raw text is enriched with a +series of tasks related to its content. Our method, highly scalable and +applicable to any pre-training corpora, consistently enhances performance +across various tasks in three different domains: biomedicine, finance, and law. +Notably, our 7B language model achieves competitive performance with +domain-specific models of much larger scales, such as BloombergGPT-50B. +Furthermore, we demonstrate that domain-specific reading comprehension texts +can improve the model's performance even on general benchmarks, showing the +potential to develop a general model across even more domains. Our model, code, +and data are available at https://github.com/microsoft/LMOps. + +
+
+ comment: ICLR 2024 Conference +
+
+
+
+
+ + ♻ ☆ Chain-of-Layer: Iteratively Prompting Large Language Models for Taxonomy + Induction from Limited Examples + + +
+ Automatic taxonomy induction is crucial for web search, recommendation +systems, and question answering. Manual curation of taxonomies is expensive in +terms of human effort, making automatic taxonomy construction highly desirable. +In this work, we introduce Chain-of-Layer which is an in-context learning +framework designed to induct taxonomies from a given set of entities. +Chain-of-Layer breaks down the task into selecting relevant candidate entities +in each layer and gradually building the taxonomy from top to bottom. To +minimize errors, we introduce the Ensemble-based Ranking Filter to reduce the +hallucinated content generated at each iteration. Through extensive +experiments, we demonstrate that Chain-of-Layer achieves state-of-the-art +performance on four real-world benchmarks. + +
+
+
+
+
+ + ♻ ☆ JailbreakZoo: Survey, Landscapes, and Horizons in Jailbreaking Large + Language and Vision-Language Models + + +
+ The rapid evolution of artificial intelligence (AI) through developments in +Large Language Models (LLMs) and Vision-Language Models (VLMs) has brought +significant advancements across various technological domains. While these +models enhance capabilities in natural language processing and visual +interactive tasks, their growing adoption raises critical concerns regarding +security and ethical alignment. This survey provides an extensive review of the +emerging field of jailbreaking--deliberately circumventing the ethical and +operational boundaries of LLMs and VLMs--and the consequent development of +defense mechanisms. Our study categorizes jailbreaks into seven distinct types +and elaborates on defense strategies that address these vulnerabilities. +Through this comprehensive examination, we identify research gaps and propose +directions for future studies to enhance the security frameworks of LLMs and +VLMs. Our findings underscore the necessity for a unified perspective that +integrates both jailbreak strategies and defensive solutions to foster a +robust, secure, and reliable environment for the next generation of language +models. More details can be found on our website: +\url{https://chonghan-chen.com/llm-jailbreak-zoo-survey/}. + +
+
+ comment: 45 pages +
+
+
+
+
+ + ♻ ☆ Exploring Semantic Perturbations on Grover + + +
+ With news and information being as easy to access as they currently are, it +is more important than ever to ensure that people are not mislead by what they +read. Recently, the rise of neural fake news (AI-generated fake news) and its +demonstrated effectiveness at fooling humans has prompted the development of +models to detect it. One such model is the Grover model, which can both detect +neural fake news to prevent it, and generate it to demonstrate how a model +could be misused to fool human readers. In this work we explore the Grover +model's fake news detection capabilities by performing targeted attacks through +perturbations on input news articles. Through this we test Grover's resilience +to these adversarial attacks and expose some potential vulnerabilities which +should be addressed in further iterations to ensure it can detect all types of +fake news accurately. + +
+
+
+
+
+ + ♻ ☆ Exploiting All Samples in Low-Resource Sentence Classification: Early + Stopping and Initialization Parameters + + +
+ To improve deep-learning performance in low-resource settings, many +researchers have redesigned model architectures or applied additional data +(e.g., external resources, unlabeled samples). However, there have been +relatively few discussions on how to make good use of small amounts of labeled +samples, although it is potentially beneficial and should be done before +applying additional data or redesigning models. In this study, we assume a +low-resource setting in which only a few labeled samples (i.e., 30-100 per +class) are available, and we discuss how to exploit them without additional +data or model redesigns. We explore possible approaches in the following three +aspects: training-validation splitting, early stopping, and weight +initialization. Extensive experiments are conducted on six public sentence +classification datasets. Performance on various evaluation metrics (e.g., +accuracy, loss, and calibration error) significantly varied depending on the +approaches that were combined in the three aspects. Based on the results, we +propose an integrated method, which is to initialize the model with a weight +averaging method and use a non-validation stop method to train all samples. +This simple integrated method consistently outperforms the competitive methods; +e.g., the average accuracy of six datasets of this method was 1.8% higher than +those of conventional validation-based methods. In addition, the integrated +method further improves the performance when adapted to several +state-of-the-art models that use additional data or redesign the network +architecture (e.g., self-training and enhanced structural models). Our results +highlight the importance of the training strategy and suggest that the +integrated method can be the first step in the low-resource setting. This study +provides empirical knowledge that will be helpful when dealing with +low-resource data in future efforts. + +
+
+ comment: 15 pages, 8 figures, published in IEEE Access +
+
+
+
+
+ + ♻ ☆ Debating with More Persuasive LLMs Leads to More Truthful Answers + + +
+ Common methods for aligning large language models (LLMs) with desired +behaviour heavily rely on human-labelled data. However, as models grow +increasingly sophisticated, they will surpass human expertise, and the role of +human evaluation will evolve into non-experts overseeing experts. In +anticipation of this, we ask: can weaker models assess the correctness of +stronger models? We investigate this question in an analogous setting, where +stronger models (experts) possess the necessary information to answer questions +and weaker models (non-experts) lack this information. The method we evaluate +is debate, where two LLM experts each argue for a different answer, and a +non-expert selects the answer. We find that debate consistently helps both +non-expert models and humans answer questions, achieving 76% and 88% accuracy +respectively (naive baselines obtain 48% and 60%). Furthermore, optimising +expert debaters for persuasiveness in an unsupervised manner improves +non-expert ability to identify the truth in debates. Our results provide +encouraging empirical evidence for the viability of aligning models with debate +in the absence of ground truth. + +
+
+ comment: For code please check: https://github.com/ucl-dark/llm_debate +
+
+
+
+
+ + ♻ ☆ SafeDecoding: Defending against Jailbreak Attacks via Safety-Aware + Decoding ACL 2024 + + +
+ As large language models (LLMs) become increasingly integrated into +real-world applications such as code generation and chatbot assistance, +extensive efforts have been made to align LLM behavior with human values, +including safety. Jailbreak attacks, aiming to provoke unintended and unsafe +behaviors from LLMs, remain a significant/leading LLM safety threat. In this +paper, we aim to defend LLMs against jailbreak attacks by introducing +SafeDecoding, a safety-aware decoding strategy for LLMs to generate helpful and +harmless responses to user queries. Our insight in developing SafeDecoding is +based on the observation that, even though probabilities of tokens representing +harmful contents outweigh those representing harmless responses, safety +disclaimers still appear among the top tokens after sorting tokens by +probability in descending order. This allows us to mitigate jailbreak attacks +by identifying safety disclaimers and amplifying their token probabilities, +while simultaneously attenuating the probabilities of token sequences that are +aligned with the objectives of jailbreak attacks. We perform extensive +experiments on five LLMs using six state-of-the-art jailbreak attacks and four +benchmark datasets. Our results show that SafeDecoding significantly reduces +the attack success rate and harmfulness of jailbreak attacks without +compromising the helpfulness of responses to benign user queries. SafeDecoding +outperforms six defense methods. + +
+
+ comment: To appear in ACL 2024 +
+
+
+
+
+ + ♻ ☆ Bilingual Adaptation of Monolingual Foundation Models + + +
+ We present an efficient method for adapting a monolingual Large Language +Model (LLM) to another language, addressing challenges of catastrophic +forgetting and tokenizer limitations. We focus this study on adapting Llama 2 +to Arabic. Our two-stage approach begins with expanding the vocabulary and +training only the embeddings matrix, followed by full model continual +pre-training on a bilingual corpus. By continually pre-training on a mix of +Arabic and English corpora, the model retains its proficiency in English while +acquiring capabilities in Arabic. Our approach results in significant +improvements in Arabic and slight enhancements in English, demonstrating +cost-effective cross-lingual transfer. We perform ablations on embedding +initialization techniques, data mix ratios, and learning rates and release a +detailed training recipe. To demonstrate generalizability of this approach we +also adapted Llama 3 8B to Arabic and Llama 2 13B to Hindi. + +
+
+
+
+
+ + ♻ ☆ Human-Interpretable Adversarial Prompt Attack on Large Language Models + with Situational Context + + +
+ Previous research on testing the vulnerabilities in Large Language Models +(LLMs) using adversarial attacks has primarily focused on nonsensical prompt +injections, which are easily detected upon manual or automated review (e.g., +via byte entropy). However, the exploration of innocuous human-understandable +malicious prompts augmented with adversarial injections remains limited. In +this research, we explore converting a nonsensical suffix attack into a +sensible prompt via a situation-driven contextual re-writing. This allows us to +show suffix conversion without any gradients, using only LLMs to perform the +attacks, and thus better understand the scope of possible risks. We combine an +independent, meaningful adversarial insertion and situations derived from +movies to check if this can trick an LLM. The situations are extracted from the +IMDB dataset, and prompts are defined following a few-shot chain-of-thought +prompting. Our approach demonstrates that a successful situation-driven attack +can be executed on both open-source and proprietary LLMs. We find that across +many LLMs, as few as 1 attempt produces an attack and that these attacks +transfer between LLMs. + +
+
+
+
+
+ + ♻ ☆ A Review of Large Language Models and Autonomous Agents in Chemistry + + +
+ Large language models (LLMs) have emerged as powerful tools in chemistry, +significantly impacting molecule design, property prediction, and synthesis +optimization. This review highlights LLM capabilities in these domains and +their potential to accelerate scientific discovery through automation. We also +review LLM-based autonomous agents: LLMs with a broader set of tools to +interact with their surrounding environment. These agents perform diverse tasks +such as paper scraping, interfacing with automated laboratories, and synthesis +planning. As agents are an emerging topic, we extend the scope of our review of +agents beyond chemistry and discuss across any scientific domains. This review +covers the recent history, current capabilities, and design of LLMs and +autonomous agents, addressing specific challenges, opportunities, and future +directions in chemistry. Key challenges include data quality and integration, +model interpretability, and the need for standard benchmarks, while future +directions point towards more sophisticated multi-modal agents and enhanced +collaboration between agents and experimental methods. Due to the quick pace of +this field, a repository has been built to keep track of the latest studies: +https://github.com/ur-whitelab/LLMs-in-science. + +
+
+
+
+
+ + ♻ ☆ MIA-Bench: Towards Better Instruction Following Evaluation of Multimodal + LLMs + + +
+ We introduce MIA-Bench, a new benchmark designed to evaluate multimodal large +language models (MLLMs) on their ability to strictly adhere to complex +instructions. Our benchmark comprises a diverse set of 400 image-prompt pairs, +each crafted to challenge the models' compliance with layered instructions in +generating accurate responses that satisfy specific requested patterns. +Evaluation results from a wide array of state-of-the-art MLLMs reveal +significant variations in performance, highlighting areas for improvement in +instruction fidelity. Additionally, we create extra training data and explore +supervised fine-tuning to enhance the models' ability to strictly follow +instructions without compromising performance on other tasks. We hope this +benchmark not only serves as a tool for measuring MLLM adherence to +instructions, but also guides future developments in MLLM training methods. + +
+
+
+
+
+ + ♻ ☆ Exploring Continual Learning of Compositional Generalization in NLI + + +
+ Compositional Natural Language Inference has been explored to assess the true +abilities of neural models to perform NLI. Yet, current evaluations assume +models to have full access to all primitive inferences in advance, in contrast +to humans that continuously acquire inference knowledge. In this paper, we +introduce the Continual Compositional Generalization in Inference (C2Gen NLI) +challenge, where a model continuously acquires knowledge of constituting +primitive inference tasks as a basis for compositional inferences. We explore +how continual learning affects compositional generalization in NLI, by +designing a continual learning setup for compositional NLI inference tasks. Our +experiments demonstrate that models fail to compositionally generalize in a +continual scenario. To address this problem, we first benchmark various +continual learning algorithms and verify their efficacy. We then further +analyze C2Gen, focusing on how to order primitives and compositional inference +types and examining correlations between subtasks. Our analyses show that by +learning subtasks continuously while observing their dependencies and +increasing degrees of difficulty, continual learning can enhance composition +generalization ability. + +
+
+
+
+
+ + ♻ ☆ Measuring and Controlling Instruction (In)Stability in Language Model + Dialogs + + +
+ System-prompting is a standard tool for customizing language-model chatbots, +enabling them to follow a specific instruction. An implicit assumption in the +use of system prompts is that they will be stable, so the chatbot will continue +to generate text according to the stipulated instructions for the duration of a +conversation. We propose a quantitative benchmark to test this assumption, +evaluating instruction stability via self-chats between two instructed +chatbots. Testing popular models like LLaMA2-chat-70B and GPT-3.5, we reveal a +significant instruction drift within eight rounds of conversations. An +empirical and theoretical analysis of this phenomenon suggests the transformer +attention mechanism plays a role, due to attention decay over long exchanges. +To combat attention decay and instruction drift, we propose a lightweight +method called split-softmax, which compares favorably against two strong +baselines. + +
+
+ comment: COLM 2024; Code and data: https://github.com/likenneth/persona_drift +
+
+
+
+
+ + ♻ ☆ UNIQORN: Unified Question Answering over RDF Knowledge Graphs and + Natural Language Text + + +
+ Question answering over RDF data like knowledge graphs has been greatly +advanced, with a number of good systems providing crisp answers for natural +language questions or telegraphic queries. Some of these systems incorporate +textual sources as additional evidence for the answering process, but cannot +compute answers that are present in text alone. Conversely, the IR and NLP +communities have addressed QA over text, but such systems barely utilize +semantic data and knowledge. This paper presents a method for complex questions +that can seamlessly operate over a mixture of RDF datasets and text corpora, or +individual sources, in a unified framework. Our method, called UNIQORN, builds +a context graph on-the-fly, by retrieving question-relevant evidences from the +RDF data and/or a text corpus, using fine-tuned BERT models. The resulting +graph typically contains all question-relevant evidences but also a lot of +noise. UNIQORN copes with this input by a graph algorithm for Group Steiner +Trees, that identifies the best answer candidates in the context graph. +Experimental results on several benchmarks of complex questions with multiple +entities and relations, show that UNIQORN significantly outperforms +state-of-the-art methods for heterogeneous QA -- in a full training mode, as +well as in zero-shot settings. The graph-based methodology provides +user-interpretable evidence for the complete answering process. + +
+
+ comment: 27 pages +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 154 + +
+
+
+ + ☆ Sparse vs Contiguous Adversarial Pixel Perturbations in Multimodal + Models: An Empirical Analysis + + +
+ Assessing the robustness of multimodal models against adversarial examples is +an important aspect for the safety of its users. We craft L0-norm perturbation +attacks on the preprocessed input images. We launch them in a black-box setup +against four multimodal models and two unimodal DNNs, considering both targeted +and untargeted misclassification. Our attacks target less than 0.04% of +perturbed image area and integrate different spatial positioning of perturbed +pixels: sparse positioning and pixels arranged in different contiguous shapes +(row, column, diagonal, and patch). To the best of our knowledge, we are the +first to assess the robustness of three state-of-the-art multimodal models +(ALIGN, AltCLIP, GroupViT) against different sparse and contiguous pixel +distribution perturbations. The obtained results indicate that unimodal DNNs +are more robust than multimodal models. Furthermore, models using CNN-based +Image Encoder are more vulnerable than models with ViT - for untargeted +attacks, we obtain a 99% success rate by perturbing less than 0.02% of the +image area. + +
+
+
+
+
+ + ☆ Trajectory-aligned Space-time Tokens for Few-shot Action Recognition ECCV 2024 + + +
+ We propose a simple yet effective approach for few-shot action recognition, +emphasizing the disentanglement of motion and appearance representations. By +harnessing recent progress in tracking, specifically point trajectories and +self-supervised representation learning, we build trajectory-aligned tokens +(TATs) that capture motion and appearance information. This approach +significantly reduces the data requirements while retaining essential +information. To process these representations, we use a Masked Space-time +Transformer that effectively learns to aggregate information to facilitate +few-shot action recognition. We demonstrate state-of-the-art results on +few-shot action recognition across multiple datasets. Our project page is +available at https://www.cs.umd.edu/~pulkit/tats + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ☆ RegionDrag: Fast Region-Based Image Editing with Diffusion Models ECCV 2024 + + +
+ Point-drag-based image editing methods, like DragDiffusion, have attracted +significant attention. However, point-drag-based approaches suffer from +computational overhead and misinterpretation of user intentions due to the +sparsity of point-based editing instructions. In this paper, we propose a +region-based copy-and-paste dragging method, RegionDrag, to overcome these +limitations. RegionDrag allows users to express their editing instructions in +the form of handle and target regions, enabling more precise control and +alleviating ambiguity. In addition, region-based operations complete editing in +one iteration and are much faster than point-drag-based methods. We also +incorporate the attention-swapping technique for enhanced stability during +editing. To validate our approach, we extend existing point-drag-based datasets +with region-based dragging instructions. Experimental results demonstrate that +RegionDrag outperforms existing point-drag-based approaches in terms of speed, +accuracy, and alignment with user intentions. Remarkably, RegionDrag completes +the edit on an image with a resolution of 512x512 in less than 2 seconds, which +is more than 100x faster than DragDiffusion, while achieving better +performance. Project page: https://visual-ai.github.io/regiondrag. + +
+
+ comment: ECCV 2024, Project page: https://visual-ai.github.io/regiondrag +
+
+
+
+
+ + ☆ VGGHeads: A Large-Scale Synthetic Dataset for 3D Human Heads + + +
+ Human head detection, keypoint estimation, and 3D head model fitting are +important tasks with many applications. However, traditional real-world +datasets often suffer from bias, privacy, and ethical concerns, and they have +been recorded in laboratory environments, which makes it difficult for trained +models to generalize. Here, we introduce VGGHeads -- a large scale synthetic +dataset generated with diffusion models for human head detection and 3D mesh +estimation. Our dataset comprises over 1 million high-resolution images, each +annotated with detailed 3D head meshes, facial landmarks, and bounding boxes. +Using this dataset we introduce a new model architecture capable of +simultaneous heads detection and head meshes reconstruction from a single image +in a single step. Through extensive experimental evaluations, we demonstrate +that models trained on our synthetic data achieve strong performance on real +images. Furthermore, the versatility of our dataset makes it applicable across +a broad spectrum of tasks, offering a general and comprehensive representation +of human heads. Additionally, we provide detailed information about the +synthetic data generation pipeline, enabling it to be re-used for other tasks +and domains. + +
+
+
+
+
+ + ☆ RefMask3D: Language-Guided Transformer for 3D Referring Segmentation ACM MM 2024 + + +
+ 3D referring segmentation is an emerging and challenging vision-language task +that aims to segment the object described by a natural language expression in a +point cloud scene. The key challenge behind this task is vision-language +feature fusion and alignment. In this work, we propose RefMask3D to explore the +comprehensive multi-modal feature interaction and understanding. First, we +propose a Geometry-Enhanced Group-Word Attention to integrate language with +geometrically coherent sub-clouds through cross-modal group-word attention, +which effectively addresses the challenges posed by the sparse and irregular +nature of point clouds. Then, we introduce a Linguistic Primitives Construction +to produce semantic primitives representing distinct semantic attributes, which +greatly enhance the vision-language understanding at the decoding stage. +Furthermore, we introduce an Object Cluster Module that analyzes the +interrelationships among linguistic primitives to consolidate their insights +and pinpoint common characteristics, helping to capture holistic information +and enhance the precision of target identification. The proposed RefMask3D +achieves new state-of-the-art performance on 3D referring segmentation, 3D +visual grounding, and also 2D referring image segmentation. Especially, +RefMask3D outperforms previous state-of-the-art method by a large margin of +3.16% mIoU} on the challenging ScanRefer dataset. Code is available at +https://github.com/heshuting555/RefMask3D. + +
+
+ comment: ACM MM 2024, Code: https://github.com/heshuting555/RefMask3D +
+
+
+
+
+ + ☆ BIV-Priv-Seg: Locating Private Content in Images Taken by People With + Visual Impairments + + +
+ Individuals who are blind or have low vision (BLV) are at a heightened risk +of sharing private information if they share photographs they have taken. To +facilitate developing technologies that can help preserve privacy, we introduce +BIV-Priv-Seg, the first localization dataset originating from people with +visual impairments that shows private content. It contains 1,028 images with +segmentation annotations for 16 private object categories. We first +characterize BIV-Priv-Seg and then evaluate modern models' performance for +locating private content in the dataset. We find modern models struggle most +with locating private objects that are not salient, small, and lack text as +well as recognizing when private content is absent from an image. We facilitate +future extensions by sharing our new dataset with the evaluation server at +https://vizwiz.org/tasks-and-datasets/object-localization. + +
+
+
+
+
+ + ☆ CodedVO: Coded Visual Odometry + + +
+ Autonomous robots often rely on monocular cameras for odometry estimation and +navigation. However, the scale ambiguity problem presents a critical barrier to +effective monocular visual odometry. In this paper, we present CodedVO, a novel +monocular visual odometry method that overcomes the scale ambiguity problem by +employing custom optics to physically encode metric depth information into +imagery. By incorporating this information into our odometry pipeline, we +achieve state-of-the-art performance in monocular visual odometry with a known +scale. We evaluate our method in diverse indoor environments and demonstrate +its robustness and adaptability. We achieve a 0.08m average trajectory error in +odometry evaluation on the ICL-NUIM indoor odometry dataset. + +
+
+ comment: 7 pages, 4 figures, IEEE ROBOTICS AND AUTOMATION LETTERS +
+
+
+
+
+ + ☆ LION: Linear Group RNN for 3D Object Detection in Point Clouds + + +
+ The benefit of transformers in large-scale 3D point cloud perception tasks, +such as 3D object detection, is limited by their quadratic computation cost +when modeling long-range relationships. In contrast, linear RNNs have low +computational complexity and are suitable for long-range modeling. Toward this +goal, we propose a simple and effective window-based framework built on LInear +grOup RNN (i.e., perform linear RNN for grouped features) for accurate 3D +object detection, called LION. The key property is to allow sufficient feature +interaction in a much larger group than transformer-based methods. However, +effectively applying linear group RNN to 3D object detection in highly sparse +point clouds is not trivial due to its limitation in handling spatial modeling. +To tackle this problem, we simply introduce a 3D spatial feature descriptor and +integrate it into the linear group RNN operators to enhance their spatial +features rather than blindly increasing the number of scanning orders for voxel +features. To further address the challenge in highly sparse point clouds, we +propose a 3D voxel generation strategy to densify foreground features thanks to +linear group RNN as a natural property of auto-regressive models. Extensive +experiments verify the effectiveness of the proposed components and the +generalization of our LION on different linear group RNN operators including +Mamba, RWKV, and RetNet. Furthermore, it is worth mentioning that our +LION-Mamba achieves state-of-the-art on Waymo, nuScenes, Argoverse V2, and ONCE +dataset. Last but not least, our method supports kinds of advanced linear RNN +operators (e.g., RetNet, RWKV, Mamba, xLSTM and TTT) on small but popular KITTI +dataset for a quick experience with our linear RNN-based framework. + +
+
+ comment: Project page: https://happinesslz.github.io/projects/LION/ +
+
+
+
+
+ + ☆ Geometry Fidelity for Spherical Images ECCV 2024 + + +
+ Spherical or omni-directional images offer an immersive visual format +appealing to a wide range of computer vision applications. However, geometric +properties of spherical images pose a major challenge for models and metrics +designed for ordinary 2D images. Here, we show that direct application of +Fr\'echet Inception Distance (FID) is insufficient for quantifying geometric +fidelity in spherical images. We introduce two quantitative metrics accounting +for geometric constraints, namely Omnidirectional FID (OmniFID) and +Discontinuity Score (DS). OmniFID is an extension of FID tailored to +additionally capture field-of-view requirements of the spherical format by +leveraging cubemap projections. DS is a kernel-based seam alignment score of +continuity across borders of 2D representations of spherical images. In +experiments, OmniFID and DS quantify geometry fidelity issues that are +undetected by FID. + +
+
+ comment: Accepted at ECCV 2024 +
+
+
+
+
+ + ☆ PianoMime: Learning a Generalist, Dexterous Piano Player from Internet + Demonstrations + + +
+ In this work, we introduce PianoMime, a framework for training a +piano-playing agent using internet demonstrations. The internet is a promising +source of large-scale demonstrations for training our robot agents. In +particular, for the case of piano-playing, Youtube is full of videos of +professional pianists playing a wide myriad of songs. In our work, we leverage +these demonstrations to learn a generalist piano-playing agent capable of +playing any arbitrary song. Our framework is divided into three parts: a data +preparation phase to extract the informative features from the Youtube videos, +a policy learning phase to train song-specific expert policies from the +demonstrations and a policy distillation phase to distil the policies into a +single generalist agent. We explore different policy designs to represent the +agent and evaluate the influence of the amount of training data on the +generalization capability of the agent to novel songs not available in the +dataset. We show that we are able to learn a policy with up to 56\% F1 score on +unseen songs. + +
+
+
+
+
+ + ☆ Quasar-ViT: Hardware-Oriented Quantization-Aware Architecture Search for + Vision Transformers + + +
+ Vision transformers (ViTs) have demonstrated their superior accuracy for +computer vision tasks compared to convolutional neural networks (CNNs). +However, ViT models are often computation-intensive for efficient deployment on +resource-limited edge devices. This work proposes Quasar-ViT, a +hardware-oriented quantization-aware architecture search framework for ViTs, to +design efficient ViT models for hardware implementation while preserving the +accuracy. First, Quasar-ViT trains a supernet using our row-wise flexible +mixed-precision quantization scheme, mixed-precision weight entanglement, and +supernet layer scaling techniques. Then, it applies an efficient +hardware-oriented search algorithm, integrated with hardware latency and +resource modeling, to determine a series of optimal subnets from supernet under +different inference latency targets. Finally, we propose a series of +model-adaptive designs on the FPGA platform to support the architecture search +and mitigate the gap between the theoretical computation reduction and the +practical inference speedup. Our searched models achieve 101.5, 159.6, and +251.6 frames-per-second (FPS) inference speed on the AMD/Xilinx ZCU102 FPGA +with 80.4%, 78.6%, and 74.9% top-1 accuracy, respectively, for the ImageNet +dataset, consistently outperforming prior works. + +
+
+ comment: Accepted by ICS 2024 +
+
+
+
+
+ + ☆ Taxonomy-Aware Continual Semantic Segmentation in Hyperbolic Spaces for + Open-World Perception + + +
+ Semantic segmentation models are typically trained on a fixed set of classes, +limiting their applicability in open-world scenarios. Class-incremental +semantic segmentation aims to update models with emerging new classes while +preventing catastrophic forgetting of previously learned ones. However, +existing methods impose strict rigidity on old classes, reducing their +effectiveness in learning new incremental classes. In this work, we propose +Taxonomy-Oriented Poincar\'e-regularized Incremental-Class Segmentation +(TOPICS) that learns feature embeddings in hyperbolic space following explicit +taxonomy-tree structures. This supervision provides plasticity for old classes, +updating ancestors based on new classes while integrating new classes at +fitting positions. Additionally, we maintain implicit class relational +constraints on the geometric basis of the Poincar\'e ball. This ensures that +the latent space can continuously adapt to new constraints while maintaining a +robust structure to combat catastrophic forgetting. We also establish eight +realistic incremental learning protocols for autonomous driving scenarios, +where novel classes can originate from known classes or the background. +Extensive evaluations of TOPICS on the Cityscapes and Mapillary Vistas 2.0 +benchmarks demonstrate that it achieves state-of-the-art performance. We make +the code and trained models publicly available at +http://topics.cs.uni-freiburg.de. + +
+
+
+
+
+ + ☆ XS-VID: An Extremely Small Video Object Detection Dataset + + +
+ Small Video Object Detection (SVOD) is a crucial subfield in modern computer +vision, essential for early object discovery and detection. However, existing +SVOD datasets are scarce and suffer from issues such as insufficiently small +objects, limited object categories, and lack of scene diversity, leading to +unitary application scenarios for corresponding methods. To address this gap, +we develop the XS-VID dataset, which comprises aerial data from various periods +and scenes, and annotates eight major object categories. To further evaluate +existing methods for detecting extremely small objects, XS-VID extensively +collects three types of objects with smaller pixel areas: extremely small +(\textit{es}, $0\sim12^2$), relatively small (\textit{rs}, $12^2\sim20^2$), and +generally small (\textit{gs}, $20^2\sim32^2$). XS-VID offers unprecedented +breadth and depth in covering and quantifying minuscule objects, significantly +enriching the scene and object diversity in the dataset. Extensive validations +on XS-VID and the publicly available VisDrone2019VID dataset show that existing +methods struggle with small object detection and significantly underperform +compared to general object detectors. Leveraging the strengths of previous +methods and addressing their weaknesses, we propose YOLOFT, which enhances +local feature associations and integrates temporal motion features, +significantly improving the accuracy and stability of SVOD. Our datasets and +benchmarks are available at \url{https://gjhhust.github.io/XS-VID/}. + +
+
+
+
+
+ + $\mathbb{X}$-Sample Contrastive Loss: Improving Contrastive Learning + with Sample Similarity Graphs + + +
+ Learning good representations involves capturing the diverse ways in which +data samples relate. Contrastive loss - an objective matching related samples - +underlies methods from self-supervised to multimodal learning. Contrastive +losses, however, can be viewed more broadly as modifying a similarity graph to +indicate how samples should relate in the embedding space. This view reveals a +shortcoming in contrastive learning: the similarity graph is binary, as only +one sample is the related positive sample. Crucially, similarities +\textit{across} samples are ignored. Based on this observation, we revise the +standard contrastive loss to explicitly encode how a sample relates to others. +We experiment with this new objective, called $\mathbb{X}$-Sample Contrastive, +to train vision models based on similarities in class or text caption +descriptions. Our study spans three scales: ImageNet-1k with 1 million, CC3M +with 3 million, and CC12M with 12 million samples. The representations learned +via our objective outperform both contrastive self-supervised and +vision-language models trained on the same data across a range of tasks. When +training on CC12M, we outperform CLIP by $0.6\%$ on both ImageNet and ImageNet +Real. Our objective appears to work particularly well in lower-data regimes, +with gains over CLIP of $16.8\%$ on ImageNet and $18.1\%$ on ImageNet Real when +training with CC3M. Finally, our objective seems to encourage the model to +learn representations that separate objects from their attributes and +backgrounds, with gains of $3.3$-$5.6$\% over CLIP on ImageNet9. We hope the +proposed solution takes a small step towards developing richer learning +objectives for understanding sample relations in foundation models. + +
+
+
+
+
+ + ☆ Estimating Earthquake Magnitude in Sentinel-1 Imagery via Ranking + + +
+ Earthquakes are commonly estimated using physical seismic stations, however, +due to the installation requirements and costs of these stations, global +coverage quickly becomes impractical. An efficient and lower-cost alternative +is to develop machine learning models to globally monitor earth observation +data to pinpoint regions impacted by these natural disasters. However, due to +the small amount of historically recorded earthquakes, this becomes a low-data +regime problem requiring algorithmic improvements to achieve peak performance +when learning to regress earthquake magnitude. In this paper, we propose to +pose the estimation of earthquake magnitudes as a metric-learning problem, +training models to not only estimate earthquake magnitude from Sentinel-1 +satellite imagery but to additionally rank pairwise samples. Our experiments +show at max a 30%+ improvement in MAE over prior regression-only based methods, +particularly transformer-based architectures. + +
+
+
+
+
+ + ☆ Self-supervised pre-training with diffusion model for few-shot landmark + detection in x-ray images + + +
+ In the last few years, deep neural networks have been extensively applied in +the medical domain for different tasks, ranging from image classification and +segmentation to landmark detection. However, the application of these +technologies in the medical domain is often hindered by data scarcity, both in +terms of available annotations and images. This study introduces a new +self-supervised pre-training protocol based on diffusion models for landmark +detection in x-ray images. Our results show that the proposed self-supervised +framework can provide accurate landmark detection with a minimal number of +available annotated training images (up to 50), outperforming ImageNet +supervised pre-training and state-of-the-art self-supervised pre-trainings for +three popular x-ray benchmark datasets. To our knowledge, this is the first +exploration of diffusion models for self-supervised learning in landmark +detection, which may offer a valuable pre-training approach in few-shot +regimes, for mitigating data scarcity. + +
+
+
+
+
+ + ☆ Efficient Inference of Vision Instruction-Following Models with Elastic + Cache ECCV 2024 + + +
+ In the field of instruction-following large vision-language models (LVLMs), +the efficient deployment of these models faces challenges, notably due to the +high memory demands of their key-value (KV) caches. Conventional cache +management strategies for LLMs focus on cache eviction, which often fails to +address the specific needs of multimodal instruction-following models. +Recognizing this gap, in this paper, we introduce Elastic Cache, a novel +approach that benefits from applying distinct acceleration methods for +instruction encoding and output generation stages. We investigate the metrics +of importance in different stages and propose an importance-driven cache +merging strategy to prune redundancy caches. Instead of discarding less +important caches, our strategy identifies important key/value vectors as anchor +points. Surrounding less important caches are then merged with these anchors, +enhancing the preservation of contextual information in the KV caches while +yielding an arbitrary acceleration ratio. For instruction encoding, we utilize +the frequency to evaluate the importance of caches. Regarding output +generation, we prioritize tokens based on their distance with an offset, by +which both the initial and most recent tokens are retained. Results on a range +of LVLMs demonstrate that Elastic Cache not only boosts efficiency but also +notably outperforms existing pruning methods in language generation across +various tasks. Code is available at https://github.com/liuzuyan/ElasticCache + +
+
+ comment: Accepted to ECCV 2024 +
+
+
+
+
+ + ☆ Keypoint Promptable Re-Identification + + +
+ Occluded Person Re-Identification (ReID) is a metric learning task that +involves matching occluded individuals based on their appearance. While many +studies have tackled occlusions caused by objects, multi-person occlusions +remain less explored. In this work, we identify and address a critical +challenge overlooked by previous occluded ReID methods: the Multi-Person +Ambiguity (MPA) arising when multiple individuals are visible in the same +bounding box, making it impossible to determine the intended ReID target among +the candidates. Inspired by recent work on prompting in vision, we introduce +Keypoint Promptable ReID (KPR), a novel formulation of the ReID problem that +explicitly complements the input bounding box with a set of semantic keypoints +indicating the intended target. Since promptable re-identification is an +unexplored paradigm, existing ReID datasets lack the pixel-level annotations +necessary for prompting. To bridge this gap and foster further research on this +topic, we introduce Occluded-PoseTrack ReID, a novel ReID dataset with +keypoints labels, that features strong inter-person occlusions. Furthermore, we +release custom keypoint labels for four popular ReID benchmarks. Experiments on +person retrieval, but also on pose tracking, demonstrate that our method +systematically surpasses previous state-of-the-art approaches on various +occluded scenarios. Our code, dataset and annotations are available at +https://github.com/VlSomers/keypoint_promptable_reidentification. + +
+
+
+
+
+ + ☆ Multi-Resolution Histopathology Patch Graphs for Ovarian Cancer + Subtyping MICCAI 2024 + + +
+ Computer vision models are increasingly capable of classifying ovarian +epithelial cancer subtypes, but they differ from pathologists by processing +small tissue patches at a single resolution. Multi-resolution graph models +leverage the spatial relationships of patches at multiple magnifications, +learning the context for each patch. In this study, we conduct the most +thorough validation of a graph model for ovarian cancer subtyping to date. +Seven models were tuned and trained using five-fold cross-validation on a set +of 1864 whole slide images (WSIs) from 434 patients treated at Leeds Teaching +Hospitals NHS Trust. The cross-validation models were ensembled and evaluated +using a balanced hold-out test set of 100 WSIs from 30 patients, and an +external validation set of 80 WSIs from 80 patients in the Transcanadian Study. +The best-performing model, a graph model using 10x+20x magnification data, gave +balanced accuracies of 73%, 88%, and 99% in cross-validation, hold-out testing, +and external validation, respectively. However, this only exceeded the +performance of attention-based multiple instance learning in external +validation, with a 93% balanced accuracy. Graph models benefitted greatly from +using the UNI foundation model rather than an ImageNet-pretrained ResNet50 for +feature extraction, with this having a much greater effect on performance than +changing the subsequent classification approach. The accuracy of the combined +foundation model and multi-resolution graph network offers a step towards the +clinical applicability of these models, with a new highest-reported performance +for this task, though further validations are still required to ensure the +robustness and usability of the models. + +
+
+ comment: Initially submitted version of a paper which has been accepted in the + GRAIL workshop at MICCAI 2024 +
+
+
+
+
+ + ☆ DINOv2 Rocks Geological Image Analysis: Classification, Segmentation, + and Interpretability + + +
+ This study investigates the interpretability, classification, and +segmentation of CT-scan images of rock samples, with a particular focus on the +application of DINOv2 within Geosciences. We compared various segmentation +techniques to evaluate their efficacy, efficiency, and adaptability in +geological image analysis. The methods assessed include the Otsu thresholding +method, clustering techniques (K-means and fuzzy C-means), a supervised machine +learning approach (Random Forest), and deep learning methods (UNet and DINOv2). +We tested these methods using ten binary sandstone datasets and three +multi-class calcite datasets. To begin, we provide a thorough interpretability +analysis of DINOv2's features in the geoscientific context, discussing its +suitability and inherent ability to process CT-scanned rock data. In terms of +classification, the out-of-the-box DINOv2 demonstrates an impressive capability +to perfectly classify rock images, even when the CT scans are out of its +original training set. Regarding segmentation, thresholding and unsupervised +methods, while fast, perform poorly despite image preprocessing, whereas +supervised methods show better results. We underscore the computational demands +of deep learning but highlight its minimal intervention, superior +generalization, and performance without additional image preprocessing. +Additionally, we observe a lack of correlation between a network's depth or the +number of parameters and its performance. Our results show that a LoRA +fine-tuned DINOv2 excels in out-of-distribution segmentation and significantly +outperforms other methods in multi-class segmentation. By systematically +comparing these methods, we identify the most efficient strategy for meticulous +and laborious segmentation tasks. DINOv2 proves advantageous, achieving +segmentations that could be described as "better than ground-truth" against +relatively small training sets. + +
+
+
+
+
+ + ☆ SSTD: Stripe-Like Space Target Detection using Single-Point Supervision + + +
+ Stripe-like space target detection (SSTD) plays a key role in enhancing space +situational awareness and assessing spacecraft behaviour. This domain faces +three challenges: the lack of publicly available datasets, interference from +stray light and stars, and the variability of stripe-like targets, which +complicates pixel-level annotation. In response, we introduces +`AstroStripeSet', a pioneering dataset designed for SSTD, aiming to bridge the +gap in academic resources and advance research in SSTD. Furthermore, we propose +a novel pseudo-label evolution teacher-student framework with single-point +supervision. This framework starts with generating initial pseudo-labels using +the zero-shot capabilities of the Segment Anything Model (SAM) in a +single-point setting, and refines these labels iteratively. In our framework, +the fine-tuned StripeSAM serves as the teacher and the newly developed +StripeNet as the student, consistently improving segmentation performance by +improving the quality of pseudo-labels. We also introduce `GeoDice', a new loss +function customized for the linear characteristics of stripe-like targets. +Extensive experiments show that the performance of our approach matches fully +supervised methods on all evaluation metrics, establishing a new +state-of-the-art (SOTA) benchmark. Our dataset and code will be made publicly +available. + +
+
+
+
+
+ + ☆ CSWin-UNet: Transformer UNet with Cross-Shaped Windows for Medical Image + Segmentation + + +
+ Deep learning, especially convolutional neural networks (CNNs) and +Transformer architectures, have become the focus of extensive research in +medical image segmentation, achieving impressive results. However, CNNs come +with inductive biases that limit their effectiveness in more complex, varied +segmentation scenarios. Conversely, while Transformer-based methods excel at +capturing global and long-range semantic details, they suffer from high +computational demands. In this study, we propose CSWin-UNet, a novel U-shaped +segmentation method that incorporates the CSWin self-attention mechanism into +the UNet to facilitate horizontal and vertical stripes self-attention. This +method significantly enhances both computational efficiency and receptive field +interactions. Additionally, our innovative decoder utilizes a content-aware +reassembly operator that strategically reassembles features, guided by +predicted kernels, for precise image resolution restoration. Our extensive +empirical evaluations on diverse datasets, including synapse multi-organ CT, +cardiac MRI, and skin lesions, demonstrate that CSWin-UNet maintains low model +complexity while delivering high segmentation accuracy. + +
+
+
+
+
+ + ☆ HVM-1: Large-scale video models pretrained with nearly 5000 hours of + human-like video data + + +
+ We introduce Human-like Video Models (HVM-1), large-scale video models +pretrained with nearly 5000 hours of curated human-like video data (mostly +egocentric, temporally extended, continuous video recordings), using the +spatiotemporal masked autoencoder (ST-MAE) algorithm. We release two 633M +parameter models trained at spatial resolutions of 224x224 and 448x448 pixels. +We evaluate the performance of these models in downstream few-shot video and +image recognition tasks and compare them against a model pretrained with 1330 +hours of short action-oriented video clips from YouTube (Kinetics-700). HVM-1 +models perform competitively against the Kinetics-700 pretrained model in +downstream evaluations despite substantial qualitative differences between the +spatiotemporal characteristics of the corresponding pretraining datasets. HVM-1 +models also learn more accurate and more robust object representations compared +to models pretrained with the image-based MAE algorithm on the same data, +demonstrating the potential benefits of learning to predict temporal +regularities in natural videos for learning better object representations. + +
+
+ comment: 10 pages, 5 figures, 1 table; code & models available from + https://github.com/eminorhan/hvm-1 +
+
+
+
+
+ + ☆ LKCell: Efficient Cell Nuclei Instance Segmentation with Large + Convolution Kernels + + +
+ The segmentation of cell nuclei in tissue images stained with the blood dye +hematoxylin and eosin (H$\&$E) is essential for various clinical applications +and analyses. Due to the complex characteristics of cellular morphology, a +large receptive field is considered crucial for generating high-quality +segmentation. However, previous methods face challenges in achieving a balance +between the receptive field and computational burden. To address this issue, we +propose LKCell, a high-accuracy and efficient cell segmentation method. Its +core insight lies in unleashing the potential of large convolution kernels to +achieve computationally efficient large receptive fields. Specifically, (1) We +transfer pre-trained large convolution kernel models to the medical domain for +the first time, demonstrating their effectiveness in cell segmentation. (2) We +analyze the redundancy of previous methods and design a new segmentation +decoder based on large convolution kernels. It achieves higher performance +while significantly reducing the number of parameters. We evaluate our method +on the most challenging benchmark and achieve state-of-the-art results (0.5080 +mPQ) in cell nuclei instance segmentation with only 21.6% FLOPs compared with +the previous leading method. Our source code and models are available at +https://github.com/hustvl/LKCell. + +
+
+
+
+
+ + ☆ GaussianSR: High Fidelity 2D Gaussian Splatting for Arbitrary-Scale + Image Super-Resolution + + +
+ Implicit neural representations (INRs) have significantly advanced the field +of arbitrary-scale super-resolution (ASSR) of images. Most existing INR-based +ASSR networks first extract features from the given low-resolution image using +an encoder, and then render the super-resolved result via a multi-layer +perceptron decoder. Although these approaches have shown promising results, +their performance is constrained by the limited representation ability of +discrete latent codes in the encoded features. In this paper, we propose a +novel ASSR method named GaussianSR that overcomes this limitation through 2D +Gaussian Splatting (2DGS). Unlike traditional methods that treat pixels as +discrete points, GaussianSR represents each pixel as a continuous Gaussian +field. The encoded features are simultaneously refined and upsampled by +rendering the mutually stacked Gaussian fields. As a result, long-range +dependencies are established to enhance representation ability. In addition, a +classifier is developed to dynamically assign Gaussian kernels to all pixels to +further improve flexibility. All components of GaussianSR (i.e., encoder, +classifier, Gaussian kernels, and decoder) are jointly learned end-to-end. +Experiments demonstrate that GaussianSR achieves superior ASSR performance with +fewer parameters than existing methods while enjoying interpretable and +content-aware feature aggregations. + +
+
+ comment: 13 pages, 12 figures +
+
+
+
+
+ + ☆ YOCO: You Only Calibrate Once for Accurate Extrinsic Parameter in + LiDAR-Camera Systems + + +
+ In a multi-sensor fusion system composed of cameras and LiDAR, precise +extrinsic calibration contributes to the system's long-term stability and +accurate perception of the environment. However, methods based on extracting +and registering corresponding points still face challenges in terms of +automation and precision. This paper proposes a novel fully automatic extrinsic +calibration method for LiDAR-camera systems that circumvents the need for +corresponding point registration. In our approach, a novel algorithm to extract +required LiDAR correspondence point is proposed. This method can effectively +filter out irrelevant points by computing the orientation of plane point clouds +and extracting points by applying distance- and density-based thresholds. We +avoid the need for corresponding point registration by introducing extrinsic +parameters between the LiDAR and camera into the projection of extracted points +and constructing co-planar constraints. These parameters are then optimized to +solve for the extrinsic. We validated our method across multiple sets of +LiDAR-camera systems. In synthetic experiments, our method demonstrates +superior performance compared to current calibration techniques. Real-world +data experiments further confirm the precision and robustness of the proposed +algorithm, with average rotation and translation calibration errors between +LiDAR and camera of less than 0.05 degree and 0.015m, respectively. This method +enables automatic and accurate extrinsic calibration in a single one step, +emphasizing the potential of calibration algorithms beyond using corresponding +point registration to enhance the automation and precision of LiDAR-camera +system calibration. + +
+
+ comment: IEEE TRANSACTIONS ON INSTRUMENTATION AND MEASUREMENT +
+
+
+
+
+ + ☆ TiCoSS: Tightening the Coupling between Semantic Segmentation and Stereo + Matching within A Joint Learning Framework + + +
+ Semantic segmentation and stereo matching, respectively analogous to the +ventral and dorsal streams in our human brain, are two key components of +autonomous driving perception systems. Addressing these two tasks with separate +networks is no longer the mainstream direction in developing computer vision +algorithms, particularly with the recent advances in large vision models and +embodied artificial intelligence. The trend is shifting towards combining them +within a joint learning framework, especially emphasizing feature sharing +between the two tasks. The major contributions of this study lie in +comprehensively tightening the coupling between semantic segmentation and +stereo matching. Specifically, this study introduces three novelties: (1) a +tightly coupled, gated feature fusion strategy, (2) a hierarchical deep +supervision strategy, and (3) a coupling tightening loss function. The combined +use of these technical contributions results in TiCoSS, a state-of-the-art +joint learning framework that simultaneously tackles semantic segmentation and +stereo matching. Through extensive experiments on the KITTI and vKITTI2 +datasets, along with qualitative and quantitative analyses, we validate the +effectiveness of our developed strategies and loss function, and demonstrate +its superior performance compared to prior arts, with a notable increase in +mIoU by over 9%. Our source code will be publicly available at +mias.group/TiCoSS upon publication. + +
+
+
+
+
+ + ☆ RestoreAgent: Autonomous Image Restoration Agent via Multimodal Large + Language Models + + +
+ Natural images captured by mobile devices often suffer from multiple types of +degradation, such as noise, blur, and low light. Traditional image restoration +methods require manual selection of specific tasks, algorithms, and execution +sequences, which is time-consuming and may yield suboptimal results. All-in-one +models, though capable of handling multiple tasks, typically support only a +limited range and often produce overly smooth, low-fidelity outcomes due to +their broad data distribution fitting. To address these challenges, we first +define a new pipeline for restoring images with multiple degradations, and then +introduce RestoreAgent, an intelligent image restoration system leveraging +multimodal large language models. RestoreAgent autonomously assesses the type +and extent of degradation in input images and performs restoration through (1) +determining the appropriate restoration tasks, (2) optimizing the task +sequence, (3) selecting the most suitable models, and (4) executing the +restoration. Experimental results demonstrate the superior performance of +RestoreAgent in handling complex degradation, surpassing human experts. +Furthermore, the system modular design facilitates the fast integration of new +tasks and models, enhancing its flexibility and scalability for various +applications. + +
+
+
+
+
+ + ☆ AttentionHand: Text-driven Controllable Hand Image Generation for 3D + Hand Reconstruction in the Wild ECCV 2024 + + +
+ Recently, there has been a significant amount of research conducted on 3D +hand reconstruction to use various forms of human-computer interaction. +However, 3D hand reconstruction in the wild is challenging due to extreme lack +of in-the-wild 3D hand datasets. Especially, when hands are in complex pose +such as interacting hands, the problems like appearance similarity, self-handed +occclusion and depth ambiguity make it more difficult. To overcome these +issues, we propose AttentionHand, a novel method for text-driven controllable +hand image generation. Since AttentionHand can generate various and numerous +in-the-wild hand images well-aligned with 3D hand label, we can acquire a new +3D hand dataset, and can relieve the domain gap between indoor and outdoor +scenes. Our method needs easy-to-use four modalities (i.e, an RGB image, a hand +mesh image from 3D label, a bounding box, and a text prompt). These modalities +are embedded into the latent space by the encoding phase. Then, through the +text attention stage, hand-related tokens from the given text prompt are +attended to highlight hand-related regions of the latent embedding. After the +highlighted embedding is fed to the visual attention stage, hand-related +regions in the embedding are attended by conditioning global and local hand +mesh images with the diffusion-based pipeline. In the decoding phase, the final +feature is decoded to new hand images, which are well-aligned with the given +hand mesh image and text prompt. As a result, AttentionHand achieved +state-of-the-art among text-to-hand image generation models, and the +performance of 3D hand mesh reconstruction was improved by additionally +training with hand images generated by AttentionHand. + +
+
+ comment: Accepted by ECCV 2024 +
+
+
+
+
+ + ☆ Segmentation-guided MRI reconstruction for meaningfully diverse + reconstructions MICCAI 2024 + + +
+ Inverse problems, such as accelerated MRI reconstruction, are ill-posed and +an infinite amount of possible and plausible solutions exist. This may not only +lead to uncertainty in the reconstructed image but also in downstream tasks +such as semantic segmentation. This uncertainty, however, is mostly not +analyzed in the literature, even though probabilistic reconstruction models are +commonly used. These models can be prone to ignore plausible but unlikely +solutions like rare pathologies. Building on MRI reconstruction approaches +based on diffusion models, we add guidance to the diffusion process during +inference, generating two meaningfully diverse reconstructions corresponding to +an upper and lower bound segmentation. The reconstruction uncertainty can then +be quantified by the difference between these bounds, which we coin the +'uncertainty boundary'. We analyzed the behavior of the upper and lower bound +segmentations for a wide range of acceleration factors and found the +uncertainty boundary to be both more reliable and more accurate compared to +repeated sampling. Code is available at https://github.com/NikolasMorshuis/SGR + +
+
+ comment: Accepted at DGM4MICCAI 2024 +
+
+
+
+
+ + ☆ Network Inversion of Convolutional Neural Nets + + +
+ Neural networks have emerged as powerful tools across various applications, +yet their decision-making process often remains opaque, leading to them being +perceived as "black boxes." This opacity raises concerns about their +interpretability and reliability, especially in safety-critical scenarios. +Network inversion techniques offer a solution by allowing us to peek inside +these black boxes, revealing the features and patterns learned by the networks +behind their decision-making processes and thereby provide valuable insights +into how neural networks arrive at their conclusions, making them more +interpretable and trustworthy. This paper presents a simple yet effective +approach to network inversion using a carefully conditioned generator that +learns the data distribution in the input space of the trained neural network, +enabling the reconstruction of inputs that would most likely lead to the +desired outputs. To capture the diversity in the input space for a given +output, instead of simply revealing the conditioning labels to the generator, +we hideously encode the conditioning label information into vectors, further +exemplified by heavy dropout in the generation process and minimisation of +cosine similarity between the features corresponding to the generated images. +The paper concludes with immediate applications of Network Inversion including +in interpretability, explainability and generation of adversarial samples. + +
+
+
+
+
+ + ☆ Investigation to answer three key questions concerning plant pest + identification and development of a practical identification framework + + +
+ The development of practical and robust automated diagnostic systems for +identifying plant pests is crucial for efficient agricultural production. In +this paper, we first investigate three key research questions (RQs) that have +not been addressed thus far in the field of image-based plant pest +identification. Based on the knowledge gained, we then develop an accurate, +robust, and fast plant pest identification framework using 334K images +comprising 78 combinations of four plant portions (the leaf front, leaf back, +fruit, and flower of cucumber, tomato, strawberry, and eggplant) and 20 pest +species captured at 27 farms. The results reveal the following. (1) For an +appropriate evaluation of the model, the test data should not include images of +the field from which the training images were collected, or other +considerations to increase the diversity of the test set should be taken into +account. (2) Pre-extraction of ROIs, such as leaves and fruits, helps to +improve identification accuracy. (3) Integration of closely related species +using the same control methods and cross-crop training methods for the same +pests, are effective. Our two-stage plant pest identification framework, +enabling ROI detection and convolutional neural network (CNN)-based +identification, achieved a highly practical performance of 91.0% and 88.5% in +mean accuracy and macro F1 score, respectively, for 12,223 instances of test +data of 21 classes collected from unseen fields, where 25 classes of images +from 318,971 samples were used for training; the average identification time +was 476 ms/image. + +
+
+ comment: 40 pages, 10 figures +
+
+
+
+
+ + ☆ Joint RGB-Spectral Decomposition Model Guided Image Enhancement in + Mobile Photography + + +
+ The integration of miniaturized spectrometers into mobile devices offers new +avenues for image quality enhancement and facilitates novel downstream tasks. +However, the broader application of spectral sensors in mobile photography is +hindered by the inherent complexity of spectral images and the constraints of +spectral imaging capabilities. To overcome these challenges, we propose a joint +RGB-Spectral decomposition model guided enhancement framework, which consists +of two steps: joint decomposition and prior-guided enhancement. Firstly, we +leverage the complementarity between RGB and Low-resolution Multi-Spectral +Images (Lr-MSI) to predict shading, reflectance, and material semantic priors. +Subsequently, these priors are seamlessly integrated into the established +HDRNet to promote dynamic range enhancement, color mapping, and grid expert +learning, respectively. Additionally, we construct a high-quality Mobile-Spec +dataset to support our research, and our experiments validate the effectiveness +of Lr-MSI in the tone enhancement task. This work aims to establish a solid +foundation for advancing spectral vision in mobile photography. The code is +available at \url{https://github.com/CalayZhou/JDM-HDRNet}. + +
+
+
+
+
+ + ☆ Lightweight Language-driven Grasp Detection using Conditional + Consistency Model IROS 2024 + + +
+ Language-driven grasp detection is a fundamental yet challenging task in +robotics with various industrial applications. In this work, we present a new +approach for language-driven grasp detection that leverages the concept of +lightweight diffusion models to achieve fast inference time. By integrating +diffusion processes with grasping prompts in natural language, our method can +effectively encode visual and textual information, enabling more accurate and +versatile grasp positioning that aligns well with the text query. To overcome +the long inference time problem in diffusion models, we leverage the image and +text features as the condition in the consistency model to reduce the number of +denoising timesteps during inference. The intensive experimental results show +that our method outperforms other recent grasp detection methods and +lightweight diffusion models by a clear margin. We further validate our method +in real-world robotic experiments to demonstrate its fast inference time +capability. + +
+
+ comment: Accepted at IROS 2024 +
+
+
+
+
+ + ☆ SaccadeDet: A Novel Dual-Stage Architecture for Rapid and Accurate + Detection in Gigapixel Images ECML-PKDD 2024 + + +
+ The advancement of deep learning in object detection has predominantly +focused on megapixel images, leaving a critical gap in the efficient processing +of gigapixel images. These super high-resolution images present unique +challenges due to their immense size and computational demands. To address +this, we introduce 'SaccadeDet', an innovative architecture for gigapixel-level +object detection, inspired by the human eye saccadic movement. The cornerstone +of SaccadeDet is its ability to strategically select and process image regions, +dramatically reducing computational load. This is achieved through a two-stage +process: the 'saccade' stage, which identifies regions of probable interest, +and the 'gaze' stage, which refines detection in these targeted areas. Our +approach, evaluated on the PANDA dataset, not only achieves an 8x speed +increase over the state-of-the-art methods but also demonstrates significant +potential in gigapixel-level pathology analysis through its application to +Whole Slide Imaging. + +
+
+ comment: This paper is accepted to ECML-PKDD 2024 +
+
+
+
+
+ + ☆ Scaling Training Data with Lossy Image Compression + + +
+ Empirically-determined scaling laws have been broadly successful in +predicting the evolution of large machine learning models with training data +and number of parameters. As a consequence, they have been useful for +optimizing the allocation of limited resources, most notably compute time. + In certain applications, storage space is an important constraint, and data +format needs to be chosen carefully as a consequence. Computer vision is a +prominent example: images are inherently analog, but are always stored in a +digital format using a finite number of bits. Given a dataset of digital +images, the number of bits $L$ to store each of them can be further reduced +using lossy data compression. This, however, can degrade the quality of the +model trained on such images, since each example has lower resolution. + In order to capture this trade-off and optimize storage of training data, we +propose a `storage scaling law' that describes the joint evolution of test +error with sample size and number of bits per image. We prove that this law +holds within a stylized model for image compression, and verify it empirically +on two computer vision tasks, extracting the relevant parameters. We then show +that this law can be used to optimize the lossy compression level. At given +storage, models trained on optimally compressed images present a significantly +smaller test error with respect to models trained on the original data. +Finally, we investigate the potential benefits of randomizing the compression +level. + +
+
+ comment: 21 pages, 27 figures +
+
+
+
+
+ + ☆ BetterDepth: Plug-and-Play Diffusion Refiner for Zero-Shot Monocular + Depth Estimation + + +
+ By training over large-scale datasets, zero-shot monocular depth estimation +(MDE) methods show robust performance in the wild but often suffer from +insufficiently precise details. Although recent diffusion-based MDE approaches +exhibit appealing detail extraction ability, they still struggle in +geometrically challenging scenes due to the difficulty of gaining robust +geometric priors from diverse datasets. To leverage the complementary merits of +both worlds, we propose BetterDepth to efficiently achieve geometrically +correct affine-invariant MDE performance while capturing fine-grained details. +Specifically, BetterDepth is a conditional diffusion-based refiner that takes +the prediction from pre-trained MDE models as depth conditioning, in which the +global depth context is well-captured, and iteratively refines details based on +the input image. For the training of such a refiner, we propose global +pre-alignment and local patch masking methods to ensure the faithfulness of +BetterDepth to depth conditioning while learning to capture fine-grained scene +details. By efficient training on small-scale synthetic datasets, BetterDepth +achieves state-of-the-art zero-shot MDE performance on diverse public datasets +and in-the-wild scenes. Moreover, BetterDepth can improve the performance of +other MDE models in a plug-and-play manner without additional re-training. + +
+
+
+
+
+ + ☆ Real Time American Sign Language Detection Using Yolo-v9 + + +
+ This paper focuses on real-time American Sign Language Detection. YOLO is a +convolutional neural network (CNN) based model, which was first released in +2015. In recent years, it gained popularity for its real-time detection +capabilities. Our study specifically targets YOLO-v9 model, released in 2024. +As the model is newly introduced, not much work has been done on it, especially +not in Sign Language Detection. Our paper provides deep insight on how YOLO- v9 +works and better than previous model. + +
+
+ comment: 11 pages, 13 figures, 1 table +
+
+
+
+
+ + ☆ Analyzing Brain Tumor Connectomics using Graphs and Persistent Homology MICCAI + + +
+ Recent advances in molecular and genetic research have identified a diverse +range of brain tumor sub-types, shedding light on differences in their +molecular mechanisms, heterogeneity, and origins. The present study performs +whole-brain connectome analysis using diffusionweighted images. To achieve +this, both graph theory and persistent homology - a prominent approach in +topological data analysis are employed in order to quantify changes in the +structural connectivity of the wholebrain connectome in subjects with brain +tumors. Probabilistic tractography is used to map the number of streamlines +connecting 84 distinct brain regions, as delineated by the Desikan-Killiany +atlas from FreeSurfer. These streamline mappings form the connectome matrix, on +which persistent homology based analysis and graph theoretical analysis are +executed to evaluate the discriminatory power between tumor sub-types that +include meningioma and glioma. A detailed statistical analysis is conducted on +persistent homology-derived topological features and graphical features to +identify the brain regions where differences between study groups are +statistically significant (p < 0.05). For classification purpose, graph-based +local features are utilized, achieving a highest accuracy of 88%. In +classifying tumor sub-types, an accuracy of 80% is attained. The findings +obtained from this study underscore the potential of persistent homology and +graph theoretical analysis of the whole-brain connectome in detecting +alterations in structural connectivity patterns specific to different types of +brain tumors. + +
+
+ comment: 15 Pages, 7 Figures, 2 Tables, TGI3-MICCAI Workshop +
+
+
+
+
+ + ☆ Segmentation by registration-enabled SAM prompt engineering using five + reference images + + +
+ The recently proposed Segment Anything Model (SAM) is a general tool for +image segmentation, but it requires additional adaptation and careful +fine-tuning for medical image segmentation, especially for small, +irregularly-shaped, and boundary-ambiguous anatomical structures such as the +knee cartilage that is of interest in this work. Repaired cartilage, after +certain surgical procedures, exhibits imaging patterns unseen to pre-training, +posing further challenges for using models like SAM with or without +general-purpose fine-tuning. To address this, we propose a novel +registration-based prompt engineering framework for medical image segmentation +using SAM. This approach utilises established image registration algorithms to +align the new image (to-be-segmented) and a small number of reference images, +without requiring segmentation labels. The spatial transformations generated by +registration align either the new image or pre-defined point-based prompts, +before using them as input to SAM. This strategy, requiring as few as five +reference images with defined point prompts, effectively prompts SAM for +inference on new images, without needing any segmentation labels. Evaluation of +MR images from patients who received cartilage stem cell therapy yielded Dice +scores of 0.89, 0.87, 0.53, and 0.52 for segmenting femur, tibia, femoral- and +tibial cartilages, respectively. This outperforms atlas-based label fusion and +is comparable to supervised nnUNet, an upper-bound fair baseline in this +application, both of which require full segmentation labels for reference +samples. The codes are available at: +https://github.com/chrissyinreallife/KneeSegmentWithSAM.git + +
+
+ comment: Accepted to the 11th International Workshop on Biomedical Image + Registration (WBIR 2024) +
+
+
+
+
+ + ☆ Guided Latent Slot Diffusion for Object-Centric Learning + + +
+ Slot attention aims to decompose an input image into a set of meaningful +object files (slots). These latent object representations enable various +downstream tasks. Yet, these slots often bind to object parts, not objects +themselves, especially for real-world datasets. To address this, we introduce +Guided Latent Slot Diffusion - GLASS, an object-centric model that uses +generated captions as a guiding signal to better align slots with objects. Our +key insight is to learn the slot-attention module in the space of generated +images. This allows us to repurpose the pre-trained diffusion decoder model, +which reconstructs the images from the slots, as a semantic mask generator +based on the generated captions. GLASS learns an object-level representation +suitable for multiple tasks simultaneously, e.g., segmentation, image +generation, and property prediction, outperforming previous methods. For object +discovery, GLASS achieves approx. a +35% and +10% relative improvement for mIoU +over the previous state-of-the-art (SOTA) method on the VOC and COCO datasets, +respectively, and establishes a new SOTA FID score for conditional image +generation amongst slot-attention-based methods. For the segmentation task, +GLASS surpasses SOTA weakly-supervised and language-based segmentation models, +which were specifically designed for the task. + +
+
+ comment: Project Page: https://guided-sa.github.io +
+
+
+
+
+ + ☆ Invariance of deep image quality metrics to affine transformations + + +
+ Deep architectures are the current state-of-the-art in predicting subjective +image quality. Usually, these models are evaluated according to their ability +to correlate with human opinion in databases with a range of distortions that +may appear in digital media. However, these oversee affine transformations +which may represent better the changes in the images actually happening in +natural conditions. Humans can be particularly invariant to these natural +transformations, as opposed to the digital ones. In this work, we evaluate +state-of-the-art deep image quality metrics by assessing their invariance to +affine transformations, specifically: rotation, translation, scaling, and +changes in spectral illumination. We propose a methodology to assign +invisibility thresholds for any perceptual metric. This methodology involves +transforming the distance measured by an arbitrary metric to a common distance +representation based on available subjectively rated databases. We +psychophysically measure an absolute detection threshold in that common +representation and express it in the physical units of each affine transform +for each metric. By doing so, we allow the analyzed metrics to be directly +comparable with actual human thresholds. We find that none of the +state-of-the-art metrics shows human-like results under this strong test based +on invisibility thresholds. This means that tuning the models exclusively to +predict the visibility of generic distortions may disregard other properties of +human vision as for instance invariances or invisibility thresholds. + +
+
+ comment: 12 pages 13 figures +
+
+
+
+
+ + ☆ ReCorD: Reasoning and Correcting Diffusion for HOI Generation ACM MM 2024 + + +
+ Diffusion models revolutionize image generation by leveraging natural +language to guide the creation of multimedia content. Despite significant +advancements in such generative models, challenges persist in depicting +detailed human-object interactions, especially regarding pose and object +placement accuracy. We introduce a training-free method named Reasoning and +Correcting Diffusion (ReCorD) to address these challenges. Our model couples +Latent Diffusion Models with Visual Language Models to refine the generation +process, ensuring precise depictions of HOIs. We propose an interaction-aware +reasoning module to improve the interpretation of the interaction, along with +an interaction correcting module to refine the output image for more precise +HOI generation delicately. Through a meticulous process of pose selection and +object positioning, ReCorD achieves superior fidelity in generated images while +efficiently reducing computational requirements. We conduct comprehensive +experiments on three benchmarks to demonstrate the significant progress in +solving text-to-image generation tasks, showcasing ReCorD's ability to render +complex interactions accurately by outperforming existing methods in HOI +classification score, as well as FID and Verb CLIP-Score. Project website is +available at https://alberthkyhky.github.io/ReCorD/ . + +
+
+ comment: Accepted by ACM MM 2024. Project website: + https://alberthkyhky.github.io/ReCorD/ +
+
+
+
+
+ + ☆ Separating Novel Features for Logical Anomaly Detection: A + Straightforward yet Effective Approach + + +
+ Vision-based inspection algorithms have significantly contributed to quality +control in industrial settings, particularly in addressing structural defects +like dent and contamination which are prevalent in mass production. Extensive +research efforts have led to the development of related benchmarks such as +MVTec AD (Bergmann et al., 2019). However, in industrial settings, there can be +instances of logical defects, where acceptable items are found in unsuitable +locations or product pairs do not match as expected. Recent methods tackling +logical defects effectively employ knowledge distillation to generate +difference maps. Knowledge distillation (KD) is used to learn normal data +distribution in unsupervised manner. Despite their effectiveness, these methods +often overlook the potential false negatives. Excessive similarity between the +teacher network and student network can hinder the generation of a suitable +difference map for logical anomaly detection. This technical report provides +insights on handling potential false negatives by utilizing a simple constraint +in KD-based logical anomaly detection methods. We select EfficientAD as a +state-of-the-art baseline and apply a margin-based constraint to its +unsupervised learning scheme. Applying this constraint, we can improve the +AUROC for MVTec LOCO AD by 1.3 %. + +
+
+
+
+
+ + ☆ Amortized Posterior Sampling with Diffusion Prior Distillation + + +
+ We propose a variational inference approach to sample from the posterior +distribution for solving inverse problems. From a pre-trained diffusion model, +our approach trains a conditional flow model to minimize the divergence between +the proposal variational distribution and the posterior distribution implicitly +defined through the diffusion model. Once trained, the flow model is capable of +sampling from the posterior distribution with a single NFE, amortized with +respect to the measurement. The proposed method paves a new path for distilling +a diffusion prior for efficient posterior sampling. We show that our method is +applicable to standard signals in Euclidean space, as well as signals on +manifold. + +
+
+
+
+
+ + ☆ Hierarchical Object Detection and Recognition Framework for Practical + Plant Disease Diagnosis + + +
+ Recently, object detection methods (OD; e.g., YOLO-based models) have been +widely utilized in plant disease diagnosis. These methods demonstrate +robustness to distance variations and excel at detecting small lesions compared +to classification methods (CL; e.g., CNN models). However, there are issues +such as low diagnostic performance for hard-to-detect diseases and high +labeling costs. Additionally, since healthy cases cannot be explicitly trained, +there is a risk of false positives. We propose the Hierarchical object +detection and recognition framework (HODRF), a sophisticated and highly +integrated two-stage system that combines the strengths of both OD and CL for +plant disease diagnosis. In the first stage, HODRF uses OD to identify regions +of interest (ROIs) without specifying the disease. In the second stage, CL +diagnoses diseases surrounding the ROIs. HODRF offers several advantages: (1) +Since OD detects only one type of ROI, HODRF can detect diseases with limited +training images by leveraging its ability to identify other lesions. (2) While +OD over-detects healthy cases, HODRF significantly reduces these errors by +using CL in the second stage. (3) CL's accuracy improves in HODRF as it +identifies diagnostic targets given as ROIs, making it less vulnerable to size +changes. (4) HODRF benefits from CL's lower annotation costs, allowing it to +learn from a larger number of images. We implemented HODRF using YOLOv7 for OD +and EfficientNetV2 for CL and evaluated its performance on a large-scale +dataset (4 crops, 20 diseased and healthy classes, 281K images). HODRF +outperformed YOLOv7 alone by 5.8 to 21.5 points on healthy data and 0.6 to 7.5 +points on macro F1 scores, and it improved macro F1 by 1.1 to 7.2 points over +EfficientNetV2. + +
+
+ comment: 6 pages, 3 figures +
+
+
+
+
+ + ☆ StreamMOS: Streaming Moving Object Segmentation with Multi-View + Perception and Dual-Span Memory + + +
+ Moving object segmentation based on LiDAR is a crucial and challenging task +for autonomous driving and mobile robotics. Most approaches explore +spatio-temporal information from LiDAR sequences to predict moving objects in +the current frame. However, they often focus on transferring temporal cues in a +single inference and regard every prediction as independent of others. This may +cause inconsistent segmentation results for the same object in different +frames. To overcome this issue, we propose a streaming network with a memory +mechanism, called StreamMOS, to build the association of features and +predictions among multiple inferences. Specifically, we utilize a short-term +memory to convey historical features, which can be regarded as spatial prior of +moving objects and adopted to enhance current inference by temporal fusion. +Meanwhile, we build a long-term memory to store previous predictions and +exploit them to refine the present forecast at voxel and instance levels +through voting. Besides, we present multi-view encoder with cascade projection +and asymmetric convolution to extract motion feature of objects in different +representations. Extensive experiments validate that our algorithm gets +competitive performance on SemanticKITTI and Sipailou Campus datasets. Code +will be released at https://github.com/NEU-REAL/StreamMOS.git. + +
+
+ comment: 8 pages, 7 figures +
+
+
+
+
+ + ☆ Exploring the Effect of Dataset Diversity in Self-Supervised Learning + for Surgical Computer Vision MICCAI2024 + + +
+ Over the past decade, computer vision applications in minimally invasive +surgery have rapidly increased. Despite this growth, the impact of surgical +computer vision remains limited compared to other medical fields like pathology +and radiology, primarily due to the scarcity of representative annotated data. +Whereas transfer learning from large annotated datasets such as ImageNet has +been conventionally the norm to achieve high-performing models, recent +advancements in self-supervised learning (SSL) have demonstrated superior +performance. In medical image analysis, in-domain SSL pretraining has already +been shown to outperform ImageNet-based initialization. Although unlabeled data +in the field of surgical computer vision is abundant, the diversity within this +data is limited. This study investigates the role of dataset diversity in SSL +for surgical computer vision, comparing procedure-specific datasets against a +more heterogeneous general surgical dataset across three different downstream +surgical applications. The obtained results show that using solely +procedure-specific data can lead to substantial improvements of 13.8%, 9.5%, +and 36.8% compared to ImageNet pretraining. However, extending this data with +more heterogeneous surgical data further increases performance by an additional +5.0%, 5.2%, and 2.5%, suggesting that increasing diversity within SSL data is +beneficial for model performance. The code and pretrained model weights are +made publicly available at https://github.com/TimJaspers0801/SurgeNet. + +
+
+ comment: accepted - Data Engineering in Medical Imaging (DEMI) Workshop @ + MICCAI2024 +
+
+
+
+
+ + ☆ Advancing 3D Point Cloud Understanding through Deep Transfer Learning: A + Comprehensive Survey + + +
+ The 3D point cloud (3DPC) has significantly evolved and benefited from the +advance of deep learning (DL). However, the latter faces various issues, +including the lack of data or annotated data, the existence of a significant +gap between training data and test data, and the requirement for high +computational resources. To that end, deep transfer learning (DTL), which +decreases dependency and costs by utilizing knowledge gained from a source +data/task in training a target data/task, has been widely investigated. +Numerous DTL frameworks have been suggested for aligning point clouds obtained +from several scans of the same scene. Additionally, DA, which is a subset of +DTL, has been modified to enhance the point cloud data's quality by dealing +with noise and missing points. Ultimately, fine-tuning and DA approaches have +demonstrated their effectiveness in addressing the distinct difficulties +inherent in point cloud data. This paper presents the first review shedding +light on this aspect. it provides a comprehensive overview of the latest +techniques for understanding 3DPC using DTL and domain adaptation (DA). +Accordingly, DTL's background is first presented along with the datasets and +evaluation metrics. A well-defined taxonomy is introduced, and detailed +comparisons are presented, considering different aspects such as different +knowledge transfer strategies, and performance. The paper covers various +applications, such as 3DPC object detection, semantic labeling, segmentation, +classification, registration, downsampling/upsampling, and denoising. +Furthermore, the article discusses the advantages and limitations of the +presented frameworks, identifies open challenges, and suggests potential +research directions. + +
+
+ comment: 55 pages, 9 tables, and 15 figures +
+
+
+
+
+ + ☆ Mew: Multiplexed Immunofluorescence Image Analysis through an Efficient + Multiplex Network ECCV 2024 + + +
+ Recent advancements in graph-based approaches for multiplexed +immunofluorescence (mIF) images have significantly propelled the field forward, +offering deeper insights into patient-level phenotyping. However, current +graph-based methodologies encounter two primary challenges: (1) Cellular +Heterogeneity, where existing approaches fail to adequately address the +inductive biases inherent in graphs, particularly the homophily characteristic +observed in cellular connectivity and; (2) Scalability, where handling cellular +graphs from high-dimensional images faces difficulties in managing a high +number of cells. To overcome these limitations, we introduce Mew, a novel +framework designed to efficiently process mIF images through the lens of +multiplex network. Mew innovatively constructs a multiplex network comprising +two distinct layers: a Voronoi network for geometric information and a +Cell-type network for capturing cell-wise homogeneity. This framework equips a +scalable and efficient Graph Neural Network (GNN), capable of processing the +entire graph during training. Furthermore, Mew integrates an interpretable +attention module that autonomously identifies relevant layers for image +classification. Extensive experiments on a real-world patient dataset from +various institutions highlight Mew's remarkable efficacy and efficiency, +marking a significant advancement in mIF image analysis. The source code of Mew +can be found here: \url{https://github.com/UNITES-Lab/Mew} + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ☆ FlexiEdit: Frequency-Aware Latent Refinement for Enhanced Non-Rigid + Editing ECCV 2024 + + +
+ Current image editing methods primarily utilize DDIM Inversion, employing a +two-branch diffusion approach to preserve the attributes and layout of the +original image. However, these methods encounter challenges with non-rigid +edits, which involve altering the image's layout or structure. Our +comprehensive analysis reveals that the high-frequency components of DDIM +latent, crucial for retaining the original image's key features and layout, +significantly contribute to these limitations. Addressing this, we introduce +FlexiEdit, which enhances fidelity to input text prompts by refining DDIM +latent, by reducing high-frequency components in targeted editing areas. +FlexiEdit comprises two key components: (1) Latent Refinement, which modifies +DDIM latent to better accommodate layout adjustments, and (2) Edit Fidelity +Enhancement via Re-inversion, aimed at ensuring the edits more accurately +reflect the input text prompts. Our approach represents notable progress in +image editing, particularly in performing complex non-rigid edits, showcasing +its enhanced capability through comparative experiments. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ☆ Move and Act: Enhanced Object Manipulation and Background Integrity for + Image Editing + + +
+ Current methods commonly utilize three-branch structures of inversion, +reconstruction, and editing, to tackle consistent image editing task. However, +these methods lack control over the generation position of the edited object +and have issues with background preservation. To overcome these limitations, we +propose a tuning-free method with only two branches: inversion and editing. +This approach allows users to simultaneously edit the object's action and +control the generation position of the edited object. Additionally, it achieves +improved background preservation. Specifically, we transfer the edited object +information to the target area and repair or preserve the background of other +areas during the inversion process at a specific time step. In the editing +stage, we use the image features in self-attention to query the key and value +of the corresponding time step in the inversion to achieve consistent image +editing. Impressive image editing results and quantitative evaluation +demonstrate the effectiveness of our method. The code is available at +https://github.com/mobiushy/move-act. + +
+
+
+
+
+ + ☆ DragText: Rethinking Text Embedding in Point-based Image Editing + + +
+ Point-based image editing enables accurate and flexible control through +content dragging. However, the role of text embedding in the editing process +has not been thoroughly investigated. A significant aspect that remains +unexplored is the interaction between text and image embeddings. In this study, +we show that during the progressive editing of an input image in a diffusion +model, the text embedding remains constant. As the image embedding increasingly +diverges from its initial state, the discrepancy between the image and text +embeddings presents a significant challenge. Moreover, we found that the text +prompt significantly influences the dragging process, particularly in +maintaining content integrity and achieving the desired manipulation. To +utilize these insights, we propose DragText, which optimizes text embedding in +conjunction with the dragging process to pair with the modified image +embedding. Simultaneously, we regularize the text optimization process to +preserve the integrity of the original text prompt. Our approach can be +seamlessly integrated with existing diffusion-based drag methods with only a +few lines of code. + +
+
+ comment: 22 pages, 18 figures +
+
+
+
+
+ + ☆ UMono: Physical Model Informed Hybrid CNN-Transformer Framework for + Underwater Monocular Depth Estimation + + +
+ Underwater monocular depth estimation serves as the foundation for tasks such +as 3D reconstruction of underwater scenes. However, due to the influence of +light and medium, the underwater environment undergoes a distinctive imaging +process, which presents challenges in accurately estimating depth from a single +image. The existing methods fail to consider the unique characteristics of +underwater environments, leading to inadequate estimation results and limited +generalization performance. Furthermore, underwater depth estimation requires +extracting and fusing both local and global features, which is not fully +explored in existing methods. In this paper, an end-to-end learning framework +for underwater monocular depth estimation called UMono is presented, which +incorporates underwater image formation model characteristics into network +architecture, and effectively utilize both local and global features of +underwater image. Experimental results demonstrate that the proposed method is +effective for underwater monocular depth estimation and outperforms the +existing methods in both quantitative and qualitative analyses. + +
+
+
+
+
+ + ☆ Towards the Spectral bias Alleviation by Normalizations in Coordinate + Networks + + +
+ Representing signals using coordinate networks dominates the area of inverse +problems recently, and is widely applied in various scientific computing tasks. +Still, there exists an issue of spectral bias in coordinate networks, limiting +the capacity to learn high-frequency components. This problem is caused by the +pathological distribution of the neural tangent kernel's (NTK's) eigenvalues of +coordinate networks. We find that, this pathological distribution could be +improved using classical normalization techniques (batch normalization and +layer normalization), which are commonly used in convolutional neural networks +but rarely used in coordinate networks. We prove that normalization techniques +greatly reduces the maximum and variance of NTK's eigenvalues while slightly +modifies the mean value, considering the max eigenvalue is much larger than the +most, this variance change results in a shift of eigenvalues' distribution from +a lower one to a higher one, therefore the spectral bias could be alleviated. +Furthermore, we propose two new normalization techniques by combining these two +techniques in different ways. The efficacy of these normalization techniques is +substantiated by the significant improvements and new state-of-the-arts +achieved by applying normalization-based coordinate networks to various tasks, +including the image compression, computed tomography reconstruction, shape +representation, magnetic resonance imaging, novel view synthesis and multi-view +stereo reconstruction. + +
+
+
+
+
+ + ☆ Image Segmentation via Divisive Normalization: dealing with + environmental diversity + + +
+ Autonomous driving is a challenging scenario for image segmentation due to +the presence of uncontrolled environmental conditions and the eventually +catastrophic consequences of failures. Previous work suggested that a +biologically motivated computation, the so-called Divisive Normalization, could +be useful to deal with image variability, but its effects have not been +systematically studied over different data sources and environmental factors. +Here we put segmentation U-nets augmented with Divisive Normalization to work +far from training conditions to find where this adaptation is more critical. We +categorize the scenes according to their radiance level and dynamic range +(day/night), and according to their achromatic/chromatic contrasts. We also +consider video game (synthetic) images to broaden the range of environments. We +check the performance in the extreme percentiles of such categorization. Then, +we push the limits further by artificially modifying the images in +perceptually/environmentally relevant dimensions: luminance, contrasts and +spectral radiance. Results show that neural networks with Divisive +Normalization get better results in all the scenarios and their performance +remains more stable with regard to the considered environmental factors and +nature of the source. Finally, we explain the improvements in segmentation +performance in two ways: (1) by quantifying the invariance of the responses +that incorporate Divisive Normalization, and (2) by illustrating the adaptive +nonlinearity of the different layers that depends on the local activity. + +
+
+
+
+
+ + ☆ Unified Lexical Representation for Interpretable Visual-Language + Alignment + + +
+ Visual-Language Alignment (VLA) has gained a lot of attention since CLIP's +groundbreaking work. Although CLIP performs well, the typical direct latent +feature alignment lacks clarity in its representation and similarity scores. On +the other hand, lexical representation, a vector whose element represents the +similarity between the sample and a word from the vocabulary, is a natural +sparse representation and interpretable, providing exact matches for individual +words. However, lexical representations is difficult to learn due to no +ground-truth supervision and false-discovery issues, and thus requires complex +design to train effectively. In this paper, we introduce LexVLA, a more +interpretable VLA framework by learning a unified lexical representation for +both modalities without complex design. We use DINOv2 as our visual model for +its local-inclined features and Llama 2, a generative language model, to +leverage its in-context lexical prediction ability. To avoid the false +discovery, we propose an overuse penalty to refrain the lexical representation +from falsely frequently activating meaningless words. We demonstrate that these +two pre-trained uni-modal models can be well-aligned by fine-tuning on modest +multi-modal dataset and avoid intricate training configurations. On cross-modal +retrieval benchmarks, LexVLA, trained on the CC-12M multi-modal dataset, +outperforms baselines fine-tuned on larger datasets (e.g., YFCC15M) and those +trained from scratch on even bigger datasets (e.g., 1.1B data, including +CC-12M). We conduct extensive experiments to analyze LexVLA. + +
+
+
+
+
+ + ☆ Enhancing Model Performance: Another Approach to Vision-Language + Instruction Tuning + + +
+ The integration of large language models (LLMs) with vision-language (VL) +tasks has been a transformative development in the realm of artificial +intelligence, highlighting the potential of LLMs as a versatile general-purpose +chatbot. However, the current trend in this evolution focuses on the +integration of vision and language to create models that can operate in more +diverse and real-world contexts. We present a novel approach, termed Bottleneck +Adapter, specifically crafted for enhancing the multimodal functionalities of +these complex models, enabling joint optimization of the entire multimodal LLM +framework through a process known as Multimodal Model Tuning (MMT). Our +approach utilizes lightweight adapters to connect the image encoder and LLM +without the need for large, complex neural networks. Unlike the conventional +modular training schemes, our approach adopts an end-to-end optimization +regime, which, when combined with the adapters, facilitates the joint +optimization using a significantly smaller parameter set. Our method exhibits +robust performance with 90.12\% accuracy, outperforming both human-level +performance (88.4\%) and LaVIN-7B (89.41\%). + +
+
+
+
+
+ + ☆ A Unified Understanding of Adversarial Vulnerability Regarding Unimodal + Models and Vision-Language Pre-training Models + + +
+ With Vision-Language Pre-training (VLP) models demonstrating powerful +multimodal interaction capabilities, the application scenarios of neural +networks are no longer confined to unimodal domains but have expanded to more +complex multimodal V+L downstream tasks. The security vulnerabilities of +unimodal models have been extensively examined, whereas those of VLP models +remain challenging. We note that in CV models, the understanding of images +comes from annotated information, while VLP models are designed to learn image +representations directly from raw text. Motivated by this discrepancy, we +developed the Feature Guidance Attack (FGA), a novel method that uses text +representations to direct the perturbation of clean images, resulting in the +generation of adversarial images. FGA is orthogonal to many advanced attack +strategies in the unimodal domain, facilitating the direct application of rich +research findings from the unimodal to the multimodal scenario. By +appropriately introducing text attack into FGA, we construct Feature Guidance +with Text Attack (FGA-T). Through the interaction of attacking two modalities, +FGA-T achieves superior attack effects against VLP models. Moreover, +incorporating data augmentation and momentum mechanisms significantly improves +the black-box transferability of FGA-T. Our method demonstrates stable and +effective attack capabilities across various datasets, downstream tasks, and +both black-box and white-box settings, offering a unified baseline for +exploring the robustness of VLP models. + +
+
+ comment: 14 pages, 9 figures, published in ACMMM2024(oral) +
+
+
+
+
+ + ☆ Harnessing Temporal Causality for Advanced Temporal Action Detection + + +
+ As a fundamental task in long-form video understanding, temporal action +detection (TAD) aims to capture inherent temporal relations in untrimmed videos +and identify candidate actions with precise boundaries. Over the years, various +networks, including convolutions, graphs, and transformers, have been explored +for effective temporal modeling for TAD. However, these modules typically treat +past and future information equally, overlooking the crucial fact that changes +in action boundaries are essentially causal events. Inspired by this insight, +we propose leveraging the temporal causality of actions to enhance TAD +representation by restricting the model's access to only past or future +context. We introduce CausalTAD, which combines causal attention and causal +Mamba to achieve state-of-the-art performance on multiple benchmarks. Notably, +with CausalTAD, we ranked 1st in the Action Recognition, Action Detection, and +Audio-Based Interaction Detection tracks at the EPIC-Kitchens Challenge 2024, +as well as 1st in the Moment Queries track at the Ego4D Challenge 2024. Our +code is available at https://github.com/sming256/OpenTAD/causaltad. + +
+
+ comment: 1st in Moment Queries track at the Ego4D Challenge 2024; 1st in + Action Recognition, Action Detection, and Audio-Based Interaction Detection + tracks at the EPIC-Kitchens Challenge 2024 +
+
+
+
+
+ + ☆ Investigating learning-independent abstract reasoning in artificial + neural networks + + +
+ Humans are capable of solving complex abstract reasoning tests. Whether this +ability reflects a learning-independent inference mechanism applicable to any +novel unlearned problem or whether it is a manifestation of extensive training +throughout life is an open question. Addressing this question in humans is +challenging because it is impossible to control their prior training. However, +assuming a similarity between the cognitive processing of Artificial Neural +Networks (ANNs) and humans, the extent to which training is required for ANNs' +abstract reasoning is informative about this question in humans. Previous +studies demonstrated that ANNs can solve abstract reasoning tests. However, +this success required extensive training. In this study, we examined the +learning-independent abstract reasoning of ANNs. Specifically, we evaluated +their performance without any pretraining, with the ANNs' weights being +randomly-initialized, and only change in the process of problem solving. We +found that naive ANN models can solve non-trivial visual reasoning tests, +similar to those used to evaluate human learning-independent reasoning. We +further studied the mechanisms that support this ability. Our results suggest +the possibility of learning-independent abstract reasoning that does not +require extensive training. + +
+
+
+
+
+ + ☆ Topology-Preserving Downsampling of Binary Images ECCV + + +
+ We present a novel discrete optimization-based approach to generate +downsampled versions of binary images that are guaranteed to have the same +topology as the original, measured by the zeroth and first Betti numbers of the +black regions, while having good similarity to the original image as measured +by IoU and Dice scores. To our best knowledge, all existing binary image +downsampling methods do not have such topology-preserving guarantees. We also +implemented a baseline morphological operation (dilation)-based approach that +always generates topologically correct results. However, we found the +similarity scores to be much worse. We demonstrate several applications of our +approach. First, generating smaller versions of medical image segmentation +masks for easier human inspection. Second, improving the efficiency of binary +image operations, including persistent homology computation and shortest path +computation, by substituting the original images with smaller ones. In +particular, the latter is a novel application that is made feasible only by the +full topology-preservation guarantee of our method. + +
+
+ comment: Accepted to The 18th European Conference on Computer Vision (ECCV) + 2024 +
+
+
+
+
+ + ☆ How Lightweight Can A Vision Transformer Be + + +
+ In this paper, we explore a strategy that uses Mixture-of-Experts (MoE) to +streamline, rather than augment, vision transformers. Each expert in an MoE +layer is a SwiGLU feedforward network, where V and W2 are shared across the +layer. No complex attention or convolutional mechanisms are employed. +Depth-wise scaling is applied to progressively reduce the size of the hidden +layer and the number of experts is increased in stages. Grouped query attention +is used. We studied the proposed approach with and without pre-training on +small datasets and investigated whether transfer learning works at this scale. +We found that the architecture is competitive even at a size of 0.67M +parameters. + +
+
+
+
+
+ + ☆ HF-Fed: Hierarchical based customized Federated Learning Framework for + X-Ray Imaging + + +
+ In clinical applications, X-ray technology is vital for noninvasive +examinations like mammography, providing essential anatomical information. +However, the radiation risk associated with X-ray procedures raises concerns. +X-ray reconstruction is crucial in medical imaging for detailed visual +representations of internal structures, aiding diagnosis and treatment without +invasive procedures. Recent advancements in deep learning (DL) have shown +promise in X-ray reconstruction, but conventional DL methods often require +centralized aggregation of large datasets, leading to domain shifts and privacy +issues. To address these challenges, we introduce the Hierarchical +Framework-based Federated Learning method (HF-Fed) for customized X-ray +imaging. HF-Fed tackles X-ray imaging optimization by decomposing the problem +into local data adaptation and holistic X-ray imaging. It employs a +hospital-specific hierarchical framework and a shared common imaging network +called Network of Networks (NoN) to acquire stable features from diverse data +distributions. The hierarchical hypernetwork extracts domain-specific +hyperparameters, conditioning the NoN for customized X-ray reconstruction. +Experimental results demonstrate HF-Fed's competitive performance, offering a +promising solution for enhancing X-ray imaging without data sharing. This study +significantly contributes to the literature on federated learning in +healthcare, providing valuable insights for policymakers and healthcare +providers. The source code and pre-trained HF-Fed model are available at +\url{https://tisharepo.github.io/Webpage/}. + +
+
+
+
+
+ + ☆ DAC: 2D-3D Retrieval with Noisy Labels via Divide-and-Conquer Alignment + and Correction ACM MM 2024 + + +
+ With the recent burst of 2D and 3D data, cross-modal retrieval has attracted +increasing attention recently. However, manual labeling by non-experts will +inevitably introduce corrupted annotations given ambiguous 2D/3D content. +Though previous works have addressed this issue by designing a naive division +strategy with hand-crafted thresholds, their performance generally exhibits +great sensitivity to the threshold value. Besides, they fail to fully utilize +the valuable supervisory signals within each divided subset. To tackle this +problem, we propose a Divide-and-conquer 2D-3D cross-modal Alignment and +Correction framework (DAC), which comprises Multimodal Dynamic Division (MDD) +and Adaptive Alignment and Correction (AAC). Specifically, the former performs +accurate sample division by adaptive credibility modeling for each sample based +on the compensation information within multimodal loss distribution. Then in +AAC, samples in distinct subsets are exploited with different alignment +strategies to fully enhance the semantic compactness and meanwhile alleviate +over-fitting to noisy labels, where a self-correction strategy is introduced to +improve the quality of representation. Moreover. To evaluate the effectiveness +in real-world scenarios, we introduce a challenging noisy benchmark, namely +Objaverse-N200, which comprises 200k-level samples annotated with 1156 +realistic noisy labels. Extensive experiments on both traditional and the newly +proposed benchmarks demonstrate the generality and superiority of our DAC, +where DAC outperforms state-of-the-art models by a large margin. (i.e., with ++5.9% gain on ModelNet40 and +5.8% on Objaverse-N200). + +
+
+ comment: accepted by ACM MM 2024 +
+
+
+
+
+ + ☆ KiVA: Kid-inspired Visual Analogies for Testing Large Multimodal Models + + +
+ This paper investigates visual analogical reasoning in large multimodal +models (LMMs) compared to human adults and children. A "visual analogy" is an +abstract rule inferred from one image and applied to another. While benchmarks +exist for testing visual reasoning in LMMs, they require advanced skills and +omit basic visual analogies that even young children can make. Inspired by +developmental psychology, we propose a new benchmark of 1,400 visual +transformations of everyday objects to test LMMs on visual analogical reasoning +and compare them to children and adults. We structure the evaluation into three +stages: identifying what changed (e.g., color, number, etc.), how it changed +(e.g., added one object), and applying the rule to new scenarios. Our findings +show that while models like GPT-4V, LLaVA-1.5, and MANTIS identify the "what" +effectively, they struggle with quantifying the "how" and extrapolating this +rule to new objects. In contrast, children and adults exhibit much stronger +analogical reasoning at all three stages. Additionally, the strongest tested +model, GPT-4V, performs better in tasks involving simple visual attributes like +color and size, correlating with quicker human adult response times. +Conversely, more complex tasks such as number, rotation, and reflection, which +necessitate extensive cognitive processing and understanding of the 3D physical +world, present more significant challenges. Altogether, these findings +highlight the limitations of training models on data that primarily consists of +2D images and text. + +
+
+ comment: 9 pages. For the KiVA benchmark, see https://github.com/ey242/KiVA +
+
+
+
+
+ + ☆ ERIT Lightweight Multimodal Dataset for Elderly Emotion Recognition and + Multimodal Fusion Evaluation + + +
+ ERIT is a novel multimodal dataset designed to facilitate research in a +lightweight multimodal fusion. It contains text and image data collected from +videos of elderly individuals reacting to various situations, as well as seven +emotion labels for each data sample. Because of the use of labeled images of +elderly users reacting emotionally, it is also facilitating research on emotion +recognition in an underrepresented age group in machine learning visual emotion +recognition. The dataset is validated through comprehensive experiments +indicating its importance in neural multimodal fusion research. + +
+
+
+
+
+ + ☆ Mpox Detection Advanced: Rapid Epidemic Response Through Synthetic Data + + +
+ Rapid development of disease detection models using computer vision is +crucial in responding to medical emergencies, such as epidemics or bioterrorism +events. Traditional data collection methods are often too slow in these +scenarios, requiring innovative approaches for quick, reliable model generation +from minimal data. Our study introduces a novel approach by constructing a +comprehensive computer vision model to detect Mpox lesions using only synthetic +data. Initially, these models generated a diverse set of synthetic images +representing Mpox lesions on various body parts (face, back, chest, leg, neck, +arm) across different skin tones as defined by the Fitzpatrick scale (fair, +brown, dark skin). Subsequently, we trained and tested a vision model with this +synthetic dataset to evaluate the diffusion models' efficacy in producing +high-quality training data and its impact on the vision model's medical image +recognition performance. The results were promising; the vision model achieved +a 97% accuracy rate, with 96% precision and recall for Mpox cases, and +similarly high metrics for normal and other skin disorder cases, demonstrating +its ability to correctly identify true positives and minimize false positives. +The model achieved an F1-Score of 96% for Mpox cases and 98% for normal and +other skin disorders, reflecting a balanced precision-recall relationship, thus +ensuring reliability and robustness in its predictions. Our proposed +SynthVision methodology indicates the potential to develop accurate computer +vision models with minimal data input for future medical emergencies. + +
+
+ comment: 8 pages, 4 figures, 1 table +
+
+
+
+
+ + ☆ CRASH: Crash Recognition and Anticipation System Harnessing with + Context-Aware and Temporal Focus Attentions + + +
+ Accurately and promptly predicting accidents among surrounding traffic agents +from camera footage is crucial for the safety of autonomous vehicles (AVs). +This task presents substantial challenges stemming from the unpredictable +nature of traffic accidents, their long-tail distribution, the intricacies of +traffic scene dynamics, and the inherently constrained field of vision of +onboard cameras. To address these challenges, this study introduces a novel +accident anticipation framework for AVs, termed CRASH. It seamlessly integrates +five components: object detector, feature extractor, object-aware module, +context-aware module, and multi-layer fusion. Specifically, we develop the +object-aware module to prioritize high-risk objects in complex and ambiguous +environments by calculating the spatial-temporal relationships between traffic +agents. In parallel, the context-aware is also devised to extend global visual +information from the temporal to the frequency domain using the Fast Fourier +Transform (FFT) and capture fine-grained visual features of potential objects +and broader context cues within traffic scenes. To capture a wider range of +visual cues, we further propose a multi-layer fusion that dynamically computes +the temporal dependencies between different scenes and iteratively updates the +correlations between different visual features for accurate and timely accident +prediction. Evaluated on real-world datasets--Dashcam Accident Dataset (DAD), +Car Crash Dataset (CCD), and AnAn Accident Detection (A3D) datasets--our model +surpasses existing top baselines in critical evaluation metrics like Average +Precision (AP) and mean Time-To-Accident (mTTA). Importantly, its robustness +and adaptability are particularly evident in challenging driving scenarios with +missing or limited training data, demonstrating significant potential for +application in real-world autonomous driving systems. + +
+
+
+
+
+ + ☆ Enhancing Eye Disease Diagnosis with Deep Learning and Synthetic Data + Augmentation + + +
+ In recent years, the focus is on improving the diagnosis of diabetic +retinopathy (DR) using machine learning and deep learning technologies. +Researchers have explored various approaches, including the use of +high-definition medical imaging, AI-driven algorithms such as convolutional +neural networks (CNNs) and generative adversarial networks (GANs). Among all +the available tools, CNNs have emerged as a preferred tool due to their +superior classification accuracy and efficiency. Although the accuracy of CNNs +is comparatively better but it can be improved by introducing some hybrid +models by combining various machine learning and deep learning models. +Therefore, in this paper, an ensemble learning technique is proposed for early +detection and management of DR with higher accuracy. The proposed model is +tested on the APTOS dataset and it is showing supremacy on the validation +accuracy ($99\%)$ in comparison to the previous models. Hence, the model can be +helpful for early detection and treatment of the DR, thereby enhancing the +overall quality of care for affected individuals. + +
+
+ comment: 18 pages, 7 figures, 2 Tables +
+
+
+
+
+ + ☆ Balancing Complementarity and Consistency via Delayed Activation in + Incomplete Multi-view Clustering + + +
+ This paper study one challenging issue in incomplete multi-view clustering, +where valuable complementary information from other views is always ignored. To +be specific, we propose a framework that effectively balances Complementarity +and Consistency information in Incomplete Multi-view Clustering (CoCo-IMC). +Specifically, we design a dual network of delayed activation, which achieves a +balance of complementarity and consistency among different views. The delayed +activation could enriches the complementarity information that was ignored +during consistency learning. Then, we recover the incomplete information and +enhance the consistency learning by minimizing the conditional entropy and +maximizing the mutual information across different views. This could be the +first theoretical attempt to incorporate delayed activation into incomplete +data recovery and the balance of complementarity and consistency. We have +proved the effectiveness of CoCo-IMC in extensive comparative experiments with +12 state-of-the-art baselines on four publicly available datasets. + +
+
+
+
+
+ + ☆ Enhancing Fine-grained Object Detection in Aerial Images via Orthogonal + Mapping + + +
+ Fine-Grained Object Detection (FGOD) is a critical task in high-resolution +aerial image analysis. This letter introduces Orthogonal Mapping (OM), a simple +yet effective method aimed at addressing the challenge of semantic confusion +inherent in FGOD. OM introduces orthogonal constraints in the feature space by +decoupling features from the last layer of the classification branch with a +class-wise orthogonal vector basis. This effectively mitigates semantic +confusion and enhances classification accuracy. Moreover, OM can be seamlessly +integrated into mainstream object detectors. Extensive experiments conducted on +three FGOD datasets (FAIR1M, ShipRSImageNet, and MAR20) demonstrate the +effectiveness and superiority of the proposed approach. Notably, with just one +line of code, OM achieves a 4.08% improvement in mean Average Precision (mAP) +over FCOS on the ShipRSImageNet dataset. Codes are released at +https://github.com/ZhuHaoranEIS/Orthogonal-FGOD. + +
+
+
+
+
+ + ☆ Cost-effective Instruction Learning for Pathology Vision and Language + Analysis + + +
+ The advent of vision-language models fosters the interactive conversations +between AI-enabled models and humans. Yet applying these models into clinics +must deal with daunting challenges around large-scale training data, financial, +and computational resources. Here we propose a cost-effective instruction +learning framework for conversational pathology named as CLOVER. CLOVER only +trains a lightweight module and uses instruction tuning while freezing the +parameters of the large language model. Instead of using costly GPT-4, we +propose well-designed prompts on GPT-3.5 for building generation-based +instructions, emphasizing the utility of pathological knowledge derived from +the Internet source. To augment the use of instructions, we construct a +high-quality set of template-based instructions in the context of digital +pathology. From two benchmark datasets, our findings reveal the strength of +hybrid-form instructions in the visual question-answer in pathology. Extensive +results show the cost-effectiveness of CLOVER in answering both open-ended and +closed-ended questions, where CLOVER outperforms strong baselines that possess +37 times more training parameters and use instruction data generated from +GPT-4. Through the instruction tuning, CLOVER exhibits robustness of few-shot +learning in the external clinical dataset. These findings demonstrate that +cost-effective modeling of CLOVER could accelerate the adoption of rapid +conversational applications in the landscape of digital pathology. + +
+
+
+
+
+ + ☆ Multi-modal Data Binding for Survival Analysis Modeling with Incomplete + Data and Annotations MICCAI 2024 + + +
+ Survival analysis stands as a pivotal process in cancer treatment research, +crucial for predicting patient survival rates accurately. Recent advancements +in data collection techniques have paved the way for enhancing survival +predictions by integrating information from multiple modalities. However, +real-world scenarios often present challenges with incomplete data, +particularly when dealing with censored survival labels. Prior works have +addressed missing modalities but have overlooked incomplete labels, which can +introduce bias and limit model efficacy. To bridge this gap, we introduce a +novel framework that simultaneously handles incomplete data across modalities +and censored survival labels. Our approach employs advanced foundation models +to encode individual modalities and align them into a universal representation +space for seamless fusion. By generating pseudo labels and incorporating +uncertainty, we significantly enhance predictive accuracy. The proposed method +demonstrates outstanding prediction accuracy in two survival analysis tasks on +both employed datasets. This innovative approach overcomes limitations +associated with disparate modalities and improves the feasibility of +comprehensive survival analysis using multiple large foundation models. + +
+
+ comment: Accepted by MICCAI 2024 +
+
+
+
+
+ + ☆ ALMRR: Anomaly Localization Mamba on Industrial Textured Surface with + Feature Reconstruction and Refinement + + +
+ Unsupervised anomaly localization on industrial textured images has achieved +remarkable results through reconstruction-based methods, yet existing +approaches based on image reconstruction and feature reconstruc-tion each have +their own shortcomings. Firstly, image-based methods tend to reconstruct both +normal and anomalous regions well, which lead to over-generalization. +Feature-based methods contain a large amount of distin-guishable semantic +information, however, its feature structure is redundant and lacks anomalous +information, which leads to significant reconstruction errors. In this paper, +we propose an Anomaly Localization method based on Mamba with Feature +Reconstruction and Refinement(ALMRR) which re-constructs semantic features +based on Mamba and then refines them through a feature refinement module. To +equip the model with prior knowledge of anomalies, we enhance it by adding +artificially simulated anomalies to the original images. Unlike image +reconstruction or repair, the features of synthesized defects are repaired +along with those of normal areas. Finally, the aligned features containing rich +semantic information are fed in-to the refinement module to obtain the anomaly +map. Extensive experiments have been conducted on the MVTec-AD-Textured dataset +and other real-world industrial dataset, which has demonstrated superior +performance com-pared to state-of-the-art (SOTA) methods. + +
+
+
+
+
+ + ☆ SAM-MIL: A Spatial Contextual Aware Multiple Instance Learning Approach + for Whole Slide Image Classification + + +
+ Multiple Instance Learning (MIL) represents the predominant framework in +Whole Slide Image (WSI) classification, covering aspects such as sub-typing, +diagnosis, and beyond. Current MIL models predominantly rely on instance-level +features derived from pretrained models such as ResNet. These models segment +each WSI into independent patches and extract features from these local +patches, leading to a significant loss of global spatial context and +restricting the model's focus to merely local features. To address this issue, +we propose a novel MIL framework, named SAM-MIL, that emphasizes spatial +contextual awareness and explicitly incorporates spatial context by extracting +comprehensive, image-level information. The Segment Anything Model (SAM) +represents a pioneering visual segmentation foundational model that can capture +segmentation features without the need for additional fine-tuning, rendering it +an outstanding tool for extracting spatial context directly from raw WSIs. Our +approach includes the design of group feature extraction based on spatial +context and a SAM-Guided Group Masking strategy to mitigate class imbalance +issues. We implement a dynamic mask ratio for different segmentation categories +and supplement these with representative group features of categories. +Moreover, SAM-MIL divides instances to generate additional pseudo-bags, thereby +augmenting the training set, and introduces consistency of spatial context +across pseudo-bags to further enhance the model's performance. Experimental +results on the CAMELYON-16 and TCGA Lung Cancer datasets demonstrate that our +proposed SAM-MIL model outperforms existing mainstream methods in WSIs +classification. Our open-source implementation code is is available at +https://github.com/FangHeng/SAM-MIL. + +
+
+ comment: accepted by ACM Multimedia 2024 +
+
+
+
+
+ + ☆ Weighted Risk Invariance: Domain Generalization under Invariant Feature + Shift + + +
+ Learning models whose predictions are invariant under multiple environments +is a promising approach for out-of-distribution generalization. Such models are +trained to extract features $X_{\text{inv}}$ where the conditional distribution +$Y \mid X_{\text{inv}}$ of the label given the extracted features does not +change across environments. Invariant models are also supposed to generalize to +shifts in the marginal distribution $p(X_{\text{inv}})$ of the extracted +features $X_{\text{inv}}$, a type of shift we call an $\textit{invariant +covariate shift}$. However, we show that proposed methods for learning +invariant models underperform under invariant covariate shift, either failing +to learn invariant models$\unicode{x2014}$even for data generated from simple +and well-studied linear-Gaussian models$\unicode{x2014}$or having poor +finite-sample performance. To alleviate these problems, we propose +$\textit{weighted risk invariance}$ (WRI). Our framework is based on imposing +invariance of the loss across environments subject to appropriate reweightings +of the training examples. We show that WRI provably learns invariant models, +i.e. discards spurious correlations, in linear-Gaussian settings. We propose a +practical algorithm to implement WRI by learning the density +$p(X_{\text{inv}})$ and the model parameters simultaneously, and we demonstrate +empirically that WRI outperforms previous invariant learning methods under +invariant covariate shift. + +
+
+
+
+
+ + ☆ A Reference-Based 3D Semantic-Aware Framework for Accurate Local Facial + Attribute Editing + + +
+ Facial attribute editing plays a crucial role in synthesizing realistic faces +with specific characteristics while maintaining realistic appearances. Despite +advancements, challenges persist in achieving precise, 3D-aware attribute +modifications, which are crucial for consistent and accurate representations of +faces from different angles. Current methods struggle with semantic +entanglement and lack effective guidance for incorporating attributes while +maintaining image integrity. To address these issues, we introduce a novel +framework that merges the strengths of latent-based and reference-based editing +methods. Our approach employs a 3D GAN inversion technique to embed attributes +from the reference image into a tri-plane space, ensuring 3D consistency and +realistic viewing from multiple perspectives. We utilize blending techniques +and predicted semantic masks to locate precise edit regions, merging them with +the contextual guidance from the reference image. A coarse-to-fine inpainting +strategy is then applied to preserve the integrity of untargeted areas, +significantly enhancing realism. Our evaluations demonstrate superior +performance across diverse editing tasks, validating our framework's +effectiveness in realistic and applicable facial attribute editing. + +
+
+
+
+
+ + ☆ UOUO: Uncontextualized Uncommon Objects for Measuring Knowledge Horizons + of Vision Language Models + + +
+ Smaller-scale Vision-Langauge Models (VLMs) often claim to perform on par +with larger models in general-domain visual grounding and question-answering +benchmarks while offering advantages in computational efficiency and storage. +However, their ability to handle rare objects, which fall into the long tail of +data distributions, is less understood. To rigorously evaluate this aspect, we +introduce the "Uncontextualized Uncommon Objects" (UOUO) benchmark. This +benchmark focuses on systematically testing VLMs with both large and small +parameter counts on rare and specialized objects. Our comprehensive analysis +reveals that while smaller VLMs maintain competitive performance on common +datasets, they significantly underperform on tasks involving uncommon objects. +We also propose an advanced, scalable pipeline for data collection and +cleaning, ensuring the UOUO benchmark provides high-quality, challenging +instances. These findings highlight the need to consider long-tail +distributions when assessing the true capabilities of VLMs. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Adapting Mouse Pathological Model to Human Glomerular Lesion + Segmentation + + +
+ Moving from animal models to human applications in preclinical research +encompasses a broad spectrum of disciplines in medical science. A fundamental +element in the development of new drugs, treatments, diagnostic methods, and in +deepening our understanding of disease processes is the accurate measurement of +kidney tissues. Past studies have demonstrated the viability of translating +glomeruli segmentation techniques from mouse models to human applications. Yet, +these investigations tend to neglect the complexities involved in segmenting +pathological glomeruli affected by different lesions. Such lesions present a +wider range of morphological variations compared to healthy glomerular tissue, +which are arguably more valuable than normal glomeruli in clinical practice. +Furthermore, data on lesions from animal models can be more readily scaled up +from disease models and whole kidney biopsies. This brings up a question: +``\textit{Can a pathological segmentation model trained on mouse models be +effectively applied to human patients?}" To answer this question, we introduced +GLAM, a deep learning study for fine-grained segmentation of human kidney +lesions using a mouse model, addressing mouse-to-human transfer learning, by +evaluating different learning strategies for segmenting human pathological +lesions using zero-shot transfer learning and hybrid learning by leveraging +mouse samples. From the results, the hybrid learning model achieved superior +performance. + +
+
+
+
+
+ + ☆ Neural Surface Detection for Unsigned Distance Fields ECCV 2024 + + +
+ Extracting surfaces from Signed Distance Fields (SDFs) can be accomplished +using traditional algorithms, such as Marching Cubes. However, since they rely +on sign flips across the surface, these algorithms cannot be used directly on +Unsigned Distance Fields (UDFs). In this work, we introduce a deep-learning +approach to taking a UDF and turning it locally into an SDF, so that it can be +effectively triangulated using existing algorithms. We show that it achieves +better accuracy in surface detection than existing methods. Furthermore it +generalizes well to unseen shapes and datasets, while being parallelizable. We +also demonstrate the flexibily of the method by using it in conjunction with +DualMeshUDF, a state of the art dual meshing method that can operate on UDFs, +improving its results and removing the need to tune its parameters. + +
+
+ comment: Accepted to ECCV 2024 +
+
+
+
+
+ + ☆ Retinal IPA: Iterative KeyPoints Alignment for Multimodal Retinal + Imaging + + +
+ We propose a novel framework for retinal feature point alignment, designed +for learning cross-modality features to enhance matching and registration +across multi-modality retinal images. Our model draws on the success of +previous learning-based feature detection and description methods. To better +leverage unlabeled data and constrain the model to reproduce relevant +keypoints, we integrate a keypoint-based segmentation task. It is trained in a +self-supervised manner by enforcing segmentation consistency between different +augmentations of the same image. By incorporating a keypoint augmented +self-supervised layer, we achieve robust feature extraction across modalities. +Extensive evaluation on two public datasets and one in-house dataset +demonstrates significant improvements in performance for modality-agnostic +retinal feature alignment. Our code and model weights are publicly available at +\url{https://github.com/MedICL-VU/RetinaIPA}. + +
+
+
+
+
+ + ☆ SMiCRM: A Benchmark Dataset of Mechanistic Molecular Images + + +
+ Optical chemical structure recognition (OCSR) systems aim to extract the +molecular structure information, usually in the form of molecular graph or +SMILES, from images of chemical molecules. While many tools have been developed +for this purpose, challenges still exist due to different types of noises that +might exist in the images. Specifically, we focus on the 'arrow-pushing' +diagrams, a typical type of chemical images to demonstrate electron flow in +mechanistic steps. We present Structural molecular identifier of Molecular +images in Chemical Reaction Mechanisms (SMiCRM), a dataset designed to +benchmark machine recognition capabilities of chemical molecules with +arrow-pushing annotations. Comprising 453 images, it spans a broad array of +organic chemical reactions, each illustrated with molecular structures and +mechanistic arrows. SMiCRM offers a rich collection of annotated molecule +images for enhancing the benchmarking process for OCSR methods. This dataset +includes a machine-readable molecular identity for each image as well as +mechanistic arrows showing electron flow during chemical reactions. It presents +a more authentic and challenging task for testing molecular recognition +technologies, and achieving this task can greatly enrich the mechanisitic +information in computer-extracted chemical reaction data. + +
+
+ comment: Under Submission +
+
+
+
+
+ + ☆ MARINE: A Computer Vision Model for Detecting Rare Predator-Prey + Interactions in Animal Videos + + +
+ Encounters between predator and prey play an essential role in ecosystems, +but their rarity makes them difficult to detect in video recordings. Although +advances in action recognition (AR) and temporal action detection (AD), +especially transformer-based models and vision foundation models, have achieved +high performance on human action datasets, animal videos remain relatively +under-researched. This thesis addresses this gap by proposing the model MARINE, +which utilizes motion-based frame selection designed for fast animal actions +and DINOv2 feature extraction with a trainable classification head for action +recognition. MARINE outperforms VideoMAE in identifying predator attacks in +videos of fish, both on a small and specific coral reef dataset (81.53\% +against 52.64\% accuracy), and on a subset of the more extensive Animal Kingdom +dataset (94.86\% against 83.14\% accuracy). In a multi-label setting on a +representative sample of Animal Kingdom, MARINE achieves 23.79\% mAP, +positioning it mid-field among existing benchmarks. Furthermore, in an AD task +on the coral reef dataset, MARINE achieves 80.78\% AP (against VideoMAE's +34.89\%) although at a lowered t-IoU threshold of 25\%. Therefore, despite room +for improvement, MARINE offers an effective starter framework to apply to AR +and AD tasks on animal recordings and thus contribute to the study of natural +ecosystems. + +
+
+ comment: This is an MSc thesis by Zsofia Katona, supervised by the two other + authors +
+
+
+
+
+ + ♻ ☆ Streetscapes: Large-scale Consistent Street View Generation Using + Autoregressive Video Diffusion + + +
+ We present a method for generating Streetscapes-long sequences of views +through an on-the-fly synthesized city-scale scene. Our generation is +conditioned by language input (e.g., city name, weather), as well as an +underlying map/layout hosting the desired trajectory. Compared to recent models +for video generation or 3D view synthesis, our method can scale to much +longer-range camera trajectories, spanning several city blocks, while +maintaining visual quality and consistency. To achieve this goal, we build on +recent work on video diffusion, used within an autoregressive framework that +can easily scale to long sequences. In particular, we introduce a new temporal +imputation method that prevents our autoregressive approach from drifting from +the distribution of realistic city imagery. We train our Streetscapes system on +a compelling source of data-posed imagery from Google Street View, along with +contextual map data-which allows users to generate city views conditioned on +any desired city layout, with controllable camera poses. Please see more +results at our project page at https://boyangdeng.com/streetscapes. + +
+
+ comment: *Equal Contributions; Fixed few duplicated references from 1st + upload; Project Page: https://boyangdeng.com/streetscapes +
+
+
+
+
+ + ♻ ☆ Castling-ViT: Compressing Self-Attention via Switching Towards + Linear-Angular Attention at Vision Transformer Inference CVPR 2023 + + +
+ Vision Transformers (ViTs) have shown impressive performance but still +require a high computation cost as compared to convolutional neural networks +(CNNs), one reason is that ViTs' attention measures global similarities and +thus has a quadratic complexity with the number of input tokens. Existing +efficient ViTs adopt local attention (e.g., Swin) or linear attention (e.g., +Performer), which sacrifice ViTs' capabilities of capturing either global or +local context. In this work, we ask an important research question: Can ViTs +learn both global and local context while being more efficient during +inference? To this end, we propose a framework called Castling-ViT, which +trains ViTs using both linear-angular attention and masked softmax-based +quadratic attention, but then switches to having only linear angular attention +during ViT inference. Our Castling-ViT leverages angular kernels to measure the +similarities between queries and keys via spectral angles. And we further +simplify it with two techniques: (1) a novel linear-angular attention +mechanism: we decompose the angular kernels into linear terms and high-order +residuals, and only keep the linear terms; and (2) we adopt two parameterized +modules to approximate high-order residuals: a depthwise convolution and an +auxiliary masked softmax attention to help learn both global and local +information, where the masks for softmax attention are regularized to gradually +become zeros and thus incur no overhead during ViT inference. Extensive +experiments and ablation studies on three tasks consistently validate the +effectiveness of the proposed Castling-ViT, e.g., achieving up to a 1.8% higher +accuracy or 40% MACs reduction on ImageNet classification and 1.2 higher mAP on +COCO detection under comparable FLOPs, as compared to ViTs with vanilla +softmax-based attentions. + +
+
+ comment: CVPR 2023 Camera Ready +
+
+
+
+
+ + ♻ ☆ Per-Gaussian Embedding-Based Deformation for Deformable 3D Gaussian + Splatting ECCV 2024 + + +
+ As 3D Gaussian Splatting (3DGS) provides fast and high-quality novel view +synthesis, it is a natural extension to deform a canonical 3DGS to multiple +frames for representing a dynamic scene. However, previous works fail to +accurately reconstruct complex dynamic scenes. We attribute the failure to the +design of the deformation field, which is built as a coordinate-based function. +This approach is problematic because 3DGS is a mixture of multiple fields +centered at the Gaussians, not just a single coordinate-based framework. To +resolve this problem, we define the deformation as a function of per-Gaussian +embeddings and temporal embeddings. Moreover, we decompose deformations as +coarse and fine deformations to model slow and fast movements, respectively. +Also, we introduce a local smoothness regularization for per-Gaussian embedding +to improve the details in dynamic regions. Project page: +https://jeongminb.github.io/e-d3dgs/ + +
+
+ comment: ECCV 2024. Project page: https://jeongminb.github.io/e-d3dgs/ +
+
+
+
+
+ + ♻ ☆ HAIFIT: Human-Centered AI for Fashion Image Translation + + +
+ In the realm of fashion design, sketches serve as the canvas for expressing +an artist's distinctive drawing style and creative vision, capturing intricate +details like stroke variations and texture nuances. The advent of +sketch-to-image cross-modal translation technology has notably aided designers. +However, existing methods often compromise these sketch details during image +generation, resulting in images that deviate from the designer's intended +concept. This limitation hampers the ability to offer designers a precise +preview of the final output. To overcome this challenge, we introduce HAIFIT, a +novel approach that transforms sketches into high-fidelity, lifelike clothing +images by integrating multi-scale features and capturing extensive feature map +dependencies from diverse perspectives. Through extensive qualitative and +quantitative evaluations conducted on our self-collected dataset, our method +demonstrates superior performance compared to existing methods in generating +photorealistic clothing images. Our method excels in preserving the distinctive +style and intricate details essential for fashion design applications. In +addition, our method also has obvious advantages in model training and +inference speed, contributing to reducing designers' time costs and improving +design efficiency. + +
+
+ comment: 10 pages,7 figures +
+
+
+
+
+ + ♻ ☆ Reference-Based 3D-Aware Image Editing with Triplanes + + +
+ Generative Adversarial Networks (GANs) have emerged as powerful tools for +high-quality image generation and real image editing by manipulating their +latent spaces. Recent advancements in GANs include 3D-aware models such as +EG3D, which feature efficient triplane-based architectures capable of +reconstructing 3D geometry from single images. However, limited attention has +been given to providing an integrated framework for 3D-aware, high-quality, +reference-based image editing. This study addresses this gap by exploring and +demonstrating the effectiveness of the triplane space for advanced +reference-based edits. Our novel approach integrates encoding, automatic +localization, spatial disentanglement of triplane features, and fusion learning +to achieve the desired edits. Additionally, our framework demonstrates +versatility and robustness across various domains, extending its effectiveness +to animal face edits, partially stylized edits like cartoon faces, full-body +clothing edits, and 360-degree head edits. Our method shows state-of-the-art +performance over relevant latent direction, text, and image-guided 2D and +3D-aware diffusion and GAN methods, both qualitatively and quantitatively. + +
+
+ comment: 20 pages, including supplementary material +
+
+
+
+
+ + ♻ ☆ Looking at Model Debiasing through the Lens of Anomaly Detection + + +
+ It is widely recognized that deep neural networks are sensitive to bias in +the data. This means that during training these models are likely to learn +spurious correlations between data and labels, resulting in limited +generalization abilities and low performance. In this context, model debiasing +approaches can be devised aiming at reducing the model's dependency on such +unwanted correlations, either leveraging the knowledge of bias information or +not. In this work, we focus on the latter and more realistic scenario, showing +the importance of accurately predicting the bias-conflicting and bias-aligned +samples to obtain compelling performance in bias mitigation. On this ground, we +propose to conceive the problem of model bias from an out-of-distribution +perspective, introducing a new bias identification method based on anomaly +detection. We claim that when data is mostly biased, bias-conflicting samples +can be regarded as outliers with respect to the bias-aligned distribution in +the feature space of a biased model, thus allowing for precisely detecting them +with an anomaly detection method. Coupling the proposed bias identification +approach with bias-conflicting data upsampling and augmentation in a two-step +strategy, we reach state-of-the-art performance on synthetic and real benchmark +datasets. Ultimately, our proposed approach shows that the data bias issue does +not necessarily require complex debiasing methods, given that an accurate bias +identification procedure is defined. + +
+
+ comment: 15 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Better Call SAL: Towards Learning to Segment Anything in Lidar ECCV 2024 + + +
+ We propose the SAL (Segment Anything in Lidar) method consisting of a +text-promptable zero-shot model for segmenting and classifying any object in +Lidar, and a pseudo-labeling engine that facilitates model training without +manual supervision. While the established paradigm for Lidar Panoptic +Segmentation (LPS) relies on manual supervision for a handful of object classes +defined a priori, we utilize 2D vision foundation models to generate 3D +supervision ``for free''. Our pseudo-labels consist of instance masks and +corresponding CLIP tokens, which we lift to Lidar using calibrated multi-modal +data. By training our model on these labels, we distill the 2D foundation +models into our Lidar SAL model. Even without manual labels, our model achieves +$91\%$ in terms of class-agnostic segmentation and $54\%$ in terms of zero-shot +Lidar Panoptic Segmentation of the fully supervised state-of-the-art. +Furthermore, we outperform several baselines that do not distill but only lift +image features to 3D. More importantly, we demonstrate that SAL supports +arbitrary class prompts, can be easily extended to new datasets, and shows +significant potential to improve with increasing amounts of self-labeled data. +Code and models are available at this +$\href{https://github.com/nv-dvl/segment-anything-lidar}{URL}$. + +
+
+ comment: Accepted to ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Towards More Practical Group Activity Detection: A New Benchmark and + Model ECCV 2024 + + +
+ Group activity detection (GAD) is the task of identifying members of each +group and classifying the activity of the group at the same time in a video. +While GAD has been studied recently, there is still much room for improvement +in both dataset and methodology due to their limited capability to address +practical GAD scenarios. To resolve these issues, we first present a new +dataset, dubbed Caf\'e. Unlike existing datasets, Caf\'e is constructed +primarily for GAD and presents more practical scenarios and metrics, as well as +being large-scale and providing rich annotations. Along with the dataset, we +propose a new GAD model that deals with an unknown number of groups and latent +group members efficiently and effectively. We evaluated our model on three +datasets including Caf\'e, where it outperformed previous work in terms of both +accuracy and inference speed. + +
+
+ comment: Accepted to ECCV 2024, Project page: + https://cvlab.postech.ac.kr/research/CAFE +
+
+
+
+
+ + ♻ ☆ Action2Sound: Ambient-Aware Generation of Action Sounds from Egocentric + Videos ECCV 2024 + + +
+ Generating realistic audio for human actions is important for many +applications, such as creating sound effects for films or virtual reality +games. Existing approaches implicitly assume total correspondence between the +video and audio during training, yet many sounds happen off-screen and have +weak to no correspondence with the visuals -- resulting in uncontrolled ambient +sounds or hallucinations at test time. We propose a novel ambient-aware audio +generation model, AV-LDM. We devise a novel audio-conditioning mechanism to +learn to disentangle foreground action sounds from the ambient background +sounds in in-the-wild training videos. Given a novel silent video, our model +uses retrieval-augmented generation to create audio that matches the visual +content both semantically and temporally. We train and evaluate our model on +two in-the-wild egocentric video datasets, Ego4D and EPIC-KITCHENS, and we +introduce Ego4D-Sounds -- 1.2M curated clips with action-audio correspondence. +Our model outperforms an array of existing methods, allows controllable +generation of the ambient sound, and even shows promise for generalizing to +computer graphics game clips. Overall, our approach is the first to focus +video-to-audio generation faithfully on the observed visual content despite +training from uncurated clips with natural background sounds. + +
+
+ comment: Project page: https://vision.cs.utexas.edu/projects/action2sound. + ECCV 2024 camera-ready version +
+
+
+
+
+ + ♻ ☆ Self-supervised learning of video representations from a child's + perspective + + +
+ Children learn powerful internal models of the world around them from a few +years of egocentric visual experience. Can such internal models be learned from +a child's visual experience with highly generic learning algorithms or do they +require strong inductive biases? Recent advances in collecting large-scale, +longitudinal, developmentally realistic video datasets and generic +self-supervised learning (SSL) algorithms are allowing us to begin to tackle +this nature vs. nurture question. However, existing work typically focuses on +image-based SSL algorithms and visual capabilities that can be learned from +static images (e.g. object recognition), thus ignoring temporal aspects of the +world. To close this gap, here we train self-supervised video models on +longitudinal, egocentric headcam recordings collected from a child over a two +year period in their early development (6-31 months). The resulting models are +highly effective at facilitating the learning of action concepts from a small +number of labeled examples; they have favorable data size scaling properties; +and they display emergent video interpolation capabilities. Video models also +learn more robust object representations than image-based models trained with +the exact same data. These results suggest that important temporal aspects of a +child's internal model of the world may be learnable from their visual +experience using highly generic learning algorithms and without strong +inductive biases. + +
+
+ comment: Published as a conference paper at CogSci 2024; code & models + available from https://github.com/eminorhan/video-models +
+
+
+
+
+ + ♻ ☆ nnU-Net Revisited: A Call for Rigorous Validation in 3D Medical Image + Segmentation MICCAI 2024 + + +
+ The release of nnU-Net marked a paradigm shift in 3D medical image +segmentation, demonstrating that a properly configured U-Net architecture could +still achieve state-of-the-art results. Despite this, the pursuit of novel +architectures, and the respective claims of superior performance over the U-Net +baseline, continued. In this study, we demonstrate that many of these recent +claims fail to hold up when scrutinized for common validation shortcomings, +such as the use of inadequate baselines, insufficient datasets, and neglected +computational resources. By meticulously avoiding these pitfalls, we conduct a +thorough and comprehensive benchmarking of current segmentation methods +including CNN-based, Transformer-based, and Mamba-based approaches. In contrast +to current beliefs, we find that the recipe for state-of-the-art performance is +1) employing CNN-based U-Net models, including ResNet and ConvNeXt variants, 2) +using the nnU-Net framework, and 3) scaling models to modern hardware +resources. These results indicate an ongoing innovation bias towards novel +architectures in the field and underscore the need for more stringent +validation standards in the quest for scientific progress. + +
+
+ comment: Accepted at MICCAI 2024 +
+
+
+
+
+ + ♻ ☆ Uncovering Latent Memories: Assessing Data Leakage and Memorization + Patterns in Frontier AI Models + + +
+ Frontier AI systems are making transformative impacts across society, but +such benefits are not without costs: models trained on web-scale datasets +containing personal and private data raise profound concerns about data privacy +and security. Language models are trained on extensive corpora including +potentially sensitive or proprietary information, and the risk of data leakage +- where the model response reveals pieces of such information - remains +inadequately understood. Prior work has investigated what factors drive +memorization and have identified that sequence complexity and the number of +repetitions drive memorization. Here, we focus on the evolution of memorization +over training. We begin by reproducing findings that the probability of +memorizing a sequence scales logarithmically with the number of times it is +present in the data. We next show that sequences which are apparently not +memorized after the first encounter can be "uncovered" throughout the course of +training even without subsequent encounters, a phenomenon we term "latent +memorization". The presence of latent memorization presents a challenge for +data privacy as memorized sequences may be hidden at the final checkpoint of +the model but remain easily recoverable. To this end, we develop a diagnostic +test relying on the cross entropy loss to uncover latent memorized sequences +with high accuracy. + +
+
+
+
+
+ + ♻ ☆ 3D Diffuser Actor: Policy Diffusion with 3D Scene Representations + + +
+ Diffusion policies are conditional diffusion models that learn robot action +distributions conditioned on the robot and environment state. They have +recently shown to outperform both deterministic and alternative action +distribution learning formulations. 3D robot policies use 3D scene feature +representations aggregated from a single or multiple camera views using sensed +depth. They have shown to generalize better than their 2D counterparts across +camera viewpoints. We unify these two lines of work and present 3D Diffuser +Actor, a neural policy equipped with a novel 3D denoising transformer that +fuses information from the 3D visual scene, a language instruction and +proprioception to predict the noise in noised 3D robot pose trajectories. 3D +Diffuser Actor sets a new state-of-the-art on RLBench with an absolute +performance gain of 18.1% over the current SOTA on a multi-view setup and an +absolute gain of 13.1% on a single-view setup. On the CALVIN benchmark, it +improves over the current SOTA by a 9% relative increase. It also learns to +control a robot manipulator in the real world from a handful of demonstrations. +Through thorough comparisons with the current SOTA policies and ablations of +our model, we show 3D Diffuser Actor's design choices dramatically outperform +2D representations, regression and classification objectives, absolute +attentions, and holistic non-tokenized 3D scene embeddings. + +
+
+ comment: First two authors contributed equally +
+
+
+
+
+ + ♻ ☆ InternVideo2: Scaling Foundation Models for Multimodal Video + Understanding ECCV2024 + + +
+ We introduce InternVideo2, a new family of video foundation models (ViFM) +that achieve the state-of-the-art results in video recognition, video-text +tasks, and video-centric dialogue. Our core design is a progressive training +approach that unifies the masked video modeling, crossmodal contrastive +learning, and next token prediction, scaling up the video encoder size to 6B +parameters. At the data level, we prioritize spatiotemporal consistency by +semantically segmenting videos and generating video-audio-speech captions. This +improves the alignment between video and text. Through extensive experiments, +we validate our designs and demonstrate superior performance on over 60 video +and audio tasks. Notably, our model outperforms others on various video-related +dialogue and long video understanding benchmarks, highlighting its ability to +reason and comprehend longer contexts. Code and models are available at +https://github.com/OpenGVLab/InternVideo/tree/main/InternVideo2/. + +
+
+ comment: a technical report about video understanding (accepted to ECCV2024) +
+
+
+
+
+ + ♻ ☆ Continual Panoptic Perception: Towards Multi-modal Incremental + Interpretation of Remote Sensing Images + + +
+ Continual learning (CL) breaks off the one-way training manner and enables a +model to adapt to new data, semantics and tasks continuously. However, current +CL methods mainly focus on single tasks. Besides, CL models are plagued by +catastrophic forgetting and semantic drift since the lack of old data, which +often occurs in remote-sensing interpretation due to the intricate fine-grained +semantics. In this paper, we propose Continual Panoptic Perception (CPP), a +unified continual learning model that leverages multi-task joint learning +covering pixel-level classification, instance-level segmentation and +image-level perception for universal interpretation in remote sensing images. +Concretely, we propose a collaborative cross-modal encoder (CCE) to extract the +input image features, which supports pixel classification and caption +generation synchronously. To inherit the knowledge from the old model without +exemplar memory, we propose a task-interactive knowledge distillation (TKD) +method, which leverages cross-modal optimization and task-asymmetric +pseudo-labeling (TPL) to alleviate catastrophic forgetting. Furthermore, we +also propose a joint optimization mechanism to achieve end-to-end multi-modal +panoptic perception. Experimental results on the fine-grained panoptic +perception dataset validate the effectiveness of the proposed model, and also +prove that joint optimization can boost sub-task CL efficiency with over 13\% +relative improvement on panoptic quality. + +
+
+ comment: Accepted in ACMMM 2024 +
+
+
+
+
+ + ♻ ☆ PARSE-Ego4D: Personal Action Recommendation Suggestions for Egocentric + Videos + + +
+ Intelligent assistance involves not only understanding but also action. +Existing ego-centric video datasets contain rich annotations of the videos, but +not of actions that an intelligent assistant could perform in the moment. To +address this gap, we release PARSE-Ego4D, a new set of personal action +recommendation annotations for the Ego4D dataset. We take a multi-stage +approach to generating and evaluating these annotations. First, we used a +prompt-engineered large language model (LLM) to generate context-aware action +suggestions and identified over 18,000 action suggestions. While these +synthetic action suggestions are valuable, the inherent limitations of LLMs +necessitate human evaluation. To ensure high-quality and user-centered +recommendations, we conducted a large-scale human annotation study that +provides grounding in human preferences for all of PARSE-Ego4D. We analyze the +inter-rater agreement and evaluate subjective preferences of participants. +Based on our synthetic dataset and complete human annotations, we propose +several new tasks for action suggestions based on ego-centric videos. We +encourage novel solutions that improve latency and energy requirements. The +annotations in PARSE-Ego4D will support researchers and developers who are +working on building action recommendation systems for augmented and virtual +reality systems. + +
+
+
+
+
+ + ♻ ☆ Semantic Diversity-aware Prototype-based Learning for Unbiased Scene + Graph Generation ECCV 2024 + + +
+ The scene graph generation (SGG) task involves detecting objects within an +image and predicting predicates that represent the relationships between the +objects. However, in SGG benchmark datasets, each subject-object pair is +annotated with a single predicate even though a single predicate may exhibit +diverse semantics (i.e., semantic diversity), existing SGG models are trained +to predict the one and only predicate for each pair. This in turn results in +the SGG models to overlook the semantic diversity that may exist in a +predicate, thus leading to biased predictions. In this paper, we propose a +novel model-agnostic Semantic Diversity-aware Prototype-based Learning (DPL) +framework that enables unbiased predictions based on the understanding of the +semantic diversity of predicates. Specifically, DPL learns the regions in the +semantic space covered by each predicate to distinguish among the various +different semantics that a single predicate can represent. Extensive +experiments demonstrate that our proposed model-agnostic DPL framework brings +significant performance improvement on existing SGG models, and also +effectively understands the semantic diversity of predicates. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Practical X-ray Gastric Cancer Screening Using Refined Stochastic Data + Augmentation and Hard Boundary Box Training + + +
+ Endoscopy is widely used to diagnose gastric cancer and has a high diagnostic +performance, but because it must be performed by a physician, the number of +people who can be diagnosed is limited. Gastric X-ray, on the other hand, can +be performed by technicians and can screen a much larger number of patients +than endoscopy, but its correct diagnosis requires experience. We propose an +unprecedented and practical gastric cancer diagnosis support system for gastric +X-ray images, which will enable more people to be screened. The system is based +on a general deep learning-based object detection model and includes two novel +technical proposals: refined probabilistic stomach image augmentation (R-sGAIA) +and hard boundary box learning (HBBT). R-sGAIA is a probabilistic gastric fold +region enhancement method that provides more learning patterns for cancer +detection models. HBBT is an efficient training method for object detection +models that allows the use of unannotated negative (i.e., healthy control) +samples that cannot be used for training in conventional detection models, +thereby improving model performance. The sensitivity (SE) of the proposed +system for gastric cancer (90.2%) is higher than that of the expert (85.5%), +and two out of five candidates detected box are cancerous, achieving a high +precision while maintaining a high processing speed of 0.51 seconds/image. The +proposed system showed 5.9 points higher on the F1 score compared to methods +using the same object detection model and state-of-the-art data augmentation. +In short, the system quickly and efficiently shows the radiologist where to +look, greatly reducing the radiologist's workload. + +
+
+ comment: 19 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ MeshSegmenter: Zero-Shot Mesh Semantic Segmentation via Texture + Synthesis ECCV2024 + + +
+ We present MeshSegmenter, a simple yet effective framework designed for +zero-shot 3D semantic segmentation. This model successfully extends the +powerful capabilities of 2D segmentation models to 3D meshes, delivering +accurate 3D segmentation across diverse meshes and segment descriptions. +Specifically, our model leverages the Segment Anything Model (SAM) model to +segment the target regions from images rendered from the 3D shape. In light of +the importance of the texture for segmentation, we also leverage the pretrained +stable diffusion model to generate images with textures from 3D shape, and +leverage SAM to segment the target regions from images with textures. Textures +supplement the shape for segmentation and facilitate accurate 3D segmentation +even in geometrically non-prominent areas, such as segmenting a car door within +a car mesh. To achieve the 3D segments, we render 2D images from different +views and conduct segmentation for both textured and untextured images. Lastly, +we develop a multi-view revoting scheme that integrates 2D segmentation results +and confidence scores from various views onto the 3D mesh, ensuring the 3D +consistency of segmentation results and eliminating inaccuracies from specific +perspectives. Through these innovations, MeshSegmenter offers stable and +reliable 3D segmentation results both quantitatively and qualitatively, +highlighting its potential as a transformative tool in the field of 3D +zero-shot segmentation. The code is available at +\url{https://github.com/zimingzhong/MeshSegmenter}. + +
+
+ comment: The paper was accepted by ECCV2024 +
+
+
+
+
+ + ♻ ☆ Holoported Characters: Real-time Free-viewpoint Rendering of Humans from + Sparse RGB Cameras CVPR + + +
+ We present the first approach to render highly realistic free-viewpoint +videos of a human actor in general apparel, from sparse multi-view recording to +display, in real-time at an unprecedented 4K resolution. At inference, our +method only requires four camera views of the moving actor and the respective +3D skeletal pose. It handles actors in wide clothing, and reproduces even +fine-scale dynamic detail, e.g. clothing wrinkles, face expressions, and hand +gestures. At training time, our learning-based approach expects dense +multi-view video and a rigged static surface scan of the actor. Our method +comprises three main stages. Stage 1 is a skeleton-driven neural approach for +high-quality capture of the detailed dynamic mesh geometry. Stage 2 is a novel +solution to create a view-dependent texture using four test-time camera views +as input. Finally, stage 3 comprises a new image-based refinement network +rendering the final 4K image given the output from the previous stages. Our +approach establishes a new benchmark for real-time rendering resolution and +quality using sparse input camera views, unlocking possibilities for immersive +telepresence. + +
+
+ comment: Project page: https://vcai.mpi-inf.mpg.de/projects/holochar/ 8 pages, + 2 tables and 8 figures; presented at Computer Vision and Pattern Recognition + (CVPR) 2024 +
+
+
+
+
+ + ♻ ☆ Auto-Vocabulary Segmentation for LiDAR Points CVPR 2024 + + +
+ Existing perception methods for autonomous driving fall short of recognizing +unknown entities not covered in the training data. Open-vocabulary methods +offer promising capabilities in detecting any object but are limited by +user-specified queries representing target classes. We propose AutoVoc3D, a +framework for automatic object class recognition and open-ended segmentation. +Evaluation on nuScenes showcases AutoVoc3D's ability to generate precise +semantic classes and accurate point-wise segmentation. Moreover, we introduce +Text-Point Semantic Similarity, a new metric to assess the semantic similarity +between text and point cloud without eliminating novel classes. + +
+
+ comment: Accepted by CVPR 2024 OpenSun3D Workshop +
+
+
+
+
+ + ♻ ☆ Category Adaptation Meets Projected Distillation in Generalized + Continual Category Discovery ECCV 2024 + + +
+ Generalized Continual Category Discovery (GCCD) tackles learning from +sequentially arriving, partially labeled datasets while uncovering new +categories. Traditional methods depend on feature distillation to prevent +forgetting the old knowledge. However, this strategy restricts the model's +ability to adapt and effectively distinguish new categories. To address this, +we introduce a novel technique integrating a learnable projector with feature +distillation, thus enhancing model adaptability without sacrificing past +knowledge. The resulting distribution shift of the previously learned +categories is mitigated with the auxiliary category adaptation network. We +demonstrate that while each component offers modest benefits individually, +their combination - dubbed CAMP (Category Adaptation Meets Projected +distillation) - significantly improves the balance between learning new +information and retaining old. CAMP exhibits superior performance across +several GCCD and Class Incremental Learning scenarios. The code is available at +https://github.com/grypesc/CAMP. + +
+
+ comment: Accepted for ECCV 2024 +
+
+
+
+
+ + ♻ ☆ SPOT: Scalable 3D Pre-training via Occupancy Prediction for Learning + Transferable 3D Representations + + +
+ Annotating 3D LiDAR point clouds for perception tasks is fundamental for many +applications e.g., autonomous driving, yet it still remains notoriously +labor-intensive. Pretraining-finetuning approach can alleviate the labeling +burden by fine-tuning a pre-trained backbone across various downstream datasets +as well as tasks. In this paper, we propose SPOT, namely Scalable Pre-training +via Occupancy prediction for learning Transferable 3D representations under +such a label-efficient fine-tuning paradigm. SPOT achieves effectiveness on +various public datasets with different downstream tasks, showcasing its general +representation power, cross-domain robustness and data scalability which are +three key factors for real-world application. Specifically, we both +theoretically and empirically show, for the first time, that general +representations learning can be achieved through the task of occupancy +prediction. Then, to address the domain gap caused by different LiDAR sensors +and annotation methods, we develop a beam re-sampling technique for point cloud +augmentation combined with class-balancing strategy. Furthermore, scalable +pre-training is observed, that is, the downstream performance across all the +experiments gets better with more pre-training data. Additionally, such +pre-training strategy also remains compatible with unlabeled data. The hope is +that our findings will facilitate the understanding of LiDAR points and pave +the way for future advancements in LiDAR pre-training. + +
+
+ comment: 15 pages, 8 figures, Code is available at + https://github.com/PJLab-ADG/3DTrans +
+
+
+
+
+ + ♻ ☆ SvANet: A Scale-variant Attention-based Network for Small Medical Object + Segmentation + + +
+ Early detection and accurate diagnosis can predict the risk of malignant +disease transformation, thereby increasing the probability of effective +treatment. A mild syndrome with small infected regions is an ominous warning +and is foremost in the early diagnosis of diseases. Deep learning algorithms, +such as convolutional neural networks (CNNs), have been used to segment natural +or medical objects, showing promising results. However, analyzing medical +objects of small areas in images remains a challenge due to information losses +and compression defects caused by convolution and pooling operations in CNNs. +These losses and defects become increasingly significant as the network +deepens, particularly for small medical objects. To address these challenges, +we propose a novel scale-variant attention-based network (SvANet) for accurate +small-scale object segmentation in medical images. The SvANet consists of Monte +Carlo attention, scale-variant attention, and vision transformer, which +incorporates cross-scale features and alleviates compression artifacts for +enhancing the discrimination of small medical objects. Quantitative +experimental results demonstrate the superior performance of SvANet, achieving +96.12%, 96.11%, 89.79%, 84.15%, 80.25%, 73.05%, and 72.58% in mean Dice +coefficient for segmenting kidney tumors, skin lesions, hepatic tumors, polyps, +surgical excision cells, retinal vasculatures, and sperms, which occupy less +than 1% of the image areas in KiTS23, ISIC 2018, ATLAS, PolypGen, TissueNet, +FIVES, and SpermHealth datasets, respectively. + +
+
+ comment: 14 pages, 9 figures, under review +
+
+
+
+
+ + ♻ ☆ Language-Driven 6-DoF Grasp Detection Using Negative Prompt Guidance ECCV 2024 + + +
+ 6-DoF grasp detection has been a fundamental and challenging problem in +robotic vision. While previous works have focused on ensuring grasp stability, +they often do not consider human intention conveyed through natural language, +hindering effective collaboration between robots and users in complex 3D +environments. In this paper, we present a new approach for language-driven +6-DoF grasp detection in cluttered point clouds. We first introduce +Grasp-Anything-6D, a large-scale dataset for the language-driven 6-DoF grasp +detection task with 1M point cloud scenes and more than 200M +language-associated 3D grasp poses. We further introduce a novel diffusion +model that incorporates a new negative prompt guidance learning strategy. The +proposed negative prompt strategy directs the detection process toward the +desired object while steering away from unwanted ones given the language input. +Our method enables an end-to-end framework where humans can command the robot +to grasp desired objects in a cluttered scene using natural language. Intensive +experimental results show the effectiveness of our method in both benchmarking +experiments and real-world scenarios, surpassing other baselines. In addition, +we demonstrate the practicality of our approach in real-world robotic +applications. Our project is available at +https://airvlab.github.io/grasp-anything. + +
+
+ comment: Accepted at ECCV 2024 +
+
+
+
+
+ + ♻ ☆ PCR-99: A Practical Method for Point Cloud Registration with 99% + Outliers + + +
+ We propose a robust method for point cloud registration that can handle both +unknown scales and extreme outlier ratios. Our method, dubbed PCR-99, uses a +deterministic 3-point sampling approach with two novel mechanisms that +significantly boost the speed: (1) an improved ordering of the samples based on +pairwise scale consistency, prioritizing the point correspondences that are +more likely to be inliers, and (2) an efficient outlier rejection scheme based +on triplet scale consistency, prescreening bad samples and reducing the number +of hypotheses to be tested. Our evaluation shows that, up to 98% outlier ratio, +the proposed method achieves comparable performance to the state of the art. At +99% outlier ratio, however, it outperforms the state of the art for both +known-scale and unknown-scale problems. Especially for the latter, we observe a +clear superiority in terms of robustness and speed. + +
+
+
+
+
+ + ♻ ☆ BEVCar: Camera-Radar Fusion for BEV Map and Object Segmentation IROS + + +
+ Semantic scene segmentation from a bird's-eye-view (BEV) perspective plays a +crucial role in facilitating planning and decision-making for mobile robots. +Although recent vision-only methods have demonstrated notable advancements in +performance, they often struggle under adverse illumination conditions such as +rain or nighttime. While active sensors offer a solution to this challenge, the +prohibitively high cost of LiDARs remains a limiting factor. Fusing camera data +with automotive radars poses a more inexpensive alternative but has received +less attention in prior research. In this work, we aim to advance this +promising avenue by introducing BEVCar, a novel approach for joint BEV object +and map segmentation. The core novelty of our approach lies in first learning a +point-based encoding of raw radar data, which is then leveraged to efficiently +initialize the lifting of image features into the BEV space. We perform +extensive experiments on the nuScenes dataset and demonstrate that BEVCar +outperforms the current state of the art. Moreover, we show that incorporating +radar information significantly enhances robustness in challenging +environmental conditions and improves segmentation performance for distant +objects. To foster future research, we provide the weather split of the +nuScenes dataset used in our experiments, along with our code and trained +models at http://bevcar.cs.uni-freiburg.de. + +
+
+ comment: Accepted for the IEEE/RSJ International Conference on Intelligent + Robots and Systems (IROS), 2024 +
+
+
+
+
+ + ♻ ☆ Multicenter Privacy-Preserving Model Training for Deep Learning Brain + Metastases Autosegmentation + + +
+ Objectives: This work aims to explore the impact of multicenter data +heterogeneity on deep learning brain metastases (BM) autosegmentation +performance, and assess the efficacy of an incremental transfer learning +technique, namely learning without forgetting (LWF), to improve model +generalizability without sharing raw data. + Materials and methods: A total of six BM datasets from University Hospital +Erlangen (UKER), University Hospital Zurich (USZ), Stanford, UCSF, NYU and +BraTS Challenge 2023 on BM segmentation were used for this evaluation. First, +the multicenter performance of a convolutional neural network (DeepMedic) for +BM autosegmentation was established for exclusive single-center training and +for training on pooled data, respectively. Subsequently bilateral collaboration +was evaluated, where a UKER pretrained model is shared to another center for +further training using transfer learning (TL) either with or without LWF. + Results: For single-center training, average F1 scores of BM detection range +from 0.625 (NYU) to 0.876 (UKER) on respective single-center test data. Mixed +multicenter training notably improves F1 scores at Stanford and NYU, with +negligible improvement at other centers. When the UKER pretrained model is +applied to USZ, LWF achieves a higher average F1 score (0.839) than naive TL +(0.570) and single-center training (0.688) on combined UKER and USZ test data. +Naive TL improves sensitivity and contouring accuracy, but compromises +precision. Conversely, LWF demonstrates commendable sensitivity, precision and +contouring accuracy. When applied to Stanford, similar performance was +observed. + Conclusion: Data heterogeneity results in varying performance in BM +autosegmentation, posing challenges to model generalizability. LWF is a +promising approach to peer-to-peer privacy-preserving model training. + +
+
+ comment: Official published version in the Green Journal: + https://doi.org/10.1016/j.radonc.2024.110419 +
+
+
+
+
+ + ♻ ☆ Enhanced Deep Learning Methodologies and MRI Selection Techniques for + Dementia Diagnosis in the Elderly Population + + +
+ Dementia, a debilitating neurological condition affecting millions worldwide, +presents significant diagnostic challenges. In this work, we introduce a novel +methodology for the classification of demented and non-demented elderly +patients using 3D brain Magnetic Resonance Imaging (MRI) scans. Our approach +features a unique technique for selectively processing MRI slices, focusing on +the most relevant brain regions and excluding less informative sections. This +methodology is complemented by a confidence-based classification committee +composed of three custom deep learning models: Dem3D ResNet, Dem3D CNN, and +Dem3D EfficientNet. These models work synergistically to enhance +decision-making accuracy, leveraging their collective strengths. Tested on the +Open Access Series of Imaging Studies(OASIS) dataset, our method achieved an +impressive accuracy of 94.12%, surpassing existing methodologies. Furthermore, +validation on the Alzheimer's Disease Neuroimaging Initiative (ADNI) dataset +confirmed the robustness and generalizability of our approach. The use of +explainable AI (XAI) techniques and comprehensive ablation studies further +substantiate the effectiveness of our techniques, providing insights into the +decision-making process and the importance of our methodology. This research +offers a significant advancement in dementia diagnosis, providing a highly +accurate and efficient tool for clinical applications. + +
+
+
+
+
+ + ♻ ☆ SiNGR: Brain Tumor Segmentation via Signed Normalized Geodesic Transform + Regression MICCAI 2024 + + +
+ One of the primary challenges in brain tumor segmentation arises from the +uncertainty of voxels close to tumor boundaries. However, the conventional +process of generating ground truth segmentation masks fails to treat such +uncertainties properly. Those "hard labels" with 0s and 1s conceptually +influenced the majority of prior studies on brain image segmentation. As a +result, tumor segmentation is often solved through voxel classification. In +this work, we instead view this problem as a voxel-level regression, where the +ground truth represents a certainty mapping from any pixel to the border of the +tumor. We propose a novel ground truth label transformation, which is based on +a signed geodesic transform, to capture the uncertainty in brain tumors' +vicinity. We combine this idea with a Focal-like regression L1-loss that +enables effective regression learning in high-dimensional output space by +appropriately weighting voxels according to their difficulty. We thoroughly +conduct an experimental evaluation to validate the components of our proposed +method, compare it to a diverse array of state-of-the-art segmentation models, +and show that it is architecture-agnostic. The code of our method is made +publicly available (\url{https://github.com/Oulu-IMEDS/SiNGR/}). + +
+
+ comment: Accepted as a conference paper at MICCAI 2024 +
+
+
+
+
+ + ♻ ☆ The Platonic Representation Hypothesis + + +
+ We argue that representations in AI models, particularly deep networks, are +converging. First, we survey many examples of convergence in the literature: +over time and across multiple domains, the ways by which different neural +networks represent data are becoming more aligned. Next, we demonstrate +convergence across data modalities: as vision models and language models get +larger, they measure distance between datapoints in a more and more alike way. +We hypothesize that this convergence is driving toward a shared statistical +model of reality, akin to Plato's concept of an ideal reality. We term such a +representation the platonic representation and discuss several possible +selective pressures toward it. Finally, we discuss the implications of these +trends, their limitations, and counterexamples to our analysis. + +
+
+ comment: Equal contributions. Project: https://phillipi.github.io/prh/ Code: + https://github.com/minyoungg/platonic-rep +
+
+
+
+
+ + ♻ ☆ Image-level Regression for Uncertainty-aware Retinal Image Segmentation + + +
+ Accurate retinal vessel (RV) segmentation is a crucial step in the +quantitative assessment of retinal vasculature, which is needed for the early +detection of retinal diseases and other conditions. Numerous studies have been +conducted to tackle the problem of segmenting vessels automatically using a +pixel-wise classification approach. The common practice of creating ground +truth labels is to categorize pixels as foreground and background. This +approach is, however, biased, and it ignores the uncertainty of a human +annotator when it comes to annotating e.g. thin vessels. In this work, we +propose a simple and effective method that casts the RV segmentation task as an +image-level regression. For this purpose, we first introduce a novel +Segmentation Annotation Uncertainty-Aware (SAUNA) transform, which adds pixel +uncertainty to the ground truth using the pixel's closeness to the annotation +boundary and vessel thickness. To train our model with soft labels, we +generalize the earlier proposed Jaccard metric loss to arbitrary hypercubes for +soft Jaccard index (Intersection-over-Union) optimization. Additionally, we +employ a stable version of the Focal-L1 loss for pixel-wise regression. We +conduct thorough experiments and compare our method to a diverse set of +baselines across 5 retinal image datasets. Our empirical results indicate that +the integration of the SAUNA transform and these segmentation losses led to +significant performance boosts for different segmentation models. Particularly, +our methodology enables UNet-like architectures to substantially outperform +computational-intensive baselines. Our implementation is available at +\url{https://github.com/Oulu-IMEDS/SAUNA}. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ♻ ☆ LPGen: Enhancing High-Fidelity Landscape Painting Generation through + Diffusion Model + + +
+ Generating landscape paintings expands the possibilities of artistic +creativity and imagination. Traditional landscape painting methods involve +using ink or colored ink on rice paper, which requires substantial time and +effort. These methods are susceptible to errors and inconsistencies and lack +precise control over lines and colors. This paper presents LPGen, a +high-fidelity, controllable model for landscape painting generation, +introducing a novel multi-modal framework that integrates image prompts into +the diffusion model. We extract its edges and contours by computing canny edges +from the target landscape image. These, along with natural language text +prompts and drawing style references, are fed into the latent diffusion model +as conditions. We implement a decoupled cross-attention strategy to ensure +compatibility between image and text prompts, facilitating multi-modal image +generation. A decoder generates the final image. Quantitative and qualitative +analyses demonstrate that our method outperforms existing approaches in +landscape painting generation and exceeds the current state-of-the-art. The +LPGen network effectively controls the composition and color of landscape +paintings, generates more accurate images, and supports further research in +deep learning-based landscape painting generation. + +
+
+
+
+
+ + ♻ ☆ Deep Learning for Cancer Prognosis Prediction Using Portrait Photos by + StyleGAN Embedding MICCAI 2024 + + +
+ Survival prediction for cancer patients is critical for optimal treatment +selection and patient management. Current patient survival prediction methods +typically extract survival information from patients' clinical record data or +biological and imaging data. In practice, experienced clinicians can have a +preliminary assessment of patients' health status based on patients' observable +physical appearances, which are mainly facial features. However, such +assessment is highly subjective. In this work, the efficacy of objectively +capturing and using prognostic information contained in conventional portrait +photographs using deep learning for survival predication purposes is +investigated for the first time. A pre-trained StyleGAN2 model is fine-tuned on +a custom dataset of our cancer patients' photos to empower its generator with +generative ability suitable for patients' photos. The StyleGAN2 is then used to +embed the photographs to its highly expressive latent space. Utilizing the +state-of-the-art survival analysis models and based on StyleGAN's latent space +photo embeddings, this approach achieved a C-index of 0.677, which is notably +higher than chance and evidencing the prognostic value embedded in simple 2D +facial images. In addition, thanks to StyleGAN's interpretable latent space, +our survival prediction model can be validated for relying on essential facial +features, eliminating any biases from extraneous information like clothing or +background. Moreover, a health attribute is obtained from regression +coefficients, which has important potential value for patient care. + +
+
+ comment: MICCAI 2024 Early Accept +
+
+
+
+
+ + ♻ ☆ FreeInit: Bridging Initialization Gap in Video Diffusion Models + + +
+ Though diffusion-based video generation has witnessed rapid progress, the +inference results of existing models still exhibit unsatisfactory temporal +consistency and unnatural dynamics. In this paper, we delve deep into the noise +initialization of video diffusion models, and discover an implicit +training-inference gap that attributes to the unsatisfactory inference +quality.Our key findings are: 1) the spatial-temporal frequency distribution of +the initial noise at inference is intrinsically different from that for +training, and 2) the denoising process is significantly influenced by the +low-frequency components of the initial noise. Motivated by these observations, +we propose a concise yet effective inference sampling strategy, FreeInit, which +significantly improves temporal consistency of videos generated by diffusion +models. Through iteratively refining the spatial-temporal low-frequency +components of the initial latent during inference, FreeInit is able to +compensate the initialization gap between training and inference, thus +effectively improving the subject appearance and temporal consistency of +generation results. Extensive experiments demonstrate that FreeInit +consistently enhances the generation quality of various text-to-video diffusion +models without additional training or fine-tuning. + +
+
+ comment: Project page: https://tianxingwu.github.io/pages/FreeInit/ Code: + https://github.com/TianxingWu/FreeInit +
+
+
+
+
+ + ♻ ☆ Bridging Sensor Gaps via Attention Gated Tuning for Hyperspectral Image + Classification + + +
+ Data-hungry HSI classification methods require high-quality labeled HSIs, +which are often costly to obtain. This characteristic limits the performance +potential of data-driven methods when dealing with limited annotated samples. +Bridging the domain gap between data acquired from different sensors allows us +to utilize abundant labeled data across sensors to break this bottleneck. In +this paper, we propose a novel Attention-Gated Tuning (AGT) strategy and a +triplet-structured transformer model, Tri-Former, to address this issue. The +AGT strategy serves as a bridge, allowing us to leverage existing labeled HSI +datasets, even RGB datasets to enhance the performance on new HSI datasets with +limited samples. Instead of inserting additional parameters inside the basic +model, we train a lightweight auxiliary branch that takes intermediate features +as input from the basic model and makes predictions. The proposed AGT resolves +conflicts between heterogeneous and even cross-modal data by suppressing the +disturbing information and enhances the useful information through a soft gate. +Additionally, we introduce Tri-Former, a triplet-structured transformer with a +spectral-spatial separation design that enhances parameter utilization and +computational efficiency, enabling easier and flexible fine-tuning. Comparison +experiments conducted on three representative HSI datasets captured by +different sensors demonstrate the proposed Tri-Former achieves better +performance compared to several state-of-the-art methods. Homologous, +heterologous and cross-modal tuning experiments verified the effectiveness of +the proposed AGT. Code has been released at: +\href{https://github.com/Cecilia-xue/AGT}{https://github.com/Cecilia-xue/AGT}. + +
+
+
+
+
+ + ♻ ☆ Reliable Spatial-Temporal Voxels For Multi-Modal Test-Time Adaptation + + +
+ Multi-modal test-time adaptation (MM-TTA) is proposed to adapt models to an +unlabeled target domain by leveraging the complementary multi-modal inputs in +an online manner. Previous MM-TTA methods for 3D segmentation rely on +predictions of cross-modal information in each input frame, while they ignore +the fact that predictions of geometric neighborhoods within consecutive frames +are highly correlated, leading to unstable predictions across time. To fulfill +this gap, we propose ReLiable Spatial-temporal Voxels (Latte), an MM-TTA method +that leverages reliable cross-modal spatial-temporal correspondences for +multi-modal 3D segmentation. Motivated by the fact that reliable predictions +should be consistent with their spatial-temporal correspondences, Latte +aggregates consecutive frames in a slide window manner and constructs +Spatial-Temopral (ST) voxels to capture temporally local prediction consistency +for each modality. After filtering out ST voxels with high ST entropy, Latte +conducts cross-modal learning for each point and pixel by attending to those +with reliable and consistent predictions among both spatial and temporal +neighborhoods. Experimental results show that Latte achieves state-of-the-art +performance on three different MM-TTA benchmarks compared to previous MM-TTA or +TTA methods. Visit our project site https://sites.google.com/view/eccv24-latte. + +
+
+
+
+
+ + ♻ ☆ DreamScene360: Unconstrained Text-to-3D Scene Generation with Panoramic + Gaussian Splatting + + +
+ The increasing demand for virtual reality applications has highlighted the +significance of crafting immersive 3D assets. We present a text-to-3D +360$^{\circ}$ scene generation pipeline that facilitates the creation of +comprehensive 360$^{\circ}$ scenes for in-the-wild environments in a matter of +minutes. Our approach utilizes the generative power of a 2D diffusion model and +prompt self-refinement to create a high-quality and globally coherent panoramic +image. This image acts as a preliminary "flat" (2D) scene representation. +Subsequently, it is lifted into 3D Gaussians, employing splatting techniques to +enable real-time exploration. To produce consistent 3D geometry, our pipeline +constructs a spatially coherent structure by aligning the 2D monocular depth +into a globally optimized point cloud. This point cloud serves as the initial +state for the centroids of 3D Gaussians. In order to address invisible issues +inherent in single-view inputs, we impose semantic and geometric constraints on +both synthesized and input camera views as regularizations. These guide the +optimization of Gaussians, aiding in the reconstruction of unseen regions. In +summary, our method offers a globally consistent 3D scene within a +360$^{\circ}$ perspective, providing an enhanced immersive experience over +existing techniques. Project website at: http://dreamscene360.github.io/ + +
+
+
+
+
+ + ♻ ☆ COIN: Counterfactual inpainting for weakly supervised semantic + segmentation for medical images + + +
+ Deep learning is dramatically transforming the field of medical imaging and +radiology, enabling the identification of pathologies in medical images, +including computed tomography (CT) and X-ray scans. However, the performance of +deep learning models, particularly in segmentation tasks, is often limited by +the need for extensive annotated datasets. To address this challenge, the +capabilities of weakly supervised semantic segmentation are explored through +the lens of Explainable AI and the generation of counterfactual explanations. +The scope of this research is development of a novel counterfactual inpainting +approach (COIN) that flips the predicted classification label from abnormal to +normal by using a generative model. For instance, if the classifier deems an +input medical image X as abnormal, indicating the presence of a pathology, the +generative model aims to inpaint the abnormal region, thus reversing the +classifier's original prediction label. The approach enables us to produce +precise segmentations for pathologies without depending on pre-existing +segmentation masks. Crucially, image-level labels are utilized, which are +substantially easier to acquire than creating detailed segmentation masks. The +effectiveness of the method is demonstrated by segmenting synthetic targets and +actual kidney tumors from CT images acquired from Tartu University Hospital in +Estonia. The findings indicate that COIN greatly surpasses established +attribution methods, such as RISE, ScoreCAM, and LayerCAM, as well as an +alternative counterfactual explanation method introduced by Singla et al. This +evidence suggests that COIN is a promising approach for semantic segmentation +of tumors in CT images, and presents a step forward in making deep learning +applications more accessible and effective in healthcare, where annotated data +is scarce. + +
+
+ comment: This work has been accepted to be presented to The 2nd World + Conference on eXplainable Artificial Intelligence (xAI 2024), July 17-19, + 2024 - Valletta, Malta +
+
+
+
+
+ + ♻ ☆ Imperative Learning: A Self-supervised Neural-Symbolic Learning + Framework for Robot Autonomy + + +
+ Data-driven methods such as reinforcement and imitation learning have +achieved remarkable success in robot autonomy. However, their data-centric +nature still hinders them from generalizing well to ever-changing environments. +Moreover, collecting large datasets for robotic tasks is often impractical and +expensive. To overcome these challenges, we introduce a new self-supervised +neural-symbolic (NeSy) computational framework, imperative learning (IL), for +robot autonomy, leveraging the generalization abilities of symbolic reasoning. +The framework of IL consists of three primary components: a neural module, a +reasoning engine, and a memory system. We formulate IL as a special bilevel +optimization (BLO), which enables reciprocal learning over the three modules. +This overcomes the label-intensive obstacles associated with data-driven +approaches and takes advantage of symbolic reasoning concerning logical +reasoning, physical principles, geometric analysis, etc. We discuss several +optimization techniques for IL and verify their effectiveness in five distinct +robot autonomy tasks including path planning, rule induction, optimal control, +visual odometry, and multi-robot routing. Through various experiments, we show +that IL can significantly enhance robot autonomy capabilities and we anticipate +that it will catalyze further research across diverse domains. + +
+
+
+
+
+ + ♻ ☆ LangOcc: Self-Supervised Open Vocabulary Occupancy Estimation via Volume + Rendering + + +
+ The 3D occupancy estimation task has become an important challenge in the +area of vision-based autonomous driving recently. However, most existing +camera-based methods rely on costly 3D voxel labels or LiDAR scans for +training, limiting their practicality and scalability. Moreover, most methods +are tied to a predefined set of classes which they can detect. In this work we +present a novel approach for open vocabulary occupancy estimation called +LangOcc, that is trained only via camera images, and can detect arbitrary +semantics via vision-language alignment. In particular, we distill the +knowledge of the strong vision-language aligned encoder CLIP into a 3D +occupancy model via differentiable volume rendering. Our model estimates +vision-language aligned features in a 3D voxel grid using only images. It is +trained in a self-supervised manner by rendering our estimations back to 2D +space, where ground-truth features can be computed. This training mechanism +automatically supervises the scene geometry, allowing for a straight-forward +and powerful training method without any explicit geometry supervision. LangOcc +outperforms LiDAR-supervised competitors in open vocabulary occupancy by a +large margin, solely relying on vision-based training. We also achieve +state-of-the-art results in self-supervised semantic occupancy estimation on +the Occ3D-nuScenes dataset, despite not being limited to a specific set of +categories, thus demonstrating the effectiveness of our proposed +vision-language training. + +
+
+
+
+
+ + ♻ ☆ Point-PEFT: Parameter-Efficient Fine-Tuning for 3D Pre-trained Models + + +
+ The popularity of pre-trained large models has revolutionized downstream +tasks across diverse fields, such as language, vision, and multi-modality. To +minimize the adaption cost for downstream tasks, many Parameter-Efficient +Fine-Tuning (PEFT) techniques are proposed for language and 2D image +pre-trained models. However, the specialized PEFT method for 3D pre-trained +models is still under-explored. To this end, we introduce Point-PEFT, a novel +framework for adapting point cloud pre-trained models with minimal learnable +parameters. Specifically, for a pre-trained 3D model, we freeze most of its +parameters, and only tune the newly added PEFT modules on downstream tasks, +which consist of a Point-prior Prompt and a Geometry-aware Adapter. The +Point-prior Prompt adopts a set of learnable prompt tokens, for which we +propose to construct a memory bank with domain-specific knowledge, and utilize +a parameter-free attention to enhance the prompt tokens. The Geometry-aware +Adapter aims to aggregate point cloud features within spatial neighborhoods to +capture fine-grained geometric information through local interactions. +Extensive experiments indicate that our Point-PEFT can achieve better +performance than the full fine-tuning on various downstream tasks, while using +only 5% of the trainable parameters, demonstrating the efficiency and +effectiveness of our approach. Code is released at +https://github.com/Ivan-Tang-3D/Point-PEFT. + +
+
+ comment: The specialized PEFT framework for 3D pre-trained models, which + achieves competitive performance to full fine-tuning, and significantly + reduces the computational resources. Project page: + https://github.com/Ivan-Tang-3D/Point-PEFT +
+
+
+
+
+ + ♻ ☆ Restoring Images in Adverse Weather Conditions via Histogram Transformer + + +
+ Transformer-based image restoration methods in adverse weather have achieved +significant progress. Most of them use self-attention along the channel +dimension or within spatially fixed-range blocks to reduce computational load. +However, such a compromise results in limitations in capturing long-range +spatial features. Inspired by the observation that the weather-induced +degradation factors mainly cause similar occlusion and brightness, in this +work, we propose an efficient Histogram Transformer (Histoformer) for restoring +images affected by adverse weather. It is powered by a mechanism dubbed +histogram self-attention, which sorts and segments spatial features into +intensity-based bins. Self-attention is then applied across bins or within each +bin to selectively focus on spatial features of dynamic range and process +similar degraded pixels of the long range together. To boost histogram +self-attention, we present a dynamic-range convolution enabling conventional +convolution to conduct operation over similar pixels rather than neighbor +pixels. We also observe that the common pixel-wise losses neglect linear +association and correlation between output and ground-truth. Thus, we propose +to leverage the Pearson correlation coefficient as a loss function to enforce +the recovered pixels following the identical order as ground-truth. Extensive +experiments demonstrate the efficacy and superiority of our proposed method. We +have released the codes in Github. + +
+
+ comment: 19 pages, 7 figures, 10MB +
+
+
+
+
+ + ♻ ☆ HarmonicNeRF: Geometry-Informed Synthetic View Augmentation for 3D Scene + Reconstruction in Driving Scenarios ACM MM 2024 + + +
+ In the realm of autonomous driving, achieving precise 3D reconstruction of +the driving environment is critical for ensuring safety and effective +navigation. Neural Radiance Fields (NeRF) have shown promise in creating highly +detailed and accurate models of complex environments. However, the application +of NeRF in autonomous driving scenarios encounters several challenges, +primarily due to the sparsity of viewpoints inherent in camera trajectories and +the constraints on data collection in unbounded outdoor scenes, which typically +occur along predetermined paths. This limitation not only reduces the available +scene information but also poses significant challenges for NeRF training, as +the sparse and path-distributed observational data leads to +under-representation of the scene's geometry. In this paper, we introduce +HarmonicNeRF, a novel approach for outdoor self-supervised monocular scene +reconstruction. HarmonicNeRF capitalizes on the strengths of NeRF and enhances +surface reconstruction accuracy by augmenting the input space with +geometry-informed synthetic views. This is achieved through the application of +spherical harmonics to generate novel radiance values, taking into careful +consideration the color observations from the limited available real-world +views. Additionally, our method incorporates proxy geometry to effectively +manage occlusion, generating radiance pseudo-labels that circumvent the +limitations of traditional image-warping techniques, which often fail in sparse +data conditions typical of autonomous driving environments. Extensive +experiments conducted on the KITTI, Argoverse, and NuScenes datasets +demonstrate our approach establishes new benchmarks in synthesizing novel depth +views and reconstructing scenes, significantly outperforming existing methods. +Project page: https://github.com/Jiawei-Yao0812/HarmonicNeRF + +
+
+ comment: Accepted by ACM MM 2024, project page: + https://github.com/Jiawei-Yao0812/HarmonicNeRF +
+
+
+
+
+ + ♻ ☆ CCVA-FL: Cross-Client Variations Adaptive Federated Learning for Medical + Imaging + + +
+ Federated Learning (FL) offers a privacy-preserving approach to train models +on decentralized data. Its potential in healthcare is significant, but +challenges arise due to cross-client variations in medical image data, +exacerbated by limited annotations. This paper introduces Cross-Client +Variations Adaptive Federated Learning (CCVA-FL) to address these issues. +CCVA-FL aims to minimize cross-client variations by transforming images into a +common feature space. It involves expert annotation of a subset of images from +each client, followed by the selection of a client with the least data +complexity as the target. Synthetic medical images are then generated using +Scalable Diffusion Models with Transformers (DiT) based on the target client's +annotated images. These synthetic images, capturing diversity and representing +the original data, are shared with other clients. Each client then translates +its local images into the target image space using image-to-image translation. +The translated images are subsequently used in a federated learning setting to +develop a server model. Our results demonstrate that CCVA-FL outperforms +Vanilla Federated Averaging by effectively addressing data distribution +differences across clients without compromising privacy. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Chemical Shift Encoding based Double Bonds Quantification in + Triglycerides using Deep Image Prior + + +
+ This study evaluated a deep learning-based method using Deep Image Prior +(DIP) to quantify triglyceride double bonds from chemical-shift encoded +multi-echo gradient echo images without network training. We employed a cost +function based on signal constraints to iteratively update the neural network +on a single dataset. The method was validated using phantom experiments and in +vivo scans. Results showed close alignment between measured and reference +double bond values, with phantom experiments yielding a Pearson correlation +coefficient of 0.96 (p = .0005). In vivo results demonstrated good agreement in +subcutaneous fat. We conclude that Deep Image Prior shows feasibility for +quantifying double bonds and fatty acid content from chemical-shift encoded +multi-echo MRI. + +
+
+
+
+
+ + ♻ ☆ Domain Generalized Recaptured Screen Image Identification Using SWIN + Transformer + + +
+ An increasing number of classification approaches have been developed to +address the issue of image rebroadcast and recapturing, a standard attack +strategy in insurance frauds, face spoofing, and video piracy. However, most of +them neglected scale variations and domain generalization scenarios, performing +poorly in instances involving domain shifts, typically made worse by +inter-domain and cross-domain scale variances. To overcome these issues, we +propose a cascaded data augmentation and SWIN transformer domain generalization +framework (DAST-DG) in the current research work Initially, we examine the +disparity in dataset representation. A feature generator is trained to make +authentic images from various domains indistinguishable. This process is then +applied to recaptured images, creating a dual adversarial learning setup. +Extensive experiments demonstrate that our approach is practical and surpasses +state-of-the-art methods across different databases. Our model achieves an +accuracy of approximately 82\% with a precision of 95\% on high-variance +datasets. + +
+
+ comment: 11 pages, 10 figures, 9 tables +
+
+
+
+
+ + ♻ ☆ QE-BEV: Query Evolution for Bird's Eye View Object Detection in Varied + Contexts ACM MM 2024 + + +
+ 3D object detection plays a pivotal role in autonomous driving and robotics, +demanding precise interpretation of Bird's Eye View (BEV) images. The dynamic +nature of real-world environments necessitates the use of dynamic query +mechanisms in 3D object detection to adaptively capture and process the complex +spatio-temporal relationships present in these scenes. However, prior +implementations of dynamic queries have often faced difficulties in effectively +leveraging these relationships, particularly when it comes to integrating +temporal information in a computationally efficient manner. Addressing this +limitation, we introduce a framework utilizing dynamic query evolution +strategy, harnesses K-means clustering and Top-K attention mechanisms for +refined spatio-temporal data processing. By dynamically segmenting the BEV +space and prioritizing key features through Top-K attention, our model achieves +a real-time, focused analysis of pertinent scene elements. Our extensive +evaluation on the nuScenes and Waymo dataset showcases a marked improvement in +detection accuracy, setting a new benchmark in the domain of query-based BEV +object detection. Our dynamic query evolution strategy has the potential to +push the boundaries of current BEV methods with enhanced adaptability and +computational efficiency. Project page: +https://github.com/Jiawei-Yao0812/QE-BEV + +
+
+ comment: Accepted by ACM MM 2024, project page: + https://github.com/Jiawei-Yao0812/QE-BEV +
+
+
+
+
+ + ♻ ☆ YOLOv10 to Its Genesis: A Decadal and Comprehensive Review of The You + Only Look Once (YOLO) Series + + +
+ This review systematically examines the progression of the You Only Look Once +(YOLO) object detection algorithms from YOLOv1 to the recently unveiled +YOLOv10. Employing a reverse chronological analysis, this study examines the +advancements introduced by YOLO algorithms, beginning with YOLOv10 and +progressing through YOLOv9, YOLOv8, and subsequent versions to explore each +version's contributions to enhancing speed, accuracy, and computational +efficiency in real-time object detection. The study highlights the +transformative impact of YOLO across five critical application areas: +automotive safety, healthcare, industrial manufacturing, surveillance, and +agriculture. By detailing the incremental technological advancements in +subsequent YOLO versions, this review chronicles the evolution of YOLO, and +discusses the challenges and limitations in each earlier versions. The +evolution signifies a path towards integrating YOLO with multimodal, +context-aware, and General Artificial Intelligence (AGI) systems for the next +YOLO decade, promising significant implications for future developments in +AI-driven applications. + +
+
+ comment: 11 Figures, 7 Tables +
+
+
+
+
+ + ♻ ☆ Enhancing Environmental Monitoring through Multispectral Imaging: The + WasteMS Dataset for Semantic Segmentation of Lakeside Waste + + +
+ Environmental monitoring of lakeside green areas is crucial for environmental +protection. Compared to manual inspections, computer vision technologies offer +a more efficient solution when deployed on-site. Multispectral imaging provides +diverse information about objects under different spectrums, aiding in the +differentiation between waste and lakeside lawn environments. This study +introduces WasteMS, the first multispectral dataset established for the +semantic segmentation of lakeside waste. WasteMS includes a diverse range of +waste types in lawn environments, captured under various lighting conditions. +We implemented a rigorous annotation process to label waste in images. +Representative semantic segmentation frameworks were used to evaluate +segmentation accuracy using WasteMS. Challenges encountered when using WasteMS +for segmenting waste on lakeside lawns were discussed. The WasteMS dataset is +available at https://github.com/zhuqinfeng1999/WasteMS. + +
+
+
+
+
+ + ♻ ☆ Harnessing Intra-group Variations Via a Population-Level Context for + Pathology Detection + + +
+ Realizing sufficient separability between the distributions of healthy and +pathological samples is a critical obstacle for pathology detection +convolutional models. Moreover, these models exhibit a bias for contrast-based +images, with diminished performance on texture-based medical images. This study +introduces the notion of a population-level context for pathology detection and +employs a graph theoretic approach to model and incorporate it into the latent +code of an autoencoder via a refinement module we term PopuSense. PopuSense +seeks to capture additional intra-group variations inherent in biomedical data +that a local or global context of the convolutional model might miss or smooth +out. Proof-of-concept experiments on contrast-based and texture-based images, +with minimal adaptation, encounter the existing preference for intensity-based +input. Nevertheless, PopuSense demonstrates improved separability in +contrast-based images, presenting an additional avenue for refining +representations learned by a model. + +
+
+
+
+
+ + ♻ ☆ Vision language models are blind + + +
+ While large language models with vision capabilities (VLMs), e.g., GPT-4o and +Gemini 1.5 Pro, are powering various image-text applications and scoring high +on many vision-understanding benchmarks, we find that they are surprisingly +still struggling with low-level vision tasks that are easy to humans. +Specifically, on BlindTest, our suite of 7 very simple tasks such as +identifying (a) whether two circles overlap; (b) whether two lines intersect; +(c) which letter is being circled in a word; and (d) counting circles in an +Olympic-like logo, four state-of-the-art VLMs are only 58.57% accurate on +average. Claude 3.5 Sonnet performs the best at 74.01% accuracy, but this is +still far from the human expected accuracy of 100%. Across different image +resolutions and line widths, VLMs consistently struggle with tasks that require +precise spatial information and recognizing geometric primitives that overlap +or are close together. Code and data are available at: +https://vlmsareblind.github.io + +
+
+
+
+
+ + ♻ ☆ Multi-Modality Co-Learning for Efficient Skeleton-based Action + Recognition + + +
+ Skeleton-based action recognition has garnered significant attention due to +the utilization of concise and resilient skeletons. Nevertheless, the absence +of detailed body information in skeletons restricts performance, while other +multimodal methods require substantial inference resources and are inefficient +when using multimodal data during both training and inference stages. To +address this and fully harness the complementary multimodal features, we +propose a novel multi-modality co-learning (MMCL) framework by leveraging the +multimodal large language models (LLMs) as auxiliary networks for efficient +skeleton-based action recognition, which engages in multi-modality co-learning +during the training stage and keeps efficiency by employing only concise +skeletons in inference. Our MMCL framework primarily consists of two modules. +First, the Feature Alignment Module (FAM) extracts rich RGB features from video +frames and aligns them with global skeleton features via contrastive learning. +Second, the Feature Refinement Module (FRM) uses RGB images with temporal +information and text instruction to generate instructive features based on the +powerful generalization of multimodal LLMs. These instructive text features +will further refine the classification scores and the refined scores will +enhance the model's robustness and generalization in a manner similar to soft +labels. Extensive experiments on NTU RGB+D, NTU RGB+D 120 and Northwestern-UCLA +benchmarks consistently verify the effectiveness of our MMCL, which outperforms +the existing skeleton-based action recognition methods. Meanwhile, experiments +on UTD-MHAD and SYSU-Action datasets demonstrate the commendable generalization +of our MMCL in zero-shot and domain-adaptive action recognition. Our code is +publicly available at: https://github.com/liujf69/MMCL-Action. + +
+
+
+
+
+ + ♻ ☆ JailbreakZoo: Survey, Landscapes, and Horizons in Jailbreaking Large + Language and Vision-Language Models + + +
+ The rapid evolution of artificial intelligence (AI) through developments in +Large Language Models (LLMs) and Vision-Language Models (VLMs) has brought +significant advancements across various technological domains. While these +models enhance capabilities in natural language processing and visual +interactive tasks, their growing adoption raises critical concerns regarding +security and ethical alignment. This survey provides an extensive review of the +emerging field of jailbreaking--deliberately circumventing the ethical and +operational boundaries of LLMs and VLMs--and the consequent development of +defense mechanisms. Our study categorizes jailbreaks into seven distinct types +and elaborates on defense strategies that address these vulnerabilities. +Through this comprehensive examination, we identify research gaps and propose +directions for future studies to enhance the security frameworks of LLMs and +VLMs. Our findings underscore the necessity for a unified perspective that +integrates both jailbreak strategies and defensive solutions to foster a +robust, secure, and reliable environment for the next generation of language +models. More details can be found on our website: +\url{https://chonghan-chen.com/llm-jailbreak-zoo-survey/}. + +
+
+ comment: 45 pages +
+
+
+
+
+ + ♻ ☆ ReMamber: Referring Image Segmentation with Mamba Twister ECCV 2024 + + +
+ Referring Image Segmentation~(RIS) leveraging transformers has achieved great +success on the interpretation of complex visual-language tasks. However, the +quadratic computation cost makes it resource-consuming in capturing long-range +visual-language dependencies. Fortunately, Mamba addresses this with efficient +linear complexity in processing. However, directly applying Mamba to +multi-modal interactions presents challenges, primarily due to inadequate +channel interactions for the effective fusion of multi-modal data. In this +paper, we propose ReMamber, a novel RIS architecture that integrates the power +of Mamba with a multi-modal Mamba Twister block. The Mamba Twister explicitly +models image-text interaction, and fuses textual and visual features through +its unique channel and spatial twisting mechanism. We achieve competitive +results on three challenging benchmarks with a simple and efficient +architecture. Moreover, we conduct thorough analyses of ReMamber and discuss +other fusion designs using Mamba. These provide valuable perspectives for +future research. The code has been released at: +https://github.com/yyh-rain-song/ReMamber. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Momentum Auxiliary Network for Supervised Local Learning ECCV2024 + + +
+ Deep neural networks conventionally employ end-to-end backpropagation for +their training process, which lacks biological credibility and triggers a +locking dilemma during network parameter updates, leading to significant GPU +memory use. Supervised local learning, which segments the network into multiple +local blocks updated by independent auxiliary networks. However, these methods +cannot replace end-to-end training due to lower accuracy, as gradients only +propagate within their local block, creating a lack of information exchange +between blocks. To address this issue and establish information transfer across +blocks, we propose a Momentum Auxiliary Network (MAN) that establishes a +dynamic interaction mechanism. The MAN leverages an exponential moving average +(EMA) of the parameters from adjacent local blocks to enhance information flow. +This auxiliary network, updated through EMA, helps bridge the informational gap +between blocks. Nevertheless, we observe that directly applying EMA parameters +has certain limitations due to feature discrepancies among local blocks. To +overcome this, we introduce learnable biases, further boosting performance. We +have validated our method on four image classification datasets (CIFAR-10, +STL-10, SVHN, ImageNet), attaining superior performance and substantial memory +savings. Notably, our method can reduce GPU memory usage by more than 45\% on +the ImageNet dataset compared to end-to-end training, while achieving higher +performance. The Momentum Auxiliary Network thus offers a new perspective for +supervised local learning. Our code is available at: +https://github.com/JunhaoSu0/MAN. + +
+
+ comment: Accepted by ECCV2024 +
+
+
+
+
+ + ♻ ☆ Diff-Reg v1: Diffusion Matching Model for Registration Problem + + +
+ Establishing reliable correspondences is essential for registration tasks +such as 3D and 2D3D registration. Existing methods commonly leverage geometric +or semantic point features to generate potential correspondences. However, +these features may face challenges such as large deformation, scale +inconsistency, and ambiguous matching problems (e.g., symmetry). Additionally, +many previous methods, which rely on single-pass prediction, may struggle with +local minima in complex scenarios. To mitigate these challenges, we introduce a +diffusion matching model for robust correspondence construction. Our approach +treats correspondence estimation as a denoising diffusion process within the +doubly stochastic matrix space, which gradually denoises (refines) a doubly +stochastic matching matrix to the ground-truth one for high-quality +correspondence estimation. It involves a forward diffusion process that +gradually introduces Gaussian noise into the ground truth matching matrix and a +reverse denoising process that iteratively refines the noisy matching matrix. +In particular, the feature extraction from the backbone occurs only once during +the inference phase. Our lightweight denoising module utilizes the same feature +at each reverse sampling step. Evaluation of our method on both 3D and 2D3D +registration tasks confirms its effectiveness. The code is available at +https://github.com/wuqianliang/Diff-Reg. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2401.00436 +
+
+
+
+
+ + ♻ ☆ Feature Re-Embedding: Towards Foundation Model-Level Performance in + Computational Pathology CVPR2024 + + +
+ Multiple instance learning (MIL) is the most widely used framework in +computational pathology, encompassing sub-typing, diagnosis, prognosis, and +more. However, the existing MIL paradigm typically requires an offline instance +feature extractor, such as a pre-trained ResNet or a foundation model. This +approach lacks the capability for feature fine-tuning within the specific +downstream tasks, limiting its adaptability and performance. To address this +issue, we propose a Re-embedded Regional Transformer (R$^2$T) for re-embedding +the instance features online, which captures fine-grained local features and +establishes connections across different regions. Unlike existing works that +focus on pre-training powerful feature extractor or designing sophisticated +instance aggregator, R$^2$T is tailored to re-embed instance features online. +It serves as a portable module that can seamlessly integrate into mainstream +MIL models. Extensive experimental results on common computational pathology +tasks validate that: 1) feature re-embedding improves the performance of MIL +models based on ResNet-50 features to the level of foundation model features, +and further enhances the performance of foundation model features; 2) the +R$^2$T can introduce more significant performance improvements to various MIL +models; 3) R$^2$T-MIL, as an R$^2$T-enhanced AB-MIL, outperforms other latest +methods by a large margin.The code is available at: +https://github.com/DearCaat/RRT-MIL. + +
+
+ comment: Accepted by CVPR2024 +
+
+
+
+
+ + ♻ ☆ Exploring Semantic Perturbations on Grover + + +
+ With news and information being as easy to access as they currently are, it +is more important than ever to ensure that people are not mislead by what they +read. Recently, the rise of neural fake news (AI-generated fake news) and its +demonstrated effectiveness at fooling humans has prompted the development of +models to detect it. One such model is the Grover model, which can both detect +neural fake news to prevent it, and generate it to demonstrate how a model +could be misused to fool human readers. In this work we explore the Grover +model's fake news detection capabilities by performing targeted attacks through +perturbations on input news articles. Through this we test Grover's resilience +to these adversarial attacks and expose some potential vulnerabilities which +should be addressed in further iterations to ensure it can detect all types of +fake news accurately. + +
+
+
+
+
+ + ♻ ☆ CHOSEN: Compilation to Hardware Optimization Stack for Efficient Vision + Transformer Inference + + +
+ Vision Transformers (ViTs) represent a groundbreaking shift in machine +learning approaches to computer vision. Unlike traditional approaches, ViTs +employ the self-attention mechanism, which has been widely used in natural +language processing, to analyze image patches. Despite their advantages in +modeling visual tasks, deploying ViTs on hardware platforms, notably +Field-Programmable Gate Arrays (FPGAs), introduces considerable challenges. +These challenges stem primarily from the non-linear calculations and high +computational and memory demands of ViTs. This paper introduces CHOSEN, a +software-hardware co-design framework to address these challenges and offer an +automated framework for ViT deployment on the FPGAs in order to maximize +performance. Our framework is built upon three fundamental contributions: +multi-kernel design to maximize the bandwidth, mainly targeting benefits of +multi DDR memory banks, approximate non-linear functions that exhibit minimal +accuracy degradation, and efficient use of available logic blocks on the FPGA, +and efficient compiler to maximize the performance and memory-efficiency of the +computing kernels by presenting a novel algorithm for design space exploration +to find optimal hardware configuration that achieves optimal throughput and +latency. Compared to the state-of-the-art ViT accelerators, CHOSEN achieves a +1.5x and 1.42x improvement in the throughput on the DeiT-S and DeiT-B models. + +
+
+
+
+
+ + ♻ ☆ X-Portrait: Expressive Portrait Animation with Hierarchical Motion + Attention SIGGRAPH 2024 + + +
+ We propose X-Portrait, an innovative conditional diffusion model tailored for +generating expressive and temporally coherent portrait animation. Specifically, +given a single portrait as appearance reference, we aim to animate it with +motion derived from a driving video, capturing both highly dynamic and subtle +facial expressions along with wide-range head movements. As its core, we +leverage the generative prior of a pre-trained diffusion model as the rendering +backbone, while achieve fine-grained head pose and expression control with +novel controlling signals within the framework of ControlNet. In contrast to +conventional coarse explicit controls such as facial landmarks, our motion +control module is learned to interpret the dynamics directly from the original +driving RGB inputs. The motion accuracy is further enhanced with a patch-based +local control module that effectively enhance the motion attention to +small-scale nuances like eyeball positions. Notably, to mitigate the identity +leakage from the driving signals, we train our motion control modules with +scaling-augmented cross-identity images, ensuring maximized disentanglement +from the appearance reference modules. Experimental results demonstrate the +universal effectiveness of X-Portrait across a diverse range of facial +portraits and expressive driving sequences, and showcase its proficiency in +generating captivating portrait animations with consistently maintained +identity characteristics. + +
+
+ comment: SIGGRAPH 2024 +
+
+
+
+
+ + ♻ ☆ Hierarchical and Decoupled BEV Perception Learning Framework for + Autonomous Driving + + +
+ Perception is essential for autonomous driving system. Recent approaches +based on Bird's-eye-view (BEV) and deep learning have made significant +progress. However, there exists challenging issues including lengthy +development cycles, poor reusability, and complex sensor setups in perception +algorithm development process. To tackle the above challenges, this paper +proposes a novel hierarchical BEV perception paradigm, aiming to provide a +library of fundamental perception modules and user-friendly graphical +interface, enabling swift construction of customized models. We conduct the +Pretrain-Finetune strategy to effectively utilize large scale public datasets +and streamline development processes. Moreover, we present a Multi-Module +Learning (MML) approach, enhancing performance through synergistic and +iterative training of multiple models. Extensive experimental results on the +Nuscenes dataset demonstrate that our approach renders significant improvement +over the traditional training scheme. + +
+
+
+
+
+ + ♻ ☆ SOEDiff: Efficient Distillation for Small Object Editing + + +
+ In this paper, we delve into a new task known as small object editing (SOE), +which focuses on text-based image inpainting within a constrained, small-sized +area. Despite the remarkable success have been achieved by current image +inpainting approaches, their application to the SOE task generally results in +failure cases such as Object Missing, Text-Image Mismatch, and Distortion. +These failures stem from the limited use of small-sized objects in training +datasets and the downsampling operations employed by U-Net models, which +hinders accurate generation. To overcome these challenges, we introduce a novel +training-based approach, SOEDiff, aimed at enhancing the capability of baseline +models like StableDiffusion in editing small-sized objects while minimizing +training costs. Specifically, our method involves two key components: SO-LoRA, +which efficiently fine-tunes low-rank matrices, and Cross-Scale Score +Distillation loss, which leverages high-resolution predictions from the +pre-trained teacher diffusion model. Our method presents significant +improvements on the test dataset collected from MSCOCO and OpenImage, +validating the effectiveness of our proposed method in small object editing. In +particular, when comparing SOEDiff with SD-I model on the OpenImage-f dataset, +we observe a 0.99 improvement in CLIP-Score and a reduction of 2.87 in FID. + +
+
+
+
+
+ + ♻ ☆ Efficient OCR for Building a Diverse Digital History + + +
+ Thousands of users consult digital archives daily, but the information they +can access is unrepresentative of the diversity of documentary history. The +sequence-to-sequence architecture typically used for optical character +recognition (OCR) - which jointly learns a vision and language model - is +poorly extensible to low-resource document collections, as learning a +language-vision model requires extensive labeled sequences and compute. This +study models OCR as a character level image retrieval problem, using a +contrastively trained vision encoder. Because the model only learns characters' +visual features, it is more sample efficient and extensible than existing +architectures, enabling accurate OCR in settings where existing solutions fail. +Crucially, the model opens new avenues for community engagement in making +digital history more representative of documentary history. + +
+
+
+
+
+ + ♻ ☆ Evaluating geometric accuracy of NeRF reconstructions compared to SLAM + method + + +
+ As Neural Radiance Field (NeRF) implementations become faster, more efficient +and accurate, their applicability to real world mapping tasks becomes more +accessible. Traditionally, 3D mapping, or scene reconstruction, has relied on +expensive LiDAR sensing. Photogrammetry can perform image-based 3D +reconstruction but is computationally expensive and requires extremely dense +image representation to recover complex geometry and photorealism. NeRFs +perform 3D scene reconstruction by training a neural network on sparse image +and pose data, achieving superior results to photogrammetry with less input +data. This paper presents an evaluation of two NeRF scene reconstructions for +the purpose of estimating the diameter of a vertical PVC cylinder. One of these +are trained on commodity iPhone data and the other is trained on robot-sourced +imagery and poses. This neural-geometry is compared to state-of-the-art +lidar-inertial SLAM in terms of scene noise and metric-accuracy. + +
+
+
+
+
+ + ♻ ☆ What Matters in Range View 3D Object Detection + + +
+ Lidar-based perception pipelines rely on 3D object detection models to +interpret complex scenes. While multiple representations for lidar exist, the +range-view is enticing since it losslessly encodes the entire lidar sensor +output. In this work, we achieve state-of-the-art amongst range-view 3D object +detection models without using multiple techniques proposed in past range-view +literature. We explore range-view 3D object detection across two modern +datasets with substantially different properties: Argoverse 2 and Waymo Open. +Our investigation reveals key insights: (1) input feature dimensionality +significantly influences the overall performance, (2) surprisingly, employing a +classification loss grounded in 3D spatial proximity works as well or better +compared to more elaborate IoU-based losses, and (3) addressing non-uniform +lidar density via a straightforward range subsampling technique outperforms +existing multi-resolution, range-conditioned networks. Our experiments reveal +that techniques proposed in recent range-view literature are not needed to +achieve state-of-the-art performance. Combining the above findings, we +establish a new state-of-the-art model for range-view 3D object detection -- +improving AP by 2.2% on the Waymo Open dataset while maintaining a runtime of +10 Hz. We establish the first range-view model on the Argoverse 2 dataset and +outperform strong voxel-based baselines. All models are multi-class and +open-source. Code is available at +https://github.com/benjaminrwilson/range-view-3d-detection. + +
+
+ comment: Fixed broken link +
+
+
+
+
+ + ♻ ☆ Assessing Brittleness of Image-Text Retrieval Benchmarks from + Vision-Language Models Perspective + + +
+ Image-text retrieval (ITR), an important task in information retrieval (IR), +is driven by pretrained vision-language models (VLMs) that consistently achieve +state-of-the-art performance. However, a significant challenge lies in the +brittleness of existing ITR benchmarks. In standard datasets for the task, +captions often provide broad summaries of scenes, neglecting detailed +information about specific concepts. Additionally, the current evaluation setup +assumes simplistic binary matches between images and texts and focuses on +intra-modality rather than cross-modal relationships, which can lead to +misinterpretations of model performance. Motivated by this gap, in this study, +we focus on examining the brittleness of the ITR evaluation pipeline with a +focus on concept granularity. We start by analyzing two common benchmarks, +MS-COCO and Flickr30k, and compare them with their augmented versions, +MS-COCO-FG and Flickr30k-FG, given a specified set of linguistic features +capturing concept granularity. We discover that Flickr30k-FG and MS COCO-FG +consistently achieve higher scores across all the selected features. To +investigate the performance of VLMs on coarse and fine-grained datasets, we +introduce a taxonomy of perturbations. We apply these perturbations to the +selected datasets. We evaluate four state-of-the-art models - ALIGN, AltCLIP, +CLIP, and GroupViT - on the standard and fine-grained datasets under zero-shot +conditions, with and without the applied perturbations. The results demonstrate +that although perturbations generally degrade model performance, the +fine-grained datasets exhibit a smaller performance drop than their standard +counterparts. Moreover, the relative performance drop across all setups is +consistent across all models and datasets, indicating that the issue lies +within the benchmarks. We conclude the paper by providing an agenda for +improving ITR evaluation pipelines. + +
+
+
+
+
+ + ♻ ☆ MIA-Bench: Towards Better Instruction Following Evaluation of Multimodal + LLMs + + +
+ We introduce MIA-Bench, a new benchmark designed to evaluate multimodal large +language models (MLLMs) on their ability to strictly adhere to complex +instructions. Our benchmark comprises a diverse set of 400 image-prompt pairs, +each crafted to challenge the models' compliance with layered instructions in +generating accurate responses that satisfy specific requested patterns. +Evaluation results from a wide array of state-of-the-art MLLMs reveal +significant variations in performance, highlighting areas for improvement in +instruction fidelity. Additionally, we create extra training data and explore +supervised fine-tuning to enhance the models' ability to strictly follow +instructions without compromising performance on other tasks. We hope this +benchmark not only serves as a tool for measuring MLLM adherence to +instructions, but also guides future developments in MLLM training methods. + +
+
+
+
+
+ + ♻ ☆ Novel OCT mosaicking pipeline with Feature- and Pixel-based registration + + +
+ High-resolution Optical Coherence Tomography (OCT) images are crucial for +ophthalmology studies but are limited by their relatively narrow field of view +(FoV). Image mosaicking is a technique for aligning multiple overlapping images +to obtain a larger FoV. Current mosaicking pipelines often struggle with +substantial noise and considerable displacement between the input sub-fields. +In this paper, we propose a versatile pipeline for stitching multi-view +OCT/OCTA \textit{en face} projection images. Our method combines the strengths +of learning-based feature matching and robust pixel-based registration to align +multiple images effectively. Furthermore, we advance the application of a +trained foundational model, Segment Anything Model (SAM), to validate +mosaicking results in an unsupervised manner. The efficacy of our pipeline is +validated using an in-house dataset and a large public dataset, where our +method shows superior performance in terms of both accuracy and computational +efficiency. We also made our evaluation tool for image mosaicking and the +corresponding pipeline publicly available at +\url{https://github.com/MedICL-VU/OCT-mosaicking}. + +
+
+ comment: ISBI 2024 Oral +
+
+
+
+
+ + ♻ ☆ PolyOculus: Simultaneous Multi-view Image-based Novel View Synthesis + + +
+ This paper considers the problem of generative novel view synthesis (GNVS), +generating novel, plausible views of a scene given a limited number of known +views. Here, we propose a set-based generative model that can simultaneously +generate multiple, self-consistent new views, conditioned on any number of +views. Our approach is not limited to generating a single image at a time and +can condition on a variable number of views. As a result, when generating a +large number of views, our method is not restricted to a low-order +autoregressive generation approach and is better able to maintain generated +image quality over large sets of images. We evaluate our model on standard NVS +datasets and show that it outperforms the state-of-the-art image-based GNVS +baselines. Further, we show that the model is capable of generating sets of +views that have no natural sequential ordering, like loops and binocular +trajectories, and significantly outperforms other methods on such tasks. + +
+
+
+
+
+
+
+
+ + Information Retrieval 9 + +
+
+
+ + ☆ I can listen but cannot read: An evaluation of two-tower multimodal + systems for instrument recognition + + +
+ Music two-tower multimodal systems integrate audio and text modalities into a +joint audio-text space, enabling direct comparison between songs and their +corresponding labels. These systems enable new approaches for classification +and retrieval, leveraging both modalities. Despite the promising results they +have shown for zero-shot classification and retrieval tasks, closer inspection +of the embeddings is needed. This paper evaluates the inherent zero-shot +properties of joint audio-text spaces for the case-study of instrument +recognition. We present an evaluation and analysis of two-tower systems for +zero-shot instrument recognition and a detailed analysis of the properties of +the pre-joint and joint embeddings spaces. Our findings suggest that audio +encoders alone demonstrate good quality, while challenges remain within the +text encoder or joint space projection. Specifically, two-tower systems exhibit +sensitivity towards specific words, favoring generic prompts over musically +informed ones. Despite the large size of textual encoders, they do not yet +leverage additional textual context or infer instruments accurately from their +descriptions. Lastly, a novel approach for quantifying the semantic +meaningfulness of the textual space leveraging an instrument ontology is +proposed. This method reveals deficiencies in the systems' understanding of +instruments and provides evidence of the need for fine-tuning text encoders on +musical data. + +
+
+ comment: Accepted to ISMIR 2024 +
+
+
+
+
+ + ☆ Sample Enrichment via Temporary Operations on Subsequences for + Sequential Recommendation + + +
+ Sequential recommendation leverages interaction sequences to predict +forthcoming user behaviors, crucial for crafting personalized recommendations. +However, the true preferences of a user are inherently complex and +high-dimensional, while the observed data is merely a simplified and +low-dimensional projection of the rich preferences, which often leads to +prevalent issues like data sparsity and inaccurate model training. To learn +true preferences from the sparse data, most existing works endeavor to +introduce some extra information or design some ingenious models. Although they +have shown to be effective, extra information usually increases the cost of +data collection, and complex models may result in difficulty in deployment. +Innovatively, we avoid the use of extra information or alterations to the +model; instead, we fill the transformation space between the observed data and +the underlying preferences with randomness. Specifically, we propose a novel +model-agnostic and highly generic framework for sequential recommendation +called sample enrichment via temporary operations on subsequences (SETO), which +temporarily and separately enriches the transformation space via sequence +enhancement operations with rationality constraints in training. The +transformation space not only exists in the process from input samples to +preferences but also in preferences to target samples. We highlight our SETO's +effectiveness and versatility over multiple representative and state-of-the-art +sequential recommendation models (including six single-domain sequential models +and two cross-domain sequential models) across multiple real-world datasets +(including three single-domain datasets, three cross-domain datasets and a +large-scale industry dataset). + +
+
+ comment: 12 pages, 6 figures +
+
+
+
+
+ + ☆ Text-Driven Neural Collaborative Filtering Model for Paper Source + Tracing KDD + + +
+ Identifying significant references within the complex interrelations of a +citation knowledge graph is challenging, which encompasses connections through +citations, authorship, keywords, and other relational attributes. The Paper +Source Tracing (PST) task seeks to automate the identification of pivotal +references for given scholarly articles utilizing advanced data mining +techniques. In the KDD CUP 2024, we design a recommendation-based framework +tailored for the PST task. This framework employs the Neural Collaborative +Filtering (NCF) model to generate final predictions. To process the textual +attributes of the papers and extract input features for the model, we utilize +SciBERT, a pre-trained language model. According to the experimental results, +our method achieved a score of 0.37814 on the Mean Average Precision (MAP) +metric, outperforming baseline models and ranking 11th among all participating +teams. The source code is publicly available at +https://github.com/MyLove-XAB/KDDCupFinal. + +
+
+ comment: KDD CUP 2024 OAG-Challenges, Paper Source Tracing, Technical Report + of Team AoboSama @ KDD CUP 2024. August 25--29, 2024. Barcelona, Spain +
+
+
+
+
+ + ☆ Supporting Evidence-Based Medicine by Finding Both Relevant and + Significant Works + + +
+ In this paper, we present a new approach to improving the relevance and +reliability of medical IR, which builds upon the concept of Level of Evidence +(LoE). LoE framework categorizes medical publications into 7 distinct levels +based on the underlying empirical evidence. Despite LoE framework's relevance +in medical research and evidence-based practice, only few medical publications +explicitly state their LoE. Therefore, we develop a classification model for +automatically assigning LoE to medical publications, which successfully +classifies over 26 million documents in MEDLINE database into LoE classes. The +subsequent retrieval experiments on TREC PM datasets show substantial +improvements in retrieval relevance, when LoE is used as a search filter. + +
+
+
+
+
+ + ♻ ☆ Improving Stance Detection by Leveraging Measurement Knowledge from + Social Sciences: A Case Study of Dutch Political Tweets and Traditional + Gender Role Division + + +
+ Stance detection (SD) concerns automatically determining the viewpoint (i.e., +in favour of, against, or neutral) of a text's author towards a target. SD has +been applied to many research topics, among which the detection of stances +behind political tweets is an important one. In this paper, we apply SD to a +dataset of tweets from official party accounts in the Netherlands between 2017 +and 2021, with a focus on stances towards traditional gender role division, a +dividing issue between (some) Dutch political parties. To implement and improve +SD of traditional gender role division, we propose to leverage an established +survey instrument from social sciences, which has been validated for the +purpose of measuring attitudes towards traditional gender role division. Based +on our experiments, we show that using such a validated survey instrument helps +to improve SD performance. + +
+
+
+
+
+ + ♻ ☆ General-Purpose User Modeling with Behavioral Logs: A Snapchat Case + Study SIGIR 2024 + + +
+ Learning general-purpose user representations based on user behavioral logs +is an increasingly popular user modeling approach. It benefits from easily +available, privacy-friendly yet expressive data, and does not require extensive +re-tuning of the upstream user model for different downstream tasks. While this +approach has shown promise in search engines and e-commerce applications, its +fit for instant messaging platforms, a cornerstone of modern digital +communication, remains largely uncharted. We explore this research gap using +Snapchat data as a case study. Specifically, we implement a Transformer-based +user model with customized training objectives and show that the model can +produce high-quality user representations across a broad range of evaluation +tasks, among which we introduce three new downstream tasks that concern pivotal +topics in user research: user safety, engagement and churn. We also tackle the +challenge of efficient extrapolation of long sequences at inference time, by +applying a novel positional encoding method. + +
+
+ comment: SIGIR 2024 +
+
+
+
+
+ + ♻ ☆ Bridging Items and Language: A Transition Paradigm for Large Language + Model-Based Recommendation KDD 2024 + + +
+ Harnessing Large Language Models (LLMs) for recommendation is rapidly +emerging, which relies on two fundamental steps to bridge the recommendation +item space and the language space: 1) item indexing utilizes identifiers to +represent items in the language space, and 2) generation grounding associates +LLMs' generated token sequences to in-corpus items. However, previous methods +exhibit inherent limitations in the two steps. Existing ID-based identifiers +(e.g., numeric IDs) and description-based identifiers (e.g., titles) either +lose semantics or lack adequate distinctiveness. Moreover, prior generation +grounding methods might generate invalid identifiers, thus misaligning with +in-corpus items. To address these issues, we propose a novel Transition +paradigm for LLM-based Recommender (named TransRec) to bridge items and +language. Specifically, TransRec presents multi-facet identifiers, which +simultaneously incorporate ID, title, and attribute for item indexing to pursue +both distinctiveness and semantics. Additionally, we introduce a specialized +data structure for TransRec to ensure generating valid identifiers only and +utilize substring indexing to encourage LLMs to generate from any position of +identifiers. Lastly, TransRec presents an aggregated grounding module to +leverage generated multi-facet identifiers to rank in-corpus items efficiently. +We instantiate TransRec on two backbone models, BART-large and LLaMA-7B. +Extensive results on three real-world datasets under diverse settings validate +the superiority of TransRec. + +
+
+ comment: Accepted by KDD 2024 +
+
+
+
+
+ + ♻ ☆ Assessing Brittleness of Image-Text Retrieval Benchmarks from + Vision-Language Models Perspective + + +
+ Image-text retrieval (ITR), an important task in information retrieval (IR), +is driven by pretrained vision-language models (VLMs) that consistently achieve +state-of-the-art performance. However, a significant challenge lies in the +brittleness of existing ITR benchmarks. In standard datasets for the task, +captions often provide broad summaries of scenes, neglecting detailed +information about specific concepts. Additionally, the current evaluation setup +assumes simplistic binary matches between images and texts and focuses on +intra-modality rather than cross-modal relationships, which can lead to +misinterpretations of model performance. Motivated by this gap, in this study, +we focus on examining the brittleness of the ITR evaluation pipeline with a +focus on concept granularity. We start by analyzing two common benchmarks, +MS-COCO and Flickr30k, and compare them with their augmented versions, +MS-COCO-FG and Flickr30k-FG, given a specified set of linguistic features +capturing concept granularity. We discover that Flickr30k-FG and MS COCO-FG +consistently achieve higher scores across all the selected features. To +investigate the performance of VLMs on coarse and fine-grained datasets, we +introduce a taxonomy of perturbations. We apply these perturbations to the +selected datasets. We evaluate four state-of-the-art models - ALIGN, AltCLIP, +CLIP, and GroupViT - on the standard and fine-grained datasets under zero-shot +conditions, with and without the applied perturbations. The results demonstrate +that although perturbations generally degrade model performance, the +fine-grained datasets exhibit a smaller performance drop than their standard +counterparts. Moreover, the relative performance drop across all setups is +consistent across all models and datasets, indicating that the issue lies +within the benchmarks. We conclude the paper by providing an agenda for +improving ITR evaluation pipelines. + +
+
+
+
+
+ + ♻ ☆ UNIQORN: Unified Question Answering over RDF Knowledge Graphs and + Natural Language Text + + +
+ Question answering over RDF data like knowledge graphs has been greatly +advanced, with a number of good systems providing crisp answers for natural +language questions or telegraphic queries. Some of these systems incorporate +textual sources as additional evidence for the answering process, but cannot +compute answers that are present in text alone. Conversely, the IR and NLP +communities have addressed QA over text, but such systems barely utilize +semantic data and knowledge. This paper presents a method for complex questions +that can seamlessly operate over a mixture of RDF datasets and text corpora, or +individual sources, in a unified framework. Our method, called UNIQORN, builds +a context graph on-the-fly, by retrieving question-relevant evidences from the +RDF data and/or a text corpus, using fine-tuned BERT models. The resulting +graph typically contains all question-relevant evidences but also a lot of +noise. UNIQORN copes with this input by a graph algorithm for Group Steiner +Trees, that identifies the best answer candidates in the context graph. +Experimental results on several benchmarks of complex questions with multiple +entities and relations, show that UNIQORN significantly outperforms +state-of-the-art methods for heterogeneous QA -- in a full training mode, as +well as in zero-shot settings. The graph-based methodology provides +user-interpretable evidence for the complete answering process. + +
+
+ comment: 27 pages +
+
+
+
+
+
+
+
+ + Machine Learning 170 + +
+
+
+ + ☆ Sparse vs Contiguous Adversarial Pixel Perturbations in Multimodal + Models: An Empirical Analysis + + +
+ Assessing the robustness of multimodal models against adversarial examples is +an important aspect for the safety of its users. We craft L0-norm perturbation +attacks on the preprocessed input images. We launch them in a black-box setup +against four multimodal models and two unimodal DNNs, considering both targeted +and untargeted misclassification. Our attacks target less than 0.04% of +perturbed image area and integrate different spatial positioning of perturbed +pixels: sparse positioning and pixels arranged in different contiguous shapes +(row, column, diagonal, and patch). To the best of our knowledge, we are the +first to assess the robustness of three state-of-the-art multimodal models +(ALIGN, AltCLIP, GroupViT) against different sparse and contiguous pixel +distribution perturbations. The obtained results indicate that unimodal DNNs +are more robust than multimodal models. Furthermore, models using CNN-based +Image Encoder are more vulnerable than models with ViT - for untargeted +attacks, we obtain a 99% success rate by perturbing less than 0.02% of the +image area. + +
+
+
+
+
+ + ☆ VGGHeads: A Large-Scale Synthetic Dataset for 3D Human Heads + + +
+ Human head detection, keypoint estimation, and 3D head model fitting are +important tasks with many applications. However, traditional real-world +datasets often suffer from bias, privacy, and ethical concerns, and they have +been recorded in laboratory environments, which makes it difficult for trained +models to generalize. Here, we introduce VGGHeads -- a large scale synthetic +dataset generated with diffusion models for human head detection and 3D mesh +estimation. Our dataset comprises over 1 million high-resolution images, each +annotated with detailed 3D head meshes, facial landmarks, and bounding boxes. +Using this dataset we introduce a new model architecture capable of +simultaneous heads detection and head meshes reconstruction from a single image +in a single step. Through extensive experimental evaluations, we demonstrate +that models trained on our synthetic data achieve strong performance on real +images. Furthermore, the versatility of our dataset makes it applicable across +a broad spectrum of tasks, offering a general and comprehensive representation +of human heads. Additionally, we provide detailed information about the +synthetic data generation pipeline, enabling it to be re-used for other tasks +and domains. + +
+
+
+
+
+ + ☆ LoRA-Pro: Are Low-Rank Adapters Properly Optimized? + + +
+ Low-Rank Adaptation, also known as LoRA, has emerged as a prominent method +for parameter-efficient fine-tuning foundation models by re-parameterizing the +original matrix into the product of two low-rank matrices. Despite its +efficiency, LoRA often yields inferior performance compared to full +fine-tuning. In this paper, we propose LoRA-Pro to bridge this performance gap. +Firstly, we delve into the optimization processes in LoRA and full fine-tuning. +We reveal that while LoRA employs low-rank approximation, it neglects to +approximate the optimization process of full fine-tuning. To address this, we +introduce a novel concept called the "equivalent gradient." This virtual +gradient makes the optimization process on the re-parameterized matrix +equivalent to LoRA, which can be used to quantify the differences between LoRA +and full fine-tuning. The equivalent gradient is derived from the gradients of +matrices $A$ and $B$. To narrow the performance gap, our approach minimizes the +differences between the equivalent gradient and the gradient obtained from full +fine-tuning during the optimization process. By solving this objective, we +derive optimal closed-form solutions for updating matrices $A$ and $B$. Our +method constrains the optimization process, shrinking the performance gap +between LoRA and full fine-tuning. Extensive experiments on natural language +processing tasks validate the effectiveness of our method. + +
+
+
+
+
+ + ☆ Numerical Literals in Link Prediction: A Critical Examination of Models + and Datasets + + +
+ Link Prediction(LP) is an essential task over Knowledge Graphs(KGs), +traditionally focussed on using and predicting the relations between entities. +Textual entity descriptions have already been shown to be valuable, but models +that incorporate numerical literals have shown minor improvements on existing +benchmark datasets. It is unclear whether a model is actually better in using +numerical literals, or better capable of utilizing the graph structure. This +raises doubts about the effectiveness of these methods and about the +suitability of the existing benchmark datasets. + We propose a methodology to evaluate LP models that incorporate numerical +literals. We propose i) a new synthetic dataset to better understand how well +these models use numerical literals and ii) dataset ablations strategies to +investigate potential difficulties with the existing datasets. We identify a +prevalent trend: many models underutilize literal information and potentially +rely on additional parameters for performance gains. Our investigation +highlights the need for more extensive evaluations when releasing new models +and datasets. + +
+
+
+
+
+ + ☆ Automated Ensemble Multimodal Machine Learning for Healthcare + + +
+ The application of machine learning in medicine and healthcare has led to the +creation of numerous diagnostic and prognostic models. However, despite their +success, current approaches generally issue predictions using data from a +single modality. This stands in stark contrast with clinician decision-making +which employs diverse information from multiple sources. While several +multimodal machine learning approaches exist, significant challenges in +developing multimodal systems remain that are hindering clinical adoption. In +this paper, we introduce a multimodal framework, AutoPrognosis-M, that enables +the integration of structured clinical (tabular) data and medical imaging using +automated machine learning. AutoPrognosis-M incorporates 17 imaging models, +including convolutional neural networks and vision transformers, and three +distinct multimodal fusion strategies. In an illustrative application using a +multimodal skin lesion dataset, we highlight the importance of multimodal +machine learning and the power of combining multiple fusion strategies using +ensemble learning. We have open-sourced our framework as a tool for the +community and hope it will accelerate the uptake of multimodal machine learning +in healthcare and spur further innovation. + +
+
+
+
+
+ + ☆ Recursive Introspection: Teaching Language Model Agents How to + Self-Improve + + +
+ A central piece in enabling intelligent agentic behavior in foundation models +is to make them capable of introspecting upon their behavior, reasoning, and +correcting their mistakes as more computation or interaction is available. Even +the strongest proprietary large language models (LLMs) do not quite exhibit the +ability of continually improving their responses sequentially, even in +scenarios where they are explicitly told that they are making a mistake. In +this paper, we develop RISE: Recursive IntroSpEction, an approach for +fine-tuning LLMs to introduce this capability, despite prior work hypothesizing +that this capability may not be possible to attain. Our approach prescribes an +iterative fine-tuning procedure, which attempts to teach the model how to alter +its response after having executed previously unsuccessful attempts to solve a +hard test-time problem, with optionally additional environment feedback. RISE +poses fine-tuning for a single-turn prompt as solving a multi-turn Markov +decision process (MDP), where the initial state is the prompt. Inspired by +principles in online imitation learning and reinforcement learning, we propose +strategies for multi-turn data collection and training so as to imbue an LLM +with the capability to recursively detect and correct its previous mistakes in +subsequent iterations. Our experiments show that RISE enables Llama2, Llama3, +and Mistral models to improve themselves with more turns on math reasoning +tasks, outperforming several single-turn strategies given an equal amount of +inference-time computation. We also find that RISE scales well, often attaining +larger benefits with more capable models. Our analysis shows that RISE makes +meaningful improvements to responses to arrive at the correct solution for +challenging prompts, without disrupting one-turn abilities as a result of +expressing more complex distributions. + +
+
+
+
+
+ + ☆ Exploring Scaling Trends in LLM Robustness + + +
+ Language model capabilities predictably improve from scaling a model's size +and training data. Motivated by this, increasingly large language models have +been trained, yielding an array of impressive capabilities. Yet these models +are vulnerable to adversarial prompts, such as "jailbreaks" that hijack models +to perform undesired behaviors, posing a significant risk of misuse. Prior work +indicates that computer vision models become more robust with model and data +scaling, raising the question: does language model robustness also improve with +scale? We study this question empirically, finding that larger models respond +substantially better to adversarial training, but there is little to no benefit +from model scale in the absence of explicit defenses. + +
+
+ comment: 31 pages +
+
+
+
+
+ + ☆ Geometry Fidelity for Spherical Images ECCV 2024 + + +
+ Spherical or omni-directional images offer an immersive visual format +appealing to a wide range of computer vision applications. However, geometric +properties of spherical images pose a major challenge for models and metrics +designed for ordinary 2D images. Here, we show that direct application of +Fr\'echet Inception Distance (FID) is insufficient for quantifying geometric +fidelity in spherical images. We introduce two quantitative metrics accounting +for geometric constraints, namely Omnidirectional FID (OmniFID) and +Discontinuity Score (DS). OmniFID is an extension of FID tailored to +additionally capture field-of-view requirements of the spherical format by +leveraging cubemap projections. DS is a kernel-based seam alignment score of +continuity across borders of 2D representations of spherical images. In +experiments, OmniFID and DS quantify geometry fidelity issues that are +undetected by FID. + +
+
+ comment: Accepted at ECCV 2024 +
+
+
+
+
+ + ☆ Differentiable Quantum Architecture Search in Asynchronous Quantum + Reinforcement Learning + + +
+ The emergence of quantum reinforcement learning (QRL) is propelled by +advancements in quantum computing (QC) and machine learning (ML), particularly +through quantum neural networks (QNN) built on variational quantum circuits +(VQC). These advancements have proven successful in addressing sequential +decision-making tasks. However, constructing effective QRL models demands +significant expertise due to challenges in designing quantum circuit +architectures, including data encoding and parameterized circuits, which +profoundly influence model performance. In this paper, we propose addressing +this challenge with differentiable quantum architecture search (DiffQAS), +enabling trainable circuit parameters and structure weights using +gradient-based optimization. Furthermore, we enhance training efficiency +through asynchronous reinforcement learning (RL) methods facilitating parallel +training. Through numerical simulations, we demonstrate that our proposed +DiffQAS-QRL approach achieves performance comparable to manually-crafted +circuit architectures across considered environments, showcasing stability +across diverse scenarios. This methodology offers a pathway for designing QRL +models without extensive quantum knowledge, ensuring robust performance and +fostering broader application of QRL. + +
+
+ comment: Accepted by IEEE International Conference on Quantum Computing and + Engineering - QCE 2024 +
+
+
+
+
+ + ☆ Sparse Incremental Aggregation in Multi-Hop Federated Learning SP + + +
+ This paper investigates federated learning (FL) in a multi-hop communication +setup, such as in constellations with inter-satellite links. In this setup, +part of the FL clients are responsible for forwarding other client's results to +the parameter server. Instead of using conventional routing, the communication +efficiency can be improved significantly by using in-network model aggregation +at each intermediate hop, known as incremental aggregation (IA). Prior works +[1] have indicated diminishing gains for IA under gradient sparsification. Here +we study this issue and propose several novel correlated sparsification methods +for IA. Numerical results show that, for some of these algorithms, the full +potential of IA is still available under sparsification without impairing +convergence. We demonstrate a 15x improvement in communication efficiency over +conventional routing and a 11x improvement over state-of-the-art (SoA) sparse +IA. + +
+
+ comment: This paper is accepted for the 25th IEEE International Workshop on + Signal Processing Advances in Wireless Communications (SPAWC) conference +
+
+
+
+
+ + ☆ AsEP: Benchmarking Deep Learning Methods for Antibody-specific Epitope + Prediction + + +
+ Epitope identification is vital for antibody design yet challenging due to +the inherent variability in antibodies. While many deep learning methods have +been developed for general protein binding site prediction tasks, whether they +work for epitope prediction remains an understudied research question. The +challenge is also heightened by the lack of a consistent evaluation pipeline +with sufficient dataset size and epitope diversity. We introduce a filtered +antibody-antigen complex structure dataset, AsEP (Antibody-specific Epitope +Prediction). AsEP is the largest of its kind and provides clustered epitope +groups, allowing the community to develop and test novel epitope prediction +methods. AsEP comes with an easy-to-use interface in Python and pre-built graph +representations of each antibody-antigen complex while also supporting +customizable embedding methods. Based on this new dataset, we benchmarked +various representative general protein-binding site prediction methods and find +that their performances are not satisfactory as expected for epitope +prediction. We thus propose a new method, WALLE, that leverages both protein +language models and graph neural networks. WALLE demonstrate about 5X +performance gain over existing methods. Our empirical findings evidence that +epitope prediction benefits from combining sequential embeddings provided by +language models and geometrical information from graph representations, +providing a guideline for future method design. In addition, we reformulate the +task as bipartite link prediction, allowing easy model performance attribution +and interpretability. We open-source our data and code at +https://github.com/biochunan/AsEP-dataset. + +
+
+
+
+
+ + ☆ Gene Regulatory Network Inference from Pre-trained Single-Cell + Transcriptomics Transformer with Joint Graph Learning ICML 2024 + + +
+ Inferring gene regulatory networks (GRNs) from single-cell RNA sequencing +(scRNA-seq) data is a complex challenge that requires capturing the intricate +relationships between genes and their regulatory interactions. In this study, +we tackle this challenge by leveraging the single-cell BERT-based pre-trained +transformer model (scBERT), trained on extensive unlabeled scRNA-seq data, to +augment structured biological knowledge from existing GRNs. We introduce a +novel joint graph learning approach that combines the rich contextual +representations learned by pre-trained single-cell language models with the +structured knowledge encoded in GRNs using graph neural networks (GNNs). By +integrating these two modalities, our approach effectively reasons over boththe +gene expression level constraints provided by the scRNA-seq data and the +structured biological knowledge inherent in GRNs. We evaluate our method on +human cell benchmark datasets from the BEELINE study with cell type-specific +ground truth networks. The results demonstrate superior performance over +current state-of-the-art baselines, offering a deeper understanding of cellular +regulatory mechanisms. + +
+
+ comment: Accepted into the ICML 2024 AI for Science workshop +
+
+
+
+
+ + ☆ Quasar-ViT: Hardware-Oriented Quantization-Aware Architecture Search for + Vision Transformers + + +
+ Vision transformers (ViTs) have demonstrated their superior accuracy for +computer vision tasks compared to convolutional neural networks (CNNs). +However, ViT models are often computation-intensive for efficient deployment on +resource-limited edge devices. This work proposes Quasar-ViT, a +hardware-oriented quantization-aware architecture search framework for ViTs, to +design efficient ViT models for hardware implementation while preserving the +accuracy. First, Quasar-ViT trains a supernet using our row-wise flexible +mixed-precision quantization scheme, mixed-precision weight entanglement, and +supernet layer scaling techniques. Then, it applies an efficient +hardware-oriented search algorithm, integrated with hardware latency and +resource modeling, to determine a series of optimal subnets from supernet under +different inference latency targets. Finally, we propose a series of +model-adaptive designs on the FPGA platform to support the architecture search +and mitigate the gap between the theoretical computation reduction and the +practical inference speedup. Our searched models achieve 101.5, 159.6, and +251.6 frames-per-second (FPS) inference speed on the AMD/Xilinx ZCU102 FPGA +with 80.4%, 78.6%, and 74.9% top-1 accuracy, respectively, for the ImageNet +dataset, consistently outperforming prior works. + +
+
+ comment: Accepted by ICS 2024 +
+
+
+
+
+ + ☆ RIDA: A Robust Attack Framework on Incomplete Graphs + + +
+ Graph Neural Networks (GNNs) are vital in data science but are increasingly +susceptible to adversarial attacks. To help researchers develop more robust GNN +models, it's essential to focus on designing strong attack models as +foundational benchmarks and guiding references. Among adversarial attacks, +gray-box poisoning attacks are noteworthy due to their effectiveness and fewer +constraints. These attacks exploit GNNs' need for retraining on updated data, +thereby impacting their performance by perturbing these datasets. However, +current research overlooks the real-world scenario of incomplete graphs.To +address this gap, we introduce the Robust Incomplete Deep Attack Framework +(RIDA). It is the first algorithm for robust gray-box poisoning attacks on +incomplete graphs. The approach innovatively aggregates distant vertex +information and ensures powerful data utilization.Extensive tests against 9 +SOTA baselines on 3 real-world datasets demonstrate RIDA's superiority in +handling incompleteness and high attack performance on the incomplete graph. + +
+
+
+
+
+ + ☆ Unlocking Tokens as Data Points for Generalization Bounds on Larger + Language Models + + +
+ Large language models (LLMs) with billions of parameters excel at predicting +the next token in a sequence. Recent work computes non-vacuous +compression-based generalization bounds for LLMs, but these bounds are vacuous +for large models at the billion-parameter scale. Moreover, these bounds are +obtained through restrictive compression techniques, bounding compressed models +that generate low-quality text. Additionally, the tightness of these existing +bounds depends on the number of IID documents in a training set rather than the +much larger number of non-IID constituent tokens, leaving untapped potential +for tighter bounds. In this work, we instead use properties of martingales to +derive generalization bounds that benefit from the vast number of tokens in LLM +training sets. Since a dataset contains far more tokens than documents, our +generalization bounds not only tolerate but actually benefit from far less +restrictive compression schemes. With Monarch matrices, Kronecker +factorizations, and post-training quantization, we achieve non-vacuous +generalization bounds for LLMs as large as LLaMA2-70B. Unlike previous +approaches, our work achieves the first non-vacuous bounds for models that are +deployed in practice and generate high-quality text. + +
+
+
+
+
+ + ☆ StraightLine: An End-to-End Resource-Aware Scheduler for Machine + Learning Application Requests + + +
+ The life cycle of machine learning (ML) applications consists of two stages: +model development and model deployment. However, traditional ML systems (e.g., +training-specific or inference-specific systems) focus on one particular stage +or phase of the life cycle of ML applications. These systems often aim at +optimizing model training or accelerating model inference, and they frequently +assume homogeneous infrastructure, which may not always reflect real-world +scenarios that include cloud data centers, local servers, containers, and +serverless platforms. We present StraightLine, an end-to-end resource-aware +scheduler that schedules the optimal resources (e.g., container, virtual +machine, or serverless) for different ML application requests in a hybrid +infrastructure. The key innovation is an empirical dynamic placing algorithm +that intelligently places requests based on their unique characteristics (e.g., +request frequency, input data size, and data distribution). In contrast to +existing ML systems, StraightLine offers end-to-end resource-aware placement, +thereby it can significantly reduce response time and failure rate for model +deployment when facing different computing resources in the hybrid +infrastructure. + +
+
+ comment: 6 pages, 8 figures, to appear in AIoTC'24 +
+
+
+
+
+ + ☆ Maximum Entropy On-Policy Actor-Critic via Entropy Advantage Estimation + + +
+ Entropy Regularisation is a widely adopted technique that enhances policy +optimisation performance and stability. A notable form of entropy +regularisation is augmenting the objective with an entropy term, thereby +simultaneously optimising the expected return and the entropy. This framework, +known as maximum entropy reinforcement learning (MaxEnt RL), has shown +theoretical and empirical successes. However, its practical application in +straightforward on-policy actor-critic settings remains surprisingly +underexplored. We hypothesise that this is due to the difficulty of managing +the entropy reward in practice. This paper proposes a simple method of +separating the entropy objective from the MaxEnt RL objective, which +facilitates the implementation of MaxEnt RL in on-policy settings. Our +empirical evaluations demonstrate that extending Proximal Policy Optimisation +(PPO) and Trust Region Policy Optimisation (TRPO) within the MaxEnt framework +improves policy optimisation performance in both MuJoCo and Procgen tasks. +Additionally, our results highlight MaxEnt RL's capacity to enhance +generalisation. + +
+
+
+
+
+ + ☆ IRIS: Wireless Ring for Vision-based Smart Home Interaction + + +
+ Integrating cameras into wireless smart rings has been challenging due to +size and power constraints. We introduce IRIS, the first wireless +vision-enabled smart ring system for smart home interactions. Equipped with a +camera, Bluetooth radio, inertial measurement unit (IMU), and an onboard +battery, IRIS meets the small size, weight, and power (SWaP) requirements for +ring devices. IRIS is context-aware, adapting its gesture set to the detected +device, and can last for 16-24 hours on a single charge. IRIS leverages the +scene semantics to achieve instance-level device recognition. In a study +involving 23 participants, IRIS consistently outpaced voice commands, with a +higher proportion of participants expressing a preference for IRIS over voice +commands regarding toggling a device's state, granular control, and social +acceptability. Our work pushes the boundary of what is possible with ring +form-factor devices, addressing system challenges and opening up novel +interaction capabilities. + +
+
+ comment: 15 pages, 17 figures, 6 tables, to be published in UIST 2024 +
+
+
+
+
+ + $\mathbb{X}$-Sample Contrastive Loss: Improving Contrastive Learning + with Sample Similarity Graphs + + +
+ Learning good representations involves capturing the diverse ways in which +data samples relate. Contrastive loss - an objective matching related samples - +underlies methods from self-supervised to multimodal learning. Contrastive +losses, however, can be viewed more broadly as modifying a similarity graph to +indicate how samples should relate in the embedding space. This view reveals a +shortcoming in contrastive learning: the similarity graph is binary, as only +one sample is the related positive sample. Crucially, similarities +\textit{across} samples are ignored. Based on this observation, we revise the +standard contrastive loss to explicitly encode how a sample relates to others. +We experiment with this new objective, called $\mathbb{X}$-Sample Contrastive, +to train vision models based on similarities in class or text caption +descriptions. Our study spans three scales: ImageNet-1k with 1 million, CC3M +with 3 million, and CC12M with 12 million samples. The representations learned +via our objective outperform both contrastive self-supervised and +vision-language models trained on the same data across a range of tasks. When +training on CC12M, we outperform CLIP by $0.6\%$ on both ImageNet and ImageNet +Real. Our objective appears to work particularly well in lower-data regimes, +with gains over CLIP of $16.8\%$ on ImageNet and $18.1\%$ on ImageNet Real when +training with CC3M. Finally, our objective seems to encourage the model to +learn representations that separate objects from their attributes and +backgrounds, with gains of $3.3$-$5.6$\% over CLIP on ImageNet9. We hope the +proposed solution takes a small step towards developing richer learning +objectives for understanding sample relations in foundation models. + +
+
+
+
+
+ + ☆ Unsupervised Training of Neural Cellular Automata on Edge Devices + + +
+ The disparity in access to machine learning tools for medical imaging across +different regions significantly limits the potential for universal healthcare +innovation, particularly in remote areas. Our research addresses this issue by +implementing Neural Cellular Automata (NCA) training directly on smartphones +for accessible X-ray lung segmentation. We confirm the practicality and +feasibility of deploying and training these advanced models on five Android +devices, improving medical diagnostics accessibility and bridging the tech +divide to extend machine learning benefits in medical imaging to low- and +middle-income countries (LMICs). We further enhance this approach with an +unsupervised adaptation method using the novel Variance-Weighted Segmentation +Loss (VWSL), which efficiently learns from unlabeled data by minimizing the +variance from multiple NCA predictions. This strategy notably improves model +adaptability and performance across diverse medical imaging contexts without +the need for extensive computational resources or labeled datasets, effectively +lowering the participation threshold. Our methodology, tested on three +multisite X-ray datasets -- Padchest, ChestX-ray8, and MIMIC-III -- +demonstrates improvements in segmentation Dice accuracy by 0.7 to 2.8%, +compared to the classic Med-NCA. Additionally, in extreme cases where no +digital copy is available and images must be captured by a phone from an X-ray +lightbox or monitor, VWSL enhances Dice accuracy by 5-20%, demonstrating the +method's robustness even with suboptimal image sources. + +
+
+
+
+
+ + ☆ Graph Neural Ordinary Differential Equations for Coarse-Grained + Socioeconomic Dynamics + + +
+ We present a data-driven machine-learning approach for modeling space-time +socioeconomic dynamics. Through coarse-graining fine-scale observations, our +modeling framework simplifies these complex systems to a set of tractable +mechanistic relationships -- in the form of ordinary differential equations -- +while preserving critical system behaviors. This approach allows for expedited +'what if' studies and sensitivity analyses, essential for informed +policy-making. Our findings, from a case study of Baltimore, MD, indicate that +this machine learning-augmented coarse-grained model serves as a powerful +instrument for deciphering the complex interactions between social factors, +geography, and exogenous stressors, offering a valuable asset for system +forecasting and resilience planning. + +
+
+
+
+
+ + ☆ Fine-Tuning Large Language Models for Stock Return Prediction Using + Newsflow + + +
+ Large language models (LLMs) and their fine-tuning techniques have +demonstrated superior performance in various language understanding and +generation tasks. This paper explores fine-tuning LLMs for stock return +forecasting with financial newsflow. In quantitative investing, return +forecasting is fundamental for subsequent tasks like stock picking, portfolio +optimization, etc. We formulate the model to include text representation and +forecasting modules. We propose to compare the encoder-only and decoder-only +LLMs, considering they generate text representations in distinct ways. The +impact of these different representations on forecasting performance remains an +open question. Meanwhile, we compare two simple methods of integrating LLMs' +token-level representations into the forecasting module. The experiments on +real news and investment universes reveal that: (1) aggregated representations +from LLMs' token-level embeddings generally produce return predictions that +enhance the performance of long-only and long-short portfolios; (2) in the +relatively large investment universe, the decoder LLMs-based prediction model +leads to stronger portfolios, whereas in the small universes, there are no +consistent winners. Among the three LLMs studied (DeBERTa, Mistral, Llama), +Mistral performs more robustly across different universes; (3) return +predictions derived from LLMs' text representations are a strong signal for +portfolio construction, outperforming conventional sentiment scores. + +
+
+
+
+
+ + ☆ Principal-Agent Reinforcement Learning + + +
+ Contracts are the economic framework which allows a principal to delegate a +task to an agent -- despite misaligned interests, and even without directly +observing the agent's actions. In many modern reinforcement learning settings, +self-interested agents learn to perform a multi-stage task delegated to them by +a principal. We explore the significant potential of utilizing contracts to +incentivize the agents. We model the delegated task as an MDP, and study a +stochastic game between the principal and agent where the principal learns what +contracts to use, and the agent learns an MDP policy in response. We present a +learning-based algorithm for optimizing the principal's contracts, which +provably converges to the subgame-perfect equilibrium of the principal-agent +game. A deep RL implementation allows us to apply our method to very large MDPs +with unknown transition dynamics. We extend our approach to multiple agents, +and demonstrate its relevance to resolving a canonical sequential social +dilemma with minimal intervention to agent rewards. + +
+
+
+
+
+ + ☆ HVM-1: Large-scale video models pretrained with nearly 5000 hours of + human-like video data + + +
+ We introduce Human-like Video Models (HVM-1), large-scale video models +pretrained with nearly 5000 hours of curated human-like video data (mostly +egocentric, temporally extended, continuous video recordings), using the +spatiotemporal masked autoencoder (ST-MAE) algorithm. We release two 633M +parameter models trained at spatial resolutions of 224x224 and 448x448 pixels. +We evaluate the performance of these models in downstream few-shot video and +image recognition tasks and compare them against a model pretrained with 1330 +hours of short action-oriented video clips from YouTube (Kinetics-700). HVM-1 +models perform competitively against the Kinetics-700 pretrained model in +downstream evaluations despite substantial qualitative differences between the +spatiotemporal characteristics of the corresponding pretraining datasets. HVM-1 +models also learn more accurate and more robust object representations compared +to models pretrained with the image-based MAE algorithm on the same data, +demonstrating the potential benefits of learning to predict temporal +regularities in natural videos for learning better object representations. + +
+
+ comment: 10 pages, 5 figures, 1 table; code & models available from + https://github.com/eminorhan/hvm-1 +
+
+
+
+
+ + ☆ Multi-Agent Deep Reinforcement Learning for Resilience Optimization in + 5G RAN + + +
+ Resilience is defined as the ability of a network to resist, adapt, and +quickly recover from disruptions, and to continue to maintain an acceptable +level of services from users' perspective. With the advent of future radio +networks, including advanced 5G and upcoming 6G, critical services become +integral to future networks, requiring uninterrupted service delivery for end +users. Unfortunately, with the growing network complexity, user mobility and +diversity, it becomes challenging to scale current resilience management +techniques that rely on local optimizations to large dense network deployments. +This paper aims to address this problem by globally optimizing the resilience +of a dense multi-cell network based on multi-agent deep reinforcement learning. +Specifically, our proposed solution can dynamically tilt cell antennas and +reconfigure transmit power to mitigate outages and increase both coverage and +service availability. A multi-objective optimization problem is formulated to +simultaneously satisfy resiliency constraints while maximizing the service +quality in the network area in order to minimize the impact of outages on +neighbouring cells. Extensive simulations then demonstrate that with our +proposed solution, the average service availability in terms of user throughput +can be increased by up to 50-60% on average, while reaching a coverage +availability of 99% in best cases. + +
+
+
+
+
+ + ☆ Cross-Vendor Reproducibility of Radiomics-based Machine Learning Models + for Computer-aided Diagnosis + + +
+ Background: The reproducibility of machine-learning models in prostate cancer +detection across different MRI vendors remains a significant challenge. +Methods: This study investigates Support Vector Machines (SVM) and Random +Forest (RF) models trained on radiomic features extracted from T2-weighted MRI +images using Pyradiomics and MRCradiomics libraries. Feature selection was +performed using the maximum relevance minimum redundancy (MRMR) technique. We +aimed to enhance clinical decision support through multimodal learning and +feature fusion. Results: Our SVM model, utilizing combined features from +Pyradiomics and MRCradiomics, achieved an AUC of 0.74 on the Multi-Improd +dataset (Siemens scanner) but decreased to 0.60 on the Philips test set. The RF +model showed similar trends, with notable robustness for models using +Pyradiomics features alone (AUC of 0.78 on Philips). Conclusions: These +findings demonstrate the potential of multimodal feature integration to improve +the robustness and generalizability of machine-learning models for clinical +decision support in prostate cancer detection. This study marks a significant +step towards developing reliable AI-driven diagnostic tools that maintain +efficacy across various imaging platforms. + +
+
+
+
+
+ + ☆ I can listen but cannot read: An evaluation of two-tower multimodal + systems for instrument recognition + + +
+ Music two-tower multimodal systems integrate audio and text modalities into a +joint audio-text space, enabling direct comparison between songs and their +corresponding labels. These systems enable new approaches for classification +and retrieval, leveraging both modalities. Despite the promising results they +have shown for zero-shot classification and retrieval tasks, closer inspection +of the embeddings is needed. This paper evaluates the inherent zero-shot +properties of joint audio-text spaces for the case-study of instrument +recognition. We present an evaluation and analysis of two-tower systems for +zero-shot instrument recognition and a detailed analysis of the properties of +the pre-joint and joint embeddings spaces. Our findings suggest that audio +encoders alone demonstrate good quality, while challenges remain within the +text encoder or joint space projection. Specifically, two-tower systems exhibit +sensitivity towards specific words, favoring generic prompts over musically +informed ones. Despite the large size of textual encoders, they do not yet +leverage additional textual context or infer instruments accurately from their +descriptions. Lastly, a novel approach for quantifying the semantic +meaningfulness of the textual space leveraging an instrument ontology is +proposed. This method reveals deficiencies in the systems' understanding of +instruments and provides evidence of the need for fine-tuning text encoders on +musical data. + +
+
+ comment: Accepted to ISMIR 2024 +
+
+
+
+
+ + ☆ Physics-informed nonlinear vector autoregressive models for the + prediction of dynamical systems + + +
+ Machine learning techniques have recently been of great interest for solving +differential equations. Training these models is classically a data-fitting +task, but knowledge of the expression of the differential equation can be used +to supplement the training objective, leading to the development of +physics-informed scientific machine learning. In this article, we focus on one +class of models called nonlinear vector autoregression (NVAR) to solve ordinary +differential equations (ODEs). Motivated by connections to numerical +integration and physics-informed neural networks, we explicitly derive the +physics-informed NVAR (piNVAR) which enforces the right-hand side of the +underlying differential equation regardless of NVAR construction. Because NVAR +and piNVAR completely share their learned parameters, we propose an augmented +procedure to jointly train the two models. Then, using both data-driven and +ODE-driven metrics, we evaluate the ability of the piNVAR model to predict +solutions to various ODE systems, such as the undamped spring, a Lotka-Volterra +predator-prey nonlinear model, and the chaotic Lorenz system. + +
+
+
+
+
+ + ☆ The Geometry of Queries: Query-Based Innovations in Retrieval-Augmented + Generation + + +
+ Digital health chatbots powered by Large Language Models (LLMs) have the +potential to significantly improve personal health management for chronic +conditions by providing accessible and on-demand health coaching and +question-answering. However, these chatbots risk providing unverified and +inaccurate information because LLMs generate responses based on patterns +learned from diverse internet data. Retrieval Augmented Generation (RAG) can +help mitigate hallucinations and inaccuracies in LLM responses by grounding it +on reliable content. However, efficiently and accurately retrieving most +relevant set of content for real-time user questions remains a challenge. In +this work, we introduce Query-Based Retrieval Augmented Generation (QB-RAG), a +novel approach that pre-computes a database of potential queries from a content +base using LLMs. For an incoming patient question, QB-RAG efficiently matches +it against this pre-generated query database using vector search, improving +alignment between user questions and the content. We establish a theoretical +foundation for QB-RAG and provide a comparative analysis of existing retrieval +enhancement techniques for RAG systems. Finally, our empirical evaluation +demonstrates that QB-RAG significantly improves the accuracy of healthcare +question answering, paving the way for robust and trustworthy LLM applications +in digital health. + +
+
+ comment: 22 pages +
+
+
+
+
+ + ☆ Lifelong Graph Summarization with Neural Networks: 2012, 2022, and a + Time Warp + + +
+ Summarizing web graphs is challenging due to the heterogeneity of the modeled +information and its changes over time. We investigate the use of neural +networks for lifelong graph summarization. Assuming we observe the web graph at +a certain time, we train the networks to summarize graph vertices. We apply +this trained network to summarize the vertices of the changed graph at the next +point in time. Subsequently, we continue training and evaluating the network to +perform lifelong graph summarization. We use the GNNs Graph-MLP and GraphSAINT, +as well as an MLP baseline, to summarize the temporal graphs. We compare +$1$-hop and $2$-hop summaries. We investigate the impact of reusing parameters +from a previous snapshot by measuring the backward and forward transfer and the +forgetting rate of the neural networks. Our extensive experiments on ten weekly +snapshots of a web graph with over $100$M edges, sampled in 2012 and 2022, show +that all networks predominantly use $1$-hop information to determine the +summary, even when performing $2$-hop summarization. Due to the heterogeneity +of web graphs, in some snapshots, the $2$-hop summary produces over ten times +more vertex summaries than the $1$-hop summary. When using the network trained +on the last snapshot from 2012 and applying it to the first snapshot of 2022, +we observe a strong drop in accuracy. We attribute this drop over the ten-year +time warp to the strongly increased heterogeneity of the web graph in 2022. + +
+
+
+
+
+ + ☆ How to Train the Teacher Model for Effective Knowledge Distillation ECCV2024 + + +
+ Recently, it was shown that the role of the teacher in knowledge distillation +(KD) is to provide the student with an estimate of the true Bayes conditional +probability density (BCPD). Notably, the new findings propose that the +student's error rate can be upper-bounded by the mean squared error (MSE) +between the teacher's output and BCPD. Consequently, to enhance KD efficacy, +the teacher should be trained such that its output is close to BCPD in MSE +sense. This paper elucidates that training the teacher model with MSE loss +equates to minimizing the MSE between its output and BCPD, aligning with its +core responsibility of providing the student with a BCPD estimate closely +resembling it in MSE terms. In this respect, through a comprehensive set of +experiments, we demonstrate that substituting the conventional teacher trained +with cross-entropy loss with one trained using MSE loss in state-of-the-art KD +methods consistently boosts the student's accuracy, resulting in improvements +of up to 2.6\%. + +
+
+ comment: The paper was accepted at ECCV2024 +
+
+
+
+
+ + ☆ Peak-Controlled Logits Poisoning Attack in Federated Distillation + + +
+ Federated Distillation (FD) offers an innovative approach to distributed +machine learning, leveraging knowledge distillation for efficient and flexible +cross-device knowledge transfer without necessitating the upload of extensive +model parameters to a central server. While FD has gained popularity, its +vulnerability to poisoning attacks remains underexplored. To address this gap, +we previously introduced FDLA (Federated Distillation Logits Attack), a method +that manipulates logits communication to mislead and degrade the performance of +client models. However, the impact of FDLA on participants with different +identities and the effects of malicious modifications at various stages of +knowledge transfer remain unexplored. To this end, we present PCFDLA +(Peak-Controlled Federated Distillation Logits Attack), an advanced and more +stealthy logits poisoning attack method for FD. PCFDLA enhances the +effectiveness of FDLA by carefully controlling the peak values of logits to +create highly misleading yet inconspicuous modifications. Furthermore, we +introduce a novel metric for better evaluating attack efficacy, demonstrating +that PCFDLA maintains stealth while being significantly more disruptive to +victim models compared to its predecessors. Experimental results across various +datasets confirm the superior impact of PCFDLA on model accuracy, solidifying +its potential threat in federated distillation systems. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2401.03685 +
+
+
+
+
+ + ☆ ECG Arrhythmia Detection Using Disease-specific Attention-based Deep + Learning Model + + +
+ The electrocardiogram (ECG) is one of the most commonly-used tools to +diagnose cardiovascular disease in clinical practice. Although deep learning +models have achieved very impressive success in the field of automatic ECG +analysis, they often lack model interpretability that is significantly +important in the healthcare applications. To this end, many schemes such as +general-purpose attention mechanism, Grad-CAM technique and ECG knowledge graph +were proposed to be integrated with deep learning models. However, they either +result in decreased classification performance or do not consist with the one +in cardiologists' mind when interpreting ECG. In this study, we propose a novel +disease-specific attention-based deep learning model (DANet) for arrhythmia +detection from short ECG recordings. The novel idea is to introduce a +soft-coding or hard-coding waveform enhanced module into existing deep neural +networks, which amends original ECG signals with the guidance of the rule for +diagnosis of a given disease type before being fed into the classification +module. For the soft-coding DANet, we also develop a learning framework +combining self-supervised pre-training with two-stage supervised training. To +verify the effectiveness of our proposed DANet, we applied it to the problem of +atrial premature contraction detection and the experimental results shows that +it demonstrates superior performance compared to the benchmark model. Moreover, +it also provides the waveform regions that deserve special attention in the +model's decision-making process, allowing it to be a medical diagnostic +assistant for physicians. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ☆ Learning mental states estimation through self-observation: a + developmental synergy between intentions and beliefs representations in a + deep-learning model of Theory of Mind + + +
+ Theory of Mind (ToM), the ability to attribute beliefs, intentions, or mental +states to others, is a crucial feature of human social interaction. In complex +environments, where the human sensory system reaches its limits, behaviour is +strongly driven by our beliefs about the state of the world around us. +Accessing others' mental states, e.g., beliefs and intentions, allows for more +effective social interactions in natural contexts. Yet, these variables are not +directly observable, making understanding ToM a challenging quest of interest +for different fields, including psychology, machine learning and robotics. In +this paper, we contribute to this topic by showing a developmental synergy +between learning to predict low-level mental states (e.g., intentions, goals) +and attributing high-level ones (i.e., beliefs). Specifically, we assume that +learning beliefs attribution can occur by observing one's own decision +processes involving beliefs, e.g., in a partially observable environment. Using +a simple feed-forward deep learning model, we show that, when learning to +predict others' intentions and actions, more accurate predictions can be +acquired earlier if beliefs attribution is learnt simultaneously. Furthermore, +we show that the learning performance improves even when observed actors have a +different embodiment than the observer and the gain is higher when observing +beliefs-driven chunks of behaviour. We propose that our computational approach +can inform the understanding of human social cognitive development and be +relevant for the design of future adaptive social robots able to autonomously +understand, assist, and learn from human interaction partners in novel natural +environments and tasks. + +
+
+
+
+
+ + ☆ Quadratic Advantage with Quantum Randomized Smoothing Applied to + Time-Series Analysis + + +
+ As quantum machine learning continues to develop at a rapid pace, the +importance of ensuring the robustness and efficiency of quantum algorithms +cannot be overstated. Our research presents an analysis of quantum randomized +smoothing, how data encoding and perturbation modeling approaches can be +matched to achieve meaningful robustness certificates. By utilizing an +innovative approach integrating Grover's algorithm, a quadratic sampling +advantage over classical randomized smoothing is achieved. This strategy +necessitates a basis state encoding, thus restricting the space of meaningful +perturbations. We show how constrained $k$-distant Hamming weight perturbations +are a suitable noise distribution here, and elucidate how they can be +constructed on a quantum computer. The efficacy of the proposed framework is +demonstrated on a time series classification task employing a Bag-of-Words +pre-processing solution. The advantage of quadratic sample reduction is +recovered especially in the regime with large number of samples. This may allow +quantum computers to efficiently scale randomized smoothing to more complex +tasks beyond the reach of classical methods. + +
+
+ comment: Accepted at the IEEE International Conference on Quantum Computing + and Engineering (QCE) +
+
+
+
+
+ + ☆ Self-Supervision Improves Diffusion Models for Tabular Data Imputation CIKM 2024 + + +
+ The ubiquity of missing data has sparked considerable attention and focus on +tabular data imputation methods. Diffusion models, recognized as the +cutting-edge technique for data generation, demonstrate significant potential +in tabular data imputation tasks. However, in pursuit of diversity, vanilla +diffusion models often exhibit sensitivity to initialized noises, which hinders +the models from generating stable and accurate imputation results. +Additionally, the sparsity inherent in tabular data poses challenges for +diffusion models in accurately modeling the data manifold, impacting the +robustness of these models for data imputation. To tackle these challenges, +this paper introduces an advanced diffusion model named Self-supervised +imputation Diffusion Model (SimpDM for brevity), specifically tailored for +tabular data imputation tasks. To mitigate sensitivity to noise, we introduce a +self-supervised alignment mechanism that aims to regularize the model, ensuring +consistent and stable imputation predictions. Furthermore, we introduce a +carefully devised state-dependent data augmentation strategy within SimpDM, +enhancing the robustness of the diffusion model when dealing with limited data. +Extensive experiments demonstrate that SimpDM matches or outperforms +state-of-the-art imputation methods across various scenarios. + +
+
+ comment: 10 pages, 5 figures. Accepted by CIKM 2024 +
+
+
+
+
+ + ☆ HANNA: Hard-constraint Neural Network for Consistent Activity + Coefficient Prediction + + +
+ We present the first hard-constraint neural network for predicting activity +coefficients (HANNA), a thermodynamic mixture property that is the basis for +many applications in science and engineering. Unlike traditional neural +networks, which ignore physical laws and result in inconsistent predictions, +our model is designed to strictly adhere to all thermodynamic consistency +criteria. By leveraging deep-set neural networks, HANNA maintains symmetry +under the permutation of the components. Furthermore, by hard-coding physical +constraints in the network architecture, we ensure consistency with the +Gibbs-Duhem equation and in modeling the pure components. The model was trained +and evaluated on 317,421 data points for activity coefficients in binary +mixtures from the Dortmund Data Bank, achieving significantly higher prediction +accuracies than the current state-of-the-art model UNIFAC. Moreover, HANNA only +requires the SMILES of the components as input, making it applicable to any +binary mixture of interest. HANNA is fully open-source and available for free +use. + +
+
+
+
+
+ + ☆ Network Inversion of Convolutional Neural Nets + + +
+ Neural networks have emerged as powerful tools across various applications, +yet their decision-making process often remains opaque, leading to them being +perceived as "black boxes." This opacity raises concerns about their +interpretability and reliability, especially in safety-critical scenarios. +Network inversion techniques offer a solution by allowing us to peek inside +these black boxes, revealing the features and patterns learned by the networks +behind their decision-making processes and thereby provide valuable insights +into how neural networks arrive at their conclusions, making them more +interpretable and trustworthy. This paper presents a simple yet effective +approach to network inversion using a carefully conditioned generator that +learns the data distribution in the input space of the trained neural network, +enabling the reconstruction of inputs that would most likely lead to the +desired outputs. To capture the diversity in the input space for a given +output, instead of simply revealing the conditioning labels to the generator, +we hideously encode the conditioning label information into vectors, further +exemplified by heavy dropout in the generation process and minimisation of +cosine similarity between the features corresponding to the generated images. +The paper concludes with immediate applications of Network Inversion including +in interpretability, explainability and generation of adversarial samples. + +
+
+
+
+
+ + ☆ Lightweight Industrial Cohorted Federated Learning for Heterogeneous + Assets + + +
+ Federated Learning (FL) is the most widely adopted collaborative learning +approach for training decentralized Machine Learning (ML) models by exchanging +learning between clients without sharing the data and compromising privacy. +However, since great data similarity or homogeneity is taken for granted in all +FL tasks, FL is still not specifically designed for the industrial setting. +Rarely this is the case in industrial data because there are differences in +machine type, firmware version, operational conditions, environmental factors, +and hence, data distribution. Albeit its popularity, it has been observed that +FL performance degrades if the clients have heterogeneous data distributions. +Therefore, we propose a Lightweight Industrial Cohorted FL (LICFL) algorithm +that uses model parameters for cohorting without any additional on-edge +(clientlevel) computations and communications than standard FL and mitigates +the shortcomings from data heterogeneity in industrial applications. Our +approach enhances client-level model performance by allowing them to +collaborate with similar clients and train more specialized or personalized +models. Also, we propose an adaptive aggregation algorithm that extends the +LICFL to Adaptive LICFL (ALICFL) for further improving the global model +performance and speeding up the convergence. Through numerical experiments on +real-time data, we demonstrate the efficacy of the proposed algorithms and +compare the performance with existing approaches. + +
+
+
+
+
+ + ☆ iNNspector: Visual, Interactive Deep Model Debugging + + +
+ Deep learning model design, development, and debugging is a process driven by +best practices, guidelines, trial-and-error, and the personal experiences of +model developers. At multiple stages of this process, performance and internal +model data can be logged and made available. However, due to the sheer +complexity and scale of this data and process, model developers often resort to +evaluating their model performance based on abstract metrics like accuracy and +loss. We argue that a structured analysis of data along the model's +architecture and at multiple abstraction levels can considerably streamline the +debugging process. Such a systematic analysis can further connect the +developer's design choices to their impacts on the model behavior, facilitating +the understanding, diagnosis, and refinement of deep learning models. Hence, in +this paper, we (1) contribute a conceptual framework structuring the data space +of deep learning experiments. Our framework, grounded in literature analysis +and requirements interviews, captures design dimensions and proposes mechanisms +to make this data explorable and tractable. To operationalize our framework in +a ready-to-use application, we (2) present the iNNspector system. iNNspector +enables tracking of deep learning experiments and provides interactive +visualizations of the data on all levels of abstraction from multiple models to +individual neurons. Finally, we (3) evaluate our approach with three real-world +use-cases and a user study with deep learning developers and data analysts, +proving its effectiveness and usability. + +
+
+ comment: 41 pages paper, 4 pages references, 3 pages appendix, 19 figures, 2 + tables +
+
+
+
+
+ + ☆ On the Effect of Purely Synthetic Training Data for Different Automatic + Speech Recognition Architectures + + +
+ In this work we evaluate the utility of synthetic data for training automatic +speech recognition (ASR). We use the ASR training data to train a +text-to-speech (TTS) system similar to FastSpeech-2. With this TTS we reproduce +the original training data, training ASR systems solely on synthetic data. For +ASR, we use three different architectures, attention-based encoder-decoder, +hybrid deep neural network hidden Markov model and a Gaussian mixture hidden +Markov model, showing the different sensitivity of the models to synthetic data +generation. In order to extend previous work, we present a number of ablation +studies on the effectiveness of synthetic vs. real training data for ASR. In +particular we focus on how the gap between training on synthetic and real data +changes by varying the speaker embedding or by scaling the model size. For the +latter we show that the TTS models generalize well, even when training scores +indicate overfitting. + +
+
+ comment: Accepted at the SynData4GenAI 2024 workshop +
+
+
+
+
+ + ☆ Amortized Active Learning for Nonparametric Functions + + +
+ Active learning (AL) is a sequential learning scheme aiming to select the +most informative data. AL reduces data consumption and avoids the cost of +labeling large amounts of data. However, AL trains the model and solves an +acquisition optimization for each selection. It becomes expensive when the +model training or acquisition optimization is challenging. In this paper, we +focus on active nonparametric function learning, where the gold standard +Gaussian process (GP) approaches suffer from cubic time complexity. We propose +an amortized AL method, where new data are suggested by a neural network which +is trained up-front without any real data (Figure 1). Our method avoids +repeated model training and requires no acquisition optimization during the AL +deployment. We (i) utilize GPs as function priors to construct an AL simulator, +(ii) train an AL policy that can zero-shot generalize from simulation to real +learning problems of nonparametric functions and (iii) achieve real-time data +selection and comparable learning performances to time-consuming baseline +methods. + +
+
+
+
+
+ + ☆ Relating the Seemingly Unrelated: Principled Understanding of + Generalization for Generative Models in Arithmetic Reasoning Tasks + + +
+ Large language models (LLMs) have demonstrated impressive versatility across +numerous tasks, yet their generalization capabilities remain poorly understood. +To investigate these behaviors, arithmetic tasks serve as important venues. In +previous studies, seemingly unrelated mysteries still exist -- (1) models with +appropriate positional embeddings can correctly perform longer unseen +arithmetic operations such as addition, but their effectiveness varies in more +complex tasks like multiplication; (2) models perform well for longer unseen +cases in modular addition under specific moduli (e.g., modulo 100) but struggle +under very close moduli (e.g., modulo 101), regardless of the positional +encoding used. We believe previous studies have been treating the symptoms +rather than addressing the root cause -- they have paid excessive attention to +improving model components, while overlooking the differences in task +properties that may be the real drivers. This is confirmed by our unified +theoretical framework for different arithmetic scenarios. For example, unlike +multiplication, the digital addition task has the property of translation +invariance which naturally aligns with the relative positional encoding, and +this combination leads to successful generalization of addition to unseen +longer domains. The discrepancy in operations modulo 100 and 101 arises from +the base. Modulo 100, unlike 101, is compatible with the decimal system (base +10), such that unseen information in digits beyond the units digit and the tens +digit is actually not needed for the task. Extensive experiments with GPT-like +models validate our theoretical predictions. These findings deepen our +understanding of the generalization mechanisms, and facilitate more +data-efficient model training and objective-oriented AI alignment. + +
+
+
+
+
+ + ☆ Neural Networks for Generating Better Local Optima in Topology + Optimization + + +
+ Neural networks have recently been employed as material discretizations +within adjoint optimization frameworks for inverse problems and topology +optimization. While advantageous regularization effects and better optima have +been found for some inverse problems, the benefit for topology optimization has +been limited -- where the focus of investigations has been the compliance +problem. We demonstrate how neural network material discretizations can, under +certain conditions, find better local optima in more challenging optimization +problems, where we here specifically consider acoustic topology optimization. +The chances of identifying a better optimum can significantly be improved by +running multiple partial optimizations with different neural network +initializations. Furthermore, we show that the neural network material +discretization's advantage comes from the interplay with the Adam optimizer and +emphasize its current limitations when competing with constrained and +higher-order optimization techniques. At the moment, this discretization has +only been shown to be beneficial for unconstrained first-order optimization. + +
+
+
+
+
+ + ☆ Scaling Training Data with Lossy Image Compression + + +
+ Empirically-determined scaling laws have been broadly successful in +predicting the evolution of large machine learning models with training data +and number of parameters. As a consequence, they have been useful for +optimizing the allocation of limited resources, most notably compute time. + In certain applications, storage space is an important constraint, and data +format needs to be chosen carefully as a consequence. Computer vision is a +prominent example: images are inherently analog, but are always stored in a +digital format using a finite number of bits. Given a dataset of digital +images, the number of bits $L$ to store each of them can be further reduced +using lossy data compression. This, however, can degrade the quality of the +model trained on such images, since each example has lower resolution. + In order to capture this trade-off and optimize storage of training data, we +propose a `storage scaling law' that describes the joint evolution of test +error with sample size and number of bits per image. We prove that this law +holds within a stylized model for image compression, and verify it empirically +on two computer vision tasks, extracting the relevant parameters. We then show +that this law can be used to optimize the lossy compression level. At given +storage, models trained on optimally compressed images present a significantly +smaller test error with respect to models trained on the original data. +Finally, we investigate the potential benefits of randomizing the compression +level. + +
+
+ comment: 21 pages, 27 figures +
+
+
+
+
+ + ☆ Real Time American Sign Language Detection Using Yolo-v9 + + +
+ This paper focuses on real-time American Sign Language Detection. YOLO is a +convolutional neural network (CNN) based model, which was first released in +2015. In recent years, it gained popularity for its real-time detection +capabilities. Our study specifically targets YOLO-v9 model, released in 2024. +As the model is newly introduced, not much work has been done on it, especially +not in Sign Language Detection. Our paper provides deep insight on how YOLO- v9 +works and better than previous model. + +
+
+ comment: 11 pages, 13 figures, 1 table +
+
+
+
+
+ + ☆ Fast convergence of the Expectation Maximization algorithm under a + logarithmic Sobolev inequality + + +
+ By utilizing recently developed tools for constructing gradient flows on +Wasserstein spaces, we extend an analysis technique commonly employed to +understand alternating minimization algorithms on Euclidean space to the +Expectation Maximization (EM) algorithm via its representation as +coordinate-wise minimization on the product of a Euclidean space and a space of +probability distributions due to Neal and Hinton (1998). In so doing we obtain +finite sample error bounds and exponential convergence of the EM algorithm +under a natural generalisation of a log-Sobolev inequality. We further +demonstrate that the analysis technique is sufficiently flexible to allow also +the analysis of several variants of the EM algorithm. + +
+
+
+
+
+ + ☆ Comparison of different Artificial Neural Networks for Bitcoin price + forecasting + + +
+ This study investigates the impact of varying sequence lengths on the +accuracy of predicting cryptocurrency returns using Artificial Neural Networks +(ANNs). Utilizing the Mean Absolute Error (MAE) as a threshold criterion, we +aim to enhance prediction accuracy by excluding returns that are smaller than +this threshold, thus mitigating errors associated with minor returns. The +subsequent evaluation focuses on the accuracy of predicted returns that exceed +this threshold. We compare four sequence lengths 168 hours (7 days), 72 hours +(3 days), 24 hours, and 12 hours each with a return prediction interval of 2 +hours. Our findings reveal the influence of sequence length on prediction +accuracy and underscore the potential for optimized sequence configurations in +financial forecasting models. + +
+
+ comment: 9 pages, 8 figures, 2 tables +
+
+
+
+
+ + ☆ Guided Latent Slot Diffusion for Object-Centric Learning + + +
+ Slot attention aims to decompose an input image into a set of meaningful +object files (slots). These latent object representations enable various +downstream tasks. Yet, these slots often bind to object parts, not objects +themselves, especially for real-world datasets. To address this, we introduce +Guided Latent Slot Diffusion - GLASS, an object-centric model that uses +generated captions as a guiding signal to better align slots with objects. Our +key insight is to learn the slot-attention module in the space of generated +images. This allows us to repurpose the pre-trained diffusion decoder model, +which reconstructs the images from the slots, as a semantic mask generator +based on the generated captions. GLASS learns an object-level representation +suitable for multiple tasks simultaneously, e.g., segmentation, image +generation, and property prediction, outperforming previous methods. For object +discovery, GLASS achieves approx. a +35% and +10% relative improvement for mIoU +over the previous state-of-the-art (SOTA) method on the VOC and COCO datasets, +respectively, and establishes a new SOTA FID score for conditional image +generation amongst slot-attention-based methods. For the segmentation task, +GLASS surpasses SOTA weakly-supervised and language-based segmentation models, +which were specifically designed for the task. + +
+
+ comment: Project Page: https://guided-sa.github.io +
+
+
+
+
+ + ☆ Causal Deepsets for Off-policy Evaluation under Spatial or + Spatio-temporal Interferences + + +
+ Off-policy evaluation (OPE) is widely applied in sectors such as +pharmaceuticals and e-commerce to evaluate the efficacy of novel products or +policies from offline datasets. This paper introduces a causal deepset +framework that relaxes several key structural assumptions, primarily the +mean-field assumption, prevalent in existing OPE methodologies that handle +spatio-temporal interference. These traditional assumptions frequently prove +inadequate in real-world settings, thereby restricting the capability of +current OPE methods to effectively address complex interference effects. In +response, we advocate for the implementation of the permutation invariance (PI) +assumption. This innovative approach enables the data-driven, adaptive learning +of the mean-field function, offering a more flexible estimation method beyond +conventional averaging. Furthermore, we present novel algorithms that +incorporate the PI assumption into OPE and thoroughly examine their theoretical +foundations. Our numerical analyses demonstrate that this novel approach yields +significantly more precise estimations than existing baseline algorithms, +thereby substantially improving the practical applicability and effectiveness +of OPE methodologies. A Python implementation of our proposed method is +available at https://github.com/BIG-S2/Causal-Deepsets. + +
+
+
+
+
+ + ☆ Separating Novel Features for Logical Anomaly Detection: A + Straightforward yet Effective Approach + + +
+ Vision-based inspection algorithms have significantly contributed to quality +control in industrial settings, particularly in addressing structural defects +like dent and contamination which are prevalent in mass production. Extensive +research efforts have led to the development of related benchmarks such as +MVTec AD (Bergmann et al., 2019). However, in industrial settings, there can be +instances of logical defects, where acceptable items are found in unsuitable +locations or product pairs do not match as expected. Recent methods tackling +logical defects effectively employ knowledge distillation to generate +difference maps. Knowledge distillation (KD) is used to learn normal data +distribution in unsupervised manner. Despite their effectiveness, these methods +often overlook the potential false negatives. Excessive similarity between the +teacher network and student network can hinder the generation of a suitable +difference map for logical anomaly detection. This technical report provides +insights on handling potential false negatives by utilizing a simple constraint +in KD-based logical anomaly detection methods. We select EfficientAD as a +state-of-the-art baseline and apply a margin-based constraint to its +unsupervised learning scheme. Applying this constraint, we can improve the +AUROC for MVTec LOCO AD by 1.3 %. + +
+
+
+
+
+ + ☆ Amortized Posterior Sampling with Diffusion Prior Distillation + + +
+ We propose a variational inference approach to sample from the posterior +distribution for solving inverse problems. From a pre-trained diffusion model, +our approach trains a conditional flow model to minimize the divergence between +the proposal variational distribution and the posterior distribution implicitly +defined through the diffusion model. Once trained, the flow model is capable of +sampling from the posterior distribution with a single NFE, amortized with +respect to the measurement. The proposed method paves a new path for distilling +a diffusion prior for efficient posterior sampling. We show that our method is +applicable to standard signals in Euclidean space, as well as signals on +manifold. + +
+
+
+
+
+ + ☆ The Power of Combining Data and Knowledge: GPT-4o is an Effective + Interpreter of Machine Learning Models in Predicting Lymph Node Metastasis of + Lung Cancer + + +
+ Lymph node metastasis (LNM) is a crucial factor in determining the initial +treatment for patients with lung cancer, yet accurate preoperative diagnosis of +LNM remains challenging. Recently, large language models (LLMs) have garnered +significant attention due to their remarkable text generation capabilities. +Leveraging the extensive medical knowledge learned from vast corpora, LLMs can +estimate probabilities for clinical problems, though their performance has +historically been inferior to data-driven machine learning models. In this +paper, we propose a novel ensemble method that combines the medical knowledge +acquired by LLMs with the latent patterns identified by machine learning models +to enhance LNM prediction performance. Initially, we developed machine learning +models using patient data. We then designed a prompt template to integrate the +patient data with the predicted probability from the machine learning model. +Subsequently, we instructed GPT-4o, the most advanced LLM developed by OpenAI, +to estimate the likelihood of LNM based on patient data and then adjust the +estimate using the machine learning output. Finally, we collected three outputs +from the GPT-4o using the same prompt and ensembled these results as the final +prediction. Using the proposed method, our models achieved an AUC value of +0.765 and an AP value of 0.415 for LNM prediction, significantly improving +predictive performance compared to baseline machine learning models. The +experimental results indicate that GPT-4o can effectively leverage its medical +knowledge and the probabilities predicted by machine learning models to achieve +more accurate LNM predictions. These findings demonstrate that LLMs can perform +well in clinical risk prediction tasks, offering a new paradigm for integrating +medical knowledge and patient data in clinical predictions. + +
+
+
+
+
+ + ☆ An Iterative Approach to Topic Modelling + + +
+ Topic modelling has become increasingly popular for summarizing text data, +such as social media posts and articles. However, topic modelling is usually +completed in one shot. Assessing the quality of resulting topics is +challenging. No effective methods or measures have been developed for assessing +the results or for making further enhancements to the topics. In this research, +we propose we propose to use an iterative process to perform topic modelling +that gives rise to a sense of completeness of the resulting topics when the +process is complete. Using the BERTopic package, a popular method in topic +modelling, we demonstrate how the modelling process can be applied iteratively +to arrive at a set of topics that could not be further improved upon using one +of the three selected measures for clustering comparison as the decision +criteria. This demonstration is conducted using a subset of the COVIDSenti-A +dataset. The early success leads us to believe that further research using in +using this approach in conjunction with other topic modelling algorithms could +be viable. + +
+
+
+
+
+ + ☆ DAM: Towards A Foundation Model for Time Series Forecasting + + +
+ It is challenging to scale time series forecasting models such that they +forecast accurately for multiple distinct domains and datasets, all with +potentially different underlying collection procedures (e.g., sample +resolution), patterns (e.g., periodicity), and prediction requirements (e.g., +reconstruction vs. forecasting). We call this general task universal +forecasting. Existing methods usually assume that input data is regularly +sampled, and they forecast to pre-determined horizons, resulting in failure to +generalise outside of the scope of their training. We propose the DAM - a +neural model that takes randomly sampled histories and outputs an adjustable +basis composition as a continuous function of time for forecasting to non-fixed +horizons. It involves three key components: (1) a flexible approach for using +randomly sampled histories from a long-tail distribution, that enables an +efficient global perspective of the underlying temporal dynamics while +retaining focus on the recent history; (2) a transformer backbone that is +trained on these actively sampled histories to produce, as representational +output, (3) the basis coefficients of a continuous function of time. We show +that a single univariate DAM, trained on 25 time series datasets, either +outperformed or closely matched existing SoTA models at multivariate long-term +forecasting across 18 datasets, including 8 held-out for zero-shot transfer, +even though these models were trained to specialise for each dataset-horizon +combination. This single DAM excels at zero-shot transfer and very-long-term +forecasting, performs well at imputation, is interpretable via basis function +composition and attention, can be tuned for different inference-cost +requirements, is robust to missing and irregularly sampled data {by design}. + +
+
+
+
+
+ + ☆ A Large-Scale Sensitivity Analysis on Latent Embeddings and + Dimensionality Reductions for Text Spatializations IEEE VIS 2024 + + +
+ The semantic similarity between documents of a text corpus can be visualized +using map-like metaphors based on two-dimensional scatterplot layouts. These +layouts result from a dimensionality reduction on the document-term matrix or a +representation within a latent embedding, including topic models. Thereby, the +resulting layout depends on the input data and hyperparameters of the +dimensionality reduction and is therefore affected by changes in them. +Furthermore, the resulting layout is affected by changes in the input data and +hyperparameters of the dimensionality reduction. However, such changes to the +layout require additional cognitive efforts from the user. In this work, we +present a sensitivity study that analyzes the stability of these layouts +concerning (1) changes in the text corpora, (2) changes in the hyperparameter, +and (3) randomness in the initialization. Our approach has two stages: data +measurement and data analysis. First, we derived layouts for the combination of +three text corpora and six text embeddings and a grid-search-inspired +hyperparameter selection of the dimensionality reductions. Afterward, we +quantified the similarity of the layouts through ten metrics, concerning local +and global structures and class separation. Second, we analyzed the resulting +42817 tabular data points in a descriptive statistical analysis. From this, we +derived guidelines for informed decisions on the layout algorithm and highlight +specific hyperparameter settings. We provide our implementation as a Git +repository at +https://github.com/hpicgs/Topic-Models-and-Dimensionality-Reduction-Sensitivity-Study +and results as Zenodo archive at https://doi.org/10.5281/zenodo.12772898. + +
+
+ comment: To be published at IEEE VIS 2024 conference +
+
+
+
+
+ + ☆ EllipBench: A Large-scale Benchmark for Machine-learning based + Ellipsometry Modeling + + +
+ Ellipsometry is used to indirectly measure the optical properties and +thickness of thin films. However, solving the inverse problem of ellipsometry +is time-consuming since it involves human expertise to apply the data fitting +techniques. Many studies use traditional machine learning-based methods to +model the complex mathematical fitting process. In our work, we approach this +problem from a deep learning perspective. First, we introduce a large-scale +benchmark dataset to facilitate deep learning methods. The proposed dataset +encompasses 98 types of thin film materials and 4 types of substrate materials, +including metals, alloys, compounds, and polymers, among others. Additionally, +we propose a deep learning framework that leverages residual connections and +self-attention mechanisms to learn the massive data points. We also introduce a +reconstruction loss to address the common challenge of multiple solutions in +thin film thickness prediction. Compared to traditional machine learning +methods, our framework achieves state-of-the-art (SOTA) performance on our +proposed dataset. The dataset and code will be available upon acceptance. + +
+
+
+
+
+ + ☆ MDS-ED: Multimodal Decision Support in the Emergency Department -- a + Benchmark Dataset for Diagnoses and Deterioration Prediction in Emergency + Medicine + + +
+ Background: Benchmarking medical decision support algorithms often struggles +due to limited access to datasets, narrow prediction tasks, and restricted +input modalities. These limitations affect their clinical relevance and +performance in high-stakes areas like emergency care, complicating replication, +validation, and improvement of benchmarks. + Methods: We introduce a dataset based on MIMIC-IV, benchmarking protocol, and +initial results for evaluating multimodal decision support in the emergency +department (ED). We use diverse data modalities from the first 1.5 hours of +patient arrival, including demographics, biometrics, vital signs, lab values, +and electrocardiogram waveforms. We analyze 1443 clinical labels across two +contexts: predicting diagnoses with ICD-10 codes and forecasting patient +deterioration. + Results: Our multimodal diagnostic model achieves an AUROC score over 0.8 in +a statistically significant manner for 357 out of 1428 conditions, including +cardiac issues like myocardial infarction and non-cardiac conditions such as +renal disease and diabetes. The deterioration model scores above 0.8 in a +statistically significant manner for 13 out of 15 targets, including critical +events like cardiac arrest and mechanical ventilation, ICU admission as well as +short- and long-term mortality. Incorporating raw waveform data significantly +improves model performance, which represents one of the first robust +demonstrations of this effect. + Conclusions: This study highlights the uniqueness of our dataset, which +encompasses a wide range of clinical tasks and utilizes a comprehensive set of +features collected early during the emergency after arriving at the ED. The +strong performance, as evidenced by high AUROC scores across diagnostic and +deterioration targets, underscores the potential of our approach to +revolutionize decision-making in acute and emergency medicine. + +
+
+ comment: 14 pages, 1 figure, code available under + https://github.com/AI4HealthUOL/MDS-ED +
+
+
+
+
+ + ☆ Innovative Speech-Based Deep Learning Approaches for Parkinson's Disease + Classification: A Systematic Review + + +
+ Parkinson's disease (PD), the second most prevalent neurodegenerative +disorder worldwide, frequently presents with early-stage speech impairments. +Recent advancements in Artificial Intelligence (AI), particularly deep learning +(DL), have significantly enhanced PD diagnosis through the analysis of speech +data. Nevertheless, the progress of research is restricted by the limited +availability of publicly accessible speech-based PD datasets, primarily due to +privacy and ethical concerns. This review covers the latest DL-based AI +approaches for speech-based PD classification, focusing on performance, +available resources and associated challenges of 33 scientific works published +between 2020 and March 2024. These DL approaches are categorized into +end-to-end (E2E) learning, transfer learning (TL) and deep acoustic features +(DAF) extraction. Among E2E approaches, Convolutional Neural Networks (CNNs) +are prevalent, though Transformers are increasingly popular. E2E approaches +face challenges such as limited data and computational resources, especially +with Transformers. TL addresses these issues by providing more robust PD +diagnosis and better generalizability across languages. DAF extraction aims to +improve the explainability and interpretability of results by examining the +specific effects of deep features on both other DL approaches and more +traditional machine learning (ML) methods. However, it often underperforms +compared to E2E and TL approaches. This review also discusses unresolved issues +related to bias, explainability and privacy, highlighting the need for future +research. + +
+
+ comment: Submitted in Applied Sciences - peer reviewed Open Access journal. + This research was funded by the NWO research programme AiNed Fellowship + Grants under the project Responsible AI for Voice Diagnostics (RAIVD) - grant + number NGF.1607.22.013 +
+
+
+
+
+ + ☆ On the Opportunities of (Re)-Exploring Atmospheric Science by Foundation + Models: A Case Study + + +
+ Most state-of-the-art AI applications in atmospheric science are based on +classic deep learning approaches. However, such approaches cannot automatically +integrate multiple complicated procedures to construct an intelligent agent, +since each functionality is enabled by a separate model learned from +independent climate datasets. The emergence of foundation models, especially +multimodal foundation models, with their ability to process heterogeneous input +data and execute complex tasks, offers a substantial opportunity to overcome +this challenge. In this report, we want to explore a central question - how the +state-of-the-art foundation model, i.e., GPT-4o, performs various atmospheric +scientific tasks. Toward this end, we conduct a case study by categorizing the +tasks into four main classes, including climate data processing, physical +diagnosis, forecast and prediction, and adaptation and mitigation. For each +task, we comprehensively evaluate the GPT-4o's performance along with a +concrete discussion. We hope that this report may shed new light on future AI +applications and research in atmospheric science. + +
+
+ comment: 28 pages, 12 figures +
+
+
+
+
+ + ☆ Long-term Fairness in Ride-Hailing Platform ECML + + +
+ Matching in two-sided markets such as ride-hailing has recently received +significant attention. However, existing studies on ride-hailing mainly focus +on optimising efficiency, and fairness issues in ride-hailing have been +neglected. Fairness issues in ride-hailing, including significant earning +differences between drivers and variance of passenger waiting times among +different locations, have potential impacts on economic and ethical aspects. +The recent studies that focus on fairness in ride-hailing exploit traditional +optimisation methods and the Markov Decision Process to balance efficiency and +fairness. However, there are several issues in these existing studies, such as +myopic short-term decision-making from traditional optimisation and instability +of fairness in a comparably longer horizon from both traditional optimisation +and Markov Decision Process-based methods. To address these issues, we propose +a dynamic Markov Decision Process model to alleviate fairness issues currently +faced by ride-hailing, and seek a balance between efficiency and fairness, with +two distinct characteristics: (i) a prediction module to predict the number of +requests that will be raised in the future from different locations to allow +the proposed method to consider long-term fairness based on the whole timeline +instead of consider fairness only based on historical and current data +patterns; (ii) a customised scalarisation function for multi-objective +multi-agent Q Learning that aims to balance efficiency and fairness. Extensive +experiments on a publicly available real-world dataset demonstrate that our +proposed method outperforms existing state-of-the-art methods. + +
+
+ comment: Accepted by ECML PKDD 2024 +
+
+
+
+
+ + ☆ IsUMap: Manifold Learning and Data Visualization leveraging + Vietoris-Rips filtrations + + +
+ This work introduces IsUMap, a novel manifold learning technique that +enhances data representation by integrating aspects of UMAP and Isomap with +Vietoris-Rips filtrations. We present a systematic and detailed construction of +a metric representation for locally distorted metric spaces that captures +complex data structures more accurately than the previous schemes. Our approach +addresses limitations in existing methods by accommodating non-uniform data +distributions and intricate local geometries. We validate its performance +through extensive experiments on examples of various geometric objects and +benchmark real-world datasets, demonstrating significant improvements in +representation quality. + +
+
+
+
+
+ + ☆ Image Segmentation via Divisive Normalization: dealing with + environmental diversity + + +
+ Autonomous driving is a challenging scenario for image segmentation due to +the presence of uncontrolled environmental conditions and the eventually +catastrophic consequences of failures. Previous work suggested that a +biologically motivated computation, the so-called Divisive Normalization, could +be useful to deal with image variability, but its effects have not been +systematically studied over different data sources and environmental factors. +Here we put segmentation U-nets augmented with Divisive Normalization to work +far from training conditions to find where this adaptation is more critical. We +categorize the scenes according to their radiance level and dynamic range +(day/night), and according to their achromatic/chromatic contrasts. We also +consider video game (synthetic) images to broaden the range of environments. We +check the performance in the extreme percentiles of such categorization. Then, +we push the limits further by artificially modifying the images in +perceptually/environmentally relevant dimensions: luminance, contrasts and +spectral radiance. Results show that neural networks with Divisive +Normalization get better results in all the scenarios and their performance +remains more stable with regard to the considered environmental factors and +nature of the source. Finally, we explain the improvements in segmentation +performance in two ways: (1) by quantifying the invariance of the responses +that incorporate Divisive Normalization, and (2) by illustrating the adaptive +nonlinearity of the different layers that depends on the local activity. + +
+
+
+
+
+ + ☆ Unified Lexical Representation for Interpretable Visual-Language + Alignment + + +
+ Visual-Language Alignment (VLA) has gained a lot of attention since CLIP's +groundbreaking work. Although CLIP performs well, the typical direct latent +feature alignment lacks clarity in its representation and similarity scores. On +the other hand, lexical representation, a vector whose element represents the +similarity between the sample and a word from the vocabulary, is a natural +sparse representation and interpretable, providing exact matches for individual +words. However, lexical representations is difficult to learn due to no +ground-truth supervision and false-discovery issues, and thus requires complex +design to train effectively. In this paper, we introduce LexVLA, a more +interpretable VLA framework by learning a unified lexical representation for +both modalities without complex design. We use DINOv2 as our visual model for +its local-inclined features and Llama 2, a generative language model, to +leverage its in-context lexical prediction ability. To avoid the false +discovery, we propose an overuse penalty to refrain the lexical representation +from falsely frequently activating meaningless words. We demonstrate that these +two pre-trained uni-modal models can be well-aligned by fine-tuning on modest +multi-modal dataset and avoid intricate training configurations. On cross-modal +retrieval benchmarks, LexVLA, trained on the CC-12M multi-modal dataset, +outperforms baselines fine-tuned on larger datasets (e.g., YFCC15M) and those +trained from scratch on even bigger datasets (e.g., 1.1B data, including +CC-12M). We conduct extensive experiments to analyze LexVLA. + +
+
+
+
+
+ + ☆ Optimal Hessian/Jacobian-Free Nonconvex-PL Bilevel Optimization ICML 2024 + + +
+ Bilevel optimization is widely applied in many machine learning tasks such as +hyper-parameter learning, meta learning and reinforcement learning. Although +many algorithms recently have been developed to solve the bilevel optimization +problems, they generally rely on the (strongly) convex lower-level problems. +More recently, some methods have been proposed to solve the nonconvex-PL +bilevel optimization problems, where their upper-level problems are possibly +nonconvex, and their lower-level problems are also possibly nonconvex while +satisfying Polyak-{\L}ojasiewicz (PL) condition. However, these methods still +have a high convergence complexity or a high computation complexity such as +requiring compute expensive Hessian/Jacobian matrices and its inverses. In the +paper, thus, we propose an efficient Hessian/Jacobian-free method (i.e., +HJFBiO) with the optimal convergence complexity to solve the nonconvex-PL +bilevel problems. Theoretically, under some mild conditions, we prove that our +HJFBiO method obtains an optimal convergence rate of $O(\frac{1}{T})$, where +$T$ denotes the number of iterations, and has an optimal gradient complexity of +$O(\epsilon^{-1})$ in finding an $\epsilon$-stationary solution. We conduct +some numerical experiments on the bilevel PL game and hyper-representation +learning task to demonstrate efficiency of our proposed method. + +
+
+ comment: ICML 2024 (Oral). arXiv admin note: text overlap with + arXiv:2311.04520 +
+
+
+
+
+ + ☆ Advanced deep-reinforcement-learning methods for flow control: + group-invariant and positional-encoding networks improve learning speed and + quality + + +
+ Flow control is key to maximize energy efficiency in a wide range of +applications. However, traditional flow-control methods face significant +challenges in addressing non-linear systems and high-dimensional data, limiting +their application in realistic energy systems. This study advances +deep-reinforcement-learning (DRL) methods for flow control, particularly +focusing on integrating group-invariant networks and positional encoding into +DRL architectures. Our methods leverage multi-agent reinforcement learning +(MARL) to exploit policy invariance in space, in combination with +group-invariant networks to ensure local symmetry invariance. Additionally, a +positional encoding inspired by the transformer architecture is incorporated to +provide location information to the agents, mitigating action constraints from +strict invariance. The proposed methods are verified using a case study of +Rayleigh-B\'enard convection, where the goal is to minimize the Nusselt number +Nu. The group-invariant neural networks (GI-NNs) show faster convergence +compared to the base MARL, achieving better average policy performance. The +GI-NNs not only cut DRL training time in half but also notably enhance learning +reproducibility. Positional encoding further enhances these results, +effectively reducing the minimum Nu and stabilizing convergence. Interestingly, +group invariant networks specialize in improving learning speed and positional +encoding specializes in improving learning quality. These results demonstrate +that choosing a suitable feature-representation method according to the purpose +as well as the characteristics of each control problem is essential. We believe +that the results of this study will not only inspire novel DRL methods with +invariant and unique representations, but also provide useful insights for +industrial applications. + +
+
+
+
+
+ + ☆ Demystifying Verbatim Memorization in Large Language Models + + +
+ Large Language Models (LLMs) frequently memorize long sequences verbatim, +often with serious legal and privacy implications. Much prior work has studied +such verbatim memorization using observational data. To complement such work, +we develop a framework to study verbatim memorization in a controlled setting +by continuing pre-training from Pythia checkpoints with injected sequences. We +find that (1) non-trivial amounts of repetition are necessary for verbatim +memorization to happen; (2) later (and presumably better) checkpoints are more +likely to verbatim memorize sequences, even for out-of-distribution sequences; +(3) the generation of memorized sequences is triggered by distributed model +states that encode high-level features and makes important use of general +language modeling capabilities. Guided by these insights, we develop stress +tests to evaluate unlearning methods and find they often fail to remove the +verbatim memorized information, while also degrading the LM. Overall, these +findings challenge the hypothesis that verbatim memorization stems from +specific model weights or mechanisms. Rather, verbatim memorization is +intertwined with the LM's general capabilities and thus will be very difficult +to isolate and suppress without degrading model quality. + +
+
+
+
+
+ + ☆ NC-NCD: Novel Class Discovery for Node Classification CIKM'24 + + +
+ Novel Class Discovery (NCD) involves identifying new categories within +unlabeled data by utilizing knowledge acquired from previously established +categories. However, existing NCD methods often struggle to maintain a balance +between the performance of old and new categories. Discovering unlabeled new +categories in a class-incremental way is more practical but also more +challenging, as it is frequently hindered by either catastrophic forgetting of +old categories or an inability to learn new ones. Furthermore, the +implementation of NCD on continuously scalable graph-structured data remains an +under-explored area. In response to these challenges, we introduce for the +first time a more practical NCD scenario for node classification (i.e., +NC-NCD), and propose a novel self-training framework with prototype replay and +distillation called SWORD, adopted to our NC-NCD setting. Our approach enables +the model to cluster unlabeled new category nodes after learning labeled nodes +while preserving performance on old categories without reliance on old category +nodes. SWORD achieves this by employing a self-training strategy to learn new +categories and preventing the forgetting of old categories through the joint +use of feature prototypes and knowledge distillation. Extensive experiments on +four common benchmarks demonstrate the superiority of SWORD over other +state-of-the-art methods. + +
+
+ comment: Accepted by CIKM'24 +
+
+
+
+
+ + ☆ Nested replicator dynamics, nested logit choice, and similarity-based + learning + + +
+ We consider a model of learning and evolution in games whose action sets are +endowed with a partition-based similarity structure intended to capture +exogenous similarities between strategies. In this model, revising agents have +a higher probability of comparing their current strategy with other strategies +that they deem similar, and they switch to the observed strategy with +probability proportional to its payoff excess. Because of this implicit bias +toward similar strategies, the resulting dynamics - which we call the nested +replicator dynamics - do not satisfy any of the standard monotonicity +postulates for imitative game dynamics; nonetheless, we show that they retain +the main long-run rationality properties of the replicator dynamics, albeit at +quantitatively different rates. We also show that the induced dynamics can be +viewed as a stimulus-response model in the spirit of Erev & Roth (1998), with +choice probabilities given by the nested logit choice rule of Ben-Akiva (1973) +and McFadden (1978). This result generalizes an existing relation between the +replicator dynamics and the exponential weights algorithm in online learning, +and provides an additional layer of interpretation to our analysis and results. + +
+
+ comment: 37 pages, 9 figures +
+
+
+
+
+ + ☆ Automatic Data Labeling for Software Vulnerability Prediction Models: + How Far Are We? + + +
+ Background: Software Vulnerability (SV) prediction needs large-sized and +high-quality data to perform well. Current SV datasets mostly require expensive +labeling efforts by experts (human-labeled) and thus are limited in size. +Meanwhile, there are growing efforts in automatic SV labeling at scale. +However, the fitness of auto-labeled data for SV prediction is still largely +unknown. Aims: We quantitatively and qualitatively study the quality and use of +the state-of-the-art auto-labeled SV data, D2A, for SV prediction. Method: +Using multiple sources and manual validation, we curate clean SV data from +human-labeled SV-fixing commits in two well-known projects for investigating +the auto-labeled counterparts. Results: We discover that 50+% of the +auto-labeled SVs are noisy (incorrectly labeled), and they hardly overlap with +the publicly reported ones. Yet, SV prediction models utilizing the noisy +auto-labeled SVs can perform up to 22% and 90% better in Matthews Correlation +Coefficient and Recall, respectively, than the original models. We also reveal +the promises and difficulties of applying noise-reduction methods for +automatically addressing the noise in auto-labeled SV data to maximize the data +utilization for SV prediction. Conclusions: Our study informs the benefits and +challenges of using auto-labeled SVs, paving the way for large-scale SV +prediction. + +
+
+ comment: Accepted as a full paper in the technical track at The International + Symposium on Empirical Software Engineering and Measurement (ESEM) 2024 +
+
+
+
+
+ + ☆ EEG-SSM: Leveraging State-Space Model for Dementia Detection + + +
+ State-space models (SSMs) have garnered attention for effectively processing +long data sequences, reducing the need to segment time series into shorter +intervals for model training and inference. Traditionally, SSMs capture only +the temporal dynamics of time series data, omitting the equally critical +spectral features. This study introduces EEG-SSM, a novel state-space +model-based approach for dementia classification using EEG data. Our model +features two primary innovations: EEG-SSM temporal and EEG-SSM spectral +components. The temporal component is designed to efficiently process EEG +sequences of varying lengths, while the spectral component enhances the model +by integrating frequency-domain information from EEG signals. The synergy of +these components allows EEG-SSM to adeptly manage the complexities of +multivariate EEG data, significantly improving accuracy and stability across +different temporal resolutions. Demonstrating a remarkable 91.0 percent +accuracy in classifying Healthy Control (HC), Frontotemporal Dementia (FTD), +and Alzheimer's Disease (AD) groups, EEG-SSM outperforms existing models on the +same dataset. The development of EEG-SSM represents an improvement in the use +of state-space models for screening dementia, offering more precise and +cost-effective tools for clinical neuroscience. + +
+
+
+
+
+ + ☆ Enhancing Diversity in Multi-objective Feature Selection + + +
+ Feature selection plays a pivotal role in the data preprocessing and +model-building pipeline, significantly enhancing model performance, +interpretability, and resource efficiency across diverse domains. In +population-based optimization methods, the generation of diverse individuals +holds utmost importance for adequately exploring the problem landscape, +particularly in highly multi-modal multi-objective optimization problems. Our +study reveals that, in line with findings from several prior research papers, +commonly employed crossover and mutation operations lack the capability to +generate high-quality diverse individuals and tend to become confined to +limited areas around various local optima. This paper introduces an +augmentation to the diversity of the population in the well-established +multi-objective scheme of the genetic algorithm, NSGA-II. This enhancement is +achieved through two key components: the genuine initialization method and the +substitution of the worst individuals with new randomly generated individuals +as a re-initialization approach in each generation. The proposed +multi-objective feature selection method undergoes testing on twelve real-world +classification problems, with the number of features ranging from 2,400 to +nearly 50,000. The results demonstrate that replacing the last front of the +population with an equivalent number of new random individuals generated using +the genuine initialization method and featuring a limited number of features +substantially improves the population's quality and, consequently, enhances the +performance of the multi-objective algorithm. + +
+
+ comment: 8 pages, 3 figures, accepted to be published in IEEE WCCI 2024 + conference +
+
+
+
+
+ + ☆ Exploring the Limitations of Kolmogorov-Arnold Networks in + Classification: Insights to Software Training and Hardware Implementation + + +
+ Kolmogorov-Arnold Networks (KANs), a novel type of neural network, have +recently gained popularity and attention due to the ability to substitute +multi-layer perceptions (MLPs) in artificial intelligence (AI) with higher +accuracy and interoperability. However, KAN assessment is still limited and +cannot provide an in-depth analysis of a specific domain. Furthermore, no study +has been conducted on the implementation of KANs in hardware design, which +would directly demonstrate whether KANs are truly superior to MLPs in practical +applications. As a result, in this paper, we focus on verifying KANs for +classification issues, which are a common but significant topic in AI using +four different types of datasets. Furthermore, the corresponding hardware +implementation is considered using the Vitis high-level synthesis (HLS) tool. +To the best of our knowledge, this is the first article to implement hardware +for KAN. The results indicate that KANs cannot achieve more accuracy than MLPs +in high complex datasets while utilizing substantially higher hardware +resources. Therefore, MLP remains an effective approach for achieving accuracy +and efficiency in software and hardware implementation. + +
+
+ comment: 6 pages, 3 figures, 2 tables +
+
+
+
+
+ + ☆ Integrating Ensemble Kalman Filter with AI-based Weather Prediction + Model ClimaX + + +
+ Artificial intelligence (AI)-based weather prediction research is growing +rapidly and has shown to be competitive with the advanced dynamic numerical +weather prediction models. However, research combining AI-based weather +prediction models with data assimilation remains limited partially because +long-term sequential data assimilation cycles are required to evaluate data +assimilation systems. This study explores integrating the local ensemble +transform Kalman filter (LETKF) with an AI-based weather prediction model +ClimaX. Our experiments demonstrated that the ensemble data assimilation cycled +stably for the AI-based weather prediction model using covariance inflation and +localization techniques inside the LETKF. While ClimaX showed some limitations +in capturing flow-dependent error covariance compared to dynamical models, the +AI-based ensemble forecasts provided reasonable and beneficial error covariance +in sparsely observed regions. These findings highlight the potential of AI +models in weather forecasting and the importance of physical consistency and +accurate error growth representation in improving ensemble data assimilation. + +
+
+
+
+
+ + ☆ KiVA: Kid-inspired Visual Analogies for Testing Large Multimodal Models + + +
+ This paper investigates visual analogical reasoning in large multimodal +models (LMMs) compared to human adults and children. A "visual analogy" is an +abstract rule inferred from one image and applied to another. While benchmarks +exist for testing visual reasoning in LMMs, they require advanced skills and +omit basic visual analogies that even young children can make. Inspired by +developmental psychology, we propose a new benchmark of 1,400 visual +transformations of everyday objects to test LMMs on visual analogical reasoning +and compare them to children and adults. We structure the evaluation into three +stages: identifying what changed (e.g., color, number, etc.), how it changed +(e.g., added one object), and applying the rule to new scenarios. Our findings +show that while models like GPT-4V, LLaVA-1.5, and MANTIS identify the "what" +effectively, they struggle with quantifying the "how" and extrapolating this +rule to new objects. In contrast, children and adults exhibit much stronger +analogical reasoning at all three stages. Additionally, the strongest tested +model, GPT-4V, performs better in tasks involving simple visual attributes like +color and size, correlating with quicker human adult response times. +Conversely, more complex tasks such as number, rotation, and reflection, which +necessitate extensive cognitive processing and understanding of the 3D physical +world, present more significant challenges. Altogether, these findings +highlight the limitations of training models on data that primarily consists of +2D images and text. + +
+
+ comment: 9 pages. For the KiVA benchmark, see https://github.com/ey242/KiVA +
+
+
+
+
+ + ☆ Online Learning for Autonomous Management of Intent-based 6G Networks + + +
+ The growing complexity of networks and the variety of future scenarios with +diverse and often stringent performance requirements call for a higher level of +automation. Intent-based management emerges as a solution to attain high level +of automation, enabling human operators to solely communicate with the network +through high-level intents. The intents consist of the targets in the form of +expectations (i.e., latency expectation) from a service and based on the +expectations the required network configurations should be done accordingly. It +is almost inevitable that when a network action is taken to fulfill one intent, +it can cause negative impacts on the performance of another intent, which +results in a conflict. In this paper, we aim to address the conflict issue and +autonomous management of intent-based networking, and propose an online +learning method based on the hierarchical multi-armed bandits approach for an +effective management. Thanks to this hierarchical structure, it performs an +efficient exploration and exploitation of network configurations with respect +to the dynamic network conditions. We show that our algorithm is an effective +approach regarding resource allocation and satisfaction of intent expectations. + +
+
+
+
+
+ + ☆ DualFed: Enjoying both Generalization and Personalization in Federated + Learning via Hierachical Representations + + +
+ In personalized federated learning (PFL), it is widely recognized that +achieving both high model generalization and effective personalization poses a +significant challenge due to their conflicting nature. As a result, existing +PFL methods can only manage a trade-off between these two objectives. This +raises an interesting question: Is it feasible to develop a model capable of +achieving both objectives simultaneously? Our paper presents an affirmative +answer, and the key lies in the observation that deep models inherently exhibit +hierarchical architectures, which produce representations with various levels +of generalization and personalization at different stages. A straightforward +approach stemming from this observation is to select multiple representations +from these layers and combine them to concurrently achieve generalization and +personalization. However, the number of candidate representations is commonly +huge, which makes this method infeasible due to high computational costs.To +address this problem, we propose DualFed, a new method that can directly yield +dual representations correspond to generalization and personalization +respectively, thereby simplifying the optimization task. Specifically, DualFed +inserts a personalized projection network between the encoder and classifier. +The pre-projection representations are able to capture generalized information +shareable across clients, and the post-projection representations are effective +to capture task-specific information on local clients. This design minimizes +the mutual interference between generalization and personalization, thereby +achieving a win-win situation. Extensive experiments show that DualFed can +outperform other FL methods. Code is available at +https://github.com/GuogangZhu/DualFed. + +
+
+ comment: Accepted by ACM MutltiMedia 2024 +
+
+
+
+
+ + ☆ Optimal Trade and Industrial Policies in the Global Economy: A Deep + Learning Framework + + +
+ We propose a deep learning framework, DL-opt, designed to efficiently solve +for optimal policies in quantifiable general equilibrium trade models. DL-opt +integrates (i) a nested fixed point (NFXP) formulation of the optimization +problem, (ii) automatic implicit differentiation to enhance gradient descent +for solving unilateral optimal policies, and (iii) a best-response dynamics +approach for finding Nash equilibria. Utilizing DL-opt, we solve for +non-cooperative tariffs and industrial subsidies across 7 economies and 44 +sectors, incorporating sectoral external economies of scale. Our quantitative +analysis reveals significant sectoral heterogeneity in Nash policies: Nash +industrial subsidies increase with scale elasticities, whereas Nash tariffs +decrease with trade elasticities. Moreover, we show that global dual +competition, involving both tariffs and industrial subsidies, results in lower +tariffs and higher welfare outcomes compared to a global tariff war. These +findings highlight the importance of considering sectoral heterogeneity and +policy combinations in understanding global economic competition. + +
+
+
+
+
+ + ☆ Multi-modal Data Binding for Survival Analysis Modeling with Incomplete + Data and Annotations MICCAI 2024 + + +
+ Survival analysis stands as a pivotal process in cancer treatment research, +crucial for predicting patient survival rates accurately. Recent advancements +in data collection techniques have paved the way for enhancing survival +predictions by integrating information from multiple modalities. However, +real-world scenarios often present challenges with incomplete data, +particularly when dealing with censored survival labels. Prior works have +addressed missing modalities but have overlooked incomplete labels, which can +introduce bias and limit model efficacy. To bridge this gap, we introduce a +novel framework that simultaneously handles incomplete data across modalities +and censored survival labels. Our approach employs advanced foundation models +to encode individual modalities and align them into a universal representation +space for seamless fusion. By generating pseudo labels and incorporating +uncertainty, we significantly enhance predictive accuracy. The proposed method +demonstrates outstanding prediction accuracy in two survival analysis tasks on +both employed datasets. This innovative approach overcomes limitations +associated with disparate modalities and improves the feasibility of +comprehensive survival analysis using multiple large foundation models. + +
+
+ comment: Accepted by MICCAI 2024 +
+
+
+
+
+ + ☆ Your Graph Recommender is Provably a Single-view Graph Contrastive + Learning + + +
+ Graph recommender (GR) is a type of graph neural network (GNNs) encoder that +is customized for extracting information from the user-item interaction graph. +Due to its strong performance on the recommendation task, GR has gained +significant attention recently. Graph contrastive learning (GCL) is also a +popular research direction that aims to learn, often unsupervised, GNNs with +certain contrastive objectives. As a general graph representation learning +method, GCLs have been widely adopted with the supervised recommendation loss +for joint training of GRs. Despite the intersection of GR and GCL research, +theoretical understanding of the relationship between the two fields is +surprisingly sparse. This vacancy inevitably leads to inefficient scientific +research. + In this paper, we aim to bridge the gap between the field of GR and GCL from +the perspective of encoders and loss functions. With mild assumptions, we +theoretically show an astonishing fact that graph recommender is equivalent to +a commonly-used single-view graph contrastive model. Specifically, we find that +(1) the classic encoder in GR is essentially a linear graph convolutional +network with one-hot inputs, and (2) the loss function in GR is well bounded by +a single-view GCL loss with certain hyperparameters. The first observation +enables us to explain crucial designs of GR models, e.g., the removal of +self-loop and nonlinearity. And the second finding can easily prompt many +cross-field research directions. We empirically show a remarkable result that +the recommendation loss and the GCL loss can be used interchangeably. The fact +that we can train GR models solely with the GCL loss is particularly +insightful, since before this work, GCLs were typically viewed as unsupervised +methods that need fine-tuning. We also discuss some potential future works +inspired by our theory. + +
+
+
+
+
+ + ☆ Text-Driven Neural Collaborative Filtering Model for Paper Source + Tracing KDD + + +
+ Identifying significant references within the complex interrelations of a +citation knowledge graph is challenging, which encompasses connections through +citations, authorship, keywords, and other relational attributes. The Paper +Source Tracing (PST) task seeks to automate the identification of pivotal +references for given scholarly articles utilizing advanced data mining +techniques. In the KDD CUP 2024, we design a recommendation-based framework +tailored for the PST task. This framework employs the Neural Collaborative +Filtering (NCF) model to generate final predictions. To process the textual +attributes of the papers and extract input features for the model, we utilize +SciBERT, a pre-trained language model. According to the experimental results, +our method achieved a score of 0.37814 on the Mean Average Precision (MAP) +metric, outperforming baseline models and ranking 11th among all participating +teams. The source code is publicly available at +https://github.com/MyLove-XAB/KDDCupFinal. + +
+
+ comment: KDD CUP 2024 OAG-Challenges, Paper Source Tracing, Technical Report + of Team AoboSama @ KDD CUP 2024. August 25--29, 2024. Barcelona, Spain +
+
+
+
+
+ + ☆ A Two-Stage Imaging Framework Combining CNN and Physics-Informed Neural + Networks for Full-Inverse Tomography: A Case Study in Electrical Impedance + Tomography (EIT) + + +
+ Physics-Informed Neural Networks (PINNs) are a machine learning technique for +solving partial differential equations (PDEs) by incorporating PDEs as loss +terms in neural networks and minimizing the loss function during training. +Tomographic imaging, a method to reconstruct internal properties from external +measurement data, is highly complex and ill-posed, making it an inverse +problem. Recently, PINNs have shown significant potential in computational +fluid dynamics (CFD) and have advantages in solving inverse problems. However, +existing research has primarily focused on semi-inverse Electrical Impedance +Tomography (EIT), where internal electric potentials are accessible. The +practical full inverse EIT problem, where only boundary voltage measurements +are available, remains challenging. To address this, we propose a two-stage +hybrid learning framework combining Convolutional Neural Networks (CNNs) and +PINNs to solve the full inverse EIT problem. This framework integrates +data-driven and model-driven approaches, combines supervised and unsupervised +learning, and decouples the forward and inverse problems within the PINN +framework in EIT. Stage I: a U-Net constructs an end-to-end mapping from +boundary voltage measurements to the internal potential distribution using +supervised learning. Stage II: a Multilayer Perceptron (MLP)-based PINN takes +the predicted internal potentials as input to solve for the conductivity +distribution through unsupervised learning. + +
+
+
+
+
+ + ☆ Describe Where You Are: Improving Noise-Robustness for Speech Emotion + Recognition with Text Description of the Environment + + +
+ Speech emotion recognition (SER) systems often struggle in real-world +environments, where ambient noise severely degrades their performance. This +paper explores a novel approach that exploits prior knowledge of testing +environments to maximize SER performance under noisy conditions. To address +this task, we propose a text-guided, environment-aware training where an SER +model is trained with contaminated speech samples and their paired noise +description. We use a pre-trained text encoder to extract the text-based +environment embedding and then fuse it to a transformer-based SER model during +training and inference. We demonstrate the effectiveness of our approach +through our experiment with the MSP-Podcast corpus and real-world additive +noise samples collected from the Freesound repository. Our experiment indicates +that the text-based environment descriptions processed by a large language +model (LLM) produce representations that improve the noise-robustness of the +SER system. In addition, our proposed approach with an LLM yields better +performance than our environment-agnostic baselines, especially in low +signal-to-noise ratio (SNR) conditions. When testing at -5dB SNR level, our +proposed method shows better performance than our best baseline model by 31.8 % +(arousal), 23.5% (dominance), and 9.5% (valence). + +
+
+
+
+
+ + ☆ Improving Online Algorithms via ML Predictions + + +
+ In this work we study the problem of using machine-learned predictions to +improve the performance of online algorithms. We consider two classical +problems, ski rental and non-clairvoyant job scheduling, and obtain new online +algorithms that use predictions to make their decisions. These algorithms are +oblivious to the performance of the predictor, improve with better predictions, +but do not degrade much if the predictions are poor. + +
+
+ comment: Conference version appeared in Neurips 2018 +
+
+
+
+
+ + ☆ Weighted Risk Invariance: Domain Generalization under Invariant Feature + Shift + + +
+ Learning models whose predictions are invariant under multiple environments +is a promising approach for out-of-distribution generalization. Such models are +trained to extract features $X_{\text{inv}}$ where the conditional distribution +$Y \mid X_{\text{inv}}$ of the label given the extracted features does not +change across environments. Invariant models are also supposed to generalize to +shifts in the marginal distribution $p(X_{\text{inv}})$ of the extracted +features $X_{\text{inv}}$, a type of shift we call an $\textit{invariant +covariate shift}$. However, we show that proposed methods for learning +invariant models underperform under invariant covariate shift, either failing +to learn invariant models$\unicode{x2014}$even for data generated from simple +and well-studied linear-Gaussian models$\unicode{x2014}$or having poor +finite-sample performance. To alleviate these problems, we propose +$\textit{weighted risk invariance}$ (WRI). Our framework is based on imposing +invariance of the loss across environments subject to appropriate reweightings +of the training examples. We show that WRI provably learns invariant models, +i.e. discards spurious correlations, in linear-Gaussian settings. We propose a +practical algorithm to implement WRI by learning the density +$p(X_{\text{inv}})$ and the model parameters simultaneously, and we demonstrate +empirically that WRI outperforms previous invariant learning methods under +invariant covariate shift. + +
+
+
+
+
+ + ☆ Diffusion-based subsurface multiphysics monitoring and forecasting + + +
+ Carbon capture and storage (CCS) plays a crucial role in mitigating +greenhouse gas emissions, particularly from industrial outputs. Using seismic +monitoring can aid in an accurate and robust monitoring system to ensure the +effectiveness of CCS and mitigate associated risks. However, conventional +seismic wave equation-based approaches are computationally demanding, which +hinders real-time applications. In addition to efficiency, forecasting and +uncertainty analysis are not easy to handle using such +numerical-simulation-based approaches. To this end, we propose a novel +subsurface multiphysics monitoring and forecasting framework utilizing video +diffusion models. This approach can generate high-quality representations of +CO$2$ evolution and associated changes in subsurface elastic properties. With +reconstruction guidance, forecasting and inversion can be achieved conditioned +on historical frames and/or observational data. Meanwhile, due to the +generative nature of the approach, we can quantify uncertainty in the +prediction. Tests based on the Compass model show that the proposed method +successfully captured the inherently complex physical phenomena associated with +CO$_2$ monitoring, and it can predict and invert the subsurface elastic +properties and CO$_2$ saturation with consistency in their evolution. + +
+
+
+
+
+ + ☆ Model-driven Heart Rate Estimation and Heart Murmur Detection based on + Phonocardiogram + + +
+ Acoustic signals are crucial for health monitoring, particularly heart sounds +which provide essential data like heart rate and detect cardiac anomalies such +as murmurs. This study utilizes a publicly available phonocardiogram (PCG) +dataset to estimate heart rate using model-driven methods and extends the +best-performing model to a multi-task learning (MTL) framework for simultaneous +heart rate estimation and murmur detection. Heart rate estimates are derived +using a sliding window technique on heart sound snippets, analyzed with a +combination of acoustic features (Mel spectrogram, cepstral coefficients, power +spectral density, root mean square energy). Our findings indicate that a 2D +convolutional neural network (\textbf{\texttt{2dCNN}}) is most effective for +heart rate estimation, achieving a mean absolute error (MAE) of 1.312 bpm. We +systematically investigate the impact of different feature combinations and +find that utilizing all four features yields the best results. The MTL model +(\textbf{\texttt{2dCNN-MTL}}) achieves accuracy over 95% in murmur detection, +surpassing existing models, while maintaining an MAE of 1.636 bpm in heart rate +estimation, satisfying the requirements stated by Association for the +Advancement of Medical Instrumentation (AAMI). + +
+
+ comment: 6 pages, 10 figures +
+
+
+
+
+ + ☆ HDL-GPT: High-Quality HDL is All You Need + + +
+ This paper presents Hardware Description Language Generative Pre-trained +Transformers (HDL-GPT), a novel approach that leverages the vast repository of +open-source High Definition Language (HDL) codes to train superior quality +large code models. The core premise of this paper is the hypothesis that +high-quality HDL is all you need to create models with exceptional performance +and broad zero-shot generalization abilities. The paper elucidates the methods +employed for the curation and augmentation of large corpora from open-source +HDL code, transforming highly variable quality data into high-quality data +through careful prompting and context maintenance. We demonstrate that the +careful selection, filtering, and augmentation of data across HDLs can yield +powerful models that surpass current state-of-the-art models. We also explore +the impact of different fine-tuning methods on the quality of results. We +describe experimental results across a range of fine-tuned SOTA LLMs, +substantiating our claims. We demonstrate improvements of 50% to 200% over SOTA +HDL models on current benchmarks in tasks ranging from HDL circuit +explanations, code generation, formal and simulation testbench creation, +triaging bugs, and fixing them. HDL-GPT opens new avenues for the development +of advanced model training techniques for circuit design tasks. + +
+
+ comment: DAC 2024 Invited Paper +
+
+
+
+
+ + ☆ Self-Directed Synthetic Dialogues and Revisions Technical Report + + +
+ Synthetic data has become an important tool in the fine-tuning of language +models to follow instructions and solve complex problems. Nevertheless, the +majority of open data to date is often lacking multi-turn data and collected on +closed models, limiting progress on advancing open fine-tuning methods. We +introduce Self Directed Synthetic Dialogues (SDSD), an experimental dataset +consisting of guided conversations of language models talking to themselves. +The dataset consists of multi-turn conversations generated with DBRX, Llama 2 +70B, and Mistral Large, all instructed to follow a conversation plan generated +prior to the conversation. We also explore including principles from +Constitutional AI and other related works to create synthetic preference data +via revisions to the final conversation turn. We hope this work encourages +further exploration in multi-turn data and the use of open models for expanding +the impact of synthetic data. + +
+
+ comment: 25 pages, 3 figures, 4 tables +
+
+
+
+
+ + ☆ PersonaGym: Evaluating Persona Agents and LLMs + + +
+ Persona agents, which are LLM agents that act according to an assigned +persona, have demonstrated impressive contextual response capabilities across +various applications. These persona agents offer significant enhancements +across diverse sectors, such as education, healthcare, and entertainment, where +model developers can align agent responses to different user requirements +thereby broadening the scope of agent applications. However, evaluating persona +agent performance is incredibly challenging due to the complexity of assessing +persona adherence in free-form interactions across various environments that +are relevant to each persona agent. We introduce PersonaGym, the first dynamic +evaluation framework for assessing persona agents, and PersonaScore, the first +automated human-aligned metric grounded in decision theory for comprehensive +large-scale evaluation of persona agents. Our evaluation of 6 open and +closed-source LLMs, using a benchmark encompassing 200 personas and 10,000 +questions, reveals significant opportunities for advancement in persona agent +capabilities across state-of-the-art models. For example, Claude 3.5 Sonnet +only has a 2.97% relative improvement in PersonaScore than GPT 3.5 despite +being a much more advanced model. Importantly, we find that increased model +size and complexity do not necessarily imply enhanced persona agent +capabilities thereby highlighting the pressing need for algorithmic and +architectural invention towards faithful and performant persona agents. + +
+
+ comment: 21 pages, 5 figures +
+
+
+
+
+ + ☆ Adversarial Robust Decision Transformer: Enhancing Robustness of RvS via + Minimax Returns-to-go + + +
+ Decision Transformer (DT), as one of the representative Reinforcement +Learning via Supervised Learning (RvS) methods, has achieved strong performance +in offline learning tasks by leveraging the powerful Transformer architecture +for sequential decision-making. However, in adversarial environments, these +methods can be non-robust, since the return is dependent on the strategies of +both the decision-maker and adversary. Training a probabilistic model +conditioned on observed return to predict action can fail to generalize, as the +trajectories that achieve a return in the dataset might have done so due to a +weak and suboptimal behavior adversary. To address this, we propose a +worst-case-aware RvS algorithm, the Adversarial Robust Decision Transformer +(ARDT), which learns and conditions the policy on in-sample minimax +returns-to-go. ARDT aligns the target return with the worst-case return learned +through minimax expectile regression, thereby enhancing robustness against +powerful test-time adversaries. In experiments conducted on sequential games +with full data coverage, ARDT can generate a maximin (Nash Equilibrium) +strategy, the solution with the largest adversarial robustness. In large-scale +sequential games and continuous adversarial RL environments with partial data +coverage, ARDT demonstrates significantly superior robustness to powerful +test-time adversaries and attains higher worst-case returns compared to +contemporary DT methods. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Simulation of Neural Responses to Classical Music Using Organoid + Intelligence Methods + + +
+ Music is a complex auditory stimulus capable of eliciting significant changes +in brain activity, influencing cognitive processes such as memory, attention, +and emotional regulation. However, the underlying mechanisms of music-induced +cognitive processes remain largely unknown. Organoid intelligence and deep +learning models show promise for simulating and analyzing these neural +responses to classical music, an area significantly unexplored in computational +neuroscience. Hence, we present the PyOrganoid library, an innovative tool that +facilitates the simulation of organoid learning models, integrating +sophisticated machine learning techniques with biologically inspired organoid +simulations. Our study features the development of the Pianoid model, a "deep +organoid learning" model that utilizes a Bidirectional LSTM network to predict +EEG responses based on audio features from classical music recordings. This +model demonstrates the feasibility of using computational methods to replicate +complex neural processes, providing valuable insights into music perception and +cognition. Likewise, our findings emphasize the utility of synthetic models in +neuroscience research and highlight the PyOrganoid library's potential as a +versatile tool for advancing studies in neuroscience and artificial +intelligence. + +
+
+ comment: 10 pages, 9 figures +
+
+
+
+
+ + ☆ Large Language Model Integrated Healthcare Cyber-Physical Systems + Architecture + + +
+ Cyber-physical systems have become an essential part of the modern healthcare +industry. The healthcare cyber-physical systems (HCPS) combine physical and +cyber components to improve the healthcare industry. While HCPS has many +advantages, it also has some drawbacks, such as a lengthy data entry process, a +lack of real-time processing, and limited real-time patient visualization. To +overcome these issues, this paper represents an innovative approach to +integrating large language model (LLM) to enhance the efficiency of the +healthcare system. By incorporating LLM at various layers, HCPS can leverage +advanced AI capabilities to improve patient outcomes, advance data processing, +and enhance decision-making. + +
+
+
+
+
+ + ☆ The seismic purifier: An unsupervised approach to seismic signal + detection via representation learning + + +
+ In this paper, we develop an unsupervised learning approach to earthquake +detection. We train a specific class of deep auto-encoders that learn to +reproduce the input waveforms after a data-compressive bottleneck, and then use +a simple triggering algorithm at the bottleneck to label waveforms as noise or +signal. + Our approach is motivated by the intuition that efficient compression of data +should represent signals differently from noise, and is facilitated by a +time-axis-preserving approach to auto-encoding and intuitively-motivated +choices on the architecture and triggering. + We demonstrate that the detection performance of the unsupervised approach is +comparable to, and in some cases better than, some of the state-of-the-art +supervised methods. Moreover, it has strong \emph{cross-dataset +generalization}. By experimenting with various modifications, we demonstrate +that the detection performance is insensitive to various technical choices made +in the algorithm. + Our approach has the potential to be useful for other signal detection +problems with time series data. + +
+
+ comment: Submitted to IEEE-TGRS +
+
+
+
+
+ + ☆ Gaussian Process Kolmogorov-Arnold Networks + + +
+ In this paper, we introduce a probabilistic extension to Kolmogorov Arnold +Networks (KANs) by incorporating Gaussian Process (GP) as non-linear neurons, +which we refer to as GP-KAN. A fully analytical approach to handling the output +distribution of one GP as an input to another GP is achieved by considering the +function inner product of a GP function sample with the input distribution. +These GP neurons exhibit robust non-linear modelling capabilities while using +few parameters and can be easily and fully integrated in a feed-forward network +structure. They provide inherent uncertainty estimates to the model prediction +and can be trained directly on the log-likelihood objective function, without +needing variational lower bounds or approximations. In the context of MNIST +classification, a model based on GP-KAN of 80 thousand parameters achieved +98.5% prediction accuracy, compared to current state-of-the-art models with 1.5 +million parameters. + +
+
+ comment: related code: https://github.com/siyuan0/gp-kan +
+
+
+
+
+ + ☆ SCALE: Self-regulated Clustered federAted LEarning in a Homogeneous + Environment + + +
+ Federated Learning (FL) has emerged as a transformative approach for enabling +distributed machine learning while preserving user privacy, yet it faces +challenges like communication inefficiencies and reliance on centralized +infrastructures, leading to increased latency and costs. This paper presents a +novel FL methodology that overcomes these limitations by eliminating the +dependency on edge servers, employing a server-assisted Proximity Evaluation +for dynamic cluster formation based on data similarity, performance indices, +and geographical proximity. Our integrated approach enhances operational +efficiency and scalability through a Hybrid Decentralized Aggregation Protocol, +which merges local model training with peer-to-peer weight exchange and a +centralized final aggregation managed by a dynamically elected driver node, +significantly curtailing global communication overhead. Additionally, the +methodology includes Decentralized Driver Selection, Check-pointing to reduce +network traffic, and a Health Status Verification Mechanism for system +robustness. Validated using the breast cancer dataset, our architecture not +only demonstrates a nearly tenfold reduction in communication overhead but also +shows remarkable improvements in reducing training latency and energy +consumption while maintaining high learning performance, offering a scalable, +efficient, and privacy-preserving solution for the future of federated learning +ecosystems. + +
+
+ comment: This research article got accepted in COMPSAC conference and going to + be published to IEEE +
+
+
+
+
+ + ☆ Mathematical theory of deep learning + + +
+ This book provides an introduction to the mathematical analysis of deep +learning. It covers fundamental results in approximation theory, optimization +theory, and statistical learning theory, which are the three main pillars of +deep neural network theory. Serving as a guide for students and researchers in +mathematics and related fields, the book aims to equip readers with +foundational knowledge on the topic. It prioritizes simplicity over generality, +and presents rigorous yet accessible results to help build an understanding of +the essential mathematical concepts underpinning deep learning. + +
+
+
+
+
+ + ♻ ☆ Dr. Jekyll and Mr. Hyde: Two Faces of LLMs + + +
+ Recently, we have witnessed a rise in the use of Large Language Models +(LLMs), especially in applications like chatbot assistants. Safety mechanisms +and specialized training procedures are implemented to prevent improper +responses from these assistants. In this work, we bypass these measures for +ChatGPT and Gemini (and, to some extent, Bing chat) by making them impersonate +complex personas with personality characteristics that are not aligned with a +truthful assistant. We start by creating elaborate biographies of these +personas, which we then use in a new session with the same chatbots. Our +conversations then follow a role-play style to elicit prohibited responses. +Using personas, we show that prohibited responses are actually provided, making +it possible to obtain unauthorized, illegal, or harmful information. This work +shows that by using adversarial personas, one can overcome safety mechanisms +set out by ChatGPT and Gemini. We also introduce several ways of activating +such adversarial personas, which show that both chatbots are vulnerable to this +kind of attack. With the same principle, we introduce two defenses that push +the model to interpret trustworthy personalities and make it more robust +against such attacks. + +
+
+
+
+
+ + ♻ ☆ Can time series forecasting be automated? A benchmark and analysis + + +
+ In the field of machine learning and artificial intelligence, time series +forecasting plays a pivotal role across various domains such as finance, +healthcare, and weather. However, the task of selecting the most suitable +forecasting method for a given dataset is a complex task due to the diversity +of data patterns and characteristics. This research aims to address this +challenge by proposing a comprehensive benchmark for evaluating and ranking +time series forecasting methods across a wide range of datasets. This study +investigates the comparative performance of many methods from two prominent +time series forecasting frameworks, AutoGluon-Timeseries, and sktime to shed +light on their applicability in different real-world scenarios. This research +contributes to the field of time series forecasting by providing a robust +benchmarking methodology and facilitating informed decision-making when +choosing forecasting methods for achieving optimal prediction. + +
+
+
+
+
+ + ♻ ☆ Block Verification Accelerates Speculative Decoding + + +
+ Speculative decoding is an effective method for lossless acceleration of +large language models during inference. It uses a fast model to draft a block +of tokens which are then verified in parallel by the target model, and provides +a guarantee that the output is distributed identically to a sample from the +target model. In prior works, draft verification is performed independently +token-by-token. Surprisingly, we show that this approach is not optimal. We +propose Block Verification, a simple draft verification algorithm that verifies +the entire block jointly and provides additional wall-clock speedup. We prove +that the proposed mechanism is optimal in the expected number of tokens +produced each iteration and specifically is never worse than the standard +token-level verification. Empirically, block verification provides modest but +consistent wall-clock speedups over the standard token verification algorithm +of 5%-8% in a range of tasks and datasets. Given that block verification does +not increase code complexity, maintains the strong lossless guarantee of the +standard speculative decoding verification algorithm, cannot deteriorate +performance, and, in fact, consistently improves it, it can be used as a good +default in speculative decoding implementations. + +
+
+
+
+
+ + ♻ ☆ ShiftAddLLM: Accelerating Pretrained LLMs via Post-Training + Multiplication-Less Reparameterization + + +
+ Large language models (LLMs) have shown impressive performance on language +tasks but face challenges when deployed on resource-constrained devices due to +their extensive parameters and reliance on dense multiplications, resulting in +high memory demands and latency bottlenecks. Shift-and-add reparameterization +offers a promising solution by replacing costly multiplications with +hardware-friendly primitives in both the attention and multi-layer perceptron +(MLP) layers of an LLM. However, current reparameterization techniques require +training from scratch or full parameter fine-tuning to restore accuracy, which +is resource-intensive for LLMs. To address this, we propose accelerating +pretrained LLMs through post-training shift-and-add reparameterization, +creating efficient multiplication-free models, dubbed ShiftAddLLM. +Specifically, we quantize each weight matrix into binary matrices paired with +group-wise scaling factors. The associated multiplications are reparameterized +into (1) shifts between activations and scaling factors and (2) queries and +adds according to the binary matrices. To reduce accuracy loss, we present a +multi-objective optimization method to minimize both weight and output +activation reparameterization errors. Additionally, based on varying +sensitivity across layers to reparameterization, we develop an automated bit +allocation strategy to further reduce memory usage and latency. Experiments on +five LLM families and eight tasks consistently validate the effectiveness of +ShiftAddLLM, achieving average perplexity improvements of 5.6 and 22.7 points +at comparable or lower latency compared to the most competitive quantized LLMs +at 3 and 2 bits, respectively, and more than 80% memory and energy reductions +over the original LLMs. Codes and models are available at +https://github.com/GATECH-EIC/ShiftAddLLM. + +
+
+
+
+
+ + ♻ ☆ ShiftAddViT: Mixture of Multiplication Primitives Towards Efficient + Vision Transformer NeurIPS 2023 + + +
+ Vision Transformers (ViTs) have shown impressive performance and have become +a unified backbone for multiple vision tasks. However, both the attention +mechanism and multi-layer perceptrons (MLPs) in ViTs are not sufficiently +efficient due to dense multiplications, leading to costly training and +inference. To this end, we propose to reparameterize pre-trained ViTs with a +mixture of multiplication primitives, e.g., bitwise shifts and additions, +towards a new type of multiplication-reduced model, dubbed +$\textbf{ShiftAddViT}$, which aims to achieve end-to-end inference speedups on +GPUs without requiring training from scratch. Specifically, all +$\texttt{MatMuls}$ among queries, keys, and values are reparameterized using +additive kernels, after mapping queries and keys to binary codes in Hamming +space. The remaining MLPs or linear layers are then reparameterized with shift +kernels. We utilize TVM to implement and optimize those customized kernels for +practical hardware deployment on GPUs. We find that such a reparameterization +on attention maintains model accuracy, while inevitably leading to accuracy +drops when being applied to MLPs. To marry the best of both worlds, we further +propose a new mixture of experts (MoE) framework to reparameterize MLPs by +taking multiplication or its primitives as experts, e.g., multiplication and +shift, and designing a new latency-aware load-balancing loss. Such a loss helps +to train a generic router for assigning a dynamic amount of input tokens to +different experts according to their latency. Extensive experiments on various +2D/3D Transformer-based vision tasks consistently validate the effectiveness of +our proposed ShiftAddViT, achieving up to $\textbf{5.18$\times$}$ latency +reductions on GPUs and $\textbf{42.9}$% energy savings, while maintaining a +comparable accuracy as original or efficient ViTs. + +
+
+ comment: Accepted by NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ When Linear Attention Meets Autoregressive Decoding: Towards More + Effective and Efficient Linearized Large Language Models ICML 2024 + + +
+ Autoregressive Large Language Models (LLMs) have achieved impressive +performance in language tasks but face two significant bottlenecks: (1) +quadratic complexity in the attention module as the number of tokens increases, +and (2) limited efficiency due to the sequential processing nature of +autoregressive LLMs during generation. While linear attention and speculative +decoding offer potential solutions, their applicability and synergistic +potential for enhancing autoregressive LLMs remain uncertain. We conduct the +first comprehensive study on the efficacy of existing linear attention methods +for autoregressive LLMs, integrating them with speculative decoding. We +introduce an augmentation technique for linear attention that ensures +compatibility with speculative decoding, enabling more efficient training and +serving of LLMs. Extensive experiments and ablation studies involving seven +existing linear attention models and five encoder/decoder-based LLMs +consistently validate the effectiveness of our augmented linearized LLMs. +Notably, our approach achieves up to a 6.67 reduction in perplexity on the +LLaMA model and up to a 2$\times$ speedup during generation compared to prior +linear attention methods. Codes and models are available at +https://github.com/GATECH-EIC/Linearized-LLM. + +
+
+ comment: Accepted by ICML 2024; 17 pages; 10 figures; 16 tables +
+
+
+
+
+ + ♻ ☆ Wasserstein approximation schemes based on Voronoi partitions + + +
+ We consider structured approximation of measures in Wasserstein space +$\mathrm{W}_p(\mathbb{R}^d)$ for $p\in[1,\infty)$ using general measure +approximants compactly supported on Voronoi regions derived from a scaled +Voronoi partition of $\mathbb{R}^d$. We show that if a full rank lattice +$\Lambda$ is scaled by a factor of $h\in(0,1]$, then approximation of a measure +based on the Voronoi partition of $h\Lambda$ is $O(h)$ regardless of $d$ or +$p$. We then use a covering argument to show that $N$-term approximations of +compactly supported measures is $O(N^{-\frac1d})$ which matches known rates for +optimal quantizers and empirical measure approximation in most instances. +Additionally, we generalize our construction to nonuniform Voronoi partitions, +highlighting the flexibility and robustness of our approach for various measure +approximation scenarios. Finally, we extend these results to noncompactly +supported measures with sufficient decay. Our findings are pertinent to +applications in computer vision and machine learning where measures are used to +represent structured data such as images. + +
+
+
+
+
+ + ♻ ☆ A Unified Framework for Model Editing ACL 2024 + + +
+ ROME and MEMIT are largely believed to be two different model editing +algorithms, with the major difference between them being the ability to perform +batched edits. In this paper, we unify these two algorithms under a single +conceptual umbrella, optimizing for the same goal, which we call the +preservation-memorization objective. ROME uses an equality constraint to +optimize this objective to perform one edit at a time, whereas MEMIT employs a +more flexible least-square constraint that allows for batched edits. We +generalize ROME and enable batched editing with equality constraint in the form +of EMMET - an Equality-constrained Mass Model Editing algorithm for +Transformers, a new batched memory-editing algorithm. EMMET can perform +batched-edits up to a batch-size of 10,000, with very similar performance to +MEMIT across multiple dimensions. With the introduction of EMMET, we truly +unify ROME and MEMIT and show that both algorithms are equivalent in terms of +their optimization objective, their abilities (singular and batched editing), +their model editing performance and their limitations. + +
+
+ comment: Under review. To appear as poster at KnowledgeableLM Workshop + co-located with ACL 2024 +
+
+
+
+
+ + ♻ ☆ Light Curve Classification with DistClassiPy: a new distance-based + classifier + + +
+ The rise of synoptic sky surveys has ushered in an era of big data in +time-domain astronomy, making data science and machine learning essential tools +for studying celestial objects. While tree-based models (e.g. Random Forests) +and deep learning models dominate the field, we explore the use of different +distance metrics to aid in the classification of astrophysical objects. We +developed DistClassiPy, a new distance metric based classifier. The direct use +of distance metrics is unexplored in time-domain astronomy, but distance-based +methods can help make classification more interpretable and decrease +computational costs. In particular, we applied DistClassiPy to classify light +curves of variable stars, comparing the distances between objects of different +classes. Using 18 distance metrics on a catalog of 6,000 variable stars across +10 classes, we demonstrate classification and dimensionality reduction. Our +classifier meets state-of-the-art performance but has lower computational +requirements and improved interpretability. Additionally, DistClassiPy can be +tailored to specific objects by identifying the most effective distance metric +for that classification. To facilitate broader applications within and beyond +astronomy, we have made DistClassiPy open-source and available at +https://pypi.org/project/distclassipy/. + +
+
+ comment: Accepted for publication in Astronomy and Computing (2024). 24 pages, + 19 figures +
+
+
+
+
+ + ♻ ☆ Longhorn: State Space Models are Amortized Online Learners + + +
+ The most fundamental capability of modern AI methods such as Large Language +Models (LLMs) is the ability to predict the next token in a long sequence of +tokens, known as ``sequence modeling." Although the Transformers model is the +current dominant approach to sequence modeling, its quadratic computational +cost with respect to sequence length is a significant drawback. State-space +models (SSMs) offer a promising alternative due to their linear decoding +efficiency and high parallelizability during training. However, existing SSMs +often rely on seemingly ad hoc linear recurrence designs. In this work, we +explore SSM design through the lens of online learning, conceptualizing SSMs as +meta-modules for specific online learning problems. This approach links SSM +design to formulating precise online learning objectives, with state transition +rules derived from optimizing these objectives. Based on this insight, we +introduce a novel deep SSM architecture based on the implicit update for +optimizing an online regression objective. Our experimental results show that +our models outperform state-of-the-art SSMs, including the Mamba model, on +standard sequence modeling benchmarks and language modeling tasks. + +
+
+
+
+
+ + ♻ ☆ Harmonic LLMs are Trustworthy + + +
+ We introduce an intuitive method to test the robustness (stability and +explainability) of any black-box LLM in real-time via its local deviation from +harmoniticity, denoted as $\gamma$. To the best of our knowledge this is the +first completely model-agnostic and unsupervised method of measuring the +robustness of any given response from an LLM, based upon the model itself +conforming to a purely mathematical standard. To show general application and +immediacy of results, we measure $\gamma$ in 10 popular LLMs (ChatGPT, +Claude-2.1, Claude3.0, GPT-4, GPT-4o, Smaug-72B, Mixtral-8x7B, Llama2-7B, +Mistral-7B and MPT-7B) across thousands of queries in three objective domains: +WebQA, ProgrammingQA, and TruthfulQA. Across all models and domains tested, +human annotation confirms that $\gamma \to 0$ indicates trustworthiness, and +conversely searching higher values of $\gamma$ easily exposes examples of +hallucination, a fact that enables efficient adversarial prompt generation +through stochastic gradient ascent in $\gamma$. The low-$\gamma$ leaders among +the models in the respective domains are GPT-4o, GPT-4, and Smaug-72B, +providing evidence that mid-size open-source models can win out against large +commercial models. + +
+
+ comment: 15 pages, 2 figures, 16 tables; added Claude-3.0, GPT-4o, Mistral-7B, + Mixtral-8x7B, and more annotation for other models +
+
+
+
+
+ + ♻ ☆ No Representation, No Trust: Connecting Representation, Collapse, and + Trust Issues in PPO ICML + + +
+ Reinforcement learning (RL) is inherently rife with non-stationarity since +the states and rewards the agent observes during training depend on its +changing policy. Therefore, networks in deep RL must be capable of adapting to +new observations and fitting new targets. However, previous works have observed +that networks in off-policy deep value-based methods exhibit a decrease in +representation rank, often correlated with an inability to continue learning or +a collapse in performance. Although this phenomenon has generally been +attributed to neural network learning under non-stationarity, it has been +overlooked in on-policy policy optimization methods which are often thought +capable of training indefinitely. In this work, we empirically study +representation dynamics in Proximal Policy Optimization (PPO) on the Atari and +MuJoCo environments, revealing that PPO agents are also affected by feature +rank deterioration and loss of plasticity. We show that this is aggravated with +stronger non-stationarity, ultimately driving the actor's performance to +collapse, regardless of the performance of the critic. We ask why the trust +region, specific to methods like PPO, cannot alleviate or prevent the collapse. +We find that there is a connection between representation collapse and the +degradation of the trust region, one exacerbating the other, and present +Proximal Feature Optimization (PFO), a novel auxiliary loss that, along with +other interventions, shows that regularizing the representation dynamics +improves the performance of PPO agents. + +
+
+ comment: ICML ARLET workshop version. Code and run histories are available at + https://github.com/CLAIRE-Labo/no-representation-no-trust +
+
+
+
+
+ + ♻ ☆ Evaluating the design space of diffusion-based generative models + + +
+ Most existing theoretical investigations of the accuracy of diffusion models, +albeit significant, assume the score function has been approximated to a +certain accuracy, and then use this a priori bound to control the error of +generation. This article instead provides a first quantitative understanding of +the whole generation process, i.e., both training and sampling. More precisely, +it conducts a non-asymptotic convergence analysis of denoising score matching +under gradient descent. In addition, a refined sampling error analysis for +variance exploding models is also provided. The combination of these two +results yields a full error analysis, which elucidates (again, but this time +theoretically) how to design the training and sampling processes for effective +generation. For instance, our theory implies a preference toward noise +distribution and loss weighting in training that qualitatively agree with the +ones used in [Karras et al. 2022]. It also provides perspectives on the choices +of time and variance schedules in sampling: when the score is well trained, the +design in [Song et al. 2020] is more preferable, but when it is less trained, +the design in [Karras et al. 2022] becomes more preferable. + +
+
+ comment: Comments are welcome. Out of admiration we titled our paper after + EDM, and hoped theorists' humor is not too corny +
+
+
+
+
+ + ♻ ☆ Looking at Model Debiasing through the Lens of Anomaly Detection + + +
+ It is widely recognized that deep neural networks are sensitive to bias in +the data. This means that during training these models are likely to learn +spurious correlations between data and labels, resulting in limited +generalization abilities and low performance. In this context, model debiasing +approaches can be devised aiming at reducing the model's dependency on such +unwanted correlations, either leveraging the knowledge of bias information or +not. In this work, we focus on the latter and more realistic scenario, showing +the importance of accurately predicting the bias-conflicting and bias-aligned +samples to obtain compelling performance in bias mitigation. On this ground, we +propose to conceive the problem of model bias from an out-of-distribution +perspective, introducing a new bias identification method based on anomaly +detection. We claim that when data is mostly biased, bias-conflicting samples +can be regarded as outliers with respect to the bias-aligned distribution in +the feature space of a biased model, thus allowing for precisely detecting them +with an anomaly detection method. Coupling the proposed bias identification +approach with bias-conflicting data upsampling and augmentation in a two-step +strategy, we reach state-of-the-art performance on synthetic and real benchmark +datasets. Ultimately, our proposed approach shows that the data bias issue does +not necessarily require complex debiasing methods, given that an accurate bias +identification procedure is defined. + +
+
+ comment: 15 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Generative Learning of Continuous Data by Tensor Networks + + +
+ Beyond their origin in modeling many-body quantum systems, tensor networks +have emerged as a promising class of models for solving machine learning +problems, notably in unsupervised generative learning. While possessing many +desirable features arising from their quantum-inspired nature, tensor network +generative models have previously been largely restricted to binary or +categorical data, limiting their utility in real-world modeling problems. We +overcome this by introducing a new family of tensor network generative models +for continuous data, which are capable of learning from distributions +containing continuous random variables. We develop our method in the setting of +matrix product states, first deriving a universal expressivity theorem proving +the ability of this model family to approximate any reasonably smooth +probability density function with arbitrary precision. We then benchmark the +performance of this model on several synthetic and real-world datasets, finding +that the model learns and generalizes well on distributions of continuous and +discrete variables. We develop methods for modeling different data domains, and +introduce a trainable compression layer which is found to increase model +performance given limited memory or computational resources. Overall, our +methods give important theoretical and empirical evidence of the efficacy of +quantum-inspired methods for the rapidly growing field of generative learning. + +
+
+ comment: 21 pages, 15 figures +
+
+
+
+
+ + ♻ ☆ Review of Machine Learning Methods for Additive Manufacturing of + Functionally Graded Materials + + +
+ Additive Manufacturing (AM) is a transformative manufacturing technology +enabling direct fabrication of complex parts layer-be-layer from 3D modeling +data. Among AM applications, the fabrication of Functionally Graded Materials +(FGMs) has significant importance due to the potential to enhance component +performance across several industries. FGMs are manufactured with a gradient +composition transition between dissimilar materials, enabling the design of new +materials with location-dependent mechanical and physical properties. This +study presents a comprehensive review of published literature pertaining to the +implementation of Machine Learning (ML) techniques in AM, with an emphasis on +ML-based methods for optimizing FGMs fabrication processes. Through an +extensive survey of the literature, this review article explores the role of ML +in addressing the inherent challenges in FGMs fabrication and encompasses +parameter optimization, defect detection, and real-time monitoring. The article +also provides a discussion of future research directions and challenges in +employing ML-based methods in AM fabrication of FGMs. + +
+
+ comment: 23 pages +
+
+
+
+
+ + ♻ ☆ Self-supervised learning of video representations from a child's + perspective + + +
+ Children learn powerful internal models of the world around them from a few +years of egocentric visual experience. Can such internal models be learned from +a child's visual experience with highly generic learning algorithms or do they +require strong inductive biases? Recent advances in collecting large-scale, +longitudinal, developmentally realistic video datasets and generic +self-supervised learning (SSL) algorithms are allowing us to begin to tackle +this nature vs. nurture question. However, existing work typically focuses on +image-based SSL algorithms and visual capabilities that can be learned from +static images (e.g. object recognition), thus ignoring temporal aspects of the +world. To close this gap, here we train self-supervised video models on +longitudinal, egocentric headcam recordings collected from a child over a two +year period in their early development (6-31 months). The resulting models are +highly effective at facilitating the learning of action concepts from a small +number of labeled examples; they have favorable data size scaling properties; +and they display emergent video interpolation capabilities. Video models also +learn more robust object representations than image-based models trained with +the exact same data. These results suggest that important temporal aspects of a +child's internal model of the world may be learnable from their visual +experience using highly generic learning algorithms and without strong +inductive biases. + +
+
+ comment: Published as a conference paper at CogSci 2024; code & models + available from https://github.com/eminorhan/video-models +
+
+
+
+
+ + ♻ ☆ Uncovering Latent Memories: Assessing Data Leakage and Memorization + Patterns in Frontier AI Models + + +
+ Frontier AI systems are making transformative impacts across society, but +such benefits are not without costs: models trained on web-scale datasets +containing personal and private data raise profound concerns about data privacy +and security. Language models are trained on extensive corpora including +potentially sensitive or proprietary information, and the risk of data leakage +- where the model response reveals pieces of such information - remains +inadequately understood. Prior work has investigated what factors drive +memorization and have identified that sequence complexity and the number of +repetitions drive memorization. Here, we focus on the evolution of memorization +over training. We begin by reproducing findings that the probability of +memorizing a sequence scales logarithmically with the number of times it is +present in the data. We next show that sequences which are apparently not +memorized after the first encounter can be "uncovered" throughout the course of +training even without subsequent encounters, a phenomenon we term "latent +memorization". The presence of latent memorization presents a challenge for +data privacy as memorized sequences may be hidden at the final checkpoint of +the model but remain easily recoverable. To this end, we develop a diagnostic +test relying on the cross entropy loss to uncover latent memorized sequences +with high accuracy. + +
+
+
+
+
+ + ♻ ☆ Clustering with minimum spanning trees: How good can it be? + + +
+ Minimum spanning trees (MSTs) provide a convenient representation of datasets +in numerous pattern recognition activities. Moreover, they are relatively fast +to compute. In this paper, we quantify the extent to which they are meaningful +in low-dimensional partitional data clustering tasks. By identifying the upper +bounds for the agreement between the best (oracle) algorithm and the expert +labels from a large battery of benchmark data, we discover that MST methods can +be very competitive. Next, we review, study, extend, and generalise a few +existing, state-of-the-art MST-based partitioning schemes. This leads to some +new noteworthy approaches. Overall, the Genie and the information-theoretic +methods often outperform the non-MST algorithms such as K-means, Gaussian +mixtures, spectral clustering, Birch, density-based, and classical hierarchical +agglomerative procedures. Nevertheless, we identify that there is still some +room for improvement, and thus the development of novel algorithms is +encouraged. + +
+
+
+
+
+ + ♻ ☆ Normalised clustering accuracy: An asymmetric external cluster validity + measure + + +
+ There is no, nor will there ever be, single best clustering algorithm. +Nevertheless, we would still like to be able to distinguish between methods +that work well on certain task types and those that systematically +underperform. Clustering algorithms are traditionally evaluated using either +internal or external validity measures. Internal measures quantify different +aspects of the obtained partitions, e.g., the average degree of cluster +compactness or point separability. However, their validity is questionable +because the clusterings they endorse can sometimes be meaningless. External +measures, on the other hand, compare the algorithms' outputs to fixed ground +truth groupings provided by experts. In this paper, we argue that the commonly +used classical partition similarity scores, such as the normalised mutual +information, Fowlkes-Mallows, or adjusted Rand index, miss some desirable +properties. In particular, they do not identify worst-case scenarios correctly, +nor are they easily interpretable. As a consequence, the evaluation of +clustering algorithms on diverse benchmark datasets can be difficult. To remedy +these issues, we propose and analyse a new measure: a version of the optimal +set-matching accuracy, which is normalised, monotonic with respect to some +similarity relation, scale-invariant, and corrected for the imbalancedness of +cluster sizes (but neither symmetric nor adjusted for chance). + +
+
+
+
+
+ + ♻ ☆ 3D Diffuser Actor: Policy Diffusion with 3D Scene Representations + + +
+ Diffusion policies are conditional diffusion models that learn robot action +distributions conditioned on the robot and environment state. They have +recently shown to outperform both deterministic and alternative action +distribution learning formulations. 3D robot policies use 3D scene feature +representations aggregated from a single or multiple camera views using sensed +depth. They have shown to generalize better than their 2D counterparts across +camera viewpoints. We unify these two lines of work and present 3D Diffuser +Actor, a neural policy equipped with a novel 3D denoising transformer that +fuses information from the 3D visual scene, a language instruction and +proprioception to predict the noise in noised 3D robot pose trajectories. 3D +Diffuser Actor sets a new state-of-the-art on RLBench with an absolute +performance gain of 18.1% over the current SOTA on a multi-view setup and an +absolute gain of 13.1% on a single-view setup. On the CALVIN benchmark, it +improves over the current SOTA by a 9% relative increase. It also learns to +control a robot manipulator in the real world from a handful of demonstrations. +Through thorough comparisons with the current SOTA policies and ablations of +our model, we show 3D Diffuser Actor's design choices dramatically outperform +2D representations, regression and classification objectives, absolute +attentions, and holistic non-tokenized 3D scene embeddings. + +
+
+ comment: First two authors contributed equally +
+
+
+
+
+ + ♻ ☆ Diagnosing and fixing common problems in Bayesian optimization for + molecule design ICML 2024 + + +
+ Bayesian optimization (BO) is a principled approach to molecular design +tasks. In this paper we explain three pitfalls of BO which can cause poor +empirical performance: an incorrect prior width, over-smoothing, and inadequate +acquisition function maximization. We show that with these issues addressed, +even a basic BO setup is able to achieve the highest overall performance on the +PMO benchmark for molecule design (Gao et al 2022). These results suggest that +BO may benefit from more attention in the machine learning for molecules +community. + +
+
+ comment: 8 pages, 4 figures. ICML 2024 AI for science workshop + (https://openreview.net/forum?id=V4aG4wsoIt). Code at: + https://github.com/AustinT/basic-mol-bo-workshop2024 +
+
+
+
+
+ + ♻ ☆ Anatomizing Deep Learning Inference in Web Browsers + + +
+ Web applications have increasingly adopted Deep Learning (DL) through +in-browser inference, wherein DL inference performs directly within Web +browsers. The actual performance of in-browser inference and its impacts on the +quality of experience (QoE) remain unexplored, and urgently require new QoE +measurements beyond traditional ones, e.g., mainly focusing on page load time. +To bridge this gap, we make the first comprehensive performance measurement of +in-browser inference to date. Our approach proposes new metrics to measure +in-browser inference: responsiveness, smoothness, and inference accuracy. Our +extensive analysis involves 9 representative DL models across Web browsers of +50 popular PC devices and 20 mobile devices. The results reveal that in-browser +inference exhibits a substantial latency gap, averaging 16.9 times slower on +CPU and 4.9 times slower on GPU compared to native inference on PC devices. The +gap on mobile CPU and mobile GPU is 15.8 times and 7.8 times, respectively. +Furthermore, we identify contributing factors to such latency gap, including +underutilized hardware instruction sets, inherent overhead in the runtime +environment, resource contention within the browser, and inefficiencies in +software libraries and GPU abstractions. Additionally, in-browser inference +imposes significant memory demands, at times exceeding 334.6 times the size of +the DL models themselves, partly attributable to suboptimal memory management. +We also observe that in-browser inference leads to a significant 67.2% increase +in the time it takes for GUI components to render within Web browsers, +significantly affecting the overall user QoE of Web applications reliant on +this technology + +
+
+ comment: Accepted by ACM Transactions on Software Engineering and Methodology + (TOSEM) +
+
+
+
+
+ + ♻ ☆ Resolving Discrepancies in Compute-Optimal Scaling of Language Models + + +
+ Kaplan et al. and Hoffmann et al. developed influential scaling laws for the +optimal model size as a function of the compute budget, but these laws yield +substantially different predictions. We explain the discrepancy by reproducing +the Kaplan scaling law on two datasets (OpenWebText2 and RefinedWeb) and +identifying three factors causing the difference: last layer computational +cost, warmup duration, and scale-dependent optimizer tuning. With these factors +corrected, we obtain excellent agreement with the Hoffmann et al. (i.e., +"Chinchilla") scaling law. Counter to a hypothesis of Hoffmann et al., we find +that careful learning rate decay is not essential for the validity of their +scaling law. As a secondary result, we derive scaling laws for the optimal +learning rate and batch size, finding that tuning the AdamW $\beta_2$ parameter +is essential at lower batch sizes. + +
+
+ comment: Fixing bug in small models with tuned LR +
+
+
+
+
+ + ♻ ☆ A Non-Expert's Introduction to Data Ethics for Mathematicians + + +
+ I give a short introduction to data ethics. I begin with some background +information and societal context for data ethics. I then discuss data ethics in +mathematical-science education and indicate some available course material. I +briefly highlight a few efforts -- at my home institution and elsewhere -- on +data ethics, society, and social good. I then discuss open data in research, +research replicability and some other ethical issues in research, and the +tension between privacy and open data and code, and a few controversial studies +and reactions to studies. I then discuss ethical principles, institutional +review boards, and a few other considerations in the scientific use of human +data. I then briefly survey a variety of research and lay articles that are +relevant to data ethics and data privacy. I conclude with a brief summary and +some closing remarks. + My focal audience is mathematicians, but I hope that this chapter will also +be useful to others. I am not an expert about data ethics, and this chapter +provides only a starting point on this wide-ranging topic. I encourage you to +examine the resources that I discuss and to reflect carefully on data ethics, +its role in mathematics education, and the societal implications of data and +data analysis. As data and technology continue to evolve, I hope that such +careful reflection will continue throughout your life. + +
+
+ comment: A few more small tweaks. This is a book chapter. It is associated + with my data-ethics lecture at the 2021 AMS Short Course on Mathematical and + Computational Methods for Complex Social Systems +
+
+
+
+
+ + ♻ ☆ Equivariant Ensembles and Regularization for Reinforcement Learning in + Map-based Path Planning IROS 2024 + + +
+ In reinforcement learning (RL), exploiting environmental symmetries can +significantly enhance efficiency, robustness, and performance. However, +ensuring that the deep RL policy and value networks are respectively +equivariant and invariant to exploit these symmetries is a substantial +challenge. Related works try to design networks that are equivariant and +invariant by construction, limiting them to a very restricted library of +components, which in turn hampers the expressiveness of the networks. This +paper proposes a method to construct equivariant policies and invariant value +functions without specialized neural network components, which we term +equivariant ensembles. We further add a regularization term for adding +inductive bias during training. In a map-based path planning case study, we +show how equivariant ensembles and regularization benefit sample efficiency and +performance. + +
+
+ comment: Accepted at IROS 2024. A video can be found here: + https://youtu.be/L6NOdvU7n7s. The code is available at + https://github.com/theilem/uavSim +
+
+
+
+
+ + ♻ ☆ Expressivity and Generalization: Fragment-Biases for Molecular GNNs + + +
+ Although recent advances in higher-order Graph Neural Networks (GNNs) improve +the theoretical expressiveness and molecular property predictive performance, +they often fall short of the empirical performance of models that explicitly +use fragment information as inductive bias. However, for these approaches, +there exists no theoretic expressivity study. In this work, we propose the +Fragment-WL test, an extension to the well-known Weisfeiler & Leman (WL) test, +which enables the theoretic analysis of these fragment-biased GNNs. Building on +the insights gained from the Fragment-WL test, we develop a new GNN +architecture and a fragmentation with infinite vocabulary that significantly +boosts expressiveness. We show the effectiveness of our model on synthetic and +real-world data where we outperform all GNNs on Peptides and have 12% lower +error than all GNNs on ZINC and 34% lower error than other fragment-biased +models. Furthermore, we show that our model exhibits superior generalization +capabilities compared to the latest transformer-based architectures, +positioning it as a robust solution for a range of molecular modeling tasks. + +
+
+
+
+
+ + ♻ ☆ Particle identification with machine learning from incomplete data in + the ALICE experiment + + +
+ The ALICE experiment at the LHC measures properties of the strongly +interacting matter formed in ultrarelativistic heavy-ion collisions. Such +studies require accurate particle identification (PID). ALICE provides PID +information via several detectors for particles with momentum from about 100 +MeV/c up to 20 GeV/c. Traditionally, particles are selected with rectangular +cuts. A much better performance can be achieved with machine learning (ML) +methods. Our solution uses multiple neural networks (NN) serving as binary +classifiers. Moreover, we extended our particle classifier with Feature Set +Embedding and attention in order to train on data with incomplete samples. We +also present the integration of the ML project with the ALICE analysis +software, and we discuss domain adaptation, the ML technique needed to transfer +the knowledge between simulated and real experimental data. + +
+
+ comment: Proceedings of 3rd Artificial Intelligence for the Electron-Ion + Collider workshop -- AI4EIC2023, 28.11-1.12.2023 +
+
+
+
+
+ + ♻ ☆ Category Adaptation Meets Projected Distillation in Generalized + Continual Category Discovery ECCV 2024 + + +
+ Generalized Continual Category Discovery (GCCD) tackles learning from +sequentially arriving, partially labeled datasets while uncovering new +categories. Traditional methods depend on feature distillation to prevent +forgetting the old knowledge. However, this strategy restricts the model's +ability to adapt and effectively distinguish new categories. To address this, +we introduce a novel technique integrating a learnable projector with feature +distillation, thus enhancing model adaptability without sacrificing past +knowledge. The resulting distribution shift of the previously learned +categories is mitigated with the auxiliary category adaptation network. We +demonstrate that while each component offers modest benefits individually, +their combination - dubbed CAMP (Category Adaptation Meets Projected +distillation) - significantly improves the balance between learning new +information and retaining old. CAMP exhibits superior performance across +several GCCD and Class Incremental Learning scenarios. The code is available at +https://github.com/grypesc/CAMP. + +
+
+ comment: Accepted for ECCV 2024 +
+
+
+
+
+ + ♻ ☆ The Larger the Better? Improved LLM Code-Generation via Budget + Reallocation + + +
+ It is a common belief that large language models (LLMs) are better than +smaller-sized ones. However, larger models also require significantly more time +and compute during inference. This begs the question: what happens when both +models operate under the same budget? (e.g., compute, run-time). To address +this question, we analyze code generation LLMs of various sizes and make +comparisons such as running a 70B model once vs. generating five outputs from a +13B model. We consider a standard unit-test setup, which can be used to select +the correct output from the smaller model. Our findings reveal that the +repeated use of smaller models can yield consistent improvements, with gains of +up to 15% across five tasks. On the other hand, in scenarios where unit-tests +are unavailable, a ranking-based selection of candidates from the smaller model +falls short of the performance of a single output from larger ones. Our results +highlight the potential of using smaller models instead of larger ones, and the +importance of studying approaches for ranking LLM outputs. + +
+
+ comment: COLM 2024 +
+
+
+
+
+ + ♻ ☆ A unified law of robustness for Bregman divergence losses + + +
+ In contemporary deep learning practice, models are often trained to near zero +loss i.e. to nearly interpolate the training data. However, the number of +parameters in the model is usually far more than the number of data points $n$, +the theoretical minimum needed for interpolation: a phenomenon referred to as +overparameterization. In an interesting piece of work that contributes to the +considerable research that has been devoted to understand overparameterization, +Bubeck and Sellke showed that for a broad class of covariate distributions +(specifically those satisfying a natural notion of concentration of measure), +overparameterization is necessary for robust interpolation i.e. if the +interpolating function is required to be Lipschitz. However, their robustness +results were proved only in the setting of regression with square loss. In +practice, however many other kinds of losses are used, e.g. cross entropy loss +for classification. In this work, we generalize Bubeck and Selke's result to +Bregman divergence losses, which form a common generalization of square loss +and cross-entropy loss. Our generalization relies on identifying a +bias-variance type decomposition that lies at the heart of the proof and Bubeck +and Sellke. + +
+
+ comment: 18 pages +
+
+
+
+
+ + ♻ ☆ Physics-Enhanced Graph Neural Networks For Soft Sensing in Industrial + Internet of Things + + +
+ The Industrial Internet of Things (IIoT) is reshaping manufacturing, +industrial processes, and infrastructure management. By fostering new levels of +automation, efficiency, and predictive maintenance, IIoT is transforming +traditional industries into intelligent, seamlessly interconnected ecosystems. +However, achieving highly reliable IIoT can be hindered by factors such as the +cost of installing large numbers of sensors, limitations in retrofitting +existing systems with sensors, or harsh environmental conditions that may make +sensor installation impractical. Soft (virtual) sensing leverages mathematical +models to estimate variables from physical sensor data, offering a solution to +these challenges. Data-driven and physics-based modeling are the two main +methodologies widely used for soft sensing. The choice between these strategies +depends on the complexity of the underlying system, with the data-driven +approach often being preferred when the physics-based inference models are +intricate and present challenges for state estimation. However, conventional +deep learning models are typically hindered by their inability to explicitly +represent the complex interactions among various sensors. To address this +limitation, we adopt Graph Neural Networks (GNNs), renowned for their ability +to effectively capture the complex relationships between sensor measurements. +In this research, we propose physics-enhanced GNNs, which integrate principles +of physics into graph-based methodologies. This is achieved by augmenting +additional nodes in the input graph derived from the underlying characteristics +of the physical processes. Our evaluation of the proposed methodology on the +case study of district heating networks reveals significant improvements over +purely data-driven GNNs, even in the presence of noise and parameter +inaccuracies. + +
+
+ comment: 14 pages, 10 figures. Accepted to IEEE Internet of Things Journal +
+
+
+
+
+ + ♻ ☆ Improving probabilistic forecasts of extreme wind speeds by training + statistical post-processing models with weighted scoring rules + + +
+ Accurate forecasts of extreme wind speeds are of high importance for many +applications. Such forecasts are usually generated by ensembles of numerical +weather prediction (NWP) models, which however can be biased and have errors in +dispersion, thus necessitating the application of statistical post-processing +techniques. In this work we aim to improve statistical post-processing models +for probabilistic predictions of extreme wind speeds. We do this by adjusting +the training procedure used to fit ensemble model output statistics (EMOS) +models - a commonly applied post-processing technique - and propose estimating +parameters using the so-called threshold-weighted continuous ranked probability +score (twCRPS), a proper scoring rule that places special emphasis on +predictions over a threshold. We show that training using the twCRPS leads to +improved extreme event performance of post-processing models for a variety of +thresholds. We find a distribution body-tail trade-off where improved +performance for probabilistic predictions of extreme events comes with worse +performance for predictions of the distribution body. However, we introduce +strategies to mitigate this trade-off based on weighted training and linear +pooling. Finally, we consider some synthetic experiments to explain the +training impact of the twCRPS and derive closed-form expressions of the twCRPS +for a number of distributions, giving the first such collection in the +literature. The results will enable researchers and practitioners alike to +improve the performance of probabilistic forecasting models for extremes and +other events of interest. + +
+
+
+
+
+ + ♻ ☆ Detection of Correlated Random Vectors + + +
+ In this paper, we investigate the problem of deciding whether two standard +normal random vectors $\mathsf{X}\in\mathbb{R}^{n}$ and +$\mathsf{Y}\in\mathbb{R}^{n}$ are correlated or not. This is formulated as a +hypothesis testing problem, where under the null hypothesis, these vectors are +statistically independent, while under the alternative, $\mathsf{X}$ and a +randomly and uniformly permuted version of $\mathsf{Y}$, are correlated with +correlation $\rho$. We analyze the thresholds at which optimal testing is +information-theoretically impossible and possible, as a function of $n$ and +$\rho$. To derive our information-theoretic lower bounds, we develop a novel +technique for evaluating the second moment of the likelihood ratio using an +orthogonal polynomials expansion, which among other things, reveals a +surprising connection to integer partition functions. We also study a +multi-dimensional generalization of the above setting, where rather than two +vectors we observe two databases/matrices, and furthermore allow for partial +correlations between these two. + +
+
+ comment: 42 pages +
+
+
+
+
+ + ♻ ☆ Q-Pensieve: Boosting Sample Efficiency of Multi-Objective RL Through + Memory Sharing of Q-Snapshots + + +
+ Many real-world continuous control problems are in the dilemma of weighing +the pros and cons, multi-objective reinforcement learning (MORL) serves as a +generic framework of learning control policies for different preferences over +objectives. However, the existing MORL methods either rely on multiple passes +of explicit search for finding the Pareto front and therefore are not +sample-efficient, or utilizes a shared policy network for coarse knowledge +sharing among policies. To boost the sample efficiency of MORL, we propose +Q-Pensieve, a policy improvement scheme that stores a collection of Q-snapshots +to jointly determine the policy update direction and thereby enables data +sharing at the policy level. We show that Q-Pensieve can be naturally +integrated with soft policy iteration with convergence guarantee. To +substantiate this concept, we propose the technique of Q replay buffer, which +stores the learned Q-networks from the past iterations, and arrive at a +practical actor-critic implementation. Through extensive experiments and an +ablation study, we demonstrate that with much fewer samples, the proposed +algorithm can outperform the benchmark MORL methods on a variety of MORL +benchmark tasks. + +
+
+ comment: 20 pages, 15 figures +
+
+
+
+
+ + ♻ ☆ The Platonic Representation Hypothesis + + +
+ We argue that representations in AI models, particularly deep networks, are +converging. First, we survey many examples of convergence in the literature: +over time and across multiple domains, the ways by which different neural +networks represent data are becoming more aligned. Next, we demonstrate +convergence across data modalities: as vision models and language models get +larger, they measure distance between datapoints in a more and more alike way. +We hypothesize that this convergence is driving toward a shared statistical +model of reality, akin to Plato's concept of an ideal reality. We term such a +representation the platonic representation and discuss several possible +selective pressures toward it. Finally, we discuss the implications of these +trends, their limitations, and counterexamples to our analysis. + +
+
+ comment: Equal contributions. Project: https://phillipi.github.io/prh/ Code: + https://github.com/minyoungg/platonic-rep +
+
+
+
+
+ + ♻ ☆ Neural Fractional Differential Equations + + +
+ Fractional Differential Equations (FDEs) are essential tools for modelling +complex systems in science and engineering. They extend the traditional +concepts of differentiation and integration to non-integer orders, enabling a +more precise representation of processes characterised by non-local and +memory-dependent behaviours. + This property is useful in systems where variables do not respond to changes +instantaneously, but instead exhibit a strong memory of past interactions. + Having this in mind, and drawing inspiration from Neural Ordinary +Differential Equations (Neural ODEs), we propose the Neural FDE, a novel deep +neural network architecture that adjusts a FDE to the dynamics of data. + This work provides a comprehensive overview of the numerical method employed +in Neural FDEs and the Neural FDE architecture. The numerical outcomes suggest +that, despite being more computationally demanding, the Neural FDE may +outperform the Neural ODE in modelling systems with memory or dependencies on +past states, and it can effectively be applied to learn more intricate +dynamical systems. + +
+
+
+
+
+ + ♻ ☆ KIVI: A Tuning-Free Asymmetric 2bit Quantization for KV Cache ICML2024 + + +
+ Efficiently serving large language models (LLMs) requires batching of many +requests to reduce the cost per request. Yet, with larger batch sizes and +longer context lengths, the key-value (KV) cache, which stores attention keys +and values to avoid re-computations, significantly increases memory demands and +becomes the new bottleneck in speed and memory usage. Additionally, the loading +of the KV cache causes the computational core to be idle, which limits the +inference speed. A straightforward and effective solution to reduce KV cache +size is quantization, which decreases the total bytes taken by KV cache. +However, there is a lack of in-depth studies that explore the element +distribution of KV cache to understand the hardness and limitation of KV cache +quantization. To fill the gap, we conducted a comprehensive study on the +element distribution in KV cache of popular LLMs. Our findings indicate that +the key cache should be quantized per-channel, i.e., group elements along the +channel dimension and quantize them together. In contrast, the value cache +should be quantized per-token. From this analysis, we developed a tuning-free +2bit KV cache quantization algorithm named KIVI. With hardware-friendly +implementation, KIVI can enable Llama, Falcon, and Mistral models to maintain +almost the same quality while using $\mathbf{2.6\times}$ less peak memory +(including model weight). This reduction in memory usage enables up to +$\mathbf{4\times}$ larger batch size, bringing $\mathbf{2.35\times \sim +3.47\times}$ throughput on real LLM inference workload. The source code is +available at https://github.com/jy-yuan/KIVI. + +
+
+ comment: ICML2024 +
+
+
+
+
+ + ♻ ☆ When AI Eats Itself: On the Caveats of Data Pollution in the Era of + Generative AI + + +
+ Generative artificial intelligence (AI) technologies and large models are +producing realistic outputs across various domains, such as images, text, +speech, and music. Creating these advanced generative models requires +significant resources, particularly large and high-quality datasets. To +minimize training expenses, many algorithm developers use data created by the +models themselves as a cost-effective training solution. However, not all +synthetic data effectively improve model performance, necessitating a strategic +balance in the use of real versus synthetic data to optimize outcomes. + Currently, the previously well-controlled integration of real and synthetic +data is becoming uncontrollable. The widespread and unregulated dissemination +of synthetic data online leads to the contamination of datasets traditionally +compiled through web scraping, now mixed with unlabeled synthetic data. This +trend portends a future where generative AI systems may increasingly rely +blindly on consuming self-generated data, raising concerns about model +performance and ethical issues. What will happen if generative AI continuously +consumes itself without discernment? What measures can we take to mitigate the +potential adverse effects? + There is a significant gap in the scientific literature regarding the impact +of synthetic data use in generative AI, particularly in terms of the fusion of +multimodal information. To address this research gap, this review investigates +the consequences of integrating synthetic data blindly on training generative +AI on both image and text modalities and explores strategies to mitigate these +effects. The goal is to offer a comprehensive view of synthetic data's role, +advocating for a balanced approach to its use and exploring practices that +promote the sustainable development of generative AI technologies in the era of +large models. + +
+
+
+
+
+ + ♻ ☆ Batchless Normalization: How to Normalize Activations Across Instances + with Minimal Memory Requirements + + +
+ In training neural networks, batch normalization has many benefits, not all +of them entirely understood. But it also has some drawbacks. Foremost is +arguably memory consumption, as computing the batch statistics requires all +instances within the batch to be processed simultaneously, whereas without +batch normalization it would be possible to process them one by one while +accumulating the weight gradients. Another drawback is that that distribution +parameters (mean and standard deviation) are unlike all other model parameters +in that they are not trained using gradient descent but require special +treatment, complicating implementation. In this paper, I show a simple and +straightforward way to address these issues. The idea, in short, is to add +terms to the loss that, for each activation, cause the minimization of the +negative log likelihood of a Gaussian distribution that is used to normalize +the activation. Among other benefits, this will hopefully contribute to the +democratization of AI research by means of lowering the hardware requirements +for training larger models. + +
+
+ comment: 17 pages (12 without appendices), 12 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Knowledge boosting during low-latency inference + + +
+ Models for low-latency, streaming applications could benefit from the +knowledge capacity of larger models, but edge devices cannot run these models +due to resource constraints. A possible solution is to transfer hints during +inference from a large model running remotely to a small model running +on-device. However, this incurs a communication delay that breaks real-time +requirements and does not guarantee that both models will operate on the same +data at the same time. We propose knowledge boosting, a novel technique that +allows a large model to operate on time-delayed input during inference, while +still boosting small model performance. Using a streaming neural network that +processes 8 ms chunks, we evaluate different speech separation and enhancement +tasks with communication delays of up to six chunks or 48 ms. Our results show +larger gains where the performance gap between the small and large models is +wide, demonstrating a promising method for large-small model collaboration for +low-latency applications. Code, dataset, and audio samples available at +https://knowledgeboosting.cs.washington.edu/. + +
+
+ comment: Accepted by Interspeech 2024 +
+
+
+
+
+ + ♻ ☆ Unsupervised Outlier Detection using Random Subspace and Subsampling + Ensembles of Dirichlet Process Mixtures + + +
+ Probabilistic mixture models are recognized as effective tools for +unsupervised outlier detection owing to their interpretability and global +characteristics. Among these, Dirichlet process mixture models stand out as a +strong alternative to conventional finite mixture models for both clustering +and outlier detection tasks. Unlike finite mixture models, Dirichlet process +mixtures are infinite mixture models that automatically determine the number of +mixture components based on the data. Despite their advantages, the adoption of +Dirichlet process mixture models for unsupervised outlier detection has been +limited by challenges related to computational inefficiency and sensitivity to +outliers in the construction of outlier detectors. Additionally, Dirichlet +process Gaussian mixtures struggle to effectively model non-Gaussian data with +discrete or binary features. To address these challenges, we propose a novel +outlier detection method that utilizes ensembles of Dirichlet process Gaussian +mixtures. This unsupervised algorithm employs random subspace and subsampling +ensembles to ensure efficient computation and improve the robustness of the +outlier detector. The ensemble approach further improves the suitability of the +proposed method for detecting outliers in non-Gaussian data. Furthermore, our +method uses variational inference for Dirichlet process mixtures, which ensures +both efficient and rapid computation. Empirical analyses using benchmark +datasets demonstrate that our method outperforms existing approaches in +unsupervised outlier detection. + +
+
+
+
+
+ + ♻ ☆ COIN: Counterfactual inpainting for weakly supervised semantic + segmentation for medical images + + +
+ Deep learning is dramatically transforming the field of medical imaging and +radiology, enabling the identification of pathologies in medical images, +including computed tomography (CT) and X-ray scans. However, the performance of +deep learning models, particularly in segmentation tasks, is often limited by +the need for extensive annotated datasets. To address this challenge, the +capabilities of weakly supervised semantic segmentation are explored through +the lens of Explainable AI and the generation of counterfactual explanations. +The scope of this research is development of a novel counterfactual inpainting +approach (COIN) that flips the predicted classification label from abnormal to +normal by using a generative model. For instance, if the classifier deems an +input medical image X as abnormal, indicating the presence of a pathology, the +generative model aims to inpaint the abnormal region, thus reversing the +classifier's original prediction label. The approach enables us to produce +precise segmentations for pathologies without depending on pre-existing +segmentation masks. Crucially, image-level labels are utilized, which are +substantially easier to acquire than creating detailed segmentation masks. The +effectiveness of the method is demonstrated by segmenting synthetic targets and +actual kidney tumors from CT images acquired from Tartu University Hospital in +Estonia. The findings indicate that COIN greatly surpasses established +attribution methods, such as RISE, ScoreCAM, and LayerCAM, as well as an +alternative counterfactual explanation method introduced by Singla et al. This +evidence suggests that COIN is a promising approach for semantic segmentation +of tumors in CT images, and presents a step forward in making deep learning +applications more accessible and effective in healthcare, where annotated data +is scarce. + +
+
+ comment: This work has been accepted to be presented to The 2nd World + Conference on eXplainable Artificial Intelligence (xAI 2024), July 17-19, + 2024 - Valletta, Malta +
+
+
+
+
+ + ♻ ☆ ServerlessLLM: Low-Latency Serverless Inference for Large Language + Models + + +
+ This paper presents ServerlessLLM, a distributed system designed to support +low-latency serverless inference for Large Language Models (LLMs). By +harnessing the substantial near-GPU storage and memory capacities of inference +servers, ServerlessLLM achieves effective local checkpoint storage, minimizing +the need for remote checkpoint downloads and ensuring efficient checkpoint +loading. The design of ServerlessLLM features three core contributions: (i) +\emph{fast multi-tier checkpoint loading}, featuring a new loading-optimized +checkpoint format and a multi-tier loading system, fully utilizing the +bandwidth of complex storage hierarchies on GPU servers; (ii) \emph{efficient +live migration of LLM inference}, which enables newly initiated inferences to +capitalize on local checkpoint storage while ensuring minimal user +interruption; and (iii) \emph{startup-time-optimized model scheduling}, which +assesses the locality statuses of checkpoints on each server and schedules the +model onto servers that minimize the time to start the inference. Comprehensive +evaluations, including microbenchmarks and real-world scenarios, demonstrate +that ServerlessLLM dramatically outperforms state-of-the-art serverless +systems, reducing latency by 10 - 200X across various LLM inference workloads. + +
+
+ comment: 18th USENIX Symposium on Operating Systems Design and Implementation +
+
+
+
+
+ + ♻ ☆ Nyström Kernel Stein Discrepancy + + +
+ Kernel methods underpin many of the most successful approaches in data +science and statistics, and they allow representing probability measures as +elements of a reproducing kernel Hilbert space without loss of information. +Recently, the kernel Stein discrepancy (KSD), which combines Stein's method +with kernel techniques, gained considerable attention. Through the Stein +operator, KSD allows the construction of powerful goodness-of-fit tests where +it is sufficient to know the target distribution up to a multiplicative +constant. However, the typical U- and V-statistic-based KSD estimators suffer +from a quadratic runtime complexity, which hinders their application in +large-scale settings. In this work, we propose a Nystr\"om-based KSD +acceleration -- with runtime $\mathcal O\!\left(mn+m^3\right)$ for $n$ samples +and $m\ll n$ Nystr\"om points -- , show its $\sqrt{n}$-consistency under the +null with a classical sub-Gaussian assumption, and demonstrate its +applicability for goodness-of-fit testing on a suite of benchmarks. + +
+
+ comment: Update proof of Lemma B.3, milder Assumption 1, more experiments +
+
+
+
+
+ + ♻ ☆ Enhancing Counterfactual Explanation Search with Diffusion Distance and + Directional Coherence + + +
+ A pressing issue in the adoption of AI models is the increasing demand for +more human-centric explanations of their predictions. To advance towards more +human-centric explanations, understanding how humans produce and select +explanations has been beneficial. In this work, inspired by insights of human +cognition we propose and test the incorporation of two novel biases to enhance +the search for effective counterfactual explanations. Central to our +methodology is the application of diffusion distance, which emphasizes data +connectivity and actionability in the search for feasible counterfactual +explanations. In particular, diffusion distance effectively weights more those +points that are more interconnected by numerous short-length paths. This +approach brings closely connected points nearer to each other, identifying a +feasible path between them. We also introduce a directional coherence term that +allows the expression of a preference for the alignment between the joint and +marginal directional changes in feature space to reach a counterfactual. This +term enables the generation of counterfactual explanations that align with a +set of marginal predictions based on expectations of how the outcome of the +model varies by changing one feature at a time. We evaluate our method, named +Coherent Directional Counterfactual Explainer (CoDiCE), and the impact of the +two novel biases against existing methods such as DiCE, FACE, Prototypes, and +Growing Spheres. Through a series of ablation experiments on both synthetic and +real datasets with continuous and mixed-type features, we demonstrate the +effectiveness of our method. + +
+
+ comment: This work has been accepted to be presented to The 2nd World + Conference on eXplainable Artificial Intelligence (xAI 2024), July 17-19, + 2024 - Valletta, Malta +
+
+
+
+
+ + ♻ ☆ Imperative Learning: A Self-supervised Neural-Symbolic Learning + Framework for Robot Autonomy + + +
+ Data-driven methods such as reinforcement and imitation learning have +achieved remarkable success in robot autonomy. However, their data-centric +nature still hinders them from generalizing well to ever-changing environments. +Moreover, collecting large datasets for robotic tasks is often impractical and +expensive. To overcome these challenges, we introduce a new self-supervised +neural-symbolic (NeSy) computational framework, imperative learning (IL), for +robot autonomy, leveraging the generalization abilities of symbolic reasoning. +The framework of IL consists of three primary components: a neural module, a +reasoning engine, and a memory system. We formulate IL as a special bilevel +optimization (BLO), which enables reciprocal learning over the three modules. +This overcomes the label-intensive obstacles associated with data-driven +approaches and takes advantage of symbolic reasoning concerning logical +reasoning, physical principles, geometric analysis, etc. We discuss several +optimization techniques for IL and verify their effectiveness in five distinct +robot autonomy tasks including path planning, rule induction, optimal control, +visual odometry, and multi-robot routing. Through various experiments, we show +that IL can significantly enhance robot autonomy capabilities and we anticipate +that it will catalyze further research across diverse domains. + +
+
+
+
+
+ + ♻ ☆ Point-PEFT: Parameter-Efficient Fine-Tuning for 3D Pre-trained Models + + +
+ The popularity of pre-trained large models has revolutionized downstream +tasks across diverse fields, such as language, vision, and multi-modality. To +minimize the adaption cost for downstream tasks, many Parameter-Efficient +Fine-Tuning (PEFT) techniques are proposed for language and 2D image +pre-trained models. However, the specialized PEFT method for 3D pre-trained +models is still under-explored. To this end, we introduce Point-PEFT, a novel +framework for adapting point cloud pre-trained models with minimal learnable +parameters. Specifically, for a pre-trained 3D model, we freeze most of its +parameters, and only tune the newly added PEFT modules on downstream tasks, +which consist of a Point-prior Prompt and a Geometry-aware Adapter. The +Point-prior Prompt adopts a set of learnable prompt tokens, for which we +propose to construct a memory bank with domain-specific knowledge, and utilize +a parameter-free attention to enhance the prompt tokens. The Geometry-aware +Adapter aims to aggregate point cloud features within spatial neighborhoods to +capture fine-grained geometric information through local interactions. +Extensive experiments indicate that our Point-PEFT can achieve better +performance than the full fine-tuning on various downstream tasks, while using +only 5% of the trainable parameters, demonstrating the efficiency and +effectiveness of our approach. Code is released at +https://github.com/Ivan-Tang-3D/Point-PEFT. + +
+
+ comment: The specialized PEFT framework for 3D pre-trained models, which + achieves competitive performance to full fine-tuning, and significantly + reduces the computational resources. Project page: + https://github.com/Ivan-Tang-3D/Point-PEFT +
+
+
+
+
+ + ♻ ☆ Behavioral Testing: Can Large Language Models Implicitly Resolve + Ambiguous Entities? + + +
+ One of the major aspects contributing to the striking performance of large +language models (LLMs) is the vast amount of factual knowledge accumulated +during pre-training. Yet, many LLMs suffer from self-inconsistency, which +raises doubts about their trustworthiness and reliability. In this paper, we +focus on entity type ambiguity and analyze current state-of-the-art LLMs for +their proficiency and consistency in applying their factual knowledge when +prompted for entities under ambiguity. To do so, we propose an evaluation +protocol that disentangles knowing from applying knowledge, and test +state-of-the-art LLMs on 49 entities. Our experiments reveal that LLMs perform +poorly with ambiguous prompts, achieving only 80% accuracy. Our results further +demonstrate systematic discrepancies in LLM behavior and their failure to +consistently apply information, indicating that the models can exhibit +knowledge without being able to utilize it, significant biases for preferred +readings, as well as self inconsistencies. Our study highlights the importance +of handling entity ambiguity in future for more trustworthy LLMs + +
+
+
+
+
+ + ♻ ☆ Node-like as a Whole: Structure-aware Searching and Coarsening for Graph + Classification + + +
+ Graph Transformers (GTs) have made remarkable achievements in graph-level +tasks. However, most existing works regard graph structures as a form of +guidance or bias for enhancing node representations, which focuses on +node-central perspectives and lacks explicit representations of edges and +structures. One natural question is, can we treat graph structures node-like as +a whole to learn high-level features? Through experimental analysis, we explore +the feasibility of this assumption. Based on our findings, we propose a novel +multi-view graph representation learning model via structure-aware searching +and coarsening (GRLsc) on GT architecture for graph classification. +Specifically, we build three unique views, original, coarsening, and +conversion, to learn a thorough structural representation. We compress loops +and cliques via hierarchical heuristic graph coarsening and restrict them with +well-designed constraints, which builds the coarsening view to learn high-level +interactions between structures. We also introduce line graphs for edge +embeddings and switch to edge-central perspective to construct the conversion +view. Experiments on eight real-world datasets demonstrate the improvements of +GRLsc over 28 baselines from various architectures. + +
+
+
+
+
+ + ♻ ☆ Spatial-Temporal Cross-View Contrastive Pre-training for Check-in + Sequence Representation Learning + + +
+ The rapid growth of location-based services (LBS) has yielded massive amounts +of data on human mobility. Effectively extracting meaningful representations +for user-generated check-in sequences is pivotal for facilitating various +downstream services. However, the user-generated check-in data are +simultaneously influenced by the surrounding objective circumstances and the +user's subjective intention. Specifically, the temporal uncertainty and spatial +diversity exhibited in check-in data make it difficult to capture the +macroscopic spatial-temporal patterns of users and to understand the semantics +of user mobility activities. Furthermore, the distinct characteristics of the +temporal and spatial information in check-in sequences call for an effective +fusion method to incorporate these two types of information. In this paper, we +propose a novel Spatial-Temporal Cross-view Contrastive Representation (STCCR) +framework for check-in sequence representation learning. Specifically, STCCR +addresses the above challenges by employing self-supervision from "spatial +topic" and "temporal intention" views, facilitating effective fusion of spatial +and temporal information at the semantic level. Besides, STCCR leverages +contrastive clustering to uncover users' shared spatial topics from diverse +mobility activities, while employing angular momentum contrast to mitigate the +impact of temporal uncertainty and noise. We extensively evaluate STCCR on +three real-world datasets and demonstrate its superior performance across three +downstream tasks. + +
+
+ comment: This paper has been accepted as a regular paper at IEEE TKDE +
+
+
+
+
+ + ♻ ☆ CCVA-FL: Cross-Client Variations Adaptive Federated Learning for Medical + Imaging + + +
+ Federated Learning (FL) offers a privacy-preserving approach to train models +on decentralized data. Its potential in healthcare is significant, but +challenges arise due to cross-client variations in medical image data, +exacerbated by limited annotations. This paper introduces Cross-Client +Variations Adaptive Federated Learning (CCVA-FL) to address these issues. +CCVA-FL aims to minimize cross-client variations by transforming images into a +common feature space. It involves expert annotation of a subset of images from +each client, followed by the selection of a client with the least data +complexity as the target. Synthetic medical images are then generated using +Scalable Diffusion Models with Transformers (DiT) based on the target client's +annotated images. These synthetic images, capturing diversity and representing +the original data, are shared with other clients. Each client then translates +its local images into the target image space using image-to-image translation. +The translated images are subsequently used in a federated learning setting to +develop a server model. Our results demonstrate that CCVA-FL outperforms +Vanilla Federated Averaging by effectively addressing data distribution +differences across clients without compromising privacy. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Individual Privacy Accounting for Differentially Private Stochastic + Gradient Descent + + +
+ Differentially private stochastic gradient descent (DP-SGD) is the workhorse +algorithm for recent advances in private deep learning. It provides a single +privacy guarantee to all datapoints in the dataset. We propose output-specific +$(\varepsilon,\delta)$-DP to characterize privacy guarantees for individual +examples when releasing models trained by DP-SGD. We also design an efficient +algorithm to investigate individual privacy across a number of datasets. We +find that most examples enjoy stronger privacy guarantees than the worst-case +bound. We further discover that the training loss and the privacy parameter of +an example are well-correlated. This implies groups that are underserved in +terms of model utility simultaneously experience weaker privacy guarantees. For +example, on CIFAR-10, the average $\varepsilon$ of the class with the lowest +test accuracy is 44.2\% higher than that of the class with the highest +accuracy. + +
+
+ comment: Add clarification about the applicability of Definition 4 +
+
+
+
+
+ + ♻ ☆ HyperMoE: Towards Better Mixture of Experts via Transferring Among + Experts + + +
+ The Mixture of Experts (MoE) for language models has been proven effective in +augmenting the capacity of models by dynamically routing each input token to a +specific subset of experts for processing. Despite the success, most existing +methods face a challenge for balance between sparsity and the availability of +expert knowledge: enhancing performance through increased use of expert +knowledge often results in diminishing sparsity during expert selection. To +mitigate this contradiction, we propose HyperMoE, a novel MoE framework built +upon Hypernetworks. This framework integrates the computational processes of +MoE with the concept of knowledge transferring in multi-task learning. Specific +modules generated based on the information of unselected experts serve as +supplementary information, which allows the knowledge of experts not selected +to be used while maintaining selection sparsity. Our comprehensive empirical +evaluations across multiple datasets and backbones establish that HyperMoE +significantly outperforms existing MoE methods under identical conditions +concerning the number of experts. + +
+
+
+
+
+ + ♻ ☆ Statistical Batch-Based Bearing Fault Detection + + +
+ In the domain of rotating machinery, bearings are vulnerable to different +mechanical faults, including ball, inner, and outer race faults. Various +techniques can be used in condition-based monitoring, from classical signal +analysis to deep learning methods. Based on the complex working conditions of +rotary machines, multivariate statistical process control charts such as +Hotelling's $T^2$ and Squared Prediction Error are useful for providing early +warnings. However, these methods are rarely applied to condition monitoring of +rotating machinery due to the univariate nature of the datasets. In the present +paper, we propose a multivariate statistical process control-based fault +detection method that utilizes multivariate data composed of Fourier transform +features extracted for fixed-time batches. Our approach makes use of the +multidimensional nature of Fourier transform characteristics, which record more +detailed information about the machine's status, in an effort to enhance early +defect detection and diagnosis. Experiments with varying vibration measurement +locations (Fan End, Drive End), fault types (ball, inner, and outer race +faults), and motor loads (0-3 horsepower) are used to validate the suggested +approach. The outcomes illustrate our method's effectiveness in fault detection +and point to possible broader uses in industrial maintenance. + +
+
+
+
+
+ + ♻ ☆ High Significant Fault Detection in Azure Core Workload Insights AAAI 2024 + + +
+ Azure Core workload insights have time-series data with different metric +units. Faults or Anomalies are observed in these time-series data owing to +faults observed with respect to metric name, resources region, dimensions, and +its dimension value associated with the data. For Azure Core, an important task +is to highlight faults or anomalies to the user on a dashboard that they can +perceive easily. The number of anomalies reported should be highly significant +and in a limited number, e.g., 5-20 anomalies reported per hour. The reported +anomalies will have significant user perception and high reconstruction error +in any time-series forecasting model. Hence, our task is to automatically +identify 'high significant anomalies' and their associated information for user +perception. + +
+
+ comment: Published in IAAI 2024, which is the Industrial track of AAAI 2024 +
+
+
+
+
+ + ♻ ☆ Variational Inference with Coverage Guarantees in Simulation-Based + Inference + + +
+ Amortized variational inference is an often employed framework in +simulation-based inference that produces a posterior approximation that can be +rapidly computed given any new observation. Unfortunately, there are few +guarantees about the quality of these approximate posteriors. We propose +Conformalized Amortized Neural Variational Inference (CANVI), a procedure that +is scalable, easily implemented, and provides guaranteed marginal coverage. +Given a collection of candidate amortized posterior approximators, CANVI +constructs conformalized predictors based on each candidate, compares the +predictors using a metric known as predictive efficiency, and returns the most +efficient predictor. CANVI ensures that the resulting predictor constructs +regions that contain the truth with a user-specified level of probability. +CANVI is agnostic to design decisions in formulating the candidate +approximators and only requires access to samples from the forward model, +permitting its use in likelihood-free settings. We prove lower bounds on the +predictive efficiency of the regions produced by CANVI and explore how the +quality of a posterior approximation relates to the predictive efficiency of +prediction regions based on that approximation. Finally, we demonstrate the +accurate calibration and high predictive efficiency of CANVI on a suite of +simulation-based inference benchmark tasks and an important scientific task: +analyzing galaxy emission spectra. + +
+
+
+
+
+ + ♻ ☆ Goodness-of-Fit and Clustering of Spherical Data: the QuadratiK package + in R and Python + + +
+ We introduce the QuadratiK package that incorporates innovative data analysis +methodologies. The presented software, implemented in both R and Python, offers +a comprehensive set of goodness-of-fit tests and clustering techniques using +kernel-based quadratic distances, thereby bridging the gap between the +statistical and machine learning literatures. Our software implements one, two +and k-sample tests for goodness of fit, providing an efficient and +mathematically sound way to assess the fit of probability distributions. +Expanded capabilities of our software include supporting tests for uniformity +on the d-dimensional Sphere based on Poisson kernel densities. Particularly +noteworthy is the incorporation of a unique clustering algorithm specifically +tailored for spherical data that leverages a mixture of Poisson kernel-based +densities on the sphere. Alongside this, our software includes additional +graphical functions, aiding the users in validating, as well as visualizing and +representing clustering results. This enhances interpretability and usability +of the analysis. In summary, our R and Python packages serve as a powerful +suite of tools, offering researchers and practitioners the means to delve +deeper into their data, draw robust inference, and conduct potentially +impactful analyses and inference across a wide array of disciplines. + +
+
+ comment: 36 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122 + Language Variants ACL 2024 + + +
+ We present Belebele, a multiple-choice machine reading comprehension (MRC) +dataset spanning 122 language variants. Significantly expanding the language +coverage of natural language understanding (NLU) benchmarks, this dataset +enables the evaluation of text models in high-, medium-, and low-resource +languages. Each question is based on a short passage from the Flores-200 +dataset and has four multiple-choice answers. The questions were carefully +curated to discriminate between models with different levels of general +language comprehension. The English dataset on its own proves difficult enough +to challenge state-of-the-art language models. Being fully parallel, this +dataset enables direct comparison of model performance across all languages. We +use this dataset to evaluate the capabilities of multilingual masked language +models (MLMs) and large language models (LLMs). We present extensive results +and find that despite significant cross-lingual transfer in English-centric +LLMs, much smaller MLMs pretrained on balanced multilingual data still +understand far more languages. We also observe that larger vocabulary size and +conscious vocabulary construction correlate with better performance on +low-resource languages. Overall, Belebele opens up new avenues for evaluating +and analyzing the multilingual capabilities of NLP systems. + +
+
+ comment: ACL 2024 +
+
+
+
+
+ + ♻ ☆ SES: Bridging the Gap Between Explainability and Prediction of Graph + Neural Networks ICDE 2024 + + +
+ Despite the Graph Neural Networks' (GNNs) proficiency in analyzing graph +data, achieving high-accuracy and interpretable predictions remains +challenging. Existing GNN interpreters typically provide post-hoc explanations +disjointed from GNNs' predictions, resulting in misrepresentations. +Self-explainable GNNs offer built-in explanations during the training process. +However, they cannot exploit the explanatory outcomes to augment prediction +performance, and they fail to provide high-quality explanations of node +features and require additional processes to generate explainable subgraphs, +which is costly. To address the aforementioned limitations, we propose a +self-explained and self-supervised graph neural network (SES) to bridge the gap +between explainability and prediction. SES comprises two processes: explainable +training and enhanced predictive learning. During explainable training, SES +employs a global mask generator co-trained with a graph encoder and directly +produces crucial structure and feature masks, reducing time consumption and +providing node feature and subgraph explanations. In the enhanced predictive +learning phase, mask-based positive-negative pairs are constructed utilizing +the explanations to compute a triplet loss and enhance the node representations +by contrastive learning. + +
+
+ comment: Accepted as a conference paper at ICDE 2024 +
+
+
+
+
+ + ♻ ☆ Efficient Combinatorial Optimization via Heat Diffusion + + +
+ Combinatorial optimization problems are widespread but inherently challenging +due to their discrete nature. The primary limitation of existing methods is +that they can only access a small fraction of the solution space at each +iteration, resulting in limited efficiency for searching the global optimal.To +overcome this challenge, diverging from conventional efforts of expanding the +solver's search scope, we focus on enabling information to actively propagate +to the solver through heat diffusion. By transforming the target function while +preserving its optima, heat diffusion facilitates information flow from distant +regions to the solver, providing more efficient navigation. Utilizing heat +diffusion, we propose a framework for solving general combinatorial +optimization problems.The proposed methodology demonstrates superior +performance across a range of the most challenging and widely encountered +combinatorial optimizations. Echoing recent advancements in harnessing +thermodynamics for generative artificial intelligence, our study further +reveals its significant potential in advancing combinatorial optimization. + +
+
+ comment: Code is available in https://github.com/AwakerMhy/HeO +
+
+
+
+
+ + ♻ ☆ A Survey on Hypergraph Neural Networks: An In-Depth and Step-By-Step + Guide KDD 2024 + + +
+ Higher-order interactions (HOIs) are ubiquitous in real-world complex systems +and applications. Investigation of deep learning for HOIs, thus, has become a +valuable agenda for the data mining and machine learning communities. As +networks of HOIs are expressed mathematically as hypergraphs, hypergraph neural +networks (HNNs) have emerged as a powerful tool for representation learning on +hypergraphs. Given the emerging trend, we present the first survey dedicated to +HNNs, with an in-depth and step-by-step guide. Broadly, the present survey +overviews HNN architectures, training strategies, and applications. First, we +break existing HNNs down into four design components: (i) input features, (ii) +input structures, (iii) message-passing schemes, and (iv) training strategies. +Second, we examine how HNNs address and learn HOIs with each of their +components. Third, we overview the recent applications of HNNs in +recommendation, bioinformatics and medical science, time series analysis, and +computer vision. Lastly, we conclude with a discussion on limitations and +future directions. + +
+
+ comment: To appear in KDD 2024 (survey paper). The typo in Equation (5) has + been fixed +
+
+
+
+
+ + ♻ ☆ Towards the Law of Capacity Gap in Distilling Language Models + + +
+ Language model (LM) distillation is a trending area that aims to distil the +knowledge residing in a large teacher LM to a small student one. While various +methods have been proposed to maximize the effectiveness of the distillation, +significant challenges persist, particularly when there is a substantial +capacity gap between the teacher and student LMs. This issue, often referred to +as the \textit{curse} of capacity gap, suggests that a larger teacher does not +necessarily result in a superior student compared to one distilled from a +smaller teacher. In other words, there is likely an optimal teacher yielding +the best student along the scaling course of the teacher. However, the curse of +capacity gap can not be tackled without notable compute overhead, as indicated +in previous studies. In the context of large LMs (LLMs), previously viable +approaches become much less meaningful, as it is an impossible triangle to +distill an expected student from an optimal teacher student with small compute +overhead. Fortunately, the impossible triangle can fortunately be possible +provided an inducted \textit{law} of capacity gap. In this paper, we take the +spirits of scaling law and reveal that the optimal teacher scale almost +consistently follows a linear scaling with the student scale across different +model architectures and data scales. The law later guides us to distil a 3B +student LM (termed \textsc{MiniMA}) from LLaMA2-7B. \textsc{MiniMA} is +demonstrated to outperform a wide range of 3B competitors and could even +compete with several 7B models. + +
+
+ comment: 32 pages, 10 figures, 15 tables, work in progress. Code and + checkpoints are available at https://github.com/GeneZC/MiniMA +
+
+
+
+
+ + ♻ ☆ A Priori Uncertainty Quantification of Reacting Turbulence Closure + Models using Bayesian Neural Networks + + +
+ While many physics-based closure model forms have been posited for the +sub-filter scale (SFS) in large eddy simulation (LES), vast amounts of data +available from direct numerical simulation (DNS) create opportunities to +leverage data-driven modeling techniques. Albeit flexible, data-driven models +still depend on the dataset and the functional form of the model chosen. +Increased adoption of such models requires reliable uncertainty estimates both +in the data-informed and out-of-distribution regimes. In this work, we employ +Bayesian neural networks (BNNs) to capture both epistemic and aleatoric +uncertainties in a reacting flow model. In particular, we model the filtered +progress variable scalar dissipation rate which plays a key role in the +dynamics of turbulent premixed flames. We demonstrate that BNN models can +provide unique insights about the structure of uncertainty of the data-driven +closure models. We also propose a method for the incorporation of +out-of-distribution information in a BNN. The efficacy of the model is +demonstrated by a priori evaluation on a dataset consisting of a variety of +flame conditions and fuels. + +
+
+
+
+
+ + ♻ ☆ Robust experimental data assimilation for the Spalart-Allmaras + turbulence model + + +
+ This study presents a methodology focusing on the use of computational model +and experimental data fusion to improve the Spalart-Allmaras (SA) closure model +for Reynolds-averaged Navier-Stokes solutions. In particular, our goal is to +develop a technique that not only assimilates sparse experimental data to +improve turbulence model performance, but also preserves generalization for +unseen cases by recovering classical SA behavior. We achieve our goals using +data assimilation, namely the Ensemble Kalman filtering approach (EnKF), to +calibrate the coefficients of the SA model for separated flows. A holistic +calibration strategy is implemented via the parameterization of the production, +diffusion, and destruction terms. This calibration relies on the assimilation +of experimental data collected in the form of velocity profiles, skin friction, +and pressure coefficients. Despite using observational data from a single flow +condition around a backward-facing step (BFS), the recalibrated SA model +demonstrates generalization to other separated flows, including cases such as +the 2D NASA wall mounted hump (2D-WMH) and modified BFS. Significant +improvement is observed in the quantities of interest, i.e., skin friction +coefficient ($C_f$) and pressure coefficient ($C_p$) for each flow tested. +Finally, it is also demonstrated that the newly proposed model recovers SA +proficiency for flows, such as a NACA-0012 airfoil and axisymmetric jet (ASJ), +and that the individually calibrated terms in the SA model target specific +flow-physics wherein the calibrated production term improves the re-circulation +zone while destruction improves the recovery zone. + +
+
+
+
+
+ + ♻ ☆ JailbreakZoo: Survey, Landscapes, and Horizons in Jailbreaking Large + Language and Vision-Language Models + + +
+ The rapid evolution of artificial intelligence (AI) through developments in +Large Language Models (LLMs) and Vision-Language Models (VLMs) has brought +significant advancements across various technological domains. While these +models enhance capabilities in natural language processing and visual +interactive tasks, their growing adoption raises critical concerns regarding +security and ethical alignment. This survey provides an extensive review of the +emerging field of jailbreaking--deliberately circumventing the ethical and +operational boundaries of LLMs and VLMs--and the consequent development of +defense mechanisms. Our study categorizes jailbreaks into seven distinct types +and elaborates on defense strategies that address these vulnerabilities. +Through this comprehensive examination, we identify research gaps and propose +directions for future studies to enhance the security frameworks of LLMs and +VLMs. Our findings underscore the necessity for a unified perspective that +integrates both jailbreak strategies and defensive solutions to foster a +robust, secure, and reliable environment for the next generation of language +models. More details can be found on our website: +\url{https://chonghan-chen.com/llm-jailbreak-zoo-survey/}. + +
+
+ comment: 45 pages +
+
+
+
+
+ + ♻ ☆ Targeted stochastic gradient Markov chain Monte Carlo for hidden Markov + models with rare latent states + + +
+ Markov chain Monte Carlo (MCMC) algorithms for hidden Markov models often +rely on the forward-backward sampler. This makes them computationally slow as +the length of the time series increases, motivating the development of +sub-sampling-based approaches. These approximate the full posterior by using +small random subsequences of the data at each MCMC iteration within stochastic +gradient MCMC. In the presence of imbalanced data resulting from rare latent +states, subsequences often exclude rare latent state data, leading to +inaccurate inference and prediction/detection of rare events. We propose a +targeted sub-sampling (TASS) approach that over-samples observations +corresponding to rare latent states when calculating the stochastic gradient of +parameters associated with them. TASS uses an initial clustering of the data to +construct subsequence weights that reduce the variance in gradient estimation. +This leads to improved sampling efficiency, in particular in settings where the +rare latent states correspond to extreme observations. We demonstrate +substantial gains in predictive and inferential accuracy on real and synthetic +examples. + +
+
+
+
+
+ + ♻ ☆ Bayesian Modelling in Practice: Using Uncertainty to Improve + Trustworthiness in Medical Applications ICML2019 + + +
+ The Intensive Care Unit (ICU) is a hospital department where machine learning +has the potential to provide valuable assistance in clinical decision making. +Classical machine learning models usually only provide point-estimates and no +uncertainty of predictions. In practice, uncertain predictions should be +presented to doctors with extra care in order to prevent potentially +catastrophic treatment decisions. In this work we show how Bayesian modelling +and the predictive uncertainty that it provides can be used to mitigate risk of +misguided prediction and to detect out-of-domain examples in a medical setting. +We derive analytically a bound on the prediction loss with respect to +predictive uncertainty. The bound shows that uncertainty can mitigate loss. +Furthermore, we apply a Bayesian Neural Network to the MIMIC-III dataset, +predicting risk of mortality of ICU patients. Our empirical results show that +uncertainty can indeed prevent potential errors and reliably identifies +out-of-domain patients. These results suggest that Bayesian predictive +uncertainty can greatly improve trustworthiness of machine learning models in +high-risk settings such as the ICU. + +
+
+ comment: Presented at AISG @ ICML2019: + https://aiforsocialgood.github.io/icml2019/index.htm +
+
+
+
+
+ + ♻ ☆ Decentralized Blockchain-based Robust Multi-agent Multi-armed Bandit + + +
+ We study a robust, i.e. in presence of malicious participants, multi-agent +multi-armed bandit problem where multiple participants are distributed on a +fully decentralized blockchain, with the possibility of some being malicious. +The rewards of arms are homogeneous among the honest participants, following +time-invariant stochastic distributions, which are revealed to the participants +only when certain conditions are met to ensure that the coordination mechanism +is secure enough. The coordination mechanism's objective is to efficiently +ensure the cumulative rewards gained by the honest participants are maximized. +To this end, we are the first to incorporate advanced techniques from +blockchains, as well as novel mechanisms, into such a cooperative decision +making framework to design optimal strategies for honest participants. This +framework allows various malicious behaviors and the maintenance of security +and participant privacy. More specifically, we select a pool of validators who +communicate to all participants, design a new consensus mechanism based on +digital signatures for these validators, invent a UCB-based strategy that +requires less information from participants through secure multi-party +computation, and design the chain-participant interaction and an incentive +mechanism to encourage participants' participation. Notably, we are the first +to prove the theoretical regret of the proposed algorithm and claim its +optimality. Unlike existing work that integrates blockchains with learning +problems such as federated learning which mainly focuses on optimality via +computational experiments, we demonstrate that the regret of honest +participants is upper bounded by $\log{T}$ under certain assumptions. The +regret bound is consistent with the multi-agent multi-armed bandit problem, +both without malicious participants and with purely Byzantine attacks which do +not affect the entire system. + +
+
+ comment: 45 pages +
+
+
+
+
+ + ♻ ☆ ExcelFormer: A neural network surpassing GBDTs on tabular data + + +
+ Data organized in tabular format is ubiquitous in real-world applications, +and users often craft tables with biased feature definitions and flexibly set +prediction targets of their interests. Thus, a rapid development of a robust, +effective, dataset-versatile, user-friendly tabular prediction approach is +highly desired. While Gradient Boosting Decision Trees (GBDTs) and existing +deep neural networks (DNNs) have been extensively utilized by professional +users, they present several challenges for casual users, particularly: (i) the +dilemma of model selection due to their different dataset preferences, and (ii) +the need for heavy hyperparameter searching, failing which their performances +are deemed inadequate. In this paper, we delve into this question: Can we +develop a deep learning model that serves as a "sure bet" solution for a wide +range of tabular prediction tasks, while also being user-friendly for casual +users? We delve into three key drawbacks of deep tabular models, encompassing: +(P1) lack of rotational variance property, (P2) large data demand, and (P3) +over-smooth solution. We propose ExcelFormer, addressing these challenges +through a semi-permeable attention module that effectively constrains the +influence of less informative features to break the DNNs' rotational invariance +property (for P1), data augmentation approaches tailored for tabular data (for +P2), and attentive feedforward network to boost the model fitting capability +(for P3). These designs collectively make ExcelFormer a "sure bet" solution for +diverse tabular datasets. Extensive and stratified experiments conducted on +real-world datasets demonstrate that our model outperforms previous approaches +across diverse tabular data prediction tasks, and this framework can be +friendly to casual users, offering ease of use without the heavy hyperparameter +tuning. + +
+
+
+
+
+ + ♻ ☆ MELTing point: Mobile Evaluation of Language Transformers + + +
+ Transformers have revolutionized the machine learning landscape, gradually +making their way into everyday tasks and equipping our computers with "sparks +of intelligence". However, their runtime requirements have prevented them from +being broadly deployed on mobile. As personal devices become increasingly +powerful and prompt privacy becomes an ever more pressing issue, we explore the +current state of mobile execution of Large Language Models (LLMs). To achieve +this, we have created our own automation infrastructure, MELT, which supports +the headless execution and benchmarking of LLMs on device, supporting different +models, devices and frameworks, including Android, iOS and Nvidia Jetson +devices. We evaluate popular instruction fine-tuned LLMs and leverage different +frameworks to measure their end-to-end and granular performance, tracing their +memory and energy requirements along the way. Our analysis is the first +systematic study of on-device LLM execution, quantifying performance, energy +efficiency and accuracy across various state-of-the-art models and showcases +the state of on-device intelligence in the era of hyperscale models. Results +highlight the performance heterogeneity across targets and corroborates that +LLM inference is largely memory-bound. Quantization drastically reduces memory +requirements and renders execution viable, but at a non-negligible accuracy +cost. Drawing from its energy footprint and thermal behavior, the continuous +execution of LLMs remains elusive, as both factors negatively affect user +experience. Last, our experience shows that the ecosystem is still in its +infancy, and algorithmic as well as hardware breakthroughs can significantly +shift the execution cost. We expect NPU acceleration, and framework-hardware +co-design to be the biggest bet towards efficient standalone execution, with +the alternative of offloading tailored towards edge deployments. + +
+
+ comment: Accepted at the 30th Annual International Conference On Mobile + Computing And Networking (MobiCom 2024) +
+
+
+
+
+ + ♻ ☆ Symmetries in Overparametrized Neural Networks: A Mean-Field View + + +
+ We develop a Mean-Field (MF) view of the learning dynamics of +overparametrized Artificial Neural Networks (NN) under data symmetric in law +wrt the action of a general compact group $G$. We consider for this a class of +generalized shallow NNs given by an ensemble of $N$ multi-layer units, jointly +trained using stochastic gradient descent (SGD) and possibly +symmetry-leveraging (SL) techniques, such as Data Augmentation (DA), Feature +Averaging (FA) or Equivariant Architectures (EA). We introduce the notions of +weakly and strongly invariant laws (WI and SI) on the parameter space of each +single unit, corresponding, respectively, to $G$-invariant distributions, and +to distributions supported on parameters fixed by the group action (which +encode EA). This allows us to define symmetric models compatible with taking +$N\to\infty$ and give an interpretation of the asymptotic dynamics of DA, FA +and EA in terms of Wasserstein Gradient Flows describing their MF limits. When +activations respect the group action, we show that, for symmetric data, DA, FA +and freely-trained models obey the exact same MF dynamic, which stays in the +space of WI laws and minimizes therein the population risk. We also give a +counterexample to the general attainability of an optimum over SI laws. Despite +this, quite remarkably, we show that the set of SI laws is also preserved by +the MF dynamics even when freely trained. This sharply contrasts the finite-$N$ +setting, in which EAs are generally not preserved by unconstrained SGD. We +illustrate the validity of our findings as $N$ gets larger in a teacher-student +experimental setting, training a student NN to learn from a WI, SI or arbitrary +teacher model through various SL schemes. We last deduce a data-driven +heuristic to discover the largest subspace of parameters supporting SI +distributions for a problem, that could be used for designing EA with minimal +generalization error. + +
+
+
+
+
+ + ♻ ☆ A Review of Large Language Models and Autonomous Agents in Chemistry + + +
+ Large language models (LLMs) have emerged as powerful tools in chemistry, +significantly impacting molecule design, property prediction, and synthesis +optimization. This review highlights LLM capabilities in these domains and +their potential to accelerate scientific discovery through automation. We also +review LLM-based autonomous agents: LLMs with a broader set of tools to +interact with their surrounding environment. These agents perform diverse tasks +such as paper scraping, interfacing with automated laboratories, and synthesis +planning. As agents are an emerging topic, we extend the scope of our review of +agents beyond chemistry and discuss across any scientific domains. This review +covers the recent history, current capabilities, and design of LLMs and +autonomous agents, addressing specific challenges, opportunities, and future +directions in chemistry. Key challenges include data quality and integration, +model interpretability, and the need for standard benchmarks, while future +directions point towards more sophisticated multi-modal agents and enhanced +collaboration between agents and experimental methods. Due to the quick pace of +this field, a repository has been built to keep track of the latest studies: +https://github.com/ur-whitelab/LLMs-in-science. + +
+
+
+
+
+
+
+
+ + Multimedia 5 + +
+
+
+ + ☆ ReCorD: Reasoning and Correcting Diffusion for HOI Generation ACM MM 2024 + + +
+ Diffusion models revolutionize image generation by leveraging natural +language to guide the creation of multimedia content. Despite significant +advancements in such generative models, challenges persist in depicting +detailed human-object interactions, especially regarding pose and object +placement accuracy. We introduce a training-free method named Reasoning and +Correcting Diffusion (ReCorD) to address these challenges. Our model couples +Latent Diffusion Models with Visual Language Models to refine the generation +process, ensuring precise depictions of HOIs. We propose an interaction-aware +reasoning module to improve the interpretation of the interaction, along with +an interaction correcting module to refine the output image for more precise +HOI generation delicately. Through a meticulous process of pose selection and +object positioning, ReCorD achieves superior fidelity in generated images while +efficiently reducing computational requirements. We conduct comprehensive +experiments on three benchmarks to demonstrate the significant progress in +solving text-to-image generation tasks, showcasing ReCorD's ability to render +complex interactions accurately by outperforming existing methods in HOI +classification score, as well as FID and Verb CLIP-Score. Project website is +available at https://alberthkyhky.github.io/ReCorD/ . + +
+
+ comment: Accepted by ACM MM 2024. Project website: + https://alberthkyhky.github.io/ReCorD/ +
+
+
+
+
+ + ☆ Shapley Value-based Contrastive Alignment for Multimodal Information + Extraction + + +
+ The rise of social media and the exponential growth of multimodal +communication necessitates advanced techniques for Multimodal Information +Extraction (MIE). However, existing methodologies primarily rely on direct +Image-Text interactions, a paradigm that often faces significant challenges due +to semantic and modality gaps between images and text. In this paper, we +introduce a new paradigm of Image-Context-Text interaction, where large +multimodal models (LMMs) are utilized to generate descriptive textual context +to bridge these gaps. In line with this paradigm, we propose a novel Shapley +Value-based Contrastive Alignment (Shap-CA) method, which aligns both +context-text and context-image pairs. Shap-CA initially applies the Shapley +value concept from cooperative game theory to assess the individual +contribution of each element in the set of contexts, texts and images towards +total semantic and modality overlaps. Following this quantitative evaluation, a +contrastive learning strategy is employed to enhance the interactive +contribution within context-text/image pairs, while minimizing the influence +across these pairs. Furthermore, we design an adaptive fusion module for +selective cross-modal fusion. Extensive experiments across four MIE datasets +demonstrate that our method significantly outperforms existing state-of-the-art +methods. + +
+
+ comment: Accepted at ACM Multimedia 2024 +
+
+
+
+
+ + ☆ Exploring Bengali Religious Dialect Biases in Large Language Models with + Evaluation Perspectives + + +
+ While Large Language Models (LLM) have created a massive technological impact +in the past decade, allowing for human-enabled applications, they can produce +output that contains stereotypes and biases, especially when using low-resource +languages. This can be of great ethical concern when dealing with sensitive +topics such as religion. As a means toward making LLMS more fair, we explore +bias from a religious perspective in Bengali, focusing specifically on two main +religious dialects: Hindu and Muslim-majority dialects. Here, we perform +different experiments and audit showing the comparative analysis of different +sentences using three commonly used LLMs: ChatGPT, Gemini, and Microsoft +Copilot, pertaining to the Hindu and Muslim dialects of specific words and +showcasing which ones catch the social biases and which do not. Furthermore, we +analyze our findings and relate them to potential reasons and evaluation +perspectives, considering their global impact with over 300 million speakers +worldwide. With this work, we hope to establish the rigor for creating more +fairness in LLMs, as these are widely used as creative writing agents. + +
+
+ comment: 10 Pages, 4 Figures. Accepted to the 1st Human-centered Evaluation + and Auditing of Language Models Workshop at CHI 2024 (Workshop website: + https://heal-workshop.github.io/#:~:text=Exploring%20Bengali%20Religious%20Dialect%20Biases%20in%20Large%20Language%20Models%20with%20Evaluation%20Perspectives) +
+
+
+
+
+ + ♻ ☆ Continual Panoptic Perception: Towards Multi-modal Incremental + Interpretation of Remote Sensing Images + + +
+ Continual learning (CL) breaks off the one-way training manner and enables a +model to adapt to new data, semantics and tasks continuously. However, current +CL methods mainly focus on single tasks. Besides, CL models are plagued by +catastrophic forgetting and semantic drift since the lack of old data, which +often occurs in remote-sensing interpretation due to the intricate fine-grained +semantics. In this paper, we propose Continual Panoptic Perception (CPP), a +unified continual learning model that leverages multi-task joint learning +covering pixel-level classification, instance-level segmentation and +image-level perception for universal interpretation in remote sensing images. +Concretely, we propose a collaborative cross-modal encoder (CCE) to extract the +input image features, which supports pixel classification and caption +generation synchronously. To inherit the knowledge from the old model without +exemplar memory, we propose a task-interactive knowledge distillation (TKD) +method, which leverages cross-modal optimization and task-asymmetric +pseudo-labeling (TPL) to alleviate catastrophic forgetting. Furthermore, we +also propose a joint optimization mechanism to achieve end-to-end multi-modal +panoptic perception. Experimental results on the fine-grained panoptic +perception dataset validate the effectiveness of the proposed model, and also +prove that joint optimization can boost sub-task CL efficiency with over 13\% +relative improvement on panoptic quality. + +
+
+ comment: Accepted in ACMMM 2024 +
+
+
+
+
+ + ♻ ☆ Enhancing Environmental Monitoring through Multispectral Imaging: The + WasteMS Dataset for Semantic Segmentation of Lakeside Waste + + +
+ Environmental monitoring of lakeside green areas is crucial for environmental +protection. Compared to manual inspections, computer vision technologies offer +a more efficient solution when deployed on-site. Multispectral imaging provides +diverse information about objects under different spectrums, aiding in the +differentiation between waste and lakeside lawn environments. This study +introduces WasteMS, the first multispectral dataset established for the +semantic segmentation of lakeside waste. WasteMS includes a diverse range of +waste types in lawn environments, captured under various lighting conditions. +We implemented a rigorous annotation process to label waste in images. +Representative semantic segmentation frameworks were used to evaluate +segmentation accuracy using WasteMS. Challenges encountered when using WasteMS +for segmenting waste on lakeside lawns were discussed. The WasteMS dataset is +available at https://github.com/zhuqinfeng1999/WasteMS. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 81 + +
+
+
+ + ☆ I Could've Asked That: Reformulating Unanswerable Questions + + +
+ When seeking information from unfamiliar documents, users frequently pose +questions that cannot be answered by the documents. While existing large +language models (LLMs) identify these unanswerable questions, they do not +assist users in reformulating their questions, thereby reducing their overall +utility. We curate CouldAsk, an evaluation benchmark composed of existing and +new datasets for document-grounded question answering, specifically designed to +study reformulating unanswerable questions. We evaluate state-of-the-art +open-source and proprietary LLMs on CouldAsk. The results demonstrate the +limited capabilities of these models in reformulating questions. Specifically, +GPT-4 and Llama2-7B successfully reformulate questions only 26% and 12% of the +time, respectively. Error analysis shows that 62% of the unsuccessful +reformulations stem from the models merely rephrasing the questions or even +generating identical questions. We publicly release the benchmark and the code +to reproduce the experiments. + +
+
+
+
+
+ + ☆ WildHallucinations: Evaluating Long-form Factuality in LLMs with + Real-World Entity Queries + + +
+ While hallucinations of large language models (LLMs) prevail as a major +challenge, existing evaluation benchmarks on factuality do not cover the +diverse domains of knowledge that the real-world users of LLMs seek information +about. To bridge this gap, we introduce WildHallucinations, a benchmark that +evaluates factuality. It does so by prompting LLMs to generate information +about entities mined from user-chatbot conversations in the wild. These +generations are then automatically fact-checked against a systematically +curated knowledge source collected from web search. Notably, half of these +real-world entities do not have associated Wikipedia pages. We evaluate 118,785 +generations from 15 LLMs on 7,919 entities. We find that LLMs consistently +hallucinate more on entities without Wikipedia pages and exhibit varying +hallucination rates across different domains. Finally, given the same base +models, adding a retrieval component only slightly reduces hallucinations but +does not eliminate hallucinations. + +
+
+
+
+
+ + ☆ CMR Scaling Law: Predicting Critical Mixture Ratios for Continual + Pre-training of Language Models + + +
+ Large Language Models (LLMs) excel in diverse tasks but often underperform in +specialized fields due to limited domain-specific or proprietary corpus. +Continual pre-training (CPT) enhances LLM capabilities by imbuing new +domain-specific or proprietary knowledge while replaying general corpus to +prevent catastrophic forgetting. The data mixture ratio of general corpus and +domain-specific corpus, however, has been chosen heuristically, leading to +sub-optimal training efficiency in practice. In this context, we attempt to +re-visit the scaling behavior of LLMs under the hood of CPT, and discover a +power-law relationship between loss, mixture ratio, and training tokens scale. +We formalize the trade-off between general and domain-specific capabilities, +leading to a well-defined Critical Mixture Ratio (CMR) of general and domain +data. By striking the balance, CMR maintains the model's general ability and +achieves the desired domain transfer, ensuring the highest utilization of +available resources. Therefore, if we value the balance between efficiency and +effectiveness, CMR can be consider as the optimal mixture ratio.Through +extensive experiments, we ascertain the predictability of CMR, and propose CMR +scaling law and have substantiated its generalization. These findings offer +practical guidelines for optimizing LLM training in specialized domains, +ensuring both general and domain-specific performance while efficiently +managing training resources. + +
+
+
+
+
+ + ☆ Fluent Student-Teacher Redteaming + + +
+ Many publicly available language models have been safety tuned to reduce the +likelihood of toxic or liability-inducing text. Users or security analysts +attempt to jailbreak or redteam these models with adversarial prompts which +cause compliance with requests. One attack method is to apply discrete +optimization techniques to the prompt. However, the resulting attack strings +are often gibberish text, easily filtered by defenders due to high measured +perplexity, and may fail for unseen tasks and/or well-tuned models. In this +work, we improve existing algorithms (primarily GCG and BEAST) to develop +powerful and fluent attacks on safety-tuned models like Llama-2 and Phi-3. Our +technique centers around a new distillation-based approach that encourages the +victim model to emulate a toxified finetune, either in terms of output +probabilities or internal activations. To encourage human-fluent attacks, we +add a multi-model perplexity penalty and a repetition penalty to the objective. +We also enhance optimizer strength by allowing token insertions, token swaps, +and token deletions and by using longer attack sequences. The resulting process +is able to reliably jailbreak the most difficult target models with prompts +that appear similar to human-written prompts. On Advbench we achieve attack +success rates $>93$% for Llama-2-7B, Llama-3-8B, and Vicuna-7B, while +maintaining model-measured perplexity $<33$; we achieve $95$% attack success +for Phi-3, though with higher perplexity. We also find a universally-optimized +single fluent prompt that induces $>88$% compliance on previously unseen tasks +across Llama-2-7B, Phi-3-mini and Vicuna-7B and transfers to other black-box +models. + +
+
+
+
+
+ + ☆ Dependency Transformer Grammars: Integrating Dependency Structures into + Transformer Language Models + + +
+ Syntactic Transformer language models aim to achieve better generalization +through simultaneously modeling syntax trees and sentences. While prior work +has been focusing on adding constituency-based structures to Transformers, we +introduce Dependency Transformer Grammars (DTGs), a new class of Transformer +language model with explicit dependency-based inductive bias. DTGs simulate +dependency transition systems with constrained attention patterns by modifying +attention masks, incorporate the stack information through relative positional +encoding, and augment dependency arc representation with a combination of token +embeddings and operation embeddings. When trained on a dataset of sentences +annotated with dependency trees, DTGs achieve better generalization while +maintaining comparable perplexity with Transformer language model baselines. +DTGs also outperform recent constituency-based models, showing that dependency +can better guide Transformer language models. Our code is released at +https://github.com/zhaoyd1/Dep_Transformer_Grammars. + +
+
+
+
+
+ + ☆ CovScore: Evaluation of Multi-Document Abstractive Title Set Generation + + +
+ This paper introduces CovScore, an automatic reference-less methodology for +evaluating thematic title sets, extracted from a corpus of documents. While +such extraction methods are widely used, evaluating their effectiveness remains +an open question. Moreover, some existing practices heavily rely on slow and +laborious human annotation procedures. Inspired by recently introduced +LLM-based judge methods, we propose a novel methodology that decomposes quality +into five main metrics along different aspects of evaluation. This framing +simplifies and expedites the manual evaluation process and enables automatic +and independent LLM-based evaluation. As a test case, we apply our approach to +a corpus of Holocaust survivor testimonies, motivated both by its relevance to +title set extraction and by the moral significance of this pursuit. We validate +the methodology by experimenting with naturalistic and synthetic title set +generation systems and compare their performance with the methodology. + +
+
+
+
+
+ + ☆ PERSONA: A Reproducible Testbed for Pluralistic Alignment + + +
+ The rapid advancement of language models (LMs) necessitates robust alignment +with diverse user values. However, current preference optimization approaches +often fail to capture the plurality of user opinions, instead reinforcing +majority viewpoints and marginalizing minority perspectives. We introduce +PERSONA, a reproducible test bed designed to evaluate and improve pluralistic +alignment of LMs. We procedurally generate diverse user profiles from US census +data, resulting in 1,586 synthetic personas with varied demographic and +idiosyncratic attributes. We then generate a large-scale evaluation dataset +containing 3,868 prompts and 317,200 feedback pairs obtained from our synthetic +personas. Leveraging this dataset, we systematically evaluate LM capabilities +in role-playing diverse users, verified through human judges, and the +establishment of both a benchmark, PERSONA Bench, for pluralistic alignment +approaches as well as an extensive dataset to create new and future benchmarks. +The full dataset and benchmarks are available here: +https://www.synthlabs.ai/research/persona. + +
+
+
+
+
+ + ☆ A Comprehensive Approach to Misspelling Correction with BERT and + Levenshtein Distance + + +
+ Writing, as an omnipresent form of human communication, permeates nearly +every aspect of contemporary life. Consequently, inaccuracies or errors in +written communication can lead to profound consequences, ranging from financial +losses to potentially life-threatening situations. Spelling mistakes, among the +most prevalent writing errors, are frequently encountered due to various +factors. This research aims to identify and rectify diverse spelling errors in +text using neural networks, specifically leveraging the Bidirectional Encoder +Representations from Transformers (BERT) masked language model. To achieve this +goal, we compiled a comprehensive dataset encompassing both non-real-word and +real-word errors after categorizing different types of spelling mistakes. +Subsequently, multiple pre-trained BERT models were employed. To ensure optimal +performance in correcting misspelling errors, we propose a combined approach +utilizing the BERT masked language model and Levenshtein distance. The results +from our evaluation data demonstrate that the system presented herein exhibits +remarkable capabilities in identifying and rectifying spelling mistakes, often +surpassing existing systems tailored for the Persian language. + +
+
+ comment: 12 pages, 9 figures, 5 tables +
+
+
+
+
+ + ☆ MMRA: A Benchmark for Multi-granularity Multi-image Relational + Association + + +
+ Given the remarkable success that large visual language models (LVLMs) have +achieved in image perception tasks, the endeavor to make LVMLs perceive the +world like humans is drawing increasing attention. Current multi-modal +benchmarks mainly focus on the objective fact or certain topic related +potential knowledge within a image, but overlook the associative relations +between multiple images. Therefore, we define a multi-image relation +association task, and meticulously curate \textbf{MMRA} benchmark, a +\textbf{M}ulti-granularity \textbf{M}ulti-image \textbf{R}elational +\textbf{A}ssociation benchmark, consisted of \textbf{1026} samples. In order to +systematically and comprehensively evaluate mainstream LVLMs, we establish an +associational relation system among images that contain \textbf{11 subtasks} +(e.g, UsageSimilarity, SubEvent, etc.) at two granularity levels (i.e., +"\textbf{image}" and "\textbf{entity}") according to the relations in +ConceptNet. Our experiments demonstrate that, on our MMRA benchmark, current +mainstream LVLMs all have their own advantages and disadvantages across +different subtasks. It is worth noting that, at the entity level, the +performance of all models is worse than that of them at the image level, +indicating that the fine-grained multi-image perception task is still +challenging for LVLMs. The tasks related to spatial perception are relatively +difficult for LVLMs to handle. Furthermore, we find that LVMLs exhibit a good +ability to perceive image details, and the key to enhancing their multi-image +association capability is to strengthen the reasoning ability of their language +model component. All our codes and data are released at +htt\url{https://github.com/Wusiwei0410/MMRA}. + +
+
+ comment: VLMS, Multi-Image Association +
+
+
+
+
+ + ☆ Boosting Large Language Models with Socratic Method for Conversational + Mathematics Teaching CIKM 2024 + + +
+ With the introduction of large language models (LLMs), automatic math +reasoning has seen tremendous success. However, current methods primarily focus +on providing solutions or using techniques like Chain-of-Thought to enhance +problem-solving accuracy. In this paper, we focus on improving the capability +of mathematics teaching via a Socratic teaching-based LLM +(\texttt{SocraticLLM}), which guides learners toward profound thinking with +clarity and self-discovery via conversation. We collect and release a +high-quality mathematical teaching dataset, named \texttt{SocraticMATH}, which +provides Socratic-style conversations of problems with extra knowledge. Also, +we propose a knowledge-enhanced LLM as a strong baseline to generate reliable +responses with review, guidance/heuristic, rectification, and summarization. +Experimental results show the great advantages of \texttt{SocraticLLM} by +comparing it with several strong generative models. The codes and datasets are +available on \url{https://github.com/ECNU-ICALK/SocraticMath}. + +
+
+ comment: Accepted By CIKM 2024 +
+
+
+
+
+ + ☆ Label Alignment and Reassignment with Generalist Large Language Model + for Enhanced Cross-Domain Named Entity Recognition + + +
+ Named entity recognition on the in-domain supervised and few-shot settings +have been extensively discussed in the NLP community and made significant +progress. However, cross-domain NER, a more common task in practical scenarios, +still poses a challenge for most NER methods. Previous research efforts in that +area primarily focus on knowledge transfer such as correlate label information +from source to target domains but few works pay attention to the problem of +label conflict. In this study, we introduce a label alignment and reassignment +approach, namely LAR, to address this issue for enhanced cross-domain named +entity recognition, which includes two core procedures: label alignment between +source and target domains and label reassignment for type inference. The +process of label reassignment can significantly be enhanced by integrating with +an advanced large-scale language model such as ChatGPT. We conduct an extensive +range of experiments on NER datasets involving both supervised and zero-shot +scenarios. Empirical experimental results demonstrate the validation of our +method with remarkable performance under the supervised and zero-shot +out-of-domain settings compared to SOTA methods. + +
+
+ comment: 9 pages, 4 figures +
+
+
+
+
+ + ☆ How Good (Or Bad) Are LLMs at Detecting Misleading Visualizations? IEEE VIS 2024 + + +
+ In this study, we address the growing issue of misleading charts, a prevalent +problem that undermines the integrity of information dissemination. Misleading +charts can distort the viewer's perception of data, leading to +misinterpretations and decisions based on false information. The development of +effective automatic detection methods for misleading charts is an urgent field +of research. The recent advancement of multimodal Large Language Models (LLMs) +has introduced a promising direction for addressing this challenge. We explored +the capabilities of these models in analyzing complex charts and assessing the +impact of different prompting strategies on the models' analyses. We utilized a +dataset of misleading charts collected from the internet by prior research and +crafted nine distinct prompts, ranging from simple to complex, to test the +ability of four different multimodal LLMs in detecting over 21 different chart +issues. Through three experiments--from initial exploration to detailed +analysis--we progressively gained insights into how to effectively prompt LLMs +to identify misleading charts and developed strategies to address the +scalability challenges encountered as we expanded our detection range from the +initial five issues to 21 issues in the final experiment. Our findings reveal +that multimodal LLMs possess a strong capability for chart comprehension and +critical thinking in data interpretation. There is significant potential in +employing multimodal LLMs to counter misleading information by supporting +critical thinking and enhancing visualization literacy. This study demonstrates +the applicability of LLMs in addressing the pressing concern of misleading +charts. + +
+
+ comment: To be presented at IEEE VIS 2024 +
+
+
+
+
+ + ☆ Improving ICD coding using Chapter based Named Entities and Attentional + Models + + +
+ Recent advancements in natural language processing (NLP) have led to +automation in various domains. However, clinical NLP often relies on benchmark +datasets that may not reflect real-world scenarios accurately. Automatic ICD +coding, a vital NLP task, typically uses outdated and imbalanced datasets like +MIMIC-III, with existing methods yielding micro-averaged F1 scores between 0.4 +and 0.7 due to many false positives. Our research introduces an enhanced +approach to ICD coding that improves F1 scores by using chapter-based named +entities and attentional models. This method categorizes discharge summaries +into ICD-9 Chapters and develops attentional models with chapter-specific data, +eliminating the need to consider external data for code identification. For +categorization, we use Chapter-IV to de-bias and influence key entities and +weights without neural networks, creating accurate thresholds and providing +interpretability for human validation. Post-validation, we develop attentional +models for three frequent and three non-frequent codes from Chapter-IV using +Bidirectional-Gated Recurrent Units (GRUs) with Attention and Transformer with +Multi-head Attention architectures. The average Micro-F1 scores of 0.79 and +0.81 from these models demonstrate significant performance improvements in ICD +coding. + +
+
+ comment: 10 Pages +
+
+
+
+
+ + ☆ LEAN-GitHub: Compiling GitHub LEAN repositories for a versatile LEAN + prover + + +
+ Recently, large language models have presented promising results in aiding +formal mathematical reasoning. However, their performance is restricted due to +the scarcity of formal theorem-proving data, which requires additional effort +to be extracted from raw formal language corpora. Meanwhile, a significant +amount of human-written formal language corpora remains underutilized. To +address this issue, we propose LEAN-GitHub, a dataset consisting of large-scale +formal data extracted from almost all Lean 4 repositories on GitHub. After +fine-tuning InternLM-math-plus on this dataset, our model achieved accuracies +of 48.8% with a single pass and 54.5% with 64 passes on the Lean 4 miniF2F +test, surpassing state-of-the-art method at 52%. And it also achieves +state-of-the-art on two other Lean 4 benchmarks (ProofNet and Putnam) targeting +different fields/levels of math. These results demonstrate that our proposed +dataset is beneficial for formal reasoning on a wide range of math topics. We +open-source our model at https://GitHub. com/InternLM/InternLM-Math and our +data at https://huggingface.co/ datasets/InternLM/Lean-GitHub + +
+
+
+
+
+ + ☆ NarrationDep: Narratives on Social Media For Automatic Depression + Detection + + +
+ Social media posts provide valuable insight into the narrative of users and +their intentions, including providing an opportunity to automatically model +whether a social media user is depressed or not. The challenge lies in +faithfully modelling user narratives from their online social media posts, +which could potentially be useful in several different applications. We have +developed a novel and effective model called \texttt{NarrationDep}, which +focuses on detecting narratives associated with depression. By analyzing a +user's tweets, \texttt{NarrationDep} accurately identifies crucial narratives. +\texttt{NarrationDep} is a deep learning framework that jointly models +individual user tweet representations and clusters of users' tweets. As a +result, \texttt{NarrationDep} is characterized by a novel two-layer deep +learning model: the first layer models using social media text posts, and the +second layer learns semantic representations of tweets associated with a +cluster. To faithfully model these cluster representations, the second layer +incorporates a novel component that hierarchically learns from users' posts. +The results demonstrate that our framework outperforms other comparative models +including recently developed models on a variety of datasets. + +
+
+
+
+
+ + ☆ Speech Editing -- a Summary + + +
+ With the rise of video production and social media, speech editing has become +crucial for creators to address issues like mispronunciations, missing words, +or stuttering in audio recordings. This paper explores text-based speech +editing methods that modify audio via text transcripts without manual waveform +editing. These approaches ensure edited audio is indistinguishable from the +original by altering the mel-spectrogram. Recent advancements, such as +context-aware prosody correction and advanced attention mechanisms, have +improved speech editing quality. This paper reviews state-of-the-art methods, +compares key metrics, and examines widely used datasets. The aim is to +highlight ongoing issues and inspire further research and innovation in speech +editing. + +
+
+
+
+
+ + ☆ Zero-Shot vs. Few-Shot Multi-Speaker TTS Using Pre-trained Czech + SpeechT5 Model + + +
+ In this paper, we experimented with the SpeechT5 model pre-trained on +large-scale datasets. We pre-trained the foundation model from scratch and +fine-tuned it on a large-scale robust multi-speaker text-to-speech (TTS) task. +We tested the model capabilities in a zero- and few-shot scenario. Based on two +listening tests, we evaluated the synthetic audio quality and the similarity of +how synthetic voices resemble real voices. Our results showed that the SpeechT5 +model can generate a synthetic voice for any speaker using only one minute of +the target speaker's data. We successfully demonstrated the high quality and +similarity of our synthetic voices on publicly known Czech politicians and +celebrities. + +
+
+ comment: Accepted to TSD2024 +
+
+
+
+
+ + ☆ A Comparative Analysis of Bilingual and Trilingual Wav2Vec Models for + Automatic Speech Recognition in Multilingual Oral History Archives INTERSPEECH2024 + + +
+ In this paper, we are comparing monolingual Wav2Vec 2.0 models with various +multilingual models to see whether we could improve speech recognition +performance on a unique oral history archive containing a lot of mixed-language +sentences. Our main goal is to push forward research on this unique dataset, +which is an extremely valuable part of our cultural heritage. Our results +suggest that monolingual speech recognition models are, in most cases, superior +to multilingual models, even when processing the oral history archive full of +mixed-language sentences from non-native speakers. We also performed the same +experiments on the public CommonVoice dataset to verify our results. We are +contributing to the research community by releasing our pre-trained models to +the public. + +
+
+ comment: Accepted to INTERSPEECH2024 +
+
+
+
+
+ + ☆ SimCT: A Simple Consistency Test Protocol in LLMs Development Lifecycle + + +
+ In this work, we report our efforts to advance the standard operation +procedure of developing Large Language Models (LLMs) or LLMs-based systems or +services in industry. We introduce the concept of Large Language Model +Development Lifecycle (LDLC) and then highlight the importance of consistency +test in ensuring the delivery quality. The principled solution of consistency +test, however, is usually overlooked by industrial practitioners and not urgent +in academia, and current practical solutions are insufficiently rigours and +labor-intensive. We thus propose a simple yet effective consistency test +protocol, named SimCT. SimCT is mainly to proactively check the consistency +across different development stages of "bare metal" LLMs or associated services +without accessing the model artifacts, in an attempt to expedite the delivery +by reducing the back-and-forth alignment communications among multiple teams +involved in different development stages. + Specifically, SimCT encompasses response-wise and model-wise tests. We +implement the protocol with LightGBM and Student's t-test for two components +respectively, and perform extensive experiments to substantiate the +effectiveness of SimCT and the involved components. + +
+
+
+
+
+ + ☆ SDoH-GPT: Using Large Language Models to Extract Social Determinants of + Health (SDoH) + + +
+ Extracting social determinants of health (SDoH) from unstructured medical +notes depends heavily on labor-intensive annotations, which are typically +task-specific, hampering reusability and limiting sharing. In this study we +introduced SDoH-GPT, a simple and effective few-shot Large Language Model (LLM) +method leveraging contrastive examples and concise instructions to extract SDoH +without relying on extensive medical annotations or costly human intervention. +It achieved tenfold and twentyfold reductions in time and cost respectively, +and superior consistency with human annotators measured by Cohen's kappa of up +to 0.92. The innovative combination of SDoH-GPT and XGBoost leverages the +strengths of both, ensuring high accuracy and computational efficiency while +consistently maintaining 0.90+ AUROC scores. Testing across three distinct +datasets has confirmed its robustness and accuracy. This study highlights the +potential of leveraging LLMs to revolutionize medical note classification, +demonstrating their capability to achieve highly accurate classifications with +significantly reduced time and cost. + +
+
+
+
+
+ + ☆ Behavioral Testing: Can Large Language Models Implicitly Resolve + Ambiguous Entities? + + +
+ One of the major aspects contributing to the striking performance of large +language models (LLMs) is the vast amount of factual knowledge accumulated +during pre-training. Yet, many LLMs suffer from self-inconsistency, which +raises doubts about their trustworthiness and reliability. In this paper, we +focus on entity type ambiguity and analyze current state-of-the-art LLMs for +their proficiency and consistency in applying their factual knowledge when +prompted for entities under ambiguity. To do so, we propose an evaluation +protocol that disentangles knowing from applying knowledge, and test +state-of-the-art LLMs on 49 entities. Our experiments reveal that LLMs perform +poorly with ambiguous prompts, achieving only 80% accuracy. Our results further +demonstrate systematic discrepancies in LLM behavior and their failure to +consistently apply information, indicating that the models can exhibit +knowledge without being able to utilize it, significant biases for preferred +readings, as well as self inconsistencies. Our study highlights the importance +of handling entity ambiguity in future for more trustworthy LLMs + +
+
+
+
+
+ + ☆ A Survey Forest Diagram : Gain a Divergent Insight View on a Specific + Research Topic + + +
+ With the exponential growth in the number of papers and the trend of AI +research, the use of Generative AI for information retrieval and +question-answering has become popular for conducting research surveys. However, +novice researchers unfamiliar with a particular field may not significantly +improve their efficiency in interacting with Generative AI because they have +not developed divergent thinking in that field. This study aims to develop an +in-depth Survey Forest Diagram that guides novice researchers in divergent +thinking about the research topic by indicating the citation clues among +multiple papers, to help expand the survey perspective for novice researchers. + +
+
+ comment: This paper will submit to IEEE SMC 2024 +
+
+
+
+
+ + ☆ SAFETY-J: Evaluating Safety with Critique + + +
+ The deployment of Large Language Models (LLMs) in content generation raises +significant safety concerns, particularly regarding the transparency and +interpretability of content evaluations. Current methods, primarily focused on +binary safety classifications, lack mechanisms for detailed critique, limiting +their utility for model improvement and user trust. To address these +limitations, we introduce SAFETY-J, a bilingual generative safety evaluator for +English and Chinese with critique-based judgment. SAFETY-J utilizes a robust +training dataset that includes diverse dialogues and augmented query-response +pairs to assess safety across various scenarios comprehensively. We establish +an automated meta-evaluation benchmark that objectively assesses the quality of +critiques with minimal human intervention, facilitating scalable and continuous +improvement. Additionally, SAFETY-J employs an iterative preference learning +technique to dynamically refine safety assessments based on meta-evaluations +and critiques. Our evaluations demonstrate that SAFETY-J provides more nuanced +and accurate safety evaluations, thereby enhancing both critique quality and +predictive reliability in complex content scenarios. To facilitate further +research and application, we will open-source SAFETY-J's training protocols, +datasets, and code. + +
+
+
+
+
+ + ☆ High Efficiency Image Compression for Large Visual-Language Models + + +
+ In recent years, large visual language models (LVLMs) have shown impressive +performance and promising generalization capability in multi-modal tasks, thus +replacing humans as receivers of visual information in various application +scenarios. In this paper, we pioneer to propose a variable bitrate image +compression framework consisting of a pre-editing module and an end-to-end +codec to achieve promising rate-accuracy performance for different LVLMs. In +particular, instead of optimizing an adaptive pre-editing network towards a +particular task or several representative tasks, we propose a new optimization +strategy tailored for LVLMs, which is designed based on the representation and +discrimination capability with token-level distortion and rank. The pre-editing +module and the variable bitrate end-to-end image codec are jointly trained by +the losses based on semantic tokens of the large model, which introduce +enhanced generalization capability for various data and tasks. {Experimental +results demonstrate that the proposed framework could efficiently achieve much +better rate-accuracy performance compared to the state-of-the-art coding +standard, Versatile Video Coding.} Meanwhile, experiments with multi-modal +tasks have revealed the robustness and generalization capability of the +proposed framework. + +
+
+
+
+
+ + ☆ From Internal Conflict to Contextual Adaptation of Language Models + + +
+ Knowledge-intensive language understanding tasks require Language Models +(LMs) to integrate relevant context, mitigating their inherent weaknesses, such +as incomplete or outdated knowledge. Nevertheless, studies indicate that LMs +often ignore the provided context as it can conflict with the pre-existing LM's +memory learned during pre-training. Moreover, conflicting knowledge can already +be present in the LM's parameters, termed intra-memory conflict. Existing works +have studied the two types of knowledge conflicts only in isolation. We +conjecture that the (degree of) intra-memory conflicts can in turn affect LM's +handling of context-memory conflicts. To study this, we introduce the DYNAMICQA +dataset, which includes facts with a temporal dynamic nature where a fact can +change with a varying time frequency and disputable dynamic facts, which can +change depending on the viewpoint. DYNAMICQA is the first to include real-world +knowledge conflicts and provide context to study the link between the different +types of knowledge conflicts. With the proposed dataset, we assess the use of +uncertainty for measuring the intra-memory conflict and introduce a novel +Coherent Persuasion (CP) score to evaluate the context's ability to sway LM's +semantic output. Our extensive experiments reveal that static facts, which are +unlikely to change, are more easily updated with additional context, relative +to temporal and disputable facts. + +
+
+ comment: 22 pages, 15 figures +
+
+
+
+
+ + ☆ Can Language Models Evaluate Human Written Text? Case Study on Korean + Student Writing for Education + + +
+ Large language model (LLM)-based evaluation pipelines have demonstrated their +capability to robustly evaluate machine-generated text. Extending this +methodology to assess human-written text could significantly benefit +educational settings by providing direct feedback to enhance writing skills, +although this application is not straightforward. In this paper, we investigate +whether LLMs can effectively assess human-written text for educational +purposes. We collected 100 texts from 32 Korean students across 15 types of +writing and employed GPT-4-Turbo to evaluate them using grammaticality, +fluency, coherence, consistency, and relevance as criteria. Our analyses +indicate that LLM evaluators can reliably assess grammaticality and fluency, as +well as more objective types of writing, though they struggle with other +criteria and types of writing. We publicly release our dataset and feedback. + +
+
+ comment: Work In Progress +
+
+
+
+
+ + ☆ Unveiling In-Context Learning: A Coordinate System to Understand Its + Working Mechanism + + +
+ Large language models (LLMs) exhibit remarkable in-context learning (ICL) +capabilities. However, the underlying working mechanism of ICL remains poorly +understood. Recent research presents two conflicting views on ICL: One +attributes it to LLMs' inherent ability of task recognition, deeming label +correctness and shot numbers of demonstrations as not crucial; the other +emphasizes the impact of similar examples in the demonstrations, stressing the +need for label correctness and more shots. In this work, we provide a +Two-Dimensional Coordinate System that unifies both views into a systematic +framework. The framework explains the behavior of ICL through two orthogonal +variables: whether LLMs can recognize the task and whether similar examples are +presented in the demonstrations. We propose the peak inverse rank metric to +detect the task recognition ability of LLMs and study LLMs' reactions to +different definitions of similarity. Based on these, we conduct extensive +experiments to elucidate how ICL functions across each quadrant on multiple +representative classification tasks. Finally, we extend our analyses to +generation tasks, showing that our coordinate system can also be used to +interpret ICL for generation tasks effectively. + +
+
+
+
+
+ + ☆ Revisiting Who's Harry Potter: Towards Targeted Unlearning from a Causal + Intervention Perspective + + +
+ This paper investigates Who's Harry Potter (WHP), a pioneering yet +insufficiently understood method for LLM unlearning. We explore it in two +steps. First, we introduce a new task of LLM targeted unlearning, where given +an unlearning target (e.g., a person) and some unlearning documents, we aim to +unlearn only the information about the target, rather than everything in the +unlearning documents. We further argue that a successful unlearning should +satisfy criteria such as not outputting gibberish, not fabricating facts about +the unlearning target, and not releasing factual information under jailbreak +attacks. Second, we construct a causal intervention framework for targeted +unlearning, where the knowledge of the unlearning target is modeled as a +confounder between LLM input and output, and the unlearning process as a +deconfounding process. This framework justifies and extends WHP, deriving a +simple unlearning algorithm that includes WHP as a special case. Experiments on +existing and new datasets show that our approach, without explicitly optimizing +for the aforementioned criteria, achieves competitive performance in all of +them. Our code is available at +https://github.com/UCSB-NLP-Chang/causal_unlearn.git. + +
+
+
+
+
+ + ☆ A Voter-Based Stochastic Rejection-Method Framework for Asymptotically + Safe Language Model Outputs + + +
+ This paper proposes a new method for preventing unsafe or otherwise low +quality large language model (LLM) outputs, by leveraging the stochasticity of +LLMs. We propose a system whereby LLM checkers vote on the acceptability of a +generated output, regenerating it if a threshold of disapproval is reached, +until sufficient checkers approve. We further propose estimators for cost and +failure rate, and based on those estimators and experimental data tailored to +the application, we propose an algorithm that achieves a desired failure rate +at the least possible cost. We demonstrate that, under these models, failure +rate decreases exponentially as a function of cost when voter count and +threshold are chosen according to the algorithm, and that the models reasonably +estimate the actual performance of such a system in action, even with limited +data. + +
+
+ comment: 7 pages, 2 figures +
+
+
+
+
+ + ☆ Towards Aligning Language Models with Textual Feedback + + +
+ We present ALT (ALignment with Textual feedback), an approach that aligns +language models with user preferences expressed in text. We argue that text +offers greater expressiveness, enabling users to provide richer feedback than +simple comparative preferences and this richer feedback can lead to more +efficient and effective alignment. ALT aligns the model by conditioning its +generation on the textual feedback. Our method relies solely on language +modeling techniques and requires minimal hyper-parameter tuning, though it +still presents the main benefits of RL-based alignment algorithms and can +effectively learn from textual feedback. We explore the efficacy and efficiency +of textual feedback across different tasks such as toxicity reduction, +summarization, and dialog response generation. We find that ALT outperforms PPO +for the task of toxicity reduction while being able to match its performance on +summarization with only 20% of the samples. We also explore how ALT can be used +with feedback provided by an existing LLM where we explore an LLM providing +constrained and unconstrained textual feedback. We also outline future +directions to align models with natural language feedback. + +
+
+
+
+
+ + ☆ Towards Transfer Unlearning: Empirical Evidence of Cross-Domain Bias + Mitigation + + +
+ Large language models (LLMs) often inherit biases from vast amounts of +training corpora. Traditional debiasing methods, while effective to some +extent, do not completely eliminate memorized biases and toxicity in LLMs. In +this paper, we study an unlearning-based approach to debiasing in LLMs by +performing gradient ascent on hate speech against minority groups, i.e., +minimizing the likelihood of biased or toxic content. Specifically, we propose +a mask language modeling unlearning technique, which unlearns the harmful part +of the text. This method enables LLMs to selectively forget and disassociate +from biased and harmful content. Experimental results demonstrate the +effectiveness of our approach in diminishing bias while maintaining the +language modeling abilities. Surprisingly, the results also unveil an +unexpected potential for cross-domain transfer unlearning: debiasing in one +bias form (e.g. gender) may contribute to mitigating others (e.g. race and +religion). + +
+
+
+
+
+ + ☆ Early screening of potential breakthrough technologies with enhanced + interpretability: A patent-specific hierarchical attention network model + + +
+ Despite the usefulness of machine learning approaches for the early screening +of potential breakthrough technologies, their practicality is often hindered by +opaque models. To address this, we propose an interpretable machine learning +approach to predicting future citation counts from patent texts using a +patent-specific hierarchical attention network (PatentHAN) model. Central to +this approach are (1) a patent-specific pre-trained language model, capturing +the meanings of technical words in patent claims, (2) a hierarchical network +structure, enabling detailed analysis at the claim level, and (3) a claim-wise +self-attention mechanism, revealing pivotal claims during the screening +process. A case study of 35,376 pharmaceutical patents demonstrates the +effectiveness of our approach in early screening of potential breakthrough +technologies while ensuring interpretability. Furthermore, we conduct +additional analyses using different language models and claim types to examine +the robustness of the approach. It is expected that the proposed approach will +enhance expert-machine collaboration in identifying breakthrough technologies, +providing new insight derived from text mining into technological value. + +
+
+
+
+
+ + ☆ ScholarChemQA: Unveiling the Power of Language Models in Chemical + Research Question Answering + + +
+ Question Answering (QA) effectively evaluates language models' reasoning and +knowledge depth. While QA datasets are plentiful in areas like general domain +and biomedicine, academic chemistry is less explored. Chemical QA plays a +crucial role in both education and research by effectively translating complex +chemical information into readily understandable format. Addressing this gap, +we introduce ScholarChemQA, a large-scale QA dataset constructed from chemical +papers. This dataset reflects typical real-world challenges, including an +imbalanced data distribution and a substantial amount of unlabeled data that +can be potentially useful. Correspondingly, we introduce a QAMatch model, +specifically designed to effectively answer chemical questions by fully +leveraging our collected data. We first address the issue of imbalanced label +distribution by re-weighting the instance-wise loss based on the inverse +frequency of each class, ensuring minority classes are not dominated by +majority ones during optimization. Next, we utilize the unlabeled data to +enrich the learning process, generating a variety of augmentations based on a +SoftMix operation and ensuring their predictions align with the same target, +i.e., pseudo-labels. To ensure the quality of the pseudo-labels, we propose a +calibration procedure aimed at closely aligning the pseudo-label estimates of +individual samples with a desired ground truth distribution. Experiments show +that our QAMatch significantly outperforms the recent similar-scale baselines +and Large Language Models (LLMs) not only on our ScholarChemQA dataset but also +on four benchmark datasets. We hope our benchmark and model can facilitate and +promote more research on chemical QA. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ Train-Attention: Meta-Learning Where to Focus in Continual Knowledge + Learning + + +
+ Previous studies on continual knowledge learning (CKL) in large language +models (LLMs) have predominantly focused on approaches such as regularization, +architectural modifications, and rehearsal techniques to mitigate catastrophic +forgetting. However, these methods naively inherit the inefficiencies of +standard training procedures, indiscriminately applying uniform weight across +all tokens, which can lead to unnecessary parameter updates and increased +forgetting. To address these shortcomings, we propose a novel CKL approach +termed Train-Attention-Augmented Language Model (TAALM), which enhances +learning efficiency by dynamically predicting and applying weights to tokens +based on their usefulness. This method employs a meta-learning framework that +optimizes token importance predictions, facilitating targeted knowledge updates +and minimizing forgetting. Also, we observe that existing benchmarks do not +clearly exhibit the trade-off between learning and retaining, therefore we +propose a new benchmark, \textsc{LAMA-ckl}, to address this issue. Through +experiments conducted on both newly introduced and established CKL benchmarks, +TAALM proves the state-of-the-art performance upon the baselines, and also +shows synergistic compatibility when integrated with previous CKL approaches. + +
+
+
+
+
+ + ☆ Time Matters: Examine Temporal Effects on Biomedical Language Models + + +
+ Time roots in applying language models for biomedical applications: models +are trained on historical data and will be deployed for new or future data, +which may vary from training data. While increasing biomedical tasks have +employed state-of-the-art language models, there are very few studies have +examined temporal effects on biomedical models when data usually shifts across +development and deployment. This study fills the gap by statistically probing +relations between language model performance and data shifts across three +biomedical tasks. We deploy diverse metrics to evaluate model performance, +distance methods to measure data drifts, and statistical methods to quantify +temporal effects on biomedical language models. Our study shows that time +matters for deploying biomedical language models, while the degree of +performance degradation varies by biomedical tasks and statistical +quantification approaches. We believe this study can establish a solid +benchmark to evaluate and assess temporal effects on deploying biomedical +language models. + +
+
+ comment: Accept to AMIA 2024 Annual Symposium +
+
+
+
+
+ + ☆ IgnitionInnovators at "Discharge Me!": Chain-of-Thought Instruction + Finetuning Large Language Models for Discharge Summaries + + +
+ This paper presents our proposed approach to the Discharge Me! shared task, +collocated with the 23th Workshop on Biomedical Natural Language Processing +(BioNLP). In this work, we develop an LLM-based framework for solving the +Discharge Summary Documentation (DSD) task, i.e., generating the two critical +target sections `Brief Hospital Course' and `Discharge Instructions' in the +discharge summary. By streamlining the recent instruction-finetuning process on +LLMs, we explore several prompting strategies for optimally adapting LLMs to +specific generation task of DSD. Experimental results show that providing a +clear output structure, complimented by a set of comprehensive +Chain-of-Thoughts (CoT) questions, effectively improves the model's reasoning +capability, and thereby, enhancing the structural correctness and faithfulness +of clinical information in the generated text. Source code is available at: +https://github.com/antangrocket1312/Discharge_LLM + +
+
+ comment: Accepted by BioNLP2024 Workshop +
+
+
+
+
+ + ☆ Papilusion at DAGPap24: Paper or Illusion? Detecting AI-generated + Scientific Papers + + +
+ This paper presents Papilusion, an AI-generated scientific text detector +developed within the DAGPap24 shared task on detecting automatically generated +scientific papers. We propose an ensemble-based approach and conduct ablation +studies to analyze the effect of the detector configurations on the +performance. Papilusion is ranked 6th on the leaderboard, and we improve our +performance after the competition ended, achieving 99.46 (+9.63) of the +F1-score on the official test set. + +
+
+ comment: to appear in DAGPAP 2024 proceedings +
+
+
+
+
+ + ☆ Traditional Methods Outperform Generative LLMs at Forecasting Credit + Ratings + + +
+ Large Language Models (LLMs) have been shown to perform well for many +downstream tasks. Transfer learning can enable LLMs to acquire skills that were +not targeted during pre-training. In financial contexts, LLMs can sometimes +beat well-established benchmarks. This paper investigates how well LLMs perform +in the task of forecasting corporate credit ratings. We show that while LLMs +are very good at encoding textual information, traditional methods are still +very competitive when it comes to encoding numeric and multimodal data. For our +task, current LLMs perform worse than a more traditional XGBoost architecture +that combines fundamental and macroeconomic data with high-density text-based +embedding features. + +
+
+
+
+
+ + ☆ Coupling Speech Encoders with Downstream Text Models + + +
+ We present a modular approach to building cascade speech translation (AST) +models that guarantees that the resulting model performs no worse than the +1-best cascade baseline while preserving state-of-the-art speech recognition +(ASR) and text translation (MT) performance for a given task. Our novel +contribution is the use of an ``exporter'' layer that is trained under L2-loss +to ensure a strong match between ASR embeddings and the MT token embeddings for +the 1-best sequence. The ``exporter'' output embeddings are fed directly to the +MT model in lieu of 1-best token embeddings, thus guaranteeing that the +resulting model performs no worse than the 1-best cascade baseline, while +allowing back-propagation gradient to flow from the MT model into the ASR +components. The matched-embeddings cascade architecture provide a significant +improvement over its 1-best counterpart in scenarios where incremental training +of the MT model is not an option and yet we seek to improve quality by +leveraging (speech, transcription, translated transcription) data provided with +the AST task. The gain disappears when the MT model is incrementally trained on +the parallel text data available with the AST task. The approach holds promise +for other scenarios that seek to couple ASR encoders and immutable text models, +such at large language models (LLM). + +
+
+
+
+
+ + ☆ Exploring Domain Robust Lightweight Reward Models based on Router + Mechanism ACL 2024 + + +
+ Recent advancements in large language models have heavily relied on the large +reward model from reinforcement learning from human feedback for fine-tuning. +However, the use of a single reward model across various domains may not always +be optimal, often requiring retraining from scratch when new domain data is +introduced. To address these challenges, we explore the utilization of small +language models operating in a domain-specific manner based on router +mechanisms. Our three approaches are: 1) utilize mixture of experts to form a +single reward model by modularizing an internal router and experts, 2) +employing external router to select the appropriate reward model from multiple +domain-specific models, and 3) the framework reduces parameter size by loading +reward models and router adapters onto a single small language model using +adapters. Experimental validation underscores the effectiveness of our +approach, demonstrating performance comparable to baseline methods while also +reducing the total parameter size. + +
+
+ comment: This paper is accepted for ACL 2024 +
+
+
+
+
+ + ☆ Large Language Models for Anomaly Detection in Computational Workflows: + from Supervised Fine-Tuning to In-Context Learning SC'24 + + +
+ Anomaly detection in computational workflows is critical for ensuring system +reliability and security. However, traditional rule-based methods struggle to +detect novel anomalies. This paper leverages large language models (LLMs) for +workflow anomaly detection by exploiting their ability to learn complex data +patterns. Two approaches are investigated: 1) supervised fine-tuning (SFT), +where pre-trained LLMs are fine-tuned on labeled data for sentence +classification to identify anomalies, and 2) in-context learning (ICL) where +prompts containing task descriptions and examples guide LLMs in few-shot +anomaly detection without fine-tuning. The paper evaluates the performance, +efficiency, generalization of SFT models, and explores zero-shot and few-shot +ICL prompts and interpretability enhancement via chain-of-thought prompting. +Experiments across multiple workflow datasets demonstrate the promising +potential of LLMs for effective anomaly detection in complex executions. + +
+
+ comment: 12 pages, 14 figures, paper is accepted by SC'24, source code, see: + https://github.com/PoSeiDon-Workflows/LLM_AD +
+
+
+
+
+ + ☆ Generative artificial intelligence in dentistry: Current approaches and + future challenges + + +
+ Artificial intelligence (AI) has become a commodity for people because of the +advent of generative AI (GenAI) models that bridge the usability gap of AI by +providing a natural language interface to interact with complex models. These +GenAI models range from text generation - such as two-way chat systems - to the +generation of image or video from textual descriptions input by a user. These +advancements in AI have impacted Dentistry in multiple aspects. In dental +education, the student now has the opportunity to solve a plethora of questions +by only prompting a GenAI model and have the answer in a matter of seconds. +GenAI models can help us deliver better patient healthcare by helping +practitioners gather knowledge quickly and efficiently. Finally, GenAI can also +be used in dental research, where the applications range from new drug +discovery to assistance in academic writing. In this review, we first define +GenAI models and describe their multiple generation modalities; then, we +explain and discuss their current and potential applications in Dentistry; and +finally, we describe the challenges these new technologies impose in our area. + +
+
+
+
+
+ + ♻ ☆ A Unified Framework for Model Editing ACL 2024 + + +
+ ROME and MEMIT are largely believed to be two different model editing +algorithms, with the major difference between them being the ability to perform +batched edits. In this paper, we unify these two algorithms under a single +conceptual umbrella, optimizing for the same goal, which we call the +preservation-memorization objective. ROME uses an equality constraint to +optimize this objective to perform one edit at a time, whereas MEMIT employs a +more flexible least-square constraint that allows for batched edits. We +generalize ROME and enable batched editing with equality constraint in the form +of EMMET - an Equality-constrained Mass Model Editing algorithm for +Transformers, a new batched memory-editing algorithm. EMMET can perform +batched-edits up to a batch-size of 10,000, with very similar performance to +MEMIT across multiple dimensions. With the introduction of EMMET, we truly +unify ROME and MEMIT and show that both algorithms are equivalent in terms of +their optimization objective, their abilities (singular and batched editing), +their model editing performance and their limitations. + +
+
+ comment: Under review. To appear as poster at KnowledgeableLM Workshop + co-located with ACL 2024 +
+
+
+
+
+ + ♻ ☆ Dissecting Language Models: Machine Unlearning via Selective Pruning + + +
+ Understanding and shaping the behaviour of Large Language Models (LLMs) is +increasingly important as applications become more powerful and more frequently +adopted. This paper introduces a machine unlearning method specifically +designed for LLMs. We introduce a selective pruning method for LLMs that +removes neurons based on their relative importance on a targeted capability +compared to overall network performance. This approach is a compute- and +data-efficient method for identifying and removing neurons that enable specific +behaviours. Our findings reveal that both feed-forward and attention neurons in +LLMs are specialized; that is, for specific tasks, certain neurons are more +crucial than others. Code from all experiments is available at +https://github.com/nickypro/selective-pruning + +
+
+
+
+
+ + ♻ ☆ Consent in Crisis: The Rapid Decline of the AI Data Commons + + +
+ General-purpose artificial intelligence (AI) systems are built on massive +swathes of public web data, assembled into corpora such as C4, RefinedWeb, and +Dolma. To our knowledge, we conduct the first, large-scale, longitudinal audit +of the consent protocols for the web domains underlying AI training corpora. +Our audit of 14,000 web domains provides an expansive view of crawlable web +data and how codified data use preferences are changing over time. We observe a +proliferation of AI-specific clauses to limit use, acute differences in +restrictions on AI developers, as well as general inconsistencies between +websites' expressed intentions in their Terms of Service and their robots.txt. +We diagnose these as symptoms of ineffective web protocols, not designed to +cope with the widespread re-purposing of the internet for AI. Our longitudinal +analyses show that in a single year (2023-2024) there has been a rapid +crescendo of data restrictions from web sources, rendering ~5%+ of all tokens +in C4, or 28%+ of the most actively maintained, critical sources in C4, fully +restricted from use. For Terms of Service crawling restrictions, a full 45% of +C4 is now restricted. If respected or enforced, these restrictions are rapidly +biasing the diversity, freshness, and scaling laws for general-purpose AI +systems. We hope to illustrate the emerging crises in data consent, for both +developers and creators. The foreclosure of much of the open web will impact +not only commercial AI, but also non-commercial AI and academic research. + +
+
+ comment: 41 pages (13 main), 5 figures, 9 tables +
+
+
+
+
+ + ♻ ☆ How Easily do Irrelevant Inputs Skew the Responses of Large Language + Models? + + +
+ By leveraging the retrieval of information from external knowledge databases, +Large Language Models (LLMs) exhibit enhanced capabilities for accomplishing +many knowledge-intensive tasks. However, due to the inherent flaws of current +retrieval systems, there might exist irrelevant information within those +retrieving top-ranked passages. In this work, we present a comprehensive +investigation into the robustness of LLMs to different types of irrelevant +information under various conditions. We initially introduce a framework to +construct high-quality irrelevant information that ranges from semantically +unrelated, partially related, and related to questions. Furthermore, our +analysis demonstrates that the constructed irrelevant information not only +scores highly on similarity metrics, being highly retrieved by existing +systems, but also bears semantic connections to the context. Our investigation +reveals that current LLMs still face challenges in discriminating highly +semantically related information and can be easily distracted by these +irrelevant yet misleading content. Besides, we also find that current solutions +for handling irrelevant information have limitations in improving the +robustness of LLMs to such distractions. All the resources are available on +GitHub at https://github.com/Di-viner/LLM-Robustness-to-Irrelevant-Information. + +
+
+ comment: COLM 2024 +
+
+
+
+
+ + ♻ ☆ MM-Soc: Benchmarking Multimodal Large Language Models in Social Media + Platforms ACL 2024 + + +
+ Social media platforms are hubs for multimodal information exchange, +encompassing text, images, and videos, making it challenging for machines to +comprehend the information or emotions associated with interactions in online +spaces. Multimodal Large Language Models (MLLMs) have emerged as a promising +solution to these challenges, yet they struggle to accurately interpret human +emotions and complex content such as misinformation. This paper introduces +MM-Soc, a comprehensive benchmark designed to evaluate MLLMs' understanding of +multimodal social media content. MM-Soc compiles prominent multimodal datasets +and incorporates a novel large-scale YouTube tagging dataset, targeting a range +of tasks from misinformation detection, hate speech detection, and social +context generation. Through our exhaustive evaluation on ten size-variants of +four open-source MLLMs, we have identified significant performance disparities, +highlighting the need for advancements in models' social understanding +capabilities. Our analysis reveals that, in a zero-shot setting, various types +of MLLMs generally exhibit difficulties in handling social media tasks. +However, MLLMs demonstrate performance improvements post fine-tuning, +suggesting potential pathways for improvement. Our code and data are available +at https://github.com/claws-lab/MMSoc.git. + +
+
+ comment: In Proceedings of ACL 2024 +
+
+
+
+
+ + ♻ ☆ AMONGAGENTS: Evaluating Large Language Models in the Interactive + Text-Based Social Deduction Game ACL 2024 + + +
+ Strategic social deduction games serve as valuable testbeds for evaluating +the understanding and inference skills of language models, offering crucial +insights into social science, artificial intelligence, and strategic gaming. +This paper focuses on creating proxies of human behavior in simulated +environments, with Among Us utilized as a tool for studying simulated human +behavior. The study introduces a text-based game environment, named +AmongAgents, that mirrors the dynamics of Among Us. Players act as crew members +aboard a spaceship, tasked with identifying impostors who are sabotaging the +ship and eliminating the crew. Within this environment, the behavior of +simulated language agents is analyzed. The experiments involve diverse game +sequences featuring different configurations of Crewmates and Impostor +personality archetypes. Our work demonstrates that state-of-the-art large +language models (LLMs) can effectively grasp the game rules and make decisions +based on the current context. This work aims to promote further exploration of +LLMs in goal-oriented games with incomplete information and complex action +spaces, as these settings offer valuable opportunities to assess language model +performance in socially driven scenarios. + +
+
+ comment: Wordplay @ ACL 2024 +
+
+
+
+
+ + ♻ ☆ Description-Based Text Similarity + + +
+ Identifying texts with a given semantics is central for many information +seeking scenarios. Similarity search over vector embeddings appear to be +central to this ability, yet the similarity reflected in current text +embeddings is corpus-driven, and is inconsistent and sub-optimal for many use +cases. What, then, is a good notion of similarity for effective retrieval of +text? + We identify the need to search for texts based on abstract descriptions of +their content, and the corresponding notion of \emph{description based +similarity}. We demonstrate the inadequacy of current text embeddings and +propose an alternative model that significantly improves when used in standard +nearest neighbor search. The model is trained using positive and negative pairs +sourced through prompting a LLM, demonstrating how data from LLMs can be used +for creating new capabilities not immediately possible using the original +model. + +
+
+ comment: Accepted in COLM 2024 +
+
+
+
+
+ + ♻ ☆ Overview of AI-Debater 2023: The Challenges of Argument Generation Tasks + + +
+ In this paper we present the results of the AI-Debater 2023 Challenge held by +the Chinese Conference on Affect Computing (CCAC 2023), and introduce the +related datasets. We organize two tracks to handle the argumentative generation +tasks in different scenarios, namely, Counter-Argument Generation (Track 1) and +Claim-based Argument Generation (Track 2). Each track is equipped with its +distinct dataset and baseline model respectively. In total, 32 competing teams +register for the challenge, from which we received 11 successful submissions. +In this paper, we will present the results of the challenge and a summary of +the systems, highlighting commonalities and innovations among participating +systems. Datasets and baseline models of the AI-Debater 2023 Challenge have +been already released and can be accessed through the official website of the +challenge. + +
+
+
+
+
+ + ♻ ☆ Q-Sparse: All Large Language Models can be Fully Sparsely-Activated + + +
+ We introduce, Q-Sparse, a simple yet effective approach to training +sparsely-activated large language models (LLMs). Q-Sparse enables full sparsity +of activations in LLMs which can bring significant efficiency gains in +inference. This is achieved by applying top-K sparsification to the activations +and the straight-through-estimator to the training. We also introduce Block +Q-Sparse for batch training and inference. The key results from this work are, +(1) Q-Sparse can achieve results comparable to those of baseline LLMs while +being much more efficient at inference time; (2) We present an +inference-optimal scaling law for sparsely-activated LLMs; (3) Q-Sparse is +effective in different settings, including training-from-scratch, +continue-training of off-the-shelf LLMs, and finetuning; (4) Q-Sparse works for +both full-precision and 1-bit LLMs (e.g., BitNet b1.58). Particularly, the +synergy of BitNet b1.58 and Q-Sparse (can be equipped with MoE) provides the +cornerstone and a clear path to revolutionize the efficiency, including cost +and energy consumption, of future LLMs. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ Large Language Models as Topological Structure Enhancers for + Text-Attributed Graphs + + +
+ The latest advancements in large language models (LLMs) have revolutionized +the field of natural language processing (NLP). Inspired by the success of LLMs +in NLP tasks, some recent work has begun investigating the potential of +applying LLMs in graph learning tasks. However, most of the existing work +focuses on utilizing LLMs as powerful node feature augmenters, leaving +employing LLMs to enhance graph topological structures an understudied problem. +In this work, we explore how to leverage the information retrieval and text +generation capabilities of LLMs to refine/enhance the topological structure of +text-attributed graphs (TAGs) under the node classification setting. First, we +propose using LLMs to help remove unreliable edges and add reliable ones in the +TAG. Specifically, we first let the LLM output the semantic similarity between +node attributes through delicate prompt designs, and then perform edge deletion +and edge addition based on the similarity. Second, we propose using +pseudo-labels generated by the LLM to improve graph topology, that is, we +introduce the pseudo-label propagation as a regularization to guide the graph +neural network (GNN) in learning proper edge weights. Finally, we incorporate +the two aforementioned LLM-based methods for graph topological refinement into +the process of GNN training, and perform extensive experiments on four +real-world datasets. The experimental results demonstrate the effectiveness of +LLM-based graph topology refinement (achieving a 0.15%--2.47% performance gain +on public benchmarks). + +
+
+ comment: 10 pages +
+
+
+
+
+ + ♻ ☆ Arrows of Time for Large Language Models + + +
+ We study the probabilistic modeling performed by Autoregressive Large +Language Models (LLMs) through the angle of time directionality, addressing a +question first raised in (Shannon, 1951). For large enough models, we +empirically find a time asymmetry in their ability to learn natural language: a +difference in the average log-perplexity when trying to predict the next token +versus when trying to predict the previous one. This difference is at the same +time subtle and very consistent across various modalities (language, model +size, training time, ...). Theoretically, this is surprising: from an +information-theoretic point of view, there should be no such difference. We +provide a theoretical framework to explain how such an asymmetry can appear +from sparsity and computational complexity considerations, and outline a number +of perspectives opened by our results. + +
+
+ comment: Corrected typos in Table 2. Added links. 12 figures, 20 pages +
+
+
+
+
+ + ♻ ☆ Investigating Low-Rank Training in Transformer Language Models: + Efficiency and Scaling Analysis ICML 2024 + + +
+ State-of-the-art LLMs often rely on scale with high computational costs, +which has sparked a research agenda to reduce parameter counts and costs +without significantly impacting performance. Our study focuses on +Transformer-based LLMs, specifically applying low-rank parametrization to the +computationally intensive feedforward networks (FFNs), which are less studied +than attention blocks. In contrast to previous works, (i) we explore low-rank +parametrization at scale, up to 1.3B parameters; (ii) within Transformer +language models rather than convolutional architectures; and (iii) starting +from training from scratch. Experiments on the large RefinedWeb dataset show +that low-rank parametrization is both efficient (e.g., 2.6$\times$ FFN speed-up +with 32\% parameters) and effective during training. Interestingly, these +structured FFNs exhibit steeper scaling curves than the original models. +Motivated by this finding, we develop the wide and structured networks +surpassing the current medium-sized and large-sized Transformer in perplexity +and throughput performance. Our code is available at +https://github.com/CLAIRE-Labo/StructuredFFN/tree/main. + +
+
+ comment: Accepted by ICML 2024 Next Generation of Sequence Modeling + Architectures Workshop. Short version of arXiv:2406.16450 +
+
+
+
+
+ + ♻ ☆ Tree-Planner: Efficient Close-loop Task Planning with Large Language + Models ICLR 2024 + + +
+ This paper studies close-loop task planning, which refers to the process of +generating a sequence of skills (a plan) to accomplish a specific goal while +adapting the plan based on real-time observations. Recently, prompting Large +Language Models (LLMs) to generate actions iteratively has become a prevalent +paradigm due to its superior performance and user-friendliness. However, this +paradigm is plagued by two inefficiencies: high token consumption and redundant +error correction, both of which hinder its scalability for large-scale testing +and applications. To address these issues, we propose Tree-Planner, which +reframes task planning with LLMs into three distinct phases: plan sampling, +action tree construction, and grounded deciding. Tree-Planner starts by using +an LLM to sample a set of potential plans before execution, followed by the +aggregation of them to form an action tree. Finally, the LLM performs a +top-down decision-making process on the tree, taking into account real-time +environmental information. Experiments show that Tree-Planner achieves +state-of-the-art performance while maintaining high efficiency. By decomposing +LLM queries into a single plan-sampling call and multiple grounded-deciding +calls, a considerable part of the prompt are less likely to be repeatedly +consumed. As a result, token consumption is reduced by 92.2% compared to the +previously best-performing model. Additionally, by enabling backtracking on the +action tree as needed, the correction process becomes more flexible, leading to +a 40.5% decrease in error corrections. + +
+
+ comment: Published in ICLR 2024 +
+
+
+
+
+ + ♻ ☆ A Bounding Box is Worth One Token: Interleaving Layout and Text in a + Large Language Model for Document Understanding + + +
+ Recently, many studies have demonstrated that exclusively incorporating +OCR-derived text and spatial layouts with large language models (LLMs) can be +highly effective for document understanding tasks. However, existing methods +that integrate spatial layouts with text have limitations, such as producing +overly long text sequences or failing to fully leverage the autoregressive +traits of LLMs. In this work, we introduce Interleaving Layout and Text in a +Large Language Model (LayTextLLM)} for document understanding. In particular, +LayTextLLM projects each bounding box to a single embedding and interleaves it +with text, efficiently avoiding long sequence issues while leveraging +autoregressive traits of LLMs. LayTextLLM not only streamlines the interaction +of layout and textual data but also shows enhanced performance in Key +Information Extraction (KIE) and Visual Question Answering (VQA). Comprehensive +benchmark evaluations reveal significant improvements, with a 27.2% increase on +KIE tasks and 12.0% on VQA tasks compared to previous state-of-the-art document +understanding MLLMs, as well as a 15.1% improvement over other SOTA OCR-based +LLMs on KIE tasks. + +
+
+
+
+
+ + ♻ ☆ Efficient Tuning and Inference for Large Language Models on Textual + Graphs IJCAI2024 + + +
+ Rich textual and topological information of textual graphs need to be modeled +in real-world applications such as webpages, e-commerce, and academic articles. +Practitioners have been long following the path of adopting a shallow text +encoder and a subsequent graph neural network (GNN) to solve this problem. In +light of recent advancements in large language models (LLMs), it is apparent +that integrating LLMs for enhanced textual encoding can substantially improve +the performance of textual graphs. Nevertheless, the efficiency of these +methods poses a significant challenge. In this paper, we propose ENGINE, a +parameter- and memory-efficient fine-tuning method for textual graphs with an +LLM encoder. The key insight is to combine the LLMs and GNNs through a tunable +side structure, which significantly reduces the training complexity without +impairing the joint model's capacity. Extensive experiments on textual graphs +demonstrate our method's effectiveness by achieving the best model performance, +meanwhile having the lowest training cost compared to previous methods. +Moreover, we introduce two variants with caching and dynamic early exit to +further enhance training and inference speed. Specifically, caching accelerates +ENGINE's training by 12x, and dynamic early exit achieves up to 5x faster +inference with a negligible performance drop (at maximum 1.17% relevant drop +across 7 datasets). Our codes are available at: +https://github.com/ZhuYun97/ENGINE + +
+
+ comment: Accepted by IJCAI2024 +
+
+
+
+
+ + ♻ ☆ Learning a Patent-Informed Biomedical Knowledge Graph Reveals + Technological Potential of Drug Repositioning Candidates + + +
+ Drug repositioning-a promising strategy for discovering new therapeutic uses +for existing drugs-has been increasingly explored in the computational science +literature using biomedical databases. However, the technological potential of +drug repositioning candidates has often been overlooked. This study presents a +novel protocol to comprehensively analyse various sources such as +pharmaceutical patents and biomedical databases, and identify drug +repositioning candidates with both technological potential and scientific +evidence. To this end, first, we constructed a scientific biomedical knowledge +graph (s-BKG) comprising relationships between drugs, diseases, and genes +derived from biomedical databases. Our protocol involves identifying drugs that +exhibit limited association with the target disease but are closely located in +the s-BKG, as potential drug candidates. We constructed a patent-informed +biomedical knowledge graph (p-BKG) by adding pharmaceutical patent information. +Finally, we developed a graph embedding protocol to ascertain the structure of +the p-BKG, thereby calculating the relevance scores of those candidates with +target disease-related patents to evaluate their technological potential. Our +case study on Alzheimer's disease demonstrates its efficacy and feasibility, +while the quantitative outcomes and systematic methods are expected to bridge +the gap between computational discoveries and successful market applications in +drug repositioning research. + +
+
+ comment: We are sorry to withdraw this paper. We found some critical errors in + the introduction and results sections. Specifically, we found that the first + author have wrongly inserted citations on background works and he made + mistakes in the graph embedding methods and relevant results are wrongly + calculated. In this regard, we tried to revise this paper and withdraw the + current version. Thank you +
+
+
+
+
+ + ♻ ☆ Multimodal Detection of Bots on X (Twitter) using Transformers + + +
+ Although not all bots are malicious, the vast majority of them are +responsible for spreading misinformation and manipulating the public opinion +about several issues, i.e., elections and many more. Therefore, the early +detection of bots is crucial. Although there have been proposed methods for +detecting bots in social media, there are still substantial limitations. For +instance, existing research initiatives still extract a large number of +features and train traditional machine learning algorithms or use GloVe +embeddings and train LSTMs. However, feature extraction is a tedious procedure +demanding domain expertise. Also, language models based on transformers have +been proved to be better than LSTMs. Other approaches create large graphs and +train graph neural networks requiring in this way many hours for training and +access to computational resources. To tackle these limitations, this is the +first study employing only the user description field and images of three +channels denoting the type and content of tweets posted by the users. Firstly, +we create digital DNA sequences, transform them to 3d images, and apply +pretrained models of the vision domain, including EfficientNet, AlexNet, VGG16, +etc. Next, we propose a multimodal approach, where we use TwHIN-BERT for +getting the textual representation of the user description field and employ +VGG16 for acquiring the visual representation for the image modality. We +propose three different fusion methods, namely concatenation, gated multimodal +unit, and crossmodal attention, for fusing the different modalities and compare +their performances. Finally, we present a qualitative analysis of the behavior +of our best performing model. Extensive experiments conducted on the Cresci'17 +and TwiBot-20 datasets demonstrate valuable advantages of our introduced +approaches over state-of-the-art ones. + +
+
+ comment: IEEE Transactions on Information Forensics and Security (Accepted) +
+
+
+
+
+ + ♻ ☆ Building Intelligence Identification System via Large Language Model + Watermarking: A Survey and Beyond + + +
+ Large Language Models (LLMs) are increasingly integrated into diverse +industries, posing substantial security risks due to unauthorized replication +and misuse. To mitigate these concerns, robust identification mechanisms are +widely acknowledged as an effective strategy. Identification systems for LLMs +now rely heavily on watermarking technology to manage and protect intellectual +property and ensure data security. However, previous studies have primarily +concentrated on the basic principles of algorithms and lacked a comprehensive +analysis of watermarking theory and practice from the perspective of +intelligent identification. To bridge this gap, firstly, we explore how a +robust identity recognition system can be effectively implemented and managed +within LLMs by various participants using watermarking technology. Secondly, we +propose a mathematical framework based on mutual information theory, which +systematizes the identification process to achieve more precise and customized +watermarking. Additionally, we present a comprehensive evaluation of +performance metrics for LLM watermarking, reflecting participant preferences +and advancing discussions on its identification applications. Lastly, we +outline the existing challenges in current watermarking technologies and +theoretical frameworks, and provide directional guidance to address these +challenges. Our systematic classification and detailed exposition aim to +enhance the comparison and evaluation of various methods, fostering further +research and development toward a transparent, secure, and equitable LLM +ecosystem. + +
+
+ comment: 59 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Artificial Agency and Large Language Models + + +
+ The arrival of Large Language Models (LLMs) has stirred up philosophical +debates about the possibility of realizing agency in an artificial manner. In +this work we contribute to the debate by presenting a theoretical model that +can be used as a threshold conception for artificial agents. The model defines +agents as systems whose actions and goals are always influenced by a dynamic +framework of factors that consists of the agent's accessible history, its +adaptive repertoire and its external environment. This framework, in turn, is +influenced by the actions that the agent takes and the goals that it forms. We +show with the help of the model that state-of-the-art LLMs are not agents yet, +but that there are elements to them that suggest a way forward. The paper +argues that a combination of the agent architecture presented in Park et al. +(2023) together with the use of modules like the Coscientist in Boiko et al. +(2023) could potentially be a way to realize agency in an artificial manner. We +end the paper by reflecting on the obstacles one might face in building such an +artificial agent and by presenting possible directions for future research. + +
+
+ comment: Accepted for publication in journal Intellectica, special issue + "Philosophies of AI: thinking and writing with LLMs" (Intellectica, issue 81) +
+
+
+
+
+ + ♻ ☆ RefuteBench: Evaluating Refuting Instruction-Following for Large + Language Models ACL 2024 + + +
+ The application scope of large language models (LLMs) is increasingly +expanding. In practical use, users might provide feedback based on the model's +output, hoping for a responsive model that can complete responses according to +their feedback. Whether the model can appropriately respond to users' refuting +feedback and consistently follow through with execution has not been thoroughly +analyzed. In light of this, this paper proposes a comprehensive benchmark, +RefuteBench, covering tasks such as question answering, machine translation, +and email writing. The evaluation aims to assess whether models can positively +accept feedback in form of refuting instructions and whether they can +consistently adhere to user demands throughout the conversation. We conduct +evaluations on numerous LLMs and find that LLMs are stubborn, i.e. exhibit +inclination to their internal knowledge, often failing to comply with user +feedback. Additionally, as the length of the conversation increases, models +gradually forget the user's stated feedback and roll back to their own +responses. We further propose a recall-and-repeat prompts as a simple and +effective way to enhance the model's responsiveness to feedback. + +
+
+ comment: ACL 2024 final version +
+
+
+
+
+ + ♻ ☆ Performance of large language models in numerical vs. semantic medical + knowledge: Benchmarking on evidence-based Q&As + + +
+ Clinical problem-solving requires processing of semantic medical knowledge +such as illness scripts and numerical medical knowledge of diagnostic tests for +evidence-based decision-making. As large language models (LLMs) show promising +results in many aspects of language-based clinical practice, their ability to +generate non-language evidence-based answers to clinical questions is +inherently limited by tokenization. Therefore, we evaluated LLMs' performance +on two question types: numeric (correlating findings) and semantic +(differentiating entities) while examining differences within and between LLMs +in medical aspects and comparing their performance to humans. To generate +straightforward multi-choice questions and answers (QAs) based on +evidence-based medicine (EBM), we used a comprehensive medical knowledge graph +(encompassed data from more than 50,00 peer-reviewed articles) and created the +"EBMQA". EBMQA contains 105,000 QAs labeled with medical and non-medical topics +and classified into numerical or semantic questions. We benchmarked this +dataset using more than 24,500 QAs on two state-of-the-art LLMs: Chat-GPT4 and +Claude3-Opus. We evaluated the LLMs accuracy on semantic and numerical question +types and according to sub-labeled topics. For validation, six medical experts +were tested on 100 numerical EBMQA questions. We found that both LLMs excelled +more in semantic than numerical QAs, with Claude3 surpassing GPT4 in numerical +QAs. However, both LLMs showed inter and intra gaps in different medical +aspects and remained inferior to humans. Thus, their medical advice should be +addressed carefully. + +
+
+
+
+
+ + ♻ ☆ MCFEND: A Multi-source Benchmark Dataset for Chinese Fake News Detection WWW 2024 + + +
+ The prevalence of fake news across various online sources has had a +significant influence on the public. Existing Chinese fake news detection +datasets are limited to news sourced solely from Weibo. However, fake news +originating from multiple sources exhibits diversity in various aspects, +including its content and social context. Methods trained on purely one single +news source can hardly be applicable to real-world scenarios. Our pilot +experiment demonstrates that the F1 score of the state-of-the-art method that +learns from a large Chinese fake news detection dataset, Weibo-21, drops +significantly from 0.943 to 0.470 when the test data is changed to multi-source +news data, failing to identify more than one-third of the multi-source fake +news. To address this limitation, we constructed the first multi-source +benchmark dataset for Chinese fake news detection, termed MCFEND, which is +composed of news we collected from diverse sources such as social platforms, +messaging apps, and traditional online news outlets. Notably, such news has +been fact-checked by 14 authoritative fact-checking agencies worldwide. In +addition, various existing Chinese fake news detection methods are thoroughly +evaluated on our proposed dataset in cross-source, multi-source, and unseen +source ways. MCFEND, as a benchmark dataset, aims to advance Chinese fake news +detection approaches in real-world scenarios. + +
+
+ comment: Accepted by the ACM Web Conference 2024 (WWW 2024) oral, dataset + available: https://github.com/TrustworthyComp +
+
+
+
+
+ + ♻ ☆ Probing the Decision Boundaries of In-context Learning in Large Language + Models + + +
+ In-context learning is a key paradigm in large language models (LLMs) that +enables them to generalize to new tasks and domains by simply prompting these +models with a few exemplars without explicit parameter updates. Many attempts +have been made to understand in-context learning in LLMs as a function of model +scale, pretraining data, and other factors. In this work, we propose a new +mechanism to probe and understand in-context learning from the lens of decision +boundaries for in-context binary classification. Decision boundaries are +straightforward to visualize and provide important information about the +qualitative behavior of the inductive biases of standard classifiers. To our +surprise, we find that the decision boundaries learned by current LLMs in +simple binary classification tasks are often irregular and non-smooth, +regardless of linear separability in the underlying task. This paper +investigates the factors influencing these decision boundaries and explores +methods to enhance their generalizability. We assess various approaches, +including training-free and fine-tuning methods for LLMs, the impact of model +architecture, and the effectiveness of active prompting techniques for +smoothing decision boundaries in a data-efficient manner. Our findings provide +a deeper understanding of in-context learning dynamics and offer practical +improvements for enhancing robustness and generalizability of in-context +learning. + +
+
+ comment: 18 pages, code at https://github.com/siyan-zhao/ICL_decision_boundary +
+
+
+
+
+ + ♻ ☆ The Honorific Effect: Exploring the Impact of Japanese Linguistic + Formalities on AI-Generated Physics Explanations + + +
+ This study investigates the influence of Japanese honorifics on the responses +of large language models (LLMs) when explaining the law of conservation of +momentum. We analyzed the outputs of six state-of-the-art AI models, including +variations of ChatGPT, Coral, and Gemini, using 14 different honorific forms. +Our findings reveal that honorifics significantly affect the quality, +consistency, and formality of AI-generated responses, demonstrating LLMs' +ability to interpret and adapt to social context cues embedded in language. +Notable variations were observed across different models, with some emphasizing +historical context and derivations, while others focused on intuitive +explanations. The study highlights the potential for using honorifics to adjust +the depth and complexity of AI-generated explanations in educational contexts. +Furthermore, the responsiveness of AI models to cultural linguistic elements +underscores the importance of considering cultural factors in AI development +for educational applications. These results open new avenues for research in +AI-assisted education and cultural adaptation in AI systems, with significant +implications for personalizing learning experiences and developing culturally +sensitive AI tools for global education. + +
+
+
+
+
+ + ♻ ☆ Video Understanding with Large Language Models: A Survey + + +
+ With the burgeoning growth of online video platforms and the escalating +volume of video content, the demand for proficient video understanding tools +has intensified markedly. Given the remarkable capabilities of large language +models (LLMs) in language and multimodal tasks, this survey provides a detailed +overview of recent advancements in video understanding that harness the power +of LLMs (Vid-LLMs). The emergent capabilities of Vid-LLMs are surprisingly +advanced, particularly their ability for open-ended multi-granularity (general, +temporal, and spatiotemporal) reasoning combined with commonsense knowledge, +suggesting a promising path for future video understanding. We examine the +unique characteristics and capabilities of Vid-LLMs, categorizing the +approaches into three main types: Video Analyzer x LLM, Video Embedder x LLM, +and (Analyzer + Embedder) x LLM. Furthermore, we identify five sub-types based +on the functions of LLMs in Vid-LLMs: LLM as Summarizer, LLM as Manager, LLM as +Text Decoder, LLM as Regressor, and LLM as Hidden Layer. Furthermore, this +survey presents a comprehensive study of the tasks, datasets, benchmarks, and +evaluation methodologies for Vid-LLMs. Additionally, it explores the expansive +applications of Vid-LLMs across various domains, highlighting their remarkable +scalability and versatility in real-world video understanding challenges. +Finally, it summarizes the limitations of existing Vid-LLMs and outlines +directions for future research. For more information, readers are recommended +to visit the repository at +https://github.com/yunlong10/Awesome-LLMs-for-Video-Understanding. + +
+
+
+
+
+ + ♻ ☆ A Survey of Prompt Engineering Methods in Large Language Models for + Different NLP Tasks + + +
+ Large language models (LLMs) have shown remarkable performance on many +different Natural Language Processing (NLP) tasks. Prompt engineering plays a +key role in adding more to the already existing abilities of LLMs to achieve +significant performance gains on various NLP tasks. Prompt engineering requires +composing natural language instructions called prompts to elicit knowledge from +LLMs in a structured way. Unlike previous state-of-the-art (SoTA) models, +prompt engineering does not require extensive parameter re-training or +fine-tuning based on the given NLP task and thus solely operates on the +embedded knowledge of LLMs. Additionally, LLM enthusiasts can intelligently +extract LLMs' knowledge through a basic natural language conversational +exchange or prompt engineering, allowing more and more people even without deep +mathematical machine learning background to experiment with LLMs. With prompt +engineering gaining popularity in the last two years, researchers have come up +with numerous engineering techniques around designing prompts to improve +accuracy of information extraction from the LLMs. In this paper, we summarize +different prompting techniques and club them together based on different NLP +tasks that they have been used for. We further granularly highlight the +performance of these prompting strategies on various datasets belonging to that +NLP task, talk about the corresponding LLMs used, present a taxonomy diagram +and discuss the possible SoTA for specific datasets. In total, we read and +present a survey of 44 research papers which talk about 39 different prompting +methods on 29 different NLP tasks of which most of them have been published in +the last two years. + +
+
+
+
+
+ + ♻ ☆ LLM as Dataset Analyst: Subpopulation Structure Discovery with Large + Language Model ECCV24 + + +
+ The distribution of subpopulations is an important property hidden within a +dataset. Uncovering and analyzing the subpopulation distribution within +datasets provides a comprehensive understanding of the datasets, standing as a +powerful tool beneficial to various downstream tasks, including Dataset +Subpopulation Organization, Subpopulation Shift, and Slice Discovery. Despite +its importance, there has been no work that systematically explores the +subpopulation distribution of datasets to our knowledge. To address the +limitation and solve all the mentioned tasks in a unified way, we introduce a +novel concept of subpopulation structures to represent, analyze, and utilize +subpopulation distributions within datasets. To characterize the structures in +an interpretable manner, we propose the Subpopulation Structure Discovery with +Large Language Models (SSD-LLM) framework, which employs world knowledge and +instruction-following capabilities of Large Language Models (LLMs) to +linguistically analyze informative image captions and summarize the structures. +Furthermore, we propose complete workflows to address downstream tasks, named +Task-specific Tuning, showcasing the application of the discovered structure to +a spectrum of subpopulation-related tasks, including dataset subpopulation +organization, subpopulation shift, and slice discovery. Furthermore, we propose +complete workflows to address downstream tasks, named Task-specific Tuning, +showcasing the application of the discovered structure to a spectrum of +subpopulation-related tasks, including dataset subpopulation organization, +subpopulation shift, and slice discovery. + +
+
+ comment: ECCV24 Camera Ready +
+
+
+
+
+ + ♻ ☆ CHATATC: Large Language Model-Driven Conversational Agents for + Supporting Strategic Air Traffic Flow Management ICRA + + +
+ Generative artificial intelligence (AI) and large language models (LLMs) have +gained rapid popularity through publicly available tools such as ChatGPT. The +adoption of LLMs for personal and professional use is fueled by the natural +interactions between human users and computer applications such as ChatGPT, +along with powerful summarization and text generation capabilities. Given the +widespread use of such generative AI tools, in this work we investigate how +these tools can be deployed in a non-safety critical, strategic traffic flow +management setting. Specifically, we train an LLM, CHATATC, based on a large +historical data set of Ground Delay Program (GDP) issuances, spanning 2000-2023 +and consisting of over 80,000 GDP implementations, revisions, and +cancellations. We test the query and response capabilities of CHATATC, +documenting successes (e.g., providing correct GDP rates, durations, and +reason) and shortcomings (e.g,. superlative questions). We also detail the +design of a graphical user interface for future users to interact and +collaborate with the CHATATC conversational agent. + +
+
+ comment: 8 pages, 5 figures; minor revisions to address reviewer feedback for + final submission to the 11th International Conference on Research in Air + Transportation (ICRAT) +
+
+
+
+
+ + ♻ ☆ Multi-Convformer: Extending Conformer with Multiple Convolution Kernels INTERSPEECH 2024 + + +
+ Convolutions have become essential in state-of-the-art end-to-end Automatic +Speech Recognition~(ASR) systems due to their efficient modelling of local +context. Notably, its use in Conformers has led to superior performance +compared to vanilla Transformer-based ASR systems. While components other than +the convolution module in the Conformer have been reexamined, altering the +convolution module itself has been far less explored. Towards this, we +introduce Multi-Convformer that uses multiple convolution kernels within the +convolution module of the Conformer in conjunction with gating. This helps in +improved modeling of local dependencies at varying granularities. Our model +rivals existing Conformer variants such as CgMLP and E-Branchformer in +performance, while being more parameter efficient. We empirically compare our +approach with Conformer and its variants across four different datasets and +three different modelling paradigms and show up to 8% relative word error +rate~(WER) improvements. + +
+
+ comment: Accepted to INTERSPEECH 2024 +
+
+
+
+
+ + ♻ ☆ Two-stage Generative Question Answering on Temporal Knowledge Graph + Using Large Language Models ACL + + +
+ Temporal knowledge graph question answering (TKGQA) poses a significant +challenge task, due to the temporal constraints hidden in questions and the +answers sought from dynamic structured knowledge. Although large language +models (LLMs) have made considerable progress in their reasoning ability over +structured data, their application to the TKGQA task is a relatively unexplored +area. This paper first proposes a novel generative temporal knowledge graph +question answering framework, GenTKGQA, which guides LLMs to answer temporal +questions through two phases: Subgraph Retrieval and Answer Generation. First, +we exploit LLM's intrinsic knowledge to mine temporal constraints and +structural links in the questions without extra training, thus narrowing down +the subgraph search space in both temporal and structural dimensions. Next, we +design virtual knowledge indicators to fuse the graph neural network signals of +the subgraph and the text representations of the LLM in a non-shallow way, +which helps the open-source LLM deeply understand the temporal order and +structural dependencies among the retrieved facts through instruction tuning. +Experimental results on two widely used datasets demonstrate the superiority of +our model. + +
+
+ comment: Accepted by ACL(Findings) 2024 +
+
+
+
+
+ + ♻ ☆ PEA-Diffusion: Parameter-Efficient Adapter with Knowledge Distillation + in non-English Text-to-Image Generation ECCV 2024 + + +
+ Text-to-image diffusion models are well-known for their ability to generate +realistic images based on textual prompts. However, the existing works have +predominantly focused on English, lacking support for non-English text-to-image +models. The most commonly used translation methods cannot solve the generation +problem related to language culture, while training from scratch on a specific +language dataset is prohibitively expensive. In this paper, we are inspired to +propose a simple plug-and-play language transfer method based on knowledge +distillation. All we need to do is train a lightweight MLP-like +parameter-efficient adapter (PEA) with only 6M parameters under teacher +knowledge distillation along with a small parallel data corpus. We are +surprised to find that freezing the parameters of UNet can still achieve +remarkable performance on the language-specific prompt evaluation set, +demonstrating that PEA can stimulate the potential generation ability of the +original UNet. Additionally, it closely approaches the performance of the +English text-to-image model on a general prompt evaluation set. Furthermore, +our adapter can be used as a plugin to achieve significant results in +downstream tasks in cross-lingual text-to-image generation. Code will be +available at: https://github.com/OPPO-Mente-Lab/PEA-Diffusion + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Tailoring Vaccine Messaging with Common-Ground Opinions NAACL + + +
+ One way to personalize chatbot interactions is by establishing common ground +with the intended reader. A domain where establishing mutual understanding +could be particularly impactful is vaccine concerns and misinformation. Vaccine +interventions are forms of messaging which aim to answer concerns expressed +about vaccination. Tailoring responses in this domain is difficult, since +opinions often have seemingly little ideological overlap. We define the task of +tailoring vaccine interventions to a Common-Ground Opinion (CGO). Tailoring +responses to a CGO involves meaningfully improving the answer by relating it to +an opinion or belief the reader holds. In this paper we introduce TAILOR-CGO, a +dataset for evaluating how well responses are tailored to provided CGOs. We +benchmark several major LLMs on this task; finding GPT-4-Turbo performs +significantly better than others. We also build automatic evaluation metrics, +including an efficient and accurate BERT model that outperforms finetuned LLMs, +investigate how to successfully tailor vaccine messaging to CGOs, and provide +actionable recommendations from this investigation. + Code and model weights: https://github.com/rickardstureborg/tailor-cgo +Dataset: https://huggingface.co/datasets/DukeNLP/tailor-cgo + +
+
+ comment: NAACL Findings 2024 +
+
+
+
+
+ + ♻ ☆ Data Mixture Inference: What do BPE Tokenizers Reveal about their + Training Data? + + +
+ The pretraining data of today's strongest language models is opaque; in +particular, little is known about the proportions of various domains or +languages represented. In this work, we tackle a task which we call data +mixture inference, which aims to uncover the distributional make-up of training +data. We introduce a novel attack based on a previously overlooked source of +information -- byte-pair encoding (BPE) tokenizers, used by the vast majority +of modern language models. Our key insight is that the ordered list of merge +rules learned by a BPE tokenizer naturally reveals information about the token +frequencies in its training data: the first merge is the most common byte pair, +the second is the most common pair after merging the first token, and so on. +Given a tokenizer's merge list along with data samples for each category of +interest, we formulate a linear program that solves for the proportion of each +category in the tokenizer's training set. Importantly, to the extent to which +tokenizer training data is representative of the pretraining data, we +indirectly learn about pretraining data. In controlled experiments, we show +that our attack recovers mixture ratios with high precision for tokenizers +trained on known mixtures of natural languages, programming languages, and data +sources. We then apply our approach to off-the-shelf tokenizers released with +recent LMs. We confirm much publicly disclosed information about these models, +and also make several new inferences: GPT-4o's tokenizer is much more +multilingual than its predecessors, training on 39% non-English data; Llama3 +extends GPT-3.5's tokenizer primarily for multilingual (48%) use; GPT-3.5's and +Claude's tokenizers are trained on predominantly code (~60%). We hope our work +sheds light on current design practices for pretraining data, and inspires +continued research into data mixture inference for LMs. + +
+
+ comment: 19 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Understanding and Mitigating the Threat of Vec2Text to Dense Retrieval + Systems + + +
+ The emergence of Vec2Text -- a method for text embedding inversion -- has +raised serious privacy concerns for dense retrieval systems which use text +embeddings, such as those offered by OpenAI and Cohere. This threat comes from +the ability for a malicious attacker with access to embeddings to reconstruct +the original text. In this paper, we investigate various factors related to +embedding models that may impact text recoverability via Vec2Text. We explore +factors such as distance metrics, pooling functions, bottleneck pre-training, +training with noise addition, embedding quantization, and embedding dimensions, +which were not considered in the original Vec2Text paper. Through a +comprehensive analysis of these factors, our objective is to gain a deeper +understanding of the key elements that affect the trade-offs between the text +recoverability and retrieval effectiveness of dense retrieval systems, offering +insights for practitioners designing privacy-aware dense retrieval systems. We +also propose a simple embedding transformation fix that guarantees equal +ranking effectiveness while mitigating the recoverability risk. Overall, this +study reveals that Vec2Text could pose a threat to current dense retrieval +systems, but there are some effective methods to patch such systems. + +
+
+
+
+
+ + ♻ ☆ Can GPT-4 learn to analyze moves in research article abstracts? + + +
+ One of the most powerful and enduring ideas in written discourse analysis is +that genres can be described in terms of the moves which structure a writer's +purpose. Considerable research has sought to identify these distinct +communicative acts, but analyses have been beset by problems of subjectivity, +reliability and the time-consuming need for multiple coders to confirm +analyses. In this paper we employ the affordances of GPT-4 to automate the +annotation process by using natural language prompts. Focusing on abstracts +from articles in four applied linguistics journals, we devise prompts which +enable the model to identify moves effectively. The annotated outputs of these +prompts were evaluated by two assessors with a third addressing disagreements. +The results show that an 8-shot prompt was more effective than one using two, +confirming that the inclusion of examples illustrating areas of variability can +enhance GPT-4's ability to recognize multiple moves in a single sentence and +reduce bias related to textual position. We suggest that GPT-4 offers +considerable potential in automating this annotation process, when human actors +with domain specific linguistic expertise inform the prompting process. + +
+
+
+
+
+ + ♻ ☆ Cascaded Cross-Modal Transformer for Audio-Textual Classification + + +
+ Speech classification tasks often require powerful language understanding +models to grasp useful features, which becomes problematic when limited +training data is available. To attain superior classification performance, we +propose to harness the inherent value of multimodal representations by +transcribing speech using automatic speech recognition (ASR) models and +translating the transcripts into different languages via pretrained translation +models. We thus obtain an audio-textual (multimodal) representation for each +data sample. Subsequently, we combine language-specific Bidirectional Encoder +Representations from Transformers (BERT) with Wav2Vec2.0 audio features via a +novel cascaded cross-modal transformer (CCMT). Our model is based on two +cascaded transformer blocks. The first one combines text-specific features from +distinct languages, while the second one combines acoustic features with +multilingual features previously learned by the first transformer block. We +employed our system in the Requests Sub-Challenge of the ACM Multimedia 2023 +Computational Paralinguistics Challenge. CCMT was declared the winning +solution, obtaining an unweighted average recall (UAR) of 65.41% and 85.87% for +complaint and request detection, respectively. Moreover, we applied our +framework on the Speech Commands v2 and HarperValleyBank dialog data sets, +surpassing previous studies reporting results on these benchmarks. Our code is +freely available for download at: https://github.com/ristea/ccmt. + +
+
+ comment: Accepted for publication in Artificial Intelligence Review +
+
+
+
+
+ + ♻ ☆ Distilling Robustness into Natural Language Inference Models with + Domain-Targeted Augmentation ACL + + +
+ Knowledge distillation optimises a smaller student model to behave similarly +to a larger teacher model, retaining some of the performance benefits. While +this method can improve results on in-distribution examples, it does not +necessarily generalise to out-of-distribution (OOD) settings. We investigate +two complementary methods for improving the robustness of the resulting student +models on OOD domains. The first approach augments the distillation with +generated unlabelled examples that match the target distribution. The second +method upsamples data points among the training set that are similar to the +target distribution. When applied on the task of natural language inference +(NLI), our experiments on MNLI show that distillation with these modifications +outperforms previous robustness solutions. We also find that these methods +improve performance on OOD domains even beyond the target domain. + +
+
+ comment: Accepted at ACL Findings 2024 +
+
+
+
+
+ + ♻ ☆ Distilling System 2 into System 1 + + +
+ Large language models (LLMs) can spend extra compute during inference to +generate intermediate thoughts, which helps to produce better final responses. +Since Chain-of-Thought (Wei et al., 2022), many such System 2 techniques have +been proposed such as Rephrase and Respond (Deng et al., 2023a), System 2 +Attention (Weston and Sukhbaatar, 2023) and Branch-Solve-Merge (Saha et al., +2023). In this work we investigate self-supervised methods to ``compile'' +(distill) higher quality outputs from System 2 techniques back into LLM +generations without intermediate reasoning token sequences, as this reasoning +has been distilled into System 1. We show that several such techniques can be +successfully distilled, resulting in improved results compared to the original +System 1 performance, and with less inference cost than System 2. We posit that +such System 2 distillation will be an important feature of future continually +learning AI systems, enabling them to focus System 2 capabilities on the +reasoning tasks that they cannot yet do well. + +
+
+
+
+
+ + ♻ ☆ Multi-step Inference over Unstructured Data + + +
+ The advent of Large Language Models (LLMs) and Generative AI has +revolutionized natural language applications across various domains. However, +high-stakes decision-making tasks in fields such as medical, legal and finance +require a level of precision, comprehensiveness, and logical consistency that +pure LLM or Retrieval-Augmented-Generation (RAG) approaches often fail to +deliver. At Elemental Cognition (EC), we have developed a neuro-symbolic AI +platform to tackle these problems. The platform integrates fine-tuned LLMs for +knowledge extraction and alignment with a robust symbolic reasoning engine for +logical inference, planning and interactive constraint solving. We describe +Cora, a Collaborative Research Assistant built on this platform, that is +designed to perform complex research and discovery tasks in high-stakes +domains. This paper discusses the multi-step inference challenges inherent in +such domains, critiques the limitations of existing LLM-based methods, and +demonstrates how Cora's neuro-symbolic approach effectively addresses these +issues. We provide an overview of the system architecture, key algorithms for +knowledge extraction and formal reasoning, and present preliminary evaluation +results that highlight Cora's superior performance compared to well-known LLM +and RAG baselines. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 140 + +
+
+
+ + ☆ SV4D: Dynamic 3D Content Generation with Multi-Frame and Multi-View + Consistency + + +
+ We present Stable Video 4D (SV4D), a latent video diffusion model for +multi-frame and multi-view consistent dynamic 3D content generation. Unlike +previous methods that rely on separately trained generative models for video +generation and novel view synthesis, we design a unified diffusion model to +generate novel view videos of dynamic 3D objects. Specifically, given a +monocular reference video, SV4D generates novel views for each video frame that +are temporally consistent. We then use the generated novel view videos to +optimize an implicit 4D representation (dynamic NeRF) efficiently, without the +need for cumbersome SDS-based optimization used in most prior works. To train +our unified novel view video generation model, we curated a dynamic 3D object +dataset from the existing Objaverse dataset. Extensive experimental results on +multiple datasets and user studies demonstrate SV4D's state-of-the-art +performance on novel-view video synthesis as well as 4D generation compared to +prior works. + +
+
+ comment: Project page: https://sv4d.github.io/ +
+
+
+
+
+ + ☆ SoNIC: Safe Social Navigation with Adaptive Conformal Inference and + Constrained Reinforcement Learning + + +
+ Reinforcement Learning (RL) has enabled social robots to generate +trajectories without human-designed rules or interventions, which makes it more +effective than hard-coded systems for generalizing to complex real-world +scenarios. However, social navigation is a safety-critical task that requires +robots to avoid collisions with pedestrians while previous RL-based solutions +fall short in safety performance in complex environments. To enhance the safety +of RL policies, to the best of our knowledge, we propose the first algorithm, +SoNIC, that integrates adaptive conformal inference (ACI) with constrained +reinforcement learning (CRL) to learn safe policies for social navigation. More +specifically, our method augments RL observations with ACI-generated +nonconformity scores and provides explicit guidance for agents to leverage the +uncertainty metrics to avoid safety-critical areas by incorporating safety +constraints with spatial relaxation. Our method outperforms state-of-the-art +baselines in terms of both safety and adherence to social norms by a large +margin and demonstrates much stronger robustness to out-of-distribution +scenarios. Our code and video demos are available on our project website: +https://sonic-social-nav.github.io/. + +
+
+ comment: Project website: https://sonic-social-nav.github.io/ +
+
+
+
+
+ + ☆ CSCPR: Cross-Source-Context Indoor RGB-D Place Recognition + + +
+ We present a new algorithm, Cross-Source-Context Place Recognition (CSCPR), +for RGB-D indoor place recognition that integrates global retrieval and +reranking into a single end-to-end model. Unlike prior approaches that +primarily focus on the RGB domain, CSCPR is designed to handle the RGB-D data. +We extend the Context-of-Clusters (CoCs) for handling noisy colorized point +clouds and introduce two novel modules for reranking: the Self-Context Cluster +(SCC) and Cross Source Context Cluster (CSCC), which enhance feature +representation and match query-database pairs based on local features, +respectively. We also present two new datasets, ScanNetIPR and ARKitIPR. Our +experiments demonstrate that CSCPR significantly outperforms state-of-the-art +models on these datasets by at least 36.5% in Recall@1 at ScanNet-PR dataset +and 44% in new datasets. Code and datasets will be released. + +
+
+
+
+
+ + ☆ $VILA^2$: VILA Augmented VILA + + +
+ Visual language models (VLMs) have rapidly progressed, driven by the success +of large language models (LLMs). While model architectures and training +infrastructures advance rapidly, data curation remains under-explored. When +data quantity and quality become a bottleneck, existing work either directly +crawls more raw data from the Internet that does not have a guarantee of data +quality or distills from black-box commercial models (e.g., GPT-4V / Gemini) +causing the performance upper bounded by that model. In this work, we introduce +a novel approach that includes a self-augment step and a specialist-augment +step to iteratively improve data quality and model performance. In the +self-augment step, a VLM recaptions its own pretraining data to enhance data +quality, and then retrains from scratch using this refined dataset to improve +model performance. This process can iterate for several rounds. Once +self-augmentation saturates, we employ several specialist VLMs finetuned from +the self-augmented VLM with domain-specific expertise, to further infuse +specialist knowledge into the generalist VLM through task-oriented recaptioning +and retraining. With the combined self-augmented and specialist-augmented +training, we introduce $VILA^2$ (VILA-augmented-VILA), a VLM family that +consistently improves the accuracy on a wide range of tasks over prior art, and +achieves new state-of-the-art results on MMMU leaderboard among open-sourced +models. + +
+
+
+
+
+ + ☆ Looking at Model Debiasing through the Lens of Anomaly Detection + + +
+ It is widely recognized that deep neural networks are sensitive to bias in +the data. This means that during training these models are likely to learn +spurious correlations between data and labels, resulting in limited +generalization abilities and low performance. In this context, model debiasing +approaches can be devised aiming at reducing the model's dependency on such +unwanted correlations, either leveraging the knowledge of bias information or +not. In this work, we focus on the latter and more realistic scenario, showing +the importance of accurately predicting the bias-conflicting and bias-aligned +samples to obtain compelling performance in bias mitigation. On this ground, we +propose to conceive the problem of model bias from an out-of-distribution +perspective, introducing a new bias identification method based on anomaly +detection. We claim that when data is mostly biased, bias-conflicting samples +can be regarded as outliers with respect to the bias-aligned distribution in +the feature space of a biased model, thus allowing for precisely detecting them +with an anomaly detection method. Coupling the proposed bias identification +approach with bias-conflicting data upsampling and augmentation in a two-step +strategy, we reach state-of-the-art performance on synthetic and real benchmark +datasets. Ultimately, our proposed approach shows that the data bias issue does +not necessarily require complex debiasing methods, given that an accurate bias +identification procedure is defined. + +
+
+ comment: 15 pages, 7 figures +
+
+
+
+
+ + ☆ AHMF: Adaptive Hybrid-Memory-Fusion Model for Driver Attention + Prediction + + +
+ Accurate driver attention prediction can serve as a critical reference for +intelligent vehicles in understanding traffic scenes and making informed +driving decisions. Though existing studies on driver attention prediction +improved performance by incorporating advanced saliency detection techniques, +they overlooked the opportunity to achieve human-inspired prediction by +analyzing driving tasks from a cognitive science perspective. During driving, +drivers' working memory and long-term memory play crucial roles in scene +comprehension and experience retrieval, respectively. Together, they form +situational awareness, facilitating drivers to quickly understand the current +traffic situation and make optimal decisions based on past driving experiences. +To explicitly integrate these two types of memory, this paper proposes an +Adaptive Hybrid-Memory-Fusion (AHMF) driver attention prediction model to +achieve more human-like predictions. Specifically, the model first encodes +information about specific hazardous stimuli in the current scene to form +working memories. Then, it adaptively retrieves similar situational experiences +from the long-term memory for final prediction. Utilizing domain adaptation +techniques, the model performs parallel training across multiple datasets, +thereby enriching the accumulated driving experience within the long-term +memory module. Compared to existing models, our model demonstrates significant +improvements across various metrics on multiple public datasets, proving the +effectiveness of integrating hybrid memories in driver attention prediction. + +
+
+
+
+
+ + ☆ HumanVid: Demystifying Training Data for Camera-controllable Human Image + Animation + + +
+ Human image animation involves generating videos from a character photo, +allowing user control and unlocking potential for video and movie production. +While recent approaches yield impressive results using high-quality training +data, the inaccessibility of these datasets hampers fair and transparent +benchmarking. Moreover, these approaches prioritize 2D human motion and +overlook the significance of camera motions in videos, leading to limited +control and unstable video generation.To demystify the training data, we +present HumanVid, the first large-scale high-quality dataset tailored for human +image animation, which combines crafted real-world and synthetic data. For the +real-world data, we compile a vast collection of copyright-free real-world +videos from the internet. Through a carefully designed rule-based filtering +strategy, we ensure the inclusion of high-quality videos, resulting in a +collection of 20K human-centric videos in 1080P resolution. Human and camera +motion annotation is accomplished using a 2D pose estimator and a SLAM-based +method. For the synthetic data, we gather 2,300 copyright-free 3D avatar assets +to augment existing available 3D assets. Notably, we introduce a rule-based +camera trajectory generation method, enabling the synthetic pipeline to +incorporate diverse and precise camera motion annotation, which can rarely be +found in real-world data. To verify the effectiveness of HumanVid, we establish +a baseline model named CamAnimate, short for Camera-controllable Human +Animation, that considers both human and camera motions as conditions. Through +extensive experimentation, we demonstrate that such simple baseline training on +our HumanVid achieves state-of-the-art performance in controlling both human +pose and camera motions, setting a new benchmark. Code and data will be +publicly available at \url{https://github.com/zhenzhiwang/HumanVid/}. + +
+
+ comment: camera controllable human image animation, a dataset and a baseline +
+
+
+
+
+ + ☆ 3D Gaussian Splatting: Survey, Technologies, Challenges, and + Opportunities + + +
+ 3D Gaussian Splatting (3DGS) has emerged as a prominent technique with the +potential to become a mainstream method for 3D representations. It can +effectively transform multi-view images into explicit 3D Gaussian +representations through efficient training, and achieve real-time rendering of +novel views. This survey aims to analyze existing 3DGS-related works from +multiple intersecting perspectives, including related tasks, technologies, +challenges, and opportunities. The primary objective is to provide newcomers +with a rapid understanding of the field and to assist researchers in +methodically organizing existing technologies and challenges. Specifically, we +delve into the optimization, application, and extension of 3DGS, categorizing +them based on their focuses or motivations. Additionally, we summarize and +classify nine types of technical modules and corresponding improvements +identified in existing works. Based on these analyses, we further examine the +common challenges and technologies across various tasks, proposing potential +research opportunities. + +
+
+
+
+
+ + ☆ (PASS) Visual Prompt Locates Good Structure Sparsity through a Recurrent + HyperNetwork + + +
+ Large-scale neural networks have demonstrated remarkable performance in +different domains like vision and language processing, although at the cost of +massive computation resources. As illustrated by compression literature, +structural model pruning is a prominent algorithm to encourage model +efficiency, thanks to its acceleration-friendly sparsity patterns. One of the +key questions of structural pruning is how to estimate the channel +significance. In parallel, work on data-centric AI has shown that +prompting-based techniques enable impressive generalization of large language +models across diverse downstream tasks. In this paper, we investigate a +charming possibility - \textit{leveraging visual prompts to capture the channel +importance and derive high-quality structural sparsity}. To this end, we +propose a novel algorithmic framework, namely \texttt{PASS}. It is a tailored +hyper-network to take both visual prompts and network weight statistics as +input, and output layer-wise channel sparsity in a recurrent manner. Such +designs consider the intrinsic channel dependency between layers. Comprehensive +experiments across multiple network architectures and six datasets demonstrate +the superiority of \texttt{PASS} in locating good structural sparsity. For +example, at the same FLOPs level, \texttt{PASS} subnetworks achieve $1\%\sim +3\%$ better accuracy on Food101 dataset; or with a similar performance of +$80\%$ accuracy, \texttt{PASS} subnetworks obtain $0.35\times$ more speedup +than the baselines. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Generation of Training Data from HD Maps in the Lanelet2 Framework + + +
+ Using HD maps directly as training data for machine learning tasks has seen a +massive surge in popularity and shown promising results, e.g. in the field of +map perception. Despite that, a standardized HD map framework supporting all +parts of map-based automated driving and training label generation from map +data does not exist. Furthermore, feeding map perception models with map data +as part of the input during real-time inference is not addressed by the +research community. In order to fill this gap, we presentlanelet2_ml_converter, +an integrated extension to the HD map framework Lanelet2, widely used in +automated driving systems by academia and industry. With this addition Lanelet2 +unifies map based automated driving, machine learning inference and training, +all from a single source of map data and format. Requirements for a unified +framework are analyzed and the implementation of these requirements is +described. The usability of labels in state of the art machine learning is +demonstrated with application examples from the field of map perception. The +source code is available embedded in the Lanelet2 framework under +https://github.com/fzi-forschungszentrum-informatik/Lanelet2/tree/feature_ml_converter + +
+
+
+
+
+ + ☆ Self-Calibrated Variance-Stabilizing Transformations for Real-World + Image Denoising + + +
+ Supervised deep learning has become the method of choice for image denoising. +It involves the training of neural networks on large datasets composed of pairs +of noisy and clean images. However, the necessity of training data that are +specific to the targeted application constrains the widespread use of denoising +networks. Recently, several approaches have been developed to overcome this +difficulty by whether artificially generating realistic clean/noisy image +pairs, or training exclusively on noisy images. In this paper, we show that, +contrary to popular belief, denoising networks specialized in the removal of +Gaussian noise can be efficiently leveraged in favor of real-world image +denoising, even without additional training. For this to happen, an appropriate +variance-stabilizing transform (VST) has to be applied beforehand. We propose +an algorithm termed Noise2VST for the learning of such a model-free VST. Our +approach requires only the input noisy image and an off-the-shelf Gaussian +denoiser. We demonstrate through extensive experiments the efficiency and +superiority of Noise2VST in comparison to existing methods trained in the +absence of specific clean/noisy pairs. + +
+
+
+
+
+ + ☆ 3D Question Answering for City Scene Understanding + + +
+ 3D multimodal question answering (MQA) plays a crucial role in scene +understanding by enabling intelligent agents to comprehend their surroundings +in 3D environments. While existing research has primarily focused on indoor +household tasks and outdoor roadside autonomous driving tasks, there has been +limited exploration of city-level scene understanding tasks. Furthermore, +existing research faces challenges in understanding city scenes, due to the +absence of spatial semantic information and human-environment interaction +information at the city level.To address these challenges, we investigate 3D +MQA from both dataset and method perspectives. From the dataset perspective, we +introduce a novel 3D MQA dataset named City-3DQA for city-level scene +understanding, which is the first dataset to incorporate scene semantic and +human-environment interactive tasks within the city. From the method +perspective, we propose a Scene graph enhanced City-level Understanding method +(Sg-CityU), which utilizes the scene graph to introduce the spatial semantic. A +new benchmark is reported and our proposed Sg-CityU achieves accuracy of 63.94 +% and 63.76 % in different settings of City-3DQA. Compared to indoor 3D MQA +methods and zero-shot using advanced large language models (LLMs), Sg-CityU +demonstrates state-of-the-art (SOTA) performance in robustness and +generalization. + +
+
+
+
+
+ + ☆ 2D and 3D Deep Learning Models for MRI-based Parkinson's Disease + Classification: A Comparative Analysis of Convolutional Kolmogorov-Arnold + Networks, Convolutional Neural Networks, and Graph Convolutional Networks + + +
+ Early and accurate diagnosis of Parkinson's Disease (PD) remains challenging. +This study compares deep learning architectures for MRI-based PD +classification, introducing the first three-dimensional (3D) implementation of +Convolutional Kolmogorov-Arnold Networks (ConvKANs), a new approach that +combines convolution layers with adaptive, spline-based activations. We +evaluated Convolutional Neural Networks (CNNs), ConvKANs, and Graph +Convolutional Networks (GCNs) using three open-source datasets; a total of 142 +participants (75 with PD and 67 age-matched healthy controls). For 2D analysis, +we extracted 100 axial slices centred on the midbrain from each T1-weighted +scan. For 3D analysis, we used the entire volumetric scans. ConvKANs integrate +learnable B-spline functions with convolutional layers. GCNs represent MRI data +as graphs, theoretically capturing structural relationships that may be +overlooked by traditional approaches. Interpretability visualizations, +including the first ConvKAN spline activation maps, and projections of graph +node embeddings, were depicted. ConvKANs demonstrated high performance across +datasets and dimensionalities, achieving the highest 2D AUROC (0.98) in one +dataset and matching CNN peak 3D performance (1.00). CNN models performed well, +while GCN models improved in 3D analyses, reaching up to 0.97 AUROC. 3D +implementations yielded higher AUROC values compared to 2D counterparts across +all models. ConvKAN implementation shows promise for MRI analysis in PD +classification, particularly in the context of early diagnosis. The improvement +in 3D analyses highlights the value of volumetric data in capturing subtle +PD-related changes. While MRI is not currently used for PD diagnosis, these +findings suggest its potential as a component of a multimodal diagnostic +approach, especially for early detection. + +
+
+ comment: 19 Pages, 5 figures +
+
+
+
+
+ + ☆ MMRA: A Benchmark for Multi-granularity Multi-image Relational + Association + + +
+ Given the remarkable success that large visual language models (LVLMs) have +achieved in image perception tasks, the endeavor to make LVMLs perceive the +world like humans is drawing increasing attention. Current multi-modal +benchmarks mainly focus on the objective fact or certain topic related +potential knowledge within a image, but overlook the associative relations +between multiple images. Therefore, we define a multi-image relation +association task, and meticulously curate \textbf{MMRA} benchmark, a +\textbf{M}ulti-granularity \textbf{M}ulti-image \textbf{R}elational +\textbf{A}ssociation benchmark, consisted of \textbf{1026} samples. In order to +systematically and comprehensively evaluate mainstream LVLMs, we establish an +associational relation system among images that contain \textbf{11 subtasks} +(e.g, UsageSimilarity, SubEvent, etc.) at two granularity levels (i.e., +"\textbf{image}" and "\textbf{entity}") according to the relations in +ConceptNet. Our experiments demonstrate that, on our MMRA benchmark, current +mainstream LVLMs all have their own advantages and disadvantages across +different subtasks. It is worth noting that, at the entity level, the +performance of all models is worse than that of them at the image level, +indicating that the fine-grained multi-image perception task is still +challenging for LVLMs. The tasks related to spatial perception are relatively +difficult for LVLMs to handle. Furthermore, we find that LVMLs exhibit a good +ability to perceive image details, and the key to enhancing their multi-image +association capability is to strengthen the reasoning ability of their language +model component. All our codes and data are released at +htt\url{https://github.com/Wusiwei0410/MMRA}. + +
+
+ comment: VLMS, Multi-Image Association +
+
+
+
+
+ + ☆ PrevPredMap: Exploring Temporal Modeling with Previous Predictions for + Online Vectorized HD Map Construction + + +
+ Temporal information is crucial for detecting occluded instances. Existing +temporal representations have progressed from BEV or PV features to more +compact query features. Compared to these aforementioned features, predictions +offer the highest level of abstraction, providing explicit information. In the +context of online vectorized HD map construction, this unique characteristic of +predictions is potentially advantageous for long-term temporal modeling and the +integration of map priors. This paper introduces PrevPredMap, a pioneering +temporal modeling framework that leverages previous predictions for +constructing online vectorized HD maps. We have meticulously crafted two +essential modules for PrevPredMap: the previous-predictions-based query +generator and the dynamic-position-query decoder. Specifically, the +previous-predictions-based query generator is designed to separately encode +different types of information from previous predictions, which are then +effectively utilized by the dynamic-position-query decoder to generate current +predictions. Furthermore, we have developed a dual-mode strategy to ensure +PrevPredMap's robust performance across both single-frame and temporal modes. +Extensive experiments demonstrate that PrevPredMap achieves state-of-the-art +performance on the nuScenes and Argoverse2 datasets. Code will be available at +https://github.com/pnnnnnnn/PrevPredMap. + +
+
+
+
+
+ + ☆ ViPer: Visual Personalization of Generative Models via Individual + Preference Learning + + +
+ Different users find different images generated for the same prompt +desirable. This gives rise to personalized image generation which involves +creating images aligned with an individual's visual preference. Current +generative models are, however, unpersonalized, as they are tuned to produce +outputs that appeal to a broad audience. Using them to generate images aligned +with individual users relies on iterative manual prompt engineering by the user +which is inefficient and undesirable. We propose to personalize the image +generation process by first capturing the generic preferences of the user in a +one-time process by inviting them to comment on a small selection of images, +explaining why they like or dislike each. Based on these comments, we infer a +user's structured liked and disliked visual attributes, i.e., their visual +preference, using a large language model. These attributes are used to guide a +text-to-image model toward producing images that are tuned towards the +individual user's visual preference. Through a series of user studies and large +language model guided evaluations, we demonstrate that the proposed method +results in generations that are well aligned with individual users' visual +preferences. + +
+
+ comment: Project page at https://viper.epfl.ch/ +
+
+
+
+
+ + ☆ MuST: Multi-Scale Transformers for Surgical Phase Recognition + + +
+ Phase recognition in surgical videos is crucial for enhancing computer-aided +surgical systems as it enables automated understanding of sequential procedural +stages. Existing methods often rely on fixed temporal windows for video +analysis to identify dynamic surgical phases. Thus, they struggle to +simultaneously capture short-, mid-, and long-term information necessary to +fully understand complex surgical procedures. To address these issues, we +propose Multi-Scale Transformers for Surgical Phase Recognition (MuST), a novel +Transformer-based approach that combines a Multi-Term Frame encoder with a +Temporal Consistency Module to capture information across multiple temporal +scales of a surgical video. Our Multi-Term Frame Encoder computes +interdependencies across a hierarchy of temporal scales by sampling sequences +at increasing strides around the frame of interest. Furthermore, we employ a +long-term Transformer encoder over the frame embeddings to further enhance +long-term reasoning. MuST achieves higher performance than previous +state-of-the-art methods on three different public benchmarks. + +
+
+
+
+
+ + ☆ Deep Spherical Superpixels + + +
+ Over the years, the use of superpixel segmentation has become very popular in +various applications, serving as a preprocessing step to reduce data size by +adapting to the content of the image, regardless of its semantic content. While +the superpixel segmentation of standard planar images, captured with a 90{\deg} +field of view, has been extensively studied, there has been limited focus on +dedicated methods to omnidirectional or spherical images, captured with a +360{\deg} field of view. In this study, we introduce the first deep +learning-based superpixel segmentation approach tailored for omnidirectional +images called DSS (for Deep Spherical Superpixels). Our methodology leverages +on spherical CNN architectures and the differentiable K-means clustering +paradigm for superpixels, to generate superpixels that follow the spherical +geometry. Additionally, we propose to use data augmentation techniques +specifically designed for 360{\deg} images, enabling our model to efficiently +learn from a limited set of annotated omnidirectional data. Our extensive +validation across two datasets demonstrates that taking into account the +inherent circular geometry of such images into our framework improves the +segmentation performance over traditional and deep learning-based superpixel +methods. Our code is available online. + +
+
+
+
+
+ + ☆ Preliminary study on artificial intelligence methods for cybersecurity + threat detection in computer networks based on raw data packets + + +
+ Most of the intrusion detection methods in computer networks are based on +traffic flow characteristics. However, this approach may not fully exploit the +potential of deep learning algorithms to directly extract features and patterns +from raw packets. Moreover, it impedes real-time monitoring due to the +necessity of waiting for the processing pipeline to complete and introduces +dependencies on additional software components. + In this paper, we investigate deep learning methodologies capable of +detecting attacks in real-time directly from raw packet data within network +traffic. We propose a novel approach where packets are stacked into windows and +separately recognised, with a 2D image representation suitable for processing +with computer vision models. Our investigation utilizes the CIC IDS-2017 +dataset, which includes both benign traffic and prevalent real-world attacks, +providing a comprehensive foundation for our research. + +
+
+ comment: Submitted to Computer Science Journal +
+
+
+
+
+ + ☆ Cascaded Light Propagation Volumes using Spherical Radial Basis + Functions + + +
+ This paper introduces a contribution made to one of the newest methods for +simulating indirect lighting in dynamic scenes , the cascaded light propagation +volumes . Our contribution consists on using Spherical Radial Basis Functions +instead of Spherical Harmonic, since the first achieves much better results +when many coefficients are used. We explain how to integrate the Spherical +Radial Basis Functions with the cascaded light propagation volumes, and +evaluate our technique against the same implementation, but with Spherical +harmonics. + +
+
+
+
+
+ + ☆ Multi-label Cluster Discrimination for Visual Representation Learning ECCV2024 + + +
+ Contrastive Language Image Pre-training (CLIP) has recently demonstrated +success across various tasks due to superior feature representation empowered +by image-text contrastive learning. However, the instance discrimination method +used by CLIP can hardly encode the semantic structure of training data. To +handle this limitation, cluster discrimination has been proposed through +iterative cluster assignment and classification. Nevertheless, most cluster +discrimination approaches only define a single pseudo-label for each image, +neglecting multi-label signals in the image. In this paper, we propose a novel +Multi-Label Cluster Discrimination method named MLCD to enhance representation +learning. In the clustering step, we first cluster the large-scale LAION-400M +dataset into one million centers based on off-the-shelf embedding features. +Considering that natural images frequently contain multiple visual objects or +attributes, we select the multiple closest centers as auxiliary class labels. +In the discrimination step, we design a novel multi-label classification loss, +which elegantly separates losses from positive classes and negative classes, +and alleviates ambiguity on decision boundary. We validate the proposed +multi-label cluster discrimination method with experiments on different scales +of models and pre-training datasets. Experimental results show that our method +achieves state-of-the-art performance on multiple downstream tasks including +linear probe, zero-shot classification, and image-text retrieval. + +
+
+ comment: Accepted by ECCV2024 +
+
+
+
+
+ + ☆ DarSwin-Unet: Distortion Aware Encoder-Decoder Architecture + + +
+ Wide-angle fisheye images are becoming increasingly common for perception +tasks in applications such as robotics, security, and mobility (e.g. drones, +avionics). However, current models often either ignore the distortions in +wide-angle images or are not suitable to perform pixel-level tasks. In this +paper, we present an encoder-decoder model based on a radial transformer +architecture that adapts to distortions in wide-angle lenses by leveraging the +physical characteristics defined by the radial distortion profile. In contrast +to the original model, which only performs classification tasks, we introduce a +U-Net architecture, DarSwin-Unet, designed for pixel level tasks. Furthermore, +we propose a novel strategy that minimizes sparsity when sampling the image for +creating its input tokens. Our approach enhances the model capability to handle +pixel-level tasks in wide-angle fisheye images, making it more effective for +real-world applications. Compared to other baselines, DarSwin-Unet achieves the +best results across different datasets, with significant gains when trained on +bounded levels of distortions (very low, low, medium, and high) and tested on +all, including out-of-distribution distortions. We demonstrate its performance +on depth estimation and show through extensive experiments that DarSwin-Unet +can perform zero-shot adaptation to unseen distortions of different wide-angle +lenses. + +
+
+
+
+
+ + ☆ Enhanced Deep Learning Methodologies and MRI Selection Techniques for + Dementia Diagnosis in the Elderly Population + + +
+ Dementia, a debilitating neurological condition affecting millions worldwide, +presents significant diagnostic challenges. In this work, we introduce a novel +methodology for the classification of demented and non-demented elderly +patients using 3D brain Magnetic Resonance Imaging (MRI) scans. Our approach +features a unique technique for selectively processing MRI slices, focusing on +the most relevant brain regions and excluding less informative sections. This +methodology is complemented by a confidence-based classification committee +composed of three custom deep learning models: Dem3D ResNet, Dem3D CNN, and +Dem3D EfficientNet. These models work synergistically to enhance +decision-making accuracy, leveraging their collective strengths. Tested on the +Open Access Series of Imaging Studies(OASIS) dataset, our method achieved an +impressive accuracy of 94.12%, surpassing existing methodologies. Furthermore, +validation on the Alzheimer's Disease Neuroimaging Initiative (ADNI) dataset +confirmed the robustness and generalizability of our approach. The use of +explainable AI (XAI) techniques and comprehensive ablation studies further +substantiate the effectiveness of our techniques, providing insights into the +decision-making process and the importance of our methodology. This research +offers a significant advancement in dementia diagnosis, providing a highly +accurate and efficient tool for clinical applications. + +
+
+
+
+
+ + ☆ Physical Adversarial Attack on Monocular Depth Estimation via + Shape-Varying Patches + + +
+ Adversarial attacks against monocular depth estimation (MDE) systems pose +significant challenges, particularly in safety-critical applications such as +autonomous driving. Existing patch-based adversarial attacks for MDE are +confined to the vicinity of the patch, making it difficult to affect the entire +target. To address this limitation, we propose a physics-based adversarial +attack on monocular depth estimation, employing a framework called Attack with +Shape-Varying Patches (ASP), aiming to optimize patch content, shape, and +position to maximize effectiveness. We introduce various mask shapes, including +quadrilateral, rectangular, and circular masks, to enhance the flexibility and +efficiency of the attack. Furthermore, we propose a new loss function to extend +the influence of the patch beyond the overlapping regions. Experimental results +demonstrate that our attack method generates an average depth error of 18 +meters on the target car with a patch area of 1/9, affecting over 98\% of the +target area. + +
+
+
+
+
+ + ☆ LangOcc: Self-Supervised Open Vocabulary Occupancy Estimation via Volume + Rendering + + +
+ Semantic occupancy has recently gained significant traction as a prominent +method for 3D scene representation. However, most existing camera-based methods +rely on costly datasets with fine-grained 3D voxel labels or LiDAR scans for +training, which limits their practicality and scalability, raising the need for +self-supervised approaches in this domain. Moreover, most methods are tied to a +predefined set of classes which they can detect. In this work we present a +novel approach for open vocabulary occupancy estimation called +\textit{LangOcc}, that is trained only via camera images, and can detect +arbitrary semantics via vision-language alignment. In particular, we distill +the knowledge of the strong vision-language aligned encoder CLIP into a 3D +occupancy model via differentiable volume rendering. Our model estimates +vision-language aligned features in a 3D voxel grid using only images. It is +trained in a self-supervised manner by rendering our estimations back to 2D +space, where ground-truth features can be computed. This training mechanism +automatically supervises the scene geometry, allowing for a straight-forward +and powerful training method without any explicit geometry supervision. LangOcc +outperforms LiDAR-supervised competitors in open vocabulary occupancy by a +large margin, solely relying on vision-based training. We also achieve +state-of-the-art results in self-supervised semantic occupancy estimation on +the Occ3D-nuScenes dataset, despite not being limited to a specific set of +categories, thus demonstrating the effectiveness of our proposed +vision-language training. + +
+
+
+
+
+ + ☆ How Good (Or Bad) Are LLMs at Detecting Misleading Visualizations? IEEE VIS 2024 + + +
+ In this study, we address the growing issue of misleading charts, a prevalent +problem that undermines the integrity of information dissemination. Misleading +charts can distort the viewer's perception of data, leading to +misinterpretations and decisions based on false information. The development of +effective automatic detection methods for misleading charts is an urgent field +of research. The recent advancement of multimodal Large Language Models (LLMs) +has introduced a promising direction for addressing this challenge. We explored +the capabilities of these models in analyzing complex charts and assessing the +impact of different prompting strategies on the models' analyses. We utilized a +dataset of misleading charts collected from the internet by prior research and +crafted nine distinct prompts, ranging from simple to complex, to test the +ability of four different multimodal LLMs in detecting over 21 different chart +issues. Through three experiments--from initial exploration to detailed +analysis--we progressively gained insights into how to effectively prompt LLMs +to identify misleading charts and developed strategies to address the +scalability challenges encountered as we expanded our detection range from the +initial five issues to 21 issues in the final experiment. Our findings reveal +that multimodal LLMs possess a strong capability for chart comprehension and +critical thinking in data interpretation. There is significant potential in +employing multimodal LLMs to counter misleading information by supporting +critical thinking and enhancing visualization literacy. This study demonstrates +the applicability of LLMs in addressing the pressing concern of misleading +charts. + +
+
+ comment: To be presented at IEEE VIS 2024 +
+
+
+
+
+ + ☆ Revolutionizing Text-to-Image Retrieval as Autoregressive Token-to-Voken + Generation + + +
+ Text-to-image retrieval is a fundamental task in multimedia processing, +aiming to retrieve semantically relevant cross-modal content. Traditional +studies have typically approached this task as a discriminative problem, +matching the text and image via the cross-attention mechanism (one-tower +framework) or in a common embedding space (two-tower framework). Recently, +generative cross-modal retrieval has emerged as a new research line, which +assigns images with unique string identifiers and generates the target +identifier as the retrieval target. Despite its great potential, existing +generative approaches are limited due to the following issues: insufficient +visual information in identifiers, misalignment with high-level semantics, and +learning gap towards the retrieval target. To address the above issues, we +propose an autoregressive voken generation method, named AVG. AVG tokenizes +images into vokens, i.e., visual tokens, and innovatively formulates the +text-to-image retrieval task as a token-to-voken generation problem. AVG +discretizes an image into a sequence of vokens as the identifier of the image, +while maintaining the alignment with both the visual information and high-level +semantics of the image. Additionally, to bridge the learning gap between +generative training and the retrieval target, we incorporate discriminative +training to modify the learning direction during token-to-voken training. +Extensive experiments demonstrate that AVG achieves superior results in both +effectiveness and efficiency. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ DenseTrack: Drone-based Crowd Tracking via Density-aware + Motion-appearance Synergy + + +
+ Drone-based crowd tracking faces difficulties in accurately identifying and +monitoring objects from an aerial perspective, largely due to their small size +and close proximity to each other, which complicates both localization and +tracking. To address these challenges, we present the Density-aware Tracking +(DenseTrack) framework. DenseTrack capitalizes on crowd counting to precisely +determine object locations, blending visual and motion cues to improve the +tracking of small-scale objects. It specifically addresses the problem of +cross-frame motion to enhance tracking accuracy and dependability. DenseTrack +employs crowd density estimates as anchors for exact object localization within +video frames. These estimates are merged with motion and position information +from the tracking network, with motion offsets serving as key tracking cues. +Moreover, DenseTrack enhances the ability to distinguish small-scale objects +using insights from the visual-language model, integrating appearance with +motion cues. The framework utilizes the Hungarian algorithm to ensure the +accurate matching of individuals across frames. Demonstrated on DroneCrowd +dataset, our approach exhibits superior performance, confirming its +effectiveness in scenarios captured by drones. + +
+
+
+
+
+ + ☆ M4: Multi-Proxy Multi-Gate Mixture of Experts Network for Multiple + Instance Learning in Histopathology Image Analysis + + +
+ Multiple instance learning (MIL) has been successfully applied for whole +slide images (WSIs) analysis in computational pathology, enabling a wide range +of prediction tasks from tumor subtyping to inferring genetic mutations and +multi-omics biomarkers. However, existing MIL methods predominantly focus on +single-task learning, resulting in not only overall low efficiency but also the +overlook of inter-task relatedness. To address these issues, we proposed an +adapted architecture of Multi-gate Mixture-of-experts with Multi-proxy for +Multiple instance learning (M4), and applied this framework for simultaneous +prediction of multiple genetic mutations from WSIs. The proposed M4 model has +two main innovations: (1) utilizing a mixture of experts with multiple gating +strategies for multi-genetic mutation prediction on a single pathological +slide; (2) constructing multi-proxy expert network and gate network for +comprehensive and effective modeling of pathological image information. Our +model achieved significant improvements across five tested TCGA datasets in +comparison to current state-of-the-art single-task methods. The code is +available at:https://github.com/Bigyehahaha/M4. + +
+
+ comment: 25pages,5figures +
+
+
+
+
+ + ☆ SCIsegV2: A Universal Tool for Segmentation of Intramedullary Lesions in + Spinal Cord Injury MICCAI + + +
+ Spinal cord injury (SCI) is a devastating incidence leading to permanent +paralysis and loss of sensory-motor functions potentially resulting in the +formation of lesions within the spinal cord. Imaging biomarkers obtained from +magnetic resonance imaging (MRI) scans can predict the functional recovery of +individuals with SCI and help choose the optimal treatment strategy. Currently, +most studies employ manual quantification of these MRI-derived biomarkers, +which is a subjective and tedious task. In this work, we propose (i) a +universal tool for the automatic segmentation of intramedullary SCI lesions, +dubbed \texttt{SCIsegV2}, and (ii) a method to automatically compute the width +of the tissue bridges from the segmented lesion. Tissue bridges represent the +spared spinal tissue adjacent to the lesion, which is associated with +functional recovery in SCI patients. The tool was trained and validated on a +heterogeneous dataset from 7 sites comprising patients from different SCI +phases (acute, sub-acute, and chronic) and etiologies (traumatic SCI, ischemic +SCI, and degenerative cervical myelopathy). Tissue bridges quantified +automatically did not significantly differ from those computed manually, +suggesting that the proposed automatic tool can be used to derive relevant MRI +biomarkers. \texttt{SCIsegV2} and the automatic tissue bridges computation are +open-source and available in Spinal Cord Toolbox (v6.4 and above) via the +\texttt{sct\_deepseg -task seg\_sc\_lesion\_t2w\_sci} and +\texttt{sct\_analyze\_lesion} functions, respectively. + +
+
+ comment: Accepted at MICCAI AMAI 2024 workshop +
+
+
+
+
+ + ☆ Embedding-Free Transformer with Inference Spatial Reduction for + Efficient Semantic Segmentation ECCV 2024 + + +
+ We present an Encoder-Decoder Attention Transformer, EDAFormer, which +consists of the Embedding-Free Transformer (EFT) encoder and the all-attention +decoder leveraging our Embedding-Free Attention (EFA) structure. The proposed +EFA is a novel global context modeling mechanism that focuses on functioning +the global non-linearity, not the specific roles of the query, key and value. +For the decoder, we explore the optimized structure for considering the +globality, which can improve the semantic segmentation performance. In +addition, we propose a novel Inference Spatial Reduction (ISR) method for the +computational efficiency. Different from the previous spatial reduction +attention methods, our ISR method further reduces the key-value resolution at +the inference phase, which can mitigate the computation-performance trade-off +gap for the efficient semantic segmentation. Our EDAFormer shows the +state-of-the-art performance with the efficient computation compared to the +existing transformer-based semantic segmentation models in three public +benchmarks, including ADE20K, Cityscapes and COCO-Stuff. Furthermore, our ISR +method reduces the computational cost by up to 61% with minimal mIoU +performance degradation on Cityscapes dataset. The code is available at +https://github.com/hyunwoo137/EDAFormer. + +
+
+ comment: Accepted by ECCV 2024 +
+
+
+
+
+ + ☆ LPGen: Enhancing High-Fidelity Landscape Painting Generation through + Diffusion Model + + +
+ Generating landscape paintings expands the possibilities of artistic +creativity and imagination. Traditional landscape painting methods involve +using ink or colored ink on rice paper, which requires substantial time and +effort. These methods are susceptible to errors and inconsistencies and lack +precise control over lines and colors. This paper presents LPGen, a +high-fidelity, controllable model for landscape painting generation, +introducing a novel multi-modal framework that integrates image prompts into +the diffusion model. We extract its edges and contours by computing canny edges +from the target landscape image. These, along with natural language text +prompts and drawing style references, are fed into the latent diffusion model +as conditions. We implement a decoupled cross-attention strategy to ensure +compatibility between image and text prompts, facilitating multi-modal image +generation. A decoder generates the final image. Quantitative and qualitative +analyses demonstrate that our method outperforms existing approaches in +landscape painting generation and exceeds the current state-of-the-art. The +LPGen network effectively controls the composition and color of landscape +paintings, generates more accurate images, and supports further research in +deep learning-based landscape painting generation. + +
+
+
+
+
+ + ☆ Graph Neural Networks: A suitable Alternative to MLPs in Latent 3D + Medical Image Classification? MICCAI 2024 + + +
+ Recent studies have underscored the capabilities of natural imaging +foundation models to serve as powerful feature extractors, even in a zero-shot +setting for medical imaging data. Most commonly, a shallow multi-layer +perceptron (MLP) is appended to the feature extractor to facilitate end-to-end +learning and downstream prediction tasks such as classification, thus +representing the de facto standard. However, as graph neural networks (GNNs) +have become a practicable choice for various tasks in medical research in the +recent past, we direct attention to the question of how effective GNNs are +compared to MLP prediction heads for the task of 3D medical image +classification, proposing them as a potential alternative. In our experiments, +we devise a subject-level graph for each volumetric dataset instance. Therein +latent representations of all slices in the volume, encoded through a DINOv2 +pretrained vision transformer (ViT), constitute the nodes and their respective +node features. We use public datasets to compare the classification heads +numerically and evaluate various graph construction and graph convolution +methods in our experiments. Our findings show enhancements of the GNN in +classification performance and substantial improvements in runtime compared to +an MLP prediction head. Additional robustness evaluations further validate the +promising performance of the GNN, promoting them as a suitable alternative to +traditional MLP classification heads. Our code is publicly available at: +https://github.com/compai-lab/2024-miccai-grail-kiechle + +
+
+ comment: Accepted at MICCAI 2024 - GRAIL Workshop +
+
+
+
+
+ + ☆ Nonverbal Immediacy Analysis in Education: A Multimodal Computational + Model + + +
+ This paper introduces a novel computational approach for analyzing nonverbal +social behavior in educational settings. Integrating multimodal behavioral +cues, including facial expressions, gesture intensity, and spatial dynamics, +the model assesses the nonverbal immediacy (NVI) of teachers from RGB classroom +videos. A dataset of 400 30-second video segments from German classrooms was +constructed for model training and validation. The gesture intensity regressor +achieved a correlation of 0.84, the perceived distance regressor 0.55, and the +NVI model 0.44 with median human ratings. The model demonstrates the potential +to provide a valuable support in nonverbal behavior assessment, approximating +the accuracy of individual human raters. Validated against both questionnaire +data and trained observer ratings, our models show moderate to strong +correlations with relevant educational outcomes, indicating their efficacy in +reflecting effective teaching behaviors. This research advances the objective +assessment of nonverbal communication behaviors, opening new pathways for +educational research. + +
+
+ comment: 12 pages, 3 figures. Camera-ready version for the SAB 2024: 17th + International Conference on the Simulation of Adaptive Behavior +
+
+
+
+
+ + ☆ ALPI: Auto-Labeller with Proxy Injection for 3D Object Detection using + 2D Labels Only + + +
+ 3D object detection plays a crucial role in various applications such as +autonomous vehicles, robotics and augmented reality. However, training 3D +detectors requires a costly precise annotation, which is a hindrance to scaling +annotation to large datasets. To address this challenge, we propose a weakly +supervised 3D annotator that relies solely on 2D bounding box annotations from +images, along with size priors. One major problem is that supervising a 3D +detection model using only 2D boxes is not reliable due to ambiguities between +different 3D poses and their identical 2D projection. We introduce a simple yet +effective and generic solution: we build 3D proxy objects with annotations by +construction and add them to the training dataset. Our method requires only +size priors to adapt to new classes. To better align 2D supervision with 3D +detection, our method ensures depth invariance with a novel expression of the +2D losses. Finally, to detect more challenging instances, our annotator follows +an offline pseudo-labelling scheme which gradually improves its 3D +pseudo-labels. Extensive experiments on the KITTI dataset demonstrate that our +method not only performs on-par or above previous works on the Car category, +but also achieves performance close to fully supervised methods on more +challenging classes. We further demonstrate the effectiveness and robustness of +our method by being the first to experiment on the more challenging nuScenes +dataset. We additionally propose a setting where weak labels are obtained from +a 2D detector pre-trained on MS-COCO instead of human annotations. + +
+
+
+
+
+ + ☆ Unpaired Photo-realistic Image Deraining with Energy-informed Diffusion + Model + + +
+ Existing unpaired image deraining approaches face challenges in accurately +capture the distinguishing characteristics between the rainy and clean domains, +resulting in residual degradation and color distortion within the reconstructed +images. To this end, we propose an energy-informed diffusion model for unpaired +photo-realistic image deraining (UPID-EDM). Initially, we delve into the +intricate visual-language priors embedded within the contrastive language-image +pre-training model (CLIP), and demonstrate that the CLIP priors aid in the +discrimination of rainy and clean images. Furthermore, we introduce a +dual-consistent energy function (DEF) that retains the rain-irrelevant +characteristics while eliminating the rain-relevant features. This energy +function is trained by the non-corresponding rainy and clean images. In +addition, we employ the rain-relevance discarding energy function (RDEF) and +the rain-irrelevance preserving energy function (RPEF) to direct the reverse +sampling procedure of a pre-trained diffusion model, effectively removing the +rain streaks while preserving the image contents. Extensive experiments +demonstrate that our energy-informed model surpasses the existing unpaired +learning approaches in terms of both supervised and no-reference metrics. + +
+
+
+
+
+ + ☆ Trans2Unet: Neural fusion for Nuclei Semantic Segmentation + + +
+ Nuclei segmentation, despite its fundamental role in histopathological image +analysis, is still a challenge work. The main challenge of this task is the +existence of overlapping areas, which makes separating independent nuclei more +complicated. In this paper, we propose a new two-branch architecture by +combining the Unet and TransUnet networks for nuclei segmentation task. In the +proposed architecture, namely Trans2Unet, the input image is first sent into +the Unet branch whose the last convolution layer is removed. This branch makes +the network combine features from different spatial regions of the input image +and localizes more precisely the regions of interest. The input image is also +fed into the second branch. In the second branch, which is called TransUnet +branch, the input image will be divided into patches of images. With Vision +transformer (ViT) in architecture, TransUnet can serve as a powerful encoder +for medical image segmentation tasks and enhance image details by recovering +localized spatial information. To boost up Trans2Unet efficiency and +performance, we proposed to infuse TransUnet with a computational-efficient +variation called "Waterfall" Atrous Spatial Pooling with Skip Connection +(WASP-KC) module, which is inspired by the "Waterfall" Atrous Spatial Pooling +(WASP) module. Experiment results on the 2018 Data Science Bowl benchmark show +the effectiveness and performance of the proposed architecture while compared +with previous segmentation models. + +
+
+ comment: ICCAIS 2022 +
+
+
+
+
+ + ☆ Domain Generalized Recaptured Screen Image Identification Using SWIN + Transformer + + +
+ An increasing number of classification approaches have been developed to +address the issue of image rebroadcast and recapturing, a standard attack +strategy in insurance frauds, face spoofing, and video piracy. However, most of +them neglected scale variations and domain generalization scenarios, performing +poorly in instances involving domain shifts, typically made worse by +inter-domain and cross-domain scale variances. To overcome these issues, we +propose a cascaded data augmentation and SWIN transformer domain generalization +framework (DAST-DG) in the current research work Initially, we examine the +disparity in dataset representation. A feature generator is trained to make +authentic images from various domains indistinguishable. This process is then +applied to recaptured images, creating a dual adversarial learning setup. +Extensive experiments demonstrate that our approach is practical and surpasses +state-of-the-art methods across different databases. Our model achieves an +accuracy of approximately 82\% with a precision of 95\% on high-variance +datasets. + +
+
+ comment: 11 pages, 10 figures, 9 tables +
+
+
+
+
+ + ☆ Context-aware Multi-task Learning for Pedestrian Intent and Trajectory + Prediction + + +
+ The advancement of socially-aware autonomous vehicles hinges on precise +modeling of human behavior. Within this broad paradigm, the specific challenge +lies in accurately predicting pedestrian's trajectory and intention. +Traditional methodologies have leaned heavily on historical trajectory data, +frequently overlooking vital contextual cues such as pedestrian-specific traits +and environmental factors. Furthermore, there's a notable knowledge gap as +trajectory and intention prediction have largely been approached as separate +problems, despite their mutual dependence. To bridge this gap, we introduce +PTINet (Pedestrian Trajectory and Intention Prediction Network), which jointly +learns the trajectory and intention prediction by combining past trajectory +observations, local contextual features (individual pedestrian behaviors), and +global features (signs, markings etc.). The efficacy of our approach is +evaluated on widely used public datasets: JAAD and PIE, where it has +demonstrated superior performance over existing state-of-the-art models in +trajectory and intention prediction. The results from our experiments and +ablation studies robustly validate PTINet's effectiveness in jointly exploring +intention and trajectory prediction for pedestrian behaviour modelling. The +experimental evaluation indicates the advantage of using global and local +contextual features for pedestrian trajectory and intention prediction. The +effectiveness of PTINet in predicting pedestrian behavior paves the way for the +development of automated systems capable of seamlessly interacting with +pedestrians in urban settings. + +
+
+
+
+
+ + ☆ Establishing Truly Causal Relationship Between Whole Slide Image + Predictions and Diagnostic Evidence Subregions in Deep Learning + + +
+ In the field of deep learning-driven Whole Slide Image (WSI) classification, +Multiple Instance Learning (MIL) has gained significant attention due to its +ability to be trained using only slide-level diagnostic labels. Previous MIL +researches have primarily focused on enhancing feature aggregators for globally +analyzing WSIs, but overlook a causal relationship in diagnosis: model's +prediction should ideally stem solely from regions of the image that contain +diagnostic evidence (such as tumor cells), which usually occupy relatively +small areas. To address this limitation and establish the truly causal +relationship between model predictions and diagnostic evidence regions, we +propose Causal Inference Multiple Instance Learning (CI-MIL). CI-MIL integrates +feature distillation with a novel patch decorrelation mechanism, employing a +two-stage causal inference approach to distill and process patches with high +diagnostic value. Initially, CI-MIL leverages feature distillation to identify +patches likely containing tumor cells and extracts their corresponding feature +representations. These features are then mapped to random Fourier feature +space, where a learnable weighting scheme is employed to minimize inter-feature +correlations, effectively reducing redundancy from homogenous patches and +mitigating data bias. These processes strengthen the causal relationship +between model predictions and diagnostically relevant regions, making the +prediction more direct and reliable. Experimental results demonstrate that +CI-MIL outperforms state-of-the-art methods. Additionally, CI-MIL exhibits +superior interpretability, as its selected regions demonstrate high consistency +with ground truth annotations, promising more reliable diagnostic assistance +for pathologists. + +
+
+
+
+
+ + ☆ FIIH: Fully Invertible Image Hiding for Secure and Robust + + +
+ Image hiding is the study of techniques for covert storage and transmission, +which embeds a secret image into a container image and generates stego image to +make it similar in appearance to a normal image. However, existing image hiding +methods have a serious problem that the hiding and revealing process cannot be +fully invertible, which results in the revealing network not being able to +recover the secret image losslessly, which makes it impossible to +simultaneously achieve high fidelity and secure transmission of the secret +image in an insecure network environment. To solve this problem,this paper +proposes a fully invertible image hiding architecture based on invertible +neural network,aiming to realize invertible hiding of secret images,which is +invertible on both data and network. Based on this ingenious architecture, the +method can withstand deep learning based image steganalysis. In addition, we +propose a new method for enhancing the robustness of stego images after +interference during transmission. Experiments demonstrate that the FIIH +proposed in this paper significantly outperforms other state-of-the-art image +hiding methods in hiding a single image, and also significantly outperforms +other state-of-the-art methods in robustness and security. + +
+
+
+
+
+ + ☆ XMeCap: Meme Caption Generation with Sub-Image Adaptability + + +
+ Humor, deeply rooted in societal meanings and cultural details, poses a +unique challenge for machines. While advances have been made in natural +language processing, real-world humor often thrives in a multi-modal context, +encapsulated distinctively by memes. This paper poses a particular emphasis on +the impact of multi-images on meme captioning. After that, we introduce the +\textsc{XMeCap} framework, a novel approach that adopts supervised fine-tuning +and reinforcement learning based on an innovative reward model, which factors +in both global and local similarities between visuals and text. Our results, +benchmarked against contemporary models, manifest a marked improvement in +caption generation for both single-image and multi-image memes, as well as +different meme categories. \textsc{XMeCap} achieves an average evaluation score +of 75.85 for single-image memes and 66.32 for multi-image memes, outperforming +the best baseline by 3.71\% and 4.82\%, respectively. This research not only +establishes a new frontier in meme-related studies but also underscores the +potential of machines in understanding and generating humor in a multi-modal +setting. + +
+
+ comment: Accepted to MM 2024 +
+
+
+
+
+ + ☆ RT-DETRv2: Improved Baseline with Bag-of-Freebies for Real-Time + Detection Transformer + + +
+ In this report, we present RT-DETRv2, an improved Real-Time DEtection +TRansformer (RT-DETR). RT-DETRv2 builds upon the previous state-of-the-art +real-time detector, RT-DETR, and opens up a set of bag-of-freebies for +flexibility and practicality, as well as optimizing the training strategy to +achieve enhanced performance. To improve the flexibility, we suggest setting a +distinct number of sampling points for features at different scales in the +deformable attention to achieve selective multi-scale feature extraction by the +decoder. To enhance practicality, we propose an optional discrete sampling +operator to replace the grid_sample operator that is specific to RT-DETR +compared to YOLOs. This removes the deployment constraints typically associated +with DETRs. For the training strategy, we propose dynamic data augmentation and +scale-adaptive hyperparameters customization to improve performance without +loss of speed. Source code and pre-trained models will be available at +https://github.com/lyuwenyu/RT-DETR. + +
+
+
+
+
+ + ☆ A Self-Supervised Image Registration Approach for Measuring Local + Response Patterns in Metastatic Ovarian Cancer + + +
+ High-grade serous ovarian carcinoma (HGSOC) is characterised by significant +spatial and temporal heterogeneity, typically manifesting at an advanced +metastatic stage. A major challenge in treating advanced HGSOC is effectively +monitoring localised change in tumour burden across multiple sites during +neoadjuvant chemotherapy (NACT) and predicting long-term pathological response +and overall patient survival. In this work, we propose a self-supervised +deformable image registration algorithm that utilises a general-purpose image +encoder for image feature extraction to co-register contrast-enhanced +computerised tomography scan images acquired before and after neoadjuvant +chemotherapy. This approach addresses challenges posed by highly complex tumour +deformations and longitudinal lesion matching during treatment. Localised +tumour changes are calculated using the Jacobian determinant maps of the +registration deformation at multiple disease sites and their macroscopic areas, +including hypo-dense (i.e., cystic/necrotic), hyper-dense (i.e., calcified), +and intermediate density (i.e., soft tissue) portions. A series of experiments +is conducted to understand the role of a general-purpose image encoder and its +application in quantifying change in tumour burden during neoadjuvant +chemotherapy in HGSOC. This work is the first to demonstrate the feasibility of +a self-supervised image registration approach in quantifying NACT-induced +localised tumour changes across the whole disease burden of patients with +complex multi-site HGSOC, which could be used as a potential marker for ovarian +cancer patient's long-term pathological response and survival. + +
+
+
+
+
+ + ☆ PiPa++: Towards Unification of Domain Adaptive Semantic Segmentation via + Self-supervised Learning + + +
+ Unsupervised domain adaptive segmentation aims to improve the segmentation +accuracy of models on target domains without relying on labeled data from those +domains. This approach is crucial when labeled target domain data is scarce or +unavailable. It seeks to align the feature representations of the source domain +(where labeled data is available) and the target domain (where only unlabeled +data is present), thus enabling the model to generalize well to the target +domain. Current image- and video-level domain adaptation have been addressed +using different and specialized frameworks, training strategies and +optimizations despite their underlying connections. In this paper, we propose a +unified framework PiPa++, which leverages the core idea of ``comparing'' to (1) +explicitly encourage learning of discriminative pixel-wise features with +intraclass compactness and inter-class separability, (2) promote the robust +feature learning of the identical patch against different contexts or +fluctuations, and (3) enable the learning of temporal continuity under dynamic +environments. With the designed task-smart contrastive sampling strategy, +PiPa++ enables the mining of more informative training samples according to the +task demand. Extensive experiments demonstrate the effectiveness of our method +on both image-level and video-level domain adaption benchmarks. Moreover, the +proposed method is compatible with other UDA approaches to further improve the +performance without introducing extra parameters. + +
+
+ comment: This study is under IEEE TMM review. arXiv admin note: substantial + text overlap with arXiv:2211.07609 +
+
+
+
+
+ + ☆ MemBench: Memorized Image Trigger Prompt Dataset for Diffusion Models + + +
+ Diffusion models have achieved remarkable success in Text-to-Image generation +tasks, leading to the development of many commercial models. However, recent +studies have reported that diffusion models often generate replicated images in +train data when triggered by specific prompts, potentially raising social +issues ranging from copyright to privacy concerns. To sidestep the +memorization, there have been recent studies for developing memorization +mitigation methods for diffusion models. Nevertheless, the lack of benchmarks +impedes the assessment of the true effectiveness of these methods. In this +work, we present MemBench, the first benchmark for evaluating image +memorization mitigation methods. Our benchmark includes a large number of +memorized image trigger prompts in Stable Diffusion, the most popularly used +model nowadays. Furthermore, in contrast to the prior work evaluating +mitigation performance only on trigger prompts, we present metrics evaluating +on both trigger prompts and general prompts, so that we can see whether +mitigation methods address the memorization issue while maintaining performance +for general prompts. This is an important development considering the practical +applications which previous works have overlooked. Through evaluation on +MemBench, we verify that the performance of existing image memorization +mitigation methods is still insufficient for application to diffusion models. + +
+
+
+
+
+ + ☆ OVR: A Dataset for Open Vocabulary Temporal Repetition Counting in + Videos + + +
+ We introduce a dataset of annotations of temporal repetitions in videos. The +dataset, OVR (pronounced as over), contains annotations for over 72K videos, +with each annotation specifying the number of repetitions, the start and end +time of the repetitions, and also a free-form description of what is repeating. +The annotations are provided for videos sourced from Kinetics and Ego4D, and +consequently cover both Exo and Ego viewing conditions, with a huge variety of +actions and activities. Moreover, OVR is almost an order of magnitude larger +than previous datasets for video repetition. We also propose a baseline +transformer-based counting model, OVRCounter, that can localise and count +repetitions in videos that are up to 320 frames long. The model is trained and +evaluated on the OVR dataset, and its performance assessed with and without +using text to specify the target class to count. The performance is also +compared to a prior repetition counting model. The dataset is available for +download at: https://sites.google.com/view/openvocabreps/ + +
+
+
+
+
+ + ☆ When Text and Images Don't Mix: Bias-Correcting Language-Image + Similarity Scores for Anomaly Detection + + +
+ Contrastive Language-Image Pre-training (CLIP) achieves remarkable +performance in various downstream tasks through the alignment of image and text +input embeddings and holds great promise for anomaly detection. However, our +empirical experiments show that the embeddings of text inputs unexpectedly +tightly cluster together, far away from image embeddings, contrary to the +model's contrastive training objective to align image-text input pairs. We show +that this phenomenon induces a `similarity bias' - in which false negative and +false positive errors occur due to bias in the similarities between images and +the normal label text embeddings. To address this bias, we propose a novel +methodology called BLISS which directly accounts for this similarity bias +through the use of an auxiliary, external set of text inputs. BLISS is simple, +it does not require strong inductive biases about anomalous behaviour nor an +expensive training process, and it significantly outperforms baseline methods +on benchmark image datasets, even when access to normal data is extremely +limited. + +
+
+
+
+
+ + ☆ AI-based Density Recognition + + +
+ Learning-based analysis of images is commonly used in the fields of mobility +and robotics for safe environmental motion and interaction. This requires not +only object recognition but also the assignment of certain properties to them. +With the help of this information, causally related actions can be adapted to +different circumstances. Such logical interactions can be optimized by +recognizing object-assigned properties. Density as a physical property offers +the possibility to recognize how heavy an object is, which material it is made +of, which forces are at work, and consequently which influence it has on its +environment. Our approach introduces an AI-based concept for assigning physical +properties to objects through the use of associated images. Based on +synthesized data, we derive specific patterns from 2D images using a neural +network to extract further information such as volume, material, or density. +Accordingly, we discuss the possibilities of property-based feature extraction +to improve causally related logics. + +
+
+
+
+
+ + ☆ High Efficiency Image Compression for Large Visual-Language Models + + +
+ In recent years, large visual language models (LVLMs) have shown impressive +performance and promising generalization capability in multi-modal tasks, thus +replacing humans as receivers of visual information in various application +scenarios. In this paper, we pioneer to propose a variable bitrate image +compression framework consisting of a pre-editing module and an end-to-end +codec to achieve promising rate-accuracy performance for different LVLMs. In +particular, instead of optimizing an adaptive pre-editing network towards a +particular task or several representative tasks, we propose a new optimization +strategy tailored for LVLMs, which is designed based on the representation and +discrimination capability with token-level distortion and rank. The pre-editing +module and the variable bitrate end-to-end image codec are jointly trained by +the losses based on semantic tokens of the large model, which introduce +enhanced generalization capability for various data and tasks. {Experimental +results demonstrate that the proposed framework could efficiently achieve much +better rate-accuracy performance compared to the state-of-the-art coding +standard, Versatile Video Coding.} Meanwhile, experiments with multi-modal +tasks have revealed the robustness and generalization capability of the +proposed framework. + +
+
+
+
+
+ + ☆ DiffCD: A Symmetric Differentiable Chamfer Distance for Neural Implicit + Surface Fitting + + +
+ Neural implicit surfaces can be used to recover accurate 3D geometry from +imperfect point clouds. In this work, we show that state-of-the-art techniques +work by minimizing an approximation of a one-sided Chamfer distance. This shape +metric is not symmetric, as it only ensures that the point cloud is near the +surface but not vice versa. As a consequence, existing methods can produce +inaccurate reconstructions with spurious surfaces. Although one approach +against spurious surfaces has been widely used in the literature, we +theoretically and experimentally show that it is equivalent to regularizing the +surface area, resulting in over-smoothing. As a more appealing alternative, we +propose DiffCD, a novel loss function corresponding to the symmetric Chamfer +distance. In contrast to previous work, DiffCD also assures that the surface is +near the point cloud, which eliminates spurious surfaces without the need for +additional regularization. We experimentally show that DiffCD reliably recovers +a high degree of shape detail, substantially outperforming existing work across +varying surface complexity and noise levels. Project code is available at +https://github.com/linusnie/diffcd. + +
+
+
+
+
+ + ☆ Q-Ground: Image Quality Grounding with Large Multi-modality Models + + +
+ Recent advances of large multi-modality models (LMM) have greatly improved +the ability of image quality assessment (IQA) method to evaluate and explain +the quality of visual content. However, these advancements are mostly focused +on overall quality assessment, and the detailed examination of local quality, +which is crucial for comprehensive visual understanding, is still largely +unexplored. In this work, we introduce Q-Ground, the first framework aimed at +tackling fine-scale visual quality grounding by combining large multi-modality +models with detailed visual quality analysis. Central to our contribution is +the introduction of the QGround-100K dataset, a novel resource containing 100k +triplets of (image, quality text, distortion segmentation) to facilitate deep +investigations into visual quality. The dataset comprises two parts: one with +human-labeled annotations for accurate quality assessment, and another labeled +automatically by LMMs such as GPT4V, which helps improve the robustness of +model training while also reducing the costs of data collection. With the +QGround-100K dataset, we propose a LMM-based method equipped with multi-scale +feature learning to learn models capable of performing both image quality +answering and distortion segmentation based on text prompts. This +dual-capability approach not only refines the model's understanding of +region-aware image quality but also enables it to interactively respond to +complex, text-based queries about image quality and specific distortions. +Q-Ground takes a step towards sophisticated visual quality analysis in a finer +scale, establishing a new benchmark for future research in the area. Codes and +dataset are available at https://github.com/Q-Future/Q-Ground. + +
+
+ comment: ACM Multimedia 2024 (Oral) +
+
+
+
+
+ + ☆ Enhancing Environmental Monitoring through Multispectral Imaging: The + WasteMS Dataset for Semantic Segmentation of Lakeside Waste + + +
+ Environmental monitoring of lakeside green areas is crucial for environmental +protection. Compared to manual inspections, computer vision technologies offer +a more efficient solution when deployed on-site. Multispectral imaging provides +diverse information about objects under different spectrums, aiding in the +differentiation between waste and lakeside lawn environments. This study +introduces WasteMS, the first multispectral dataset established for the +semantic segmentation of lakeside waste. WasteMS includes a diverse range of +waste types in lawn environments, captured under various lighting conditions. +We implemented a rigorous annotation process to label waste in images. +Representative semantic segmentation frameworks were used to evaluate +segmentation accuracy using WasteMS. Challenges encountered when using WasteMS +for segmenting waste on lakeside lawns were discussed. The WasteMS dataset is +available at https://github.com/zhuqinfeng1999/WasteMS. + +
+
+
+
+
+ + ☆ EAFormer: Scene Text Segmentation with Edge-Aware Transformers ECCV 2024 + + +
+ Scene text segmentation aims at cropping texts from scene images, which is +usually used to help generative models edit or remove texts. The existing text +segmentation methods tend to involve various text-related supervisions for +better performance. However, most of them ignore the importance of text edges, +which are significant for downstream applications. In this paper, we propose +Edge-Aware Transformers, termed EAFormer, to segment texts more accurately, +especially at the edge of texts. Specifically, we first design a text edge +extractor to detect edges and filter out edges of non-text areas. Then, we +propose an edge-guided encoder to make the model focus more on text edges. +Finally, an MLP-based decoder is employed to predict text masks. We have +conducted extensive experiments on commonly-used benchmarks to verify the +effectiveness of EAFormer. The experimental results demonstrate that the +proposed method can perform better than previous methods, especially on the +segmentation of text edges. Considering that the annotations of several +benchmarks (e.g., COCO_TS and MLT_S) are not accurate enough to fairly evaluate +our methods, we have relabeled these datasets. Through experiments, we observe +that our method can achieve a higher performance improvement when more accurate +annotations are used for training. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ☆ Progressive Query Refinement Framework for Bird's-Eye-View Semantic + Segmentation from Surrounding Images IROS 2024 + + +
+ Expressing images with Multi-Resolution (MR) features has been widely adopted +in many computer vision tasks. In this paper, we introduce the MR concept into +Bird's-Eye-View (BEV) semantic segmentation for autonomous driving. This +introduction enhances our model's ability to capture both global and local +characteristics of driving scenes through our proposed residual learning. +Specifically, given a set of MR BEV query maps, the lowest resolution query map +is initially updated using a View Transformation (VT) encoder. This updated +query map is then upscaled and merged with a higher resolution query map to +undergo further updates in a subsequent VT encoder. This process is repeated +until the resolution of the updated query map reaches the target. Finally, the +lowest resolution map is added to the target resolution to generate the final +query map. During training, we enforce both the lowest and final query maps to +align with the ground-truth BEV semantic map to help our model effectively +capture the global and local characteristics. We also propose a visual feature +interaction network that promotes interactions between features across images +and across feature levels, thus highly contributing to the performance +improvement. We evaluate our model on a large-scale real-world dataset. The +experimental results show that our model outperforms the SOTA models in terms +of IoU metric. Codes are available at +https://github.com/d1024choi/ProgressiveQueryRefineNet + +
+
+ comment: IROS 2024 +
+
+
+
+
+ + ☆ LoFormer: Local Frequency Transformer for Image Deblurring + + +
+ Due to the computational complexity of self-attention (SA), prevalent +techniques for image deblurring often resort to either adopting localized SA or +employing coarse-grained global SA methods, both of which exhibit drawbacks +such as compromising global modeling or lacking fine-grained correlation. In +order to address this issue by effectively modeling long-range dependencies +without sacrificing fine-grained details, we introduce a novel approach termed +Local Frequency Transformer (LoFormer). Within each unit of LoFormer, we +incorporate a Local Channel-wise SA in the frequency domain (Freq-LC) to +simultaneously capture cross-covariance within low- and high-frequency local +windows. These operations offer the advantage of (1) ensuring equitable +learning opportunities for both coarse-grained structures and fine-grained +details, and (2) exploring a broader range of representational properties +compared to coarse-grained global SA methods. Additionally, we introduce an MLP +Gating mechanism complementary to Freq-LC, which serves to filter out +irrelevant features while enhancing global learning capabilities. Our +experiments demonstrate that LoFormer significantly improves performance in the +image deblurring task, achieving a PSNR of 34.09 dB on the GoPro dataset with +126G FLOPs. https://github.com/DeepMed-Lab-ECNU/Single-Image-Deblur + +
+
+
+
+
+ + ☆ DreamCar: Leveraging Car-specific Prior for in-the-wild 3D Car + Reconstruction + + +
+ Self-driving industries usually employ professional artists to build +exquisite 3D cars. However, it is expensive to craft large-scale digital +assets. Since there are already numerous datasets available that contain a vast +number of images of cars, we focus on reconstructing high-quality 3D car models +from these datasets. However, these datasets only contain one side of cars in +the forward-moving scene. We try to use the existing generative models to +provide more supervision information, but they struggle to generalize well in +cars since they are trained on synthetic datasets not car-specific. In +addition, The reconstructed 3D car texture misaligns due to a large error in +camera pose estimation when dealing with in-the-wild images. These restrictions +make it challenging for previous methods to reconstruct complete 3D cars. To +address these problems, we propose a novel method, named DreamCar, which can +reconstruct high-quality 3D cars given a few images even a single image. To +generalize the generative model, we collect a car dataset, named Car360, with +over 5,600 vehicles. With this dataset, we make the generative model more +robust to cars. We use this generative prior specific to the car to guide its +reconstruction via Score Distillation Sampling. To further complement the +supervision information, we utilize the geometric and appearance symmetry of +cars. Finally, we propose a pose optimization method that rectifies poses to +tackle texture misalignment. Extensive experiments demonstrate that our method +significantly outperforms existing methods in reconstructing high-quality 3D +cars. \href{https://xiaobiaodu.github.io/dreamcar-project/}{Our code is +available.} + +
+
+ comment: Projet Page: https://xiaobiaodu.github.io/dreamcar-project/ +
+
+
+
+
+ + ☆ Diffree: Text-Guided Shape Free Object Inpainting with Diffusion Model + + +
+ This paper addresses an important problem of object addition for images with +only text guidance. It is challenging because the new object must be integrated +seamlessly into the image with consistent visual context, such as lighting, +texture, and spatial location. While existing text-guided image inpainting +methods can add objects, they either fail to preserve the background +consistency or involve cumbersome human intervention in specifying bounding +boxes or user-scribbled masks. To tackle this challenge, we introduce Diffree, +a Text-to-Image (T2I) model that facilitates text-guided object addition with +only text control. To this end, we curate OABench, an exquisite synthetic +dataset by removing objects with advanced image inpainting techniques. OABench +comprises 74K real-world tuples of an original image, an inpainted image with +the object removed, an object mask, and object descriptions. Trained on OABench +using the Stable Diffusion model with an additional mask prediction module, +Diffree uniquely predicts the position of the new object and achieves object +addition with guidance from only text. Extensive experiments demonstrate that +Diffree excels in adding new objects with a high success rate while maintaining +background consistency, spatial appropriateness, and object relevance and +quality. + +
+
+
+
+
+ + ☆ Case-Enhanced Vision Transformer: Improving Explanations of Image + Similarity with a ViT-based Similarity Metric + + +
+ This short paper presents preliminary research on the Case-Enhanced Vision +Transformer (CEViT), a similarity measurement method aimed at improving the +explainability of similarity assessments for image data. Initial experimental +results suggest that integrating CEViT into k-Nearest Neighbor (k-NN) +classification yields classification accuracy comparable to state-of-the-art +computer vision models, while adding capabilities for illustrating differences +between classes. CEViT explanations can be influenced by prior cases, to +illustrate aspects of similarity relevant to those cases. + +
+
+
+
+
+ + ☆ Selective Vision-Language Subspace Projection for Few-shot CLIP + + +
+ Vision-language models such as CLIP are capable of mapping the different +modality data into a unified feature space, enabling zero/few-shot inference by +measuring the similarity of given images and texts. However, most existing +methods overlook modality gaps in CLIP's encoded features, which is shown as +the text and image features lie far apart from each other, resulting in limited +classification performance. To tackle this issue, we introduce a method called +Selective Vision-Language Subspace Projection (SSP), which incorporates local +image features and utilizes them as a bridge to enhance the alignment between +image-text pairs. Specifically, our SSP framework comprises two parallel +modules: a vision projector and a language projector. Both projectors utilize +local image features to span the respective subspaces for image and texts, +thereby projecting the image and text features into their respective subspaces +to achieve alignment. Moreover, our approach entails only training-free matrix +calculations and can be seamlessly integrated into advanced CLIP-based few-shot +learning frameworks. Extensive experiments on 11 datasets have demonstrated +SSP's superior text-image alignment capabilities, outperforming the +state-of-the-art alignment methods. The code is available at +https://github.com/zhuhsingyuu/SSP + +
+
+ comment: Accepted to ACM MultiMedia 2024 +
+
+
+
+
+ + ☆ Toward an Integrated Decision Making Framework for Optimized Stroke + Diagnosis with DSA and Treatment under Uncertainty + + +
+ This study addresses the challenge of stroke diagnosis and treatment under +uncertainty, a critical issue given the rapid progression and severe +consequences of stroke conditions such as aneurysms, arteriovenous +malformations (AVM), and occlusions. Current diagnostic methods, including +Digital Subtraction Angiography (DSA), face limitations due to high costs and +its invasive nature. To overcome these challenges, we propose a novel approach +using a Partially Observable Markov Decision Process (POMDP) framework. Our +model integrates advanced diagnostic tools and treatment approaches with a +decision-making algorithm that accounts for the inherent uncertainties in +stroke diagnosis. Our approach combines noisy observations from CT scans, +Siriraj scores, and DSA reports to inform the subsequent treatment options. We +utilize the online solver DESPOT, which employs tree-search methods and +particle filters, to simulate potential future scenarios and guide our +strategies. The results indicate that our POMDP framework balances diagnostic +and treatment objectives, striking a tradeoff between the need for precise +stroke identification via invasive procedures like DSA and the constraints of +limited healthcare resources that necessitate more cost-effective strategies, +such as in-hospital or at-home observation, by relying only relying on +simulation rollouts and not imposing any prior knowledge. Our study offers a +significant contribution by presenting a systematic framework that optimally +integrates diagnostic and treatment processes for stroke and accounting for +various uncertainties, thereby improving care and outcomes in stroke +management. + +
+
+
+
+
+ + ☆ Pose Estimation from Camera Images for Underwater Inspection + + +
+ High-precision localization is pivotal in underwater reinspection missions. +Traditional localization methods like inertial navigation systems, Doppler +velocity loggers, and acoustic positioning face significant challenges and are +not cost-effective for some applications. Visual localization is a +cost-effective alternative in such cases, leveraging the cameras already +equipped on inspection vehicles to estimate poses from images of the +surrounding scene. Amongst these, machine learning-based pose estimation from +images shows promise in underwater environments, performing efficient +relocalization using models trained based on previously mapped scenes. We +explore the efficacy of learning-based pose estimators in both clear and turbid +water inspection missions, assessing the impact of image formats, model +architectures and training data diversity. We innovate by employing novel view +synthesis models to generate augmented training data, significantly enhancing +pose estimation in unexplored regions. Moreover, we enhance localization +accuracy by integrating pose estimator outputs with sensor data via an extended +Kalman filter, demonstrating improved trajectory smoothness and accuracy. + +
+
+ comment: Submitted to IEEE Journal of Oceanic Engineering +
+
+
+
+
+ + ☆ Raindrop Clarity: A Dual-Focused Dataset for Day and Night Raindrop + Removal ECCV2024 + + +
+ Existing raindrop removal datasets have two shortcomings. First, they consist +of images captured by cameras with a focus on the background, leading to the +presence of blurry raindrops. To our knowledge, none of these datasets include +images where the focus is specifically on raindrops, which results in a blurry +background. Second, these datasets predominantly consist of daytime images, +thereby lacking nighttime raindrop scenarios. Consequently, algorithms trained +on these datasets may struggle to perform effectively in raindrop-focused or +nighttime scenarios. The absence of datasets specifically designed for +raindrop-focused and nighttime raindrops constrains research in this area. In +this paper, we introduce a large-scale, real-world raindrop removal dataset +called Raindrop Clarity. Raindrop Clarity comprises 15,186 high-quality +pairs/triplets (raindrops, blur, and background) of images with raindrops and +the corresponding clear background images. There are 5,442 daytime raindrop +images and 9,744 nighttime raindrop images. Specifically, the 5,442 daytime +images include 3,606 raindrop- and 1,836 background-focused images. While the +9,744 nighttime images contain 4,838 raindrop- and 4,906 background-focused +images. Our dataset will enable the community to explore background-focused and +raindrop-focused images, including challenges unique to daytime and nighttime +conditions. Our data and code are available at: +\url{https://github.com/jinyeying/RaindropClarity} + +
+
+ comment: Accepted to ECCV2024, dataset and benchmark at: + \url{https://github.com/jinyeying/RaindropClarity} +
+
+
+
+
+ + ☆ DVPE: Divided View Position Embedding for Multi-View 3D Object Detection + + +
+ Sparse query-based paradigms have achieved significant success in multi-view +3D detection for autonomous vehicles. Current research faces challenges in +balancing between enlarging receptive fields and reducing interference when +aggregating multi-view features. Moreover, different poses of cameras present +challenges in training global attention models. To address these problems, this +paper proposes a divided view method, in which features are modeled globally +via the visibility crossattention mechanism, but interact only with partial +features in a divided local virtual space. This effectively reduces +interference from other irrelevant features and alleviates the training +difficulties of the transformer by decoupling the position embedding from +camera poses. Additionally, 2D historical RoI features are incorporated into +the object-centric temporal modeling to utilize highlevel visual semantic +information. The model is trained using a one-to-many assignment strategy to +facilitate stability. Our framework, named DVPE, achieves state-of-the-art +performance (57.2% mAP and 64.5% NDS) on the nuScenes test set. Codes will be +available at https://github.com/dop0/DVPE. + +
+
+
+
+
+ + ☆ Open Challenges on Fairness of Artificial Intelligence in Medical + Imaging Applications + + +
+ Recently, the research community of computerized medical imaging has started +to discuss and address potential fairness issues that may emerge when +developing and deploying AI systems for medical image analysis. This chapter +covers some of the pressing challenges encountered when doing research in this +area, and it is intended to raise questions and provide food for thought for +those aiming to enter this research field. The chapter first discusses various +sources of bias, including data collection, model training, and clinical +deployment, and their impact on the fairness of machine learning algorithms in +medical image computing. We then turn to discussing open challenges that we +believe require attention from researchers and practitioners, as well as +potential pitfalls of naive application of common methods in the field. We +cover a variety of topics including the impact of biased metrics when auditing +for fairness, the leveling down effect, task difficulty variations among +subgroups, discovering biases in unseen populations, and explaining biases +beyond standard demographic attributes. + +
+
+ comment: Published as part of the book "Trustworthy AI in Medical Imaging" + (Elsevier, 2024) available at + https://shop.elsevier.com/books/trustworthy-ai-in-medical-imaging/lorenzi/978-0-443-23761-4 +
+
+
+
+
+ + ☆ Affective Behaviour Analysis via Progressive Learning + + +
+ Affective Behavior Analysis aims to develop emotionally intelligent +technology that can recognize and respond to human emotions. To advance this, +the 7th Affective Behavior Analysis in-the-wild (ABAW) competition establishes +two tracks: i.e., the Multi-task Learning (MTL) Challenge and the Compound +Expression (CE) challenge based on Aff-Wild2 and C-EXPR-DB datasets. In this +paper, we present our methods and experimental results for the two competition +tracks. Specifically, it can be summarized in the following four aspects: 1) To +attain high-quality facial features, we train a Masked-Auto Encoder in a +self-supervised manner. 2) We devise a temporal convergence module to capture +the temporal information between video frames and explore the impact of window +size and sequence length on each sub-task. 3) To facilitate the joint +optimization of various sub-tasks, we explore the impact of sub-task joint +training and feature fusion from individual tasks on each task performance +improvement. 4) We utilize curriculum learning to transition the model from +recognizing single expressions to recognizing compound expressions, thereby +improving the accuracy of compound expression recognition. Extensive +experiments demonstrate the superiority of our designs. + +
+
+ comment: Techical Report for 7th ABAW Competition +
+
+
+
+
+ + ☆ McGAN: Generating Manufacturable Designs by Embedding Manufacturing + Rules into Conditional Generative Adversarial Network + + +
+ Generative design (GD) methods aim to automatically generate a wide variety +of designs that satisfy functional or aesthetic design requirements. However, +research to date generally lacks considerations of manufacturability of the +generated designs. To this end, we propose a novel GD approach by using deep +neural networks to encode design for manufacturing (DFM) rules, thereby +modifying part designs to make them manufacturable by a given manufacturing +process. Specifically, a three-step approach is proposed: first, an instance +segmentation method, Mask R-CNN, is used to decompose a part design into +subregions. Second, a conditional generative adversarial neural network (cGAN), +Pix2Pix, transforms unmanufacturable decomposed subregions into manufacturable +subregions. The transformed subregions of designs are subsequently reintegrated +into a unified manufacturable design. These three steps, Mask-RCNN, Pix2Pix, +and reintegration, form the basis of the proposed Manufacturable conditional +GAN (McGAN) framework. Experimental results show that McGAN can transform +existing unmanufacturable designs to generate their corresponding +manufacturable counterparts automatically that realize the specified +manufacturing rules in an efficient and robust manner. The effectiveness of +McGAN is demonstrated through two-dimensional design case studies of an +injection molding process. + +
+
+
+
+
+ + ☆ SAR to Optical Image Translation with Color Supervised Diffusion Model + + +
+ Synthetic Aperture Radar (SAR) offers all-weather, high-resolution imaging +capabilities, but its complex imaging mechanism often poses challenges for +interpretation. In response to these limitations, this paper introduces an +innovative generative model designed to transform SAR images into more +intelligible optical images, thereby enhancing the interpretability of SAR +images. Specifically, our model backbone is based on the recent diffusion +models, which have powerful generative capabilities. We employ SAR images as +conditional guides in the sampling process and integrate color supervision to +counteract color shift issues effectively. We conducted experiments on the +SEN12 dataset and employed quantitative evaluations using peak signal-to-noise +ratio, structural similarity, and fr\'echet inception distance. The results +demonstrate that our model not only surpasses previous methods in quantitative +assessments but also significantly enhances the visual quality of the generated +images. + +
+
+
+
+
+ + ☆ CRASAR-U-DROIDs: A Large Scale Benchmark Dataset for Building Alignment + and Damage Assessment in Georectified sUAS Imagery + + +
+ This document presents the Center for Robot Assisted Search And Rescue - +Uncrewed Aerial Systems - Disaster Response Overhead Inspection Dataset +(CRASAR-U-DROIDs) for building damage assessment and spatial alignment +collected from small uncrewed aerial systems (sUAS) geospatial imagery. This +dataset is motivated by the increasing use of sUAS in disaster response and the +lack of previous work in utilizing high-resolution geospatial sUAS imagery for +machine learning and computer vision models, the lack of alignment with +operational use cases, and with hopes of enabling further investigations +between sUAS and satellite imagery. The CRASAR-U-DRIODs dataset consists of +fifty-two (52) orthomosaics from ten (10) federally declared disasters +(Hurricane Ian, Hurricane Ida, Hurricane Harvey, Hurricane Idalia, Hurricane +Laura, Hurricane Michael, Musset Bayou Fire, Mayfield Tornado, Kilauea +Eruption, and Champlain Towers Collapse) spanning 67.98 square kilometers +(26.245 square miles), containing 21,716 building polygons and damage labels, +and 7,880 adjustment annotations. The imagery was tiled and presented in +conjunction with overlaid building polygons to a pool of 130 annotators who +provided human judgments of damage according to the Joint Damage Scale. These +annotations were then reviewed via a two-stage review process in which building +polygon damage labels were first reviewed individually and then again by +committee. Additionally, the building polygons have been aligned spatially to +precisely overlap with the imagery to enable more performant machine learning +models to be trained. It appears that CRASAR-U-DRIODs is the largest labeled +dataset of sUAS orthomosaic imagery. + +
+
+ comment: 16 Pages, 7 Figures, 6 Tables +
+
+
+
+
+ + ☆ Unsqueeze [CLS] Bottleneck to Learn Rich Representations + + +
+ Distillation-based self-supervised learning typically leads to more +compressed representations due to its radical clustering process and the +implementation of a sharper target distribution. To overcome this limitation +and preserve more information from input, we introduce UDI, conceptualized as +Unsqueezed Distillation-based self-supervised learning (SSL). UDI enriches the +learned representation by encouraging multimodal prediction distilled from a +consolidated profile of local predictions that are derived via stratified +sampling. Our evaluations show that UDI not only promotes semantically +meaningful representations at instance level, delivering superior or +competitive results to state-of-the-art SSL methods in image classification, +but also effectively preserves the nuisance of input, which yields significant +improvement in dense prediction tasks, including object detection and +segmentation. Additionally, UDI performs competitively in low-shot image +classification, improving the scalability of joint-embedding pipelines. Various +visualizations and ablation studies are presented to further elucidate the +mechanisms behind UDI. Our source code is available at +https://github.com/ISL-CV/udi. + +
+
+
+
+
+ + ☆ SDLNet: Statistical Deep Learning Network for Co-Occurring Object + Detection and Identification ICML + + +
+ With the growing advances in deep learning based technologies the detection +and identification of co-occurring objects is a challenging task which has many +applications in areas such as, security and surveillance. In this paper, we +propose a novel framework called SDLNet- Statistical analysis with Deep +Learning Network that identifies co-occurring objects in conjunction with base +objects in multilabel object categories. The pipeline of proposed work is +implemented in two stages: in the first stage of SDLNet we deal with multilabel +detectors for discovering labels, and in the second stage we perform +co-occurrence matrix analysis. In co-occurrence matrix analysis, we learn +co-occurrence statistics by setting base classes and frequently occurring +classes, following this we build association rules and generate frequent +patterns. The crucial part of SDLNet is recognizing base classes and making +consideration for co-occurring classes. Finally, the generated co-occurrence +matrix based on frequent patterns will show base classes and their +corresponding co-occurring classes. SDLNet is evaluated on two publicly +available datasets: Pascal VOC and MS-COCO. The experimental results on these +benchmark datasets are reported in Sec 4. + +
+
+ comment: 8 pages, 3 figures, ICMLT-2024. arXiv admin note: text overlap with + arXiv:2403.17223 +
+
+
+
+
+ + ☆ Revising the Problem of Partial Labels from the Perspective of CNNs' + Robustness + + +
+ Convolutional neural networks (CNNs) have gained increasing popularity and +versatility in recent decades, finding applications in diverse domains. These +remarkable achievements are greatly attributed to the support of extensive +datasets with precise labels. However, annotating image datasets is intricate +and complex, particularly in the case of multi-label datasets. Hence, the +concept of partial-label setting has been proposed to reduce annotation costs, +and numerous corresponding solutions have been introduced. The evaluation +methods for these existing solutions have been primarily based on accuracy. +That is, their performance is assessed by their predictive accuracy on the test +set. However, we insist that such an evaluation is insufficient and one-sided. +On one hand, since the quality of the test set has not been evaluated, the +assessment results are unreliable. On the other hand, the partial-label problem +may also be raised by undergoing adversarial attacks. Therefore, incorporating +robustness into the evaluation system is crucial. For this purpose, we first +propose two attack models to generate multiple partial-label datasets with +varying degrees of label missing rates. Subsequently, we introduce a +lightweight partial-label solution using pseudo-labeling techniques and a +designed loss function. Then, we employ D-Score to analyze both the proposed +and existing methods to determine whether they can enhance robustness while +improving accuracy. Extensive experimental results demonstrate that while +certain methods may improve accuracy, the enhancement in robustness is not +significant, and in some cases, it even diminishes. + +
+
+
+
+
+ + ☆ PEEKABOO: Hiding parts of an image for unsupervised object localization + + +
+ Localizing objects in an unsupervised manner poses significant challenges due +to the absence of key visual information such as the appearance, type and +number of objects, as well as the lack of labeled object classes typically +available in supervised settings. While recent approaches to unsupervised +object localization have demonstrated significant progress by leveraging +self-supervised visual representations, they often require computationally +intensive training processes, resulting in high resource demands in terms of +computation, learnable parameters, and data. They also lack explicit modeling +of visual context, potentially limiting their accuracy in object localization. +To tackle these challenges, we propose a single-stage learning framework, +dubbed PEEKABOO, for unsupervised object localization by learning context-based +representations at both the pixel- and shape-level of the localized objects +through image masking. The key idea is to selectively hide parts of an image +and leverage the remaining image information to infer the location of objects +without explicit supervision. The experimental results, both quantitative and +qualitative, across various benchmark datasets, demonstrate the simplicity, +effectiveness and competitive performance of our approach compared to +state-of-the-art methods in both single object discovery and unsupervised +salient object detection tasks. Code and pre-trained models are available at: +https://github.com/hasibzunair/peekaboo + +
+
+
+
+
+ + ☆ CoMoTo: Unpaired Cross-Modal Lesion Distillation Improves Breast Lesion + Detection in Tomosynthesis MICCAI 2024 + + +
+ Digital Breast Tomosynthesis (DBT) is an advanced breast imaging modality +that offers superior lesion detection accuracy compared to conventional +mammography, albeit at the trade-off of longer reading time. Accelerating +lesion detection from DBT using deep learning is hindered by limited data +availability and huge annotation costs. A possible solution to this issue could +be to leverage the information provided by a more widely available modality, +such as mammography, to enhance DBT lesion detection. In this paper, we present +a novel framework, CoMoTo, for improving lesion detection in DBT. Our framework +leverages unpaired mammography data to enhance the training of a DBT model, +improving practicality by eliminating the need for mammography during +inference. Specifically, we propose two novel components, Lesion-specific +Knowledge Distillation (LsKD) and Intra-modal Point Alignment (ImPA). LsKD +selectively distills lesion features from a mammography teacher model to a DBT +student model, disregarding background features. ImPA further enriches LsKD by +ensuring the alignment of lesion features within the teacher before distilling +knowledge to the student. Our comprehensive evaluation shows that CoMoTo is +superior to traditional pretraining and image-level KD, improving performance +by 7% Mean Sensitivity under low-data setting. Our code is available at +https://github.com/Muhammad-Al-Barbary/CoMoTo . + +
+
+ comment: ADSMI @ MICCAI 2024 +
+
+
+
+
+ + ☆ Quality Assured: Rethinking Annotation Strategies in Imaging AI ECCV 2024 + + +
+ This paper does not describe a novel method. Instead, it studies an essential +foundation for reliable benchmarking and ultimately real-world application of +AI-based image analysis: generating high-quality reference annotations. +Previous research has focused on crowdsourcing as a means of outsourcing +annotations. However, little attention has so far been given to annotation +companies, specifically regarding their internal quality assurance (QA) +processes. Therefore, our aim is to evaluate the influence of QA employed by +annotation companies on annotation quality and devise methodologies for +maximizing data annotation efficacy. Based on a total of 57,648 instance +segmented images obtained from a total of 924 annotators and 34 QA workers from +four annotation companies and Amazon Mechanical Turk (MTurk), we derived the +following insights: (1) Annotation companies perform better both in terms of +quantity and quality compared to the widely used platform MTurk. (2) Annotation +companies' internal QA only provides marginal improvements, if any. However, +improving labeling instructions instead of investing in QA can substantially +boost annotation performance. (3) The benefit of internal QA depends on +specific image characteristics. Our work could enable researchers to derive +substantially more value from a fixed annotation budget and change the way +annotation companies conduct internal QA. + +
+
+ comment: Accepted at ECCV 2024, preprint, Computer Vision, Data Annotation +
+
+
+
+
+ + ♻ ☆ SwinSF: Image Reconstruction from Spatial-Temporal Spike Streams + + +
+ The spike camera, with its high temporal resolution, low latency, and high +dynamic range, addresses high-speed imaging challenges like motion blur. It +captures photons at each pixel independently, creating binary spike streams +rich in temporal information but challenging for image reconstruction. Current +algorithms, both traditional and deep learning-based, still need to be improved +in the utilization of the rich temporal detail and the restoration of the +details of the reconstructed image. To overcome this, we introduce Swin +Spikeformer (SwinSF), a novel model for dynamic scene reconstruction from spike +streams. SwinSF is composed of Spike Feature Extraction, Spatial-Temporal +Feature Extraction, and Final Reconstruction Module. It combines shifted window +self-attention and proposed temporal spike attention, ensuring a comprehensive +feature extraction that encapsulates both spatial and temporal dynamics, +leading to a more robust and accurate reconstruction of spike streams. +Furthermore, we build a new synthesized dataset for spike image reconstruction +which matches the resolution of the latest spike camera, ensuring its relevance +and applicability to the latest developments in spike camera imaging. +Experimental results demonstrate that the proposed network SwinSF sets a new +benchmark, achieving state-of-the-art performance across a series of datasets, +including both real-world and synthesized data across various resolutions. Our +codes and proposed dataset will be available soon. + +
+
+
+
+
+ + ♻ ☆ FDS: Feedback-guided Domain Synthesis with Multi-Source Conditional + Diffusion Models for Domain Generalization + + +
+ Domain Generalization techniques aim to enhance model robustness by +simulating novel data distributions during training, typically through various +augmentation or stylization strategies. However, these methods frequently +suffer from limited control over the diversity of generated images and lack +assurance that these images span distinct distributions. To address these +challenges, we propose FDS, Feedback-guided Domain Synthesis, a novel strategy +that employs diffusion models to synthesize novel, pseudo-domains by training a +single model on all source domains and performing domain mixing based on +learned features. By incorporating images that pose classification challenges +to models trained on original samples, alongside the original dataset, we +ensure the generation of a training set that spans a broad distribution +spectrum. Our comprehensive evaluations demonstrate that this methodology sets +new benchmarks in domain generalization performance across a range of +challenging datasets, effectively managing diverse types of domain shifts. The +implementation is available at: \url{https://github.com/Mehrdad-Noori/FDS.git}. + +
+
+
+
+
+ + ♻ ☆ MetaCap: Meta-learning Priors from Multi-View Imagery for Sparse-view + Human Performance Capture and Rendering + + +
+ Faithful human performance capture and free-view rendering from sparse RGB +observations is a long-standing problem in Vision and Graphics. The main +challenges are the lack of observations and the inherent ambiguities of the +setting, e.g. occlusions and depth ambiguity. As a result, radiance fields, +which have shown great promise in capturing high-frequency appearance and +geometry details in dense setups, perform poorly when naively supervising them +on sparse camera views, as the field simply overfits to the sparse-view inputs. +To address this, we propose MetaCap, a method for efficient and high-quality +geometry recovery and novel view synthesis given very sparse or even a single +view of the human. Our key idea is to meta-learn the radiance field weights +solely from potentially sparse multi-view videos, which can serve as a prior +when fine-tuning them on sparse imagery depicting the human. This prior +provides a good network weight initialization, thereby effectively addressing +ambiguities in sparse-view capture. Due to the articulated structure of the +human body and motion-induced surface deformations, learning such a prior is +non-trivial. Therefore, we propose to meta-learn the field weights in a +pose-canonicalized space, which reduces the spatial feature range and makes +feature learning more effective. Consequently, one can fine-tune our field +parameters to quickly generalize to unseen poses, novel illumination conditions +as well as novel and sparse (even monocular) camera views. For evaluating our +method under different scenarios, we collect a new dataset, WildDynaCap, which +contains subjects captured in, both, a dense camera dome and in-the-wild sparse +camera rigs, and demonstrate superior results compared to recent +state-of-the-art methods on, both, public and WildDynaCap dataset. + +
+
+ comment: Project page: https://vcai.mpi-inf.mpg.de/projects/MetaCap/ +
+
+
+
+
+ + ♻ ☆ MM-Soc: Benchmarking Multimodal Large Language Models in Social Media + Platforms ACL 2024 + + +
+ Social media platforms are hubs for multimodal information exchange, +encompassing text, images, and videos, making it challenging for machines to +comprehend the information or emotions associated with interactions in online +spaces. Multimodal Large Language Models (MLLMs) have emerged as a promising +solution to these challenges, yet they struggle to accurately interpret human +emotions and complex content such as misinformation. This paper introduces +MM-Soc, a comprehensive benchmark designed to evaluate MLLMs' understanding of +multimodal social media content. MM-Soc compiles prominent multimodal datasets +and incorporates a novel large-scale YouTube tagging dataset, targeting a range +of tasks from misinformation detection, hate speech detection, and social +context generation. Through our exhaustive evaluation on ten size-variants of +four open-source MLLMs, we have identified significant performance disparities, +highlighting the need for advancements in models' social understanding +capabilities. Our analysis reveals that, in a zero-shot setting, various types +of MLLMs generally exhibit difficulties in handling social media tasks. +However, MLLMs demonstrate performance improvements post fine-tuning, +suggesting potential pathways for improvement. Our code and data are available +at https://github.com/claws-lab/MMSoc.git. + +
+
+ comment: In Proceedings of ACL 2024 +
+
+
+
+
+ + ♻ ☆ MutDet: Mutually Optimizing Pre-training for Remote Sensing Object + Detection ECCV 2024 + + +
+ Detection pre-training methods for the DETR series detector have been +extensively studied in natural scenes, e.g., DETReg. However, the detection +pre-training remains unexplored in remote sensing scenes. In existing +pre-training methods, alignment between object embeddings extracted from a +pre-trained backbone and detector features is significant. However, due to +differences in feature extraction methods, a pronounced feature discrepancy +still exists and hinders the pre-training performance. The remote sensing +images with complex environments and more densely distributed objects +exacerbate the discrepancy. In this work, we propose a novel Mutually +optimizing pre-training framework for remote sensing object Detection, dubbed +as MutDet. In MutDet, we propose a systemic solution against this challenge. +Firstly, we propose a mutual enhancement module, which fuses the object +embeddings and detector features bidirectionally in the last encoder layer, +enhancing their information interaction.Secondly, contrastive alignment loss is +employed to guide this alignment process softly and simultaneously enhances +detector features' discriminativity. Finally, we design an auxiliary siamese +head to mitigate the task gap arising from the introduction of enhancement +module. Comprehensive experiments on various settings show new state-of-the-art +transfer performance. The improvement is particularly pronounced when data +quantity is limited. When using 10% of the DIOR-R data, MutDet improves DetReg +by 6.1% in AP50. Codes and models are available at: +https://github.com/floatingstarZ/MutDet. + +
+
+ comment: 14 pages, 4 figures; Accept to ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Multimodal Query-guided Object Localization + + +
+ Consider a scenario in one-shot query-guided object localization where +neither an image of the object nor the object category name is available as a +query. In such a scenario, a hand-drawn sketch of the object could be a choice +for a query. However, hand-drawn crude sketches alone, when used as queries, +might be ambiguous for object localization, e.g., a sketch of a laptop could be +confused for a sofa. On the other hand, a linguistic definition of the +category, e.g., a small portable computer small enough to use in your lap" +along with the sketch query, gives better visual and semantic cues for object +localization. In this work, we present a multimodal query-guided object +localization approach under the challenging open-set setting. In particular, we +use queries from two modalities, namely, hand-drawn sketch and description of +the object (also known as gloss), to perform object localization. Multimodal +query-guided object localization is a challenging task, especially when a large +domain gap exists between the queries and the natural images, as well as due to +the challenge of combining the complementary and minimal information present +across the queries. For example, hand-drawn crude sketches contain abstract +shape information of an object, while the text descriptions often capture +partial semantic information about a given object category. To address the +aforementioned challenges, we present a novel cross-modal attention scheme that +guides the region proposal network to generate object proposals relevant to the +input queries and a novel orthogonal projection-based proposal scoring +technique that scores each proposal with respect to the queries, thereby +yielding the final localization results. ... + +
+
+ comment: Accepted to MMTA +
+
+
+
+
+ + ♻ ☆ TLControl: Trajectory and Language Control for Human Motion Synthesis + + +
+ Controllable human motion synthesis is essential for applications in AR/VR, +gaming and embodied AI. Existing methods often focus solely on either language +or full trajectory control, lacking precision in synthesizing motions aligned +with user-specified trajectories, especially for multi-joint control. To +address these issues, we present TLControl, a novel method for realistic human +motion synthesis, incorporating both low-level Trajectory and high-level +Language semantics controls, through the integration of neural-based and +optimization-based techniques. Specifically, we begin with training a VQ-VAE +for a compact and well-structured latent motion space organized by body parts. +We then propose a Masked Trajectories Transformer (MTT) for predicting a motion +distribution conditioned on language and trajectory. Once trained, we use MTT +to sample initial motion predictions given user-specified partial trajectories +and text descriptions as conditioning. Finally, we introduce a test-time +optimization to refine these coarse predictions for precise trajectory control, +which offers flexibility by allowing users to specify various optimization +goals and ensures high runtime efficiency. Comprehensive experiments show that +TLControl significantly outperforms the state-of-the-art in trajectory accuracy +and time efficiency, making it practical for interactive and high-quality +animation generation. + +
+
+
+
+
+ + ♻ ☆ EventBind: Learning a Unified Representation to Bind Them All for + Event-based Open-world Understanding ECCV 2024 + + +
+ In this paper, we propose EventBind, a novel and effective framework that +unleashes the potential of vision-language models (VLMs) for event-based +recognition to compensate for the lack of large-scale event-based datasets. In +particular, due to the distinct modality gap with the image-text data and the +lack of large-scale datasets, learning a common representation space for +images, texts, and events is non-trivial.Intuitively, we need to address two +key challenges: 1) how to generalize CLIP's visual encoder to event data while +fully leveraging events' unique properties, e.g., sparsity and high temporal +resolution; 2) how to effectively align the multi-modal embeddings, i.e., +image, text, and events. Accordingly, we first introduce a novel event encoder +that subtly models the temporal information from events and meanwhile, +generates event prompts for modality bridging. We then design a text encoder +that generates content prompts and utilizes hybrid text prompts to enhance +EventBind's generalization ability across diverse datasets.With the proposed +event encoder, text encoder, and image encoder, a novel Hierarchical Triple +Contrastive Alignment (HTCA) module is introduced to jointly optimize the +correlation and enable efficient knowledge transfer among the three modalities. +We evaluate various settings, including fine-tuning and few-shot on three +benchmarks, and our EventBind achieves new state-of-the-art accuracy compared +with the previous methods, such as on N-Caltech101 (+5.34% and +1.70%) and +N-Imagenet (+5.65% and +1.99%) with fine-tuning and 20-shot settings, +respectively. Moreover, our EventBind can be flexibly extended to the event +retrieval task using text or image queries, showing plausible performance. +Project page:https://vlislab22.github.io/EventBind/. + +
+
+ comment: ECCV 2024 Accepted. Camera-ready version with supplementary +
+
+
+
+
+ + ♻ ☆ SRFNet: Monocular Depth Estimation with Fine-grained Structure via + Spatial Reliability-oriented Fusion of Frames and Events ICRA 2024 + + +
+ Monocular depth estimation is a crucial task to measure distance relative to +a camera, which is important for applications, such as robot navigation and +self-driving. Traditional frame-based methods suffer from performance drops due +to the limited dynamic range and motion blur. Therefore, recent works leverage +novel event cameras to complement or guide the frame modality via frame-event +feature fusion. However, event streams exhibit spatial sparsity, leaving some +areas unperceived, especially in regions with marginal light changes. +Therefore, direct fusion methods, e.g., RAMNet, often ignore the contribution +of the most confident regions of each modality. This leads to structural +ambiguity in the modality fusion process, thus degrading the depth estimation +performance. In this paper, we propose a novel Spatial Reliability-oriented +Fusion Network (SRFNet), that can estimate depth with fine-grained structure at +both daytime and nighttime. Our method consists of two key technical +components. Firstly, we propose an attention-based interactive fusion (AIF) +module that applies spatial priors of events and frames as the initial masks +and learns the consensus regions to guide the inter-modal feature fusion. The +fused feature are then fed back to enhance the frame and event feature +learning. Meanwhile, it utilizes an output head to generate a fused mask, which +is iteratively updated for learning consensual spatial priors. Secondly, we +propose the Reliability-oriented Depth Refinement (RDR) module to estimate +dense depth with the fine-grained structure based on the fused features and +masks. We evaluate the effectiveness of our method on the synthetic and +real-world datasets, which shows that, even without pretraining, our method +outperforms the prior methods, e.g., RAMNet, especially in night scenes. Our +project homepage: https://vlislab22.github.io/SRFNet. + +
+
+ comment: Accepted to ICRA 2024 +
+
+
+
+
+ + ♻ ☆ Trackastra: Transformer-based cell tracking for live-cell microscopy ECCV 2024 + + +
+ Cell tracking is a ubiquitous image analysis task in live-cell microscopy. +Unlike multiple object tracking (MOT) for natural images, cell tracking +typically involves hundreds of similar-looking objects that can divide in each +frame, making it a particularly challenging problem. Current state-of-the-art +approaches follow the tracking-by-detection paradigm, i.e. first all cells are +detected per frame and successively linked in a second step to form +biologically consistent cell tracks. Linking is commonly solved via discrete +optimization methods, which require manual tuning of hyperparameters for each +dataset and are therefore cumbersome to use in practice. Here we propose +Trackastra, a general purpose cell tracking approach that uses a simple +transformer architecture to directly learn pairwise associations of cells +within a temporal window from annotated data. Importantly, unlike existing +transformer-based MOT pipelines, our learning architecture also accounts for +dividing objects such as cells and allows for accurate tracking even with +simple greedy linking, thus making strides towards removing the requirement for +a complex linking step. The proposed architecture operates on the full +spatio-temporal context of detections within a time window by avoiding the +computational burden of processing dense images. We show that our tracking +approach performs on par with or better than highly tuned state-of-the-art cell +tracking algorithms for various biological datasets, such as bacteria, cell +cultures and fluorescent particles. We provide code at +https://github.com/weigertlab/trackastra. + +
+
+ comment: Accepted at ECCV 2024 +
+
+
+
+
+ + ♻ ☆ DarSwin: Distortion Aware Radial Swin Transformer + + +
+ Wide-angle lenses are commonly used in perception tasks requiring a large +field of view. Unfortunately, these lenses produce significant distortions, +making conventional models that ignore the distortion effects unable to adapt +to wide-angle images. In this paper, we present a novel transformer-based model +that automatically adapts to the distortion produced by wide-angle lenses. Our +proposed image encoder architecture, dubbed DarSwin, leverages the physical +characteristics of such lenses analytically defined by the radial distortion +profile. In contrast to conventional transformer-based architectures, DarSwin +comprises a radial patch partitioning, a distortion-based sampling technique +for creating token embeddings, and an angular position encoding for radial +patch merging. Compared to other baselines, DarSwin achieves the best results +on different datasets with significant gains when trained on bounded levels of +distortions (very low, low, medium, and high) and tested on all, including +out-of-distribution distortions. While the base DarSwin architecture requires +knowledge of the radial distortion profile, we show it can be combined with a +self-calibration network that estimates such a profile from the input image +itself, resulting in a completely uncalibrated pipeline. Finally, we also +present DarSwin-Unet, which extends DarSwin, to an encoder-decoder architecture +suitable for pixel-level tasks. We demonstrate its performance on depth +estimation and show through extensive experiments that DarSwin-Unet can perform +zero-shot adaptation to unseen distortions of different wide-angle lenses. The +code and models are publicly available at https://lvsn.github.io/darswin/ + +
+
+ comment: 18 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ GaussianEditor: Editing 3D Gaussians Delicately with Text Instructions CVPR 2024 + + +
+ Recently, impressive results have been achieved in 3D scene editing with text +instructions based on a 2D diffusion model. However, current diffusion models +primarily generate images by predicting noise in the latent space, and the +editing is usually applied to the whole image, which makes it challenging to +perform delicate, especially localized, editing for 3D scenes. Inspired by +recent 3D Gaussian splatting, we propose a systematic framework, named +GaussianEditor, to edit 3D scenes delicately via 3D Gaussians with text +instructions. Benefiting from the explicit property of 3D Gaussians, we design +a series of techniques to achieve delicate editing. Specifically, we first +extract the region of interest (RoI) corresponding to the text instruction, +aligning it to 3D Gaussians. The Gaussian RoI is further used to control the +editing process. Our framework can achieve more delicate and precise editing of +3D scenes than previous methods while enjoying much faster training speed, i.e. +within 20 minutes on a single V100 GPU, more than twice as fast as +Instruct-NeRF2NeRF (45 minutes -- 2 hours). + +
+
+ comment: CVPR 2024, Project page: https://GaussianEditor.github.io +
+
+
+
+
+ + ♻ ☆ CycleMix: Mixing Source Domains for Domain Generalization in + Style-Dependent Data + + +
+ As deep learning-based systems have become an integral part of everyday life, +limitations in their generalization ability have begun to emerge. Machine +learning algorithms typically rely on the i.i.d. assumption, meaning that their +training and validation data are expected to follow the same distribution, +which does not necessarily hold in practice. In the case of image +classification, one frequent reason that algorithms fail to generalize is that +they rely on spurious correlations present in training data, such as +associating image styles with target classes. These associations may not be +present in the unseen test data, leading to significant degradation of their +effectiveness. In this work, we attempt to mitigate this Domain Generalization +(DG) problem by training a robust feature extractor which disregards features +attributed to image-style but infers based on style-invariant image +representations. To achieve this, we train CycleGAN models to learn the +different styles present in the training data and randomly mix them together to +create samples with novel style attributes to improve generalization. +Experimental results on the PACS DG benchmark validate the proposed method. + +
+
+
+
+
+ + ♻ ☆ Inter and Intra Prior Learning-based Hyperspectral Image Reconstruction + Using Snapshot SWIR Metasurface + + +
+ Shortwave-infrared(SWIR) spectral information, ranging from 1 {\mu}m to +2.5{\mu}m, overcomes the limitations of traditional color cameras in acquiring +scene information. However, conventional SWIR hyperspectral imaging systems +face challenges due to their bulky setups and low acquisition speeds. This work +introduces a snapshot SWIR hyperspectral imaging system based on a metasurface +filter and a corresponding filter selection method to achieve the lowest +correlation coefficient among these filters. This system offers the advantages +of compact size and snapshot imaging. We propose a novel inter and intra prior +learning unfolding framework to achieve high-quality SWIR hyperspectral image +reconstruction, which bridges the gap between prior learning and cross-stage +information interaction. Additionally, We design an adaptive feature transfer +mechanism to adaptively transfer the contextual correlation of multi-scale +encoder features to prevent detailed information loss in the decoder. +Experiment results demonstrate that our method can reconstruct hyperspectral +images with high speed and superior performance over existing methods. + +
+
+ comment: 12 pages,9 figures +
+
+
+
+
+ + ♻ ☆ Velocity Driven Vision: Asynchronous Sensor Fusion Birds Eye View Models + for Autonomous Vehicles + + +
+ Fusing different sensor modalities can be a difficult task, particularly if +they are asynchronous. Asynchronisation may arise due to long processing times +or improper synchronisation during calibration, and there must exist a way to +still utilise this previous information for the purpose of safe driving, and +object detection in ego vehicle/ multi-agent trajectory prediction. +Difficulties arise in the fact that the sensor modalities have captured +information at different times and also at different positions in space. +Therefore, they are not spatially nor temporally aligned. This paper will +investigate the challenge of radar and LiDAR sensors being asynchronous +relative to the camera sensors, for various time latencies. The spatial +alignment will be resolved before lifting into BEV space via the transformation +of the radar/LiDAR point clouds into the new ego frame coordinate system. Only +after this can we concatenate the radar/LiDAR point cloud and lifted camera +features. Temporal alignment will be remedied for radar data only, we will +implement a novel method of inferring the future radar point positions using +the velocity information. Our approach to resolving the issue of sensor +asynchrony yields promising results. We demonstrate velocity information can +drastically improve IoU for asynchronous datasets, as for a time latency of 360 +milliseconds (ms), IoU improves from 49.54 to 53.63. Additionally, for a time +latency of 550ms, the camera+radar (C+R) model outperforms the camera+LiDAR +(C+L) model by 0.18 IoU. This is an advancement in utilising the +often-neglected radar sensor modality, which is less favoured than LiDAR for +autonomous driving purposes. + +
+
+
+
+
+ + ♻ ☆ PhenoBench -- A Large Dataset and Benchmarks for Semantic Image + Interpretation in the Agricultural Domain + + +
+ The production of food, feed, fiber, and fuel is a key task of agriculture, +which has to cope with many challenges in the upcoming decades, e.g., a higher +demand, climate change, lack of workers, and the availability of arable land. +Vision systems can support making better and more sustainable field management +decisions, but also support the breeding of new crop varieties by allowing +temporally dense and reproducible measurements. Recently, agricultural robotics +got an increasing interest in the vision and robotics communities since it is a +promising avenue for coping with the aforementioned lack of workers and +enabling more sustainable production. While large datasets and benchmarks in +other domains are readily available and enable significant progress, +agricultural datasets and benchmarks are comparably rare. We present an +annotated dataset and benchmarks for the semantic interpretation of real +agricultural fields. Our dataset recorded with a UAV provides high-quality, +pixel-wise annotations of crops and weeds, but also crop leaf instances at the +same time. Furthermore, we provide benchmarks for various tasks on a hidden +test set comprised of different fields: known fields covered by the training +data and a completely unseen field. Our dataset, benchmarks, and code are +available at \url{https://www.phenobench.org}. + +
+
+ comment: Accepted by IEEE Transactions on Pattern Analysis and Machine + Intelligence (T-PAMI) +
+
+
+
+
+ + ♻ ☆ Hybrid Functional Maps for Crease-Aware Non-Isometric Shape Matching CVPR 2024 + + +
+ Non-isometric shape correspondence remains a fundamental challenge in +computer vision. Traditional methods using Laplace-Beltrami operator (LBO) +eigenmodes face limitations in characterizing high-frequency extrinsic shape +changes like bending and creases. We propose a novel approach of combining the +non-orthogonal extrinsic basis of eigenfunctions of the elastic thin-shell +hessian with the intrinsic ones of the LBO, creating a hybrid spectral space in +which we construct functional maps. To this end, we present a theoretical +framework to effectively integrate non-orthogonal basis functions into +descriptor- and learning-based functional map methods. Our approach can be +incorporated easily into existing functional map pipelines across varying +applications and is able to handle complex deformations beyond isometries. We +show extensive evaluations across various supervised and unsupervised settings +and demonstrate significant improvements. Notably, our approach achieves up to +15% better mean geodesic error for non-isometric correspondence settings and up +to 45% improvement in scenarios with topological noise. + +
+
+ comment: Presented at CVPR 2024. This version contains two additional figures + in the main paper and generalization experiments in the appendix. Please cite + the official IEEE CVPR publication +
+
+
+
+
+ + ♻ ☆ Copyright Protection in Generative AI: A Technical Perspective + + +
+ Generative AI has witnessed rapid advancement in recent years, expanding +their capabilities to create synthesized content such as text, images, audio, +and code. The high fidelity and authenticity of contents generated by these +Deep Generative Models (DGMs) have sparked significant copyright concerns. +There have been various legal debates on how to effectively safeguard +copyrights in DGMs. This work delves into this issue by providing a +comprehensive overview of copyright protection from a technical perspective. We +examine from two distinct viewpoints: the copyrights pertaining to the source +data held by the data owners and those of the generative models maintained by +the model builders. For data copyright, we delve into methods data owners can +protect their content and DGMs can be utilized without infringing upon these +rights. For model copyright, our discussion extends to strategies for +preventing model theft and identifying outputs generated by specific models. +Finally, we highlight the limitations of existing techniques and identify areas +that remain unexplored. Furthermore, we discuss prospective directions for the +future of copyright protection, underscoring its importance for the sustainable +and ethical development of Generative AI. + +
+
+ comment: 26 pages +
+
+
+
+
+ + ♻ ☆ MeshVPR: Citywide Visual Place Recognition Using 3D Meshes + + +
+ Mesh-based scene representation offers a promising direction for simplifying +large-scale hierarchical visual localization pipelines, combining a visual +place recognition step based on global features (retrieval) and a visual +localization step based on local features. While existing work demonstrates the +viability of meshes for visual localization, the impact of using synthetic +databases rendered from them in visual place recognition remains largely +unexplored. In this work we investigate using dense 3D textured meshes for +large-scale Visual Place Recognition (VPR). We identify a significant +performance drop when using synthetic mesh-based image databases compared to +real-world images for retrieval. To address this, we propose MeshVPR, a novel +VPR pipeline that utilizes a lightweight features alignment framework to bridge +the gap between real-world and synthetic domains. MeshVPR leverages pre-trained +VPR models and is efficient and scalable for city-wide deployments. We +introduce novel datasets with freely available 3D meshes and manually collected +queries from Berlin, Paris, and Melbourne. Extensive evaluations demonstrate +that MeshVPR achieves competitive performance with standard VPR pipelines, +paving the way for mesh-based localization systems. Data, code, and interactive +visualizations are available at https://meshvpr.github.io/ + +
+
+ comment: Website: https://mesh-vpr.github.io/ +
+
+
+
+
+ + ♻ ☆ EXACT: How to Train Your Accuracy + + +
+ Classification tasks are usually evaluated in terms of accuracy. However, +accuracy is discontinuous and cannot be directly optimized using gradient +ascent. Popular methods minimize cross-entropy, hinge loss, or other surrogate +losses, which can lead to suboptimal results. In this paper, we propose a new +optimization framework by introducing stochasticity to a model's output and +optimizing expected accuracy, i.e. accuracy of the stochastic model. Extensive +experiments on linear models and deep image classification show that the +proposed optimization method is a powerful alternative to widely used +classification losses. + +
+
+ comment: Pattern Recognition Letters (2024) +
+
+
+
+
+ + ♻ ☆ FipTR: A Simple yet Effective Transformer Framework for Future Instance + Prediction in Autonomous Driving + + +
+ The future instance prediction from a Bird's Eye View(BEV) perspective is a +vital component in autonomous driving, which involves future instance +segmentation and instance motion prediction. Existing methods usually rely on a +redundant and complex pipeline which requires multiple auxiliary outputs and +post-processing procedures. Moreover, estimated errors on each of the auxiliary +predictions will lead to degradation of the prediction performance. In this +paper, we propose a simple yet effective fully end-to-end framework named +Future Instance Prediction Transformer(FipTR), which views the task as BEV +instance segmentation and prediction for future frames. We propose to adopt +instance queries representing specific traffic participants to directly +estimate the corresponding future occupied masks, and thus get rid of complex +post-processing procedures. Besides, we devise a flow-aware BEV predictor for +future BEV feature prediction composed of a flow-aware deformable attention +that takes backward flow guiding the offset sampling. A novel future instance +matching strategy is also proposed to further improve the temporal coherence. +Extensive experiments demonstrate the superiority of FipTR and its +effectiveness under different temporal BEV encoders. The code is available at +https://github.com/TabGuigui/FipTR . + +
+
+
+
+
+ + ♻ ☆ Efficient 3D-Aware Facial Image Editing via Attribute-Specific Prompt + Learning ECCV + + +
+ Drawing upon StyleGAN's expressivity and disentangled latent space, existing +2D approaches employ textual prompting to edit facial images with different +attributes. In contrast, 3D-aware approaches that generate faces at different +target poses require attribute-specific classifiers, learning separate model +weights for each attribute, and are not scalable for novel attributes. In this +work, we propose an efficient, plug-and-play, 3D-aware face editing framework +based on attribute-specific prompt learning, enabling the generation of facial +images with controllable attributes across various target poses. To this end, +we introduce a text-driven learnable style token-based latent attribute editor +(LAE). The LAE harnesses a pre-trained vision-language model to find +text-guided attribute-specific editing direction in the latent space of any +pre-trained 3D-aware GAN. It utilizes learnable style tokens and style mappers +to learn and transform this editing direction to 3D latent space. To train LAE +with multiple attributes, we use directional contrastive loss and style token +loss. Furthermore, to ensure view consistency and identity preservation across +different poses and attributes, we employ several 3D-aware identity and pose +preservation losses. Our experiments show that our proposed framework generates +high-quality images with 3D awareness and view consistency while maintaining +attribute-specific features. We demonstrate the effectiveness of our method on +different facial attributes, including hair color and style, expression, and +others. + +
+
+ comment: Accepted at ECCV, 2024. Amandeep Kumar and Muhammad Awais are joint + first authors. More details are available at + https://awaisrauf.github.io/3d_face_editing +
+
+
+
+
+ + ♻ ☆ Multi-HMR: Multi-Person Whole-Body Human Mesh Recovery in a Single Shot ECCV'24 + + +
+ We present Multi-HMR, a strong sigle-shot model for multi-person 3D human +mesh recovery from a single RGB image. Predictions encompass the whole body, +i.e., including hands and facial expressions, using the SMPL-X parametric model +and 3D location in the camera coordinate system. Our model detects people by +predicting coarse 2D heatmaps of person locations, using features produced by a +standard Vision Transformer (ViT) backbone. It then predicts their whole-body +pose, shape and 3D location using a new cross-attention module called the Human +Prediction Head (HPH), with one query attending to the entire set of features +for each detected person. As direct prediction of fine-grained hands and facial +poses in a single shot, i.e., without relying on explicit crops around body +parts, is hard to learn from existing data, we introduce CUFFS, the Close-Up +Frames of Full-Body Subjects dataset, containing humans close to the camera +with diverse hand poses. We show that incorporating it into the training data +further enhances predictions, particularly for hands. Multi-HMR also optionally +accounts for camera intrinsics, if available, by encoding camera ray directions +for each image token. This simple design achieves strong performance on +whole-body and body-only benchmarks simultaneously: a ViT-S backbone on +$448{\times}448$ images already yields a fast and competitive model, while +larger models and higher resolutions obtain state-of-the-art results. + +
+
+ comment: Accepted at ECCV'24 - Code: https://github.com/naver/multi-hmr +
+
+
+
+
+ + ♻ ☆ On the Federated Learning Framework for Cooperative Perception + + +
+ Cooperative perception is essential to enhance the efficiency and safety of +future transportation systems, requiring extensive data sharing among vehicles +on the road, which raises significant privacy concerns. Federated learning +offers a promising solution by enabling data privacy-preserving collaborative +enhancements in perception, decision-making, and planning among connected and +autonomous vehicles (CAVs). However, federated learning is impeded by +significant challenges arising from data heterogeneity across diverse clients, +potentially diminishing model accuracy and prolonging convergence periods. This +study introduces a specialized federated learning framework for CP, termed the +federated dynamic weighted aggregation (FedDWA) algorithm, facilitated by +dynamic adjusting loss (DALoss) function. This framework employs dynamic client +weighting to direct model convergence and integrates a novel loss function that +utilizes Kullback-Leibler divergence (KLD) to counteract the detrimental +effects of non-independently and identically distributed (Non-IID) and +unbalanced data. Utilizing the BEV transformer as the primary model, our +rigorous testing on the OpenV2V dataset, augmented with FedBEVT data, +demonstrates significant improvements in the average intersection over union +(IoU). These results highlight the substantial potential of our federated +learning framework to address data heterogeneity challenges in CP, thereby +enhancing the accuracy of environmental perception models and facilitating more +robust and efficient collaborative learning solutions in the transportation +sector. + +
+
+
+
+
+ + ♻ ☆ SOAP: Enhancing Spatio-Temporal Relation and Motion Information + Capturing for Few-Shot Action Recognition ACM MM 2024 + + +
+ High frame-rate (HFR) videos of action recognition improve fine-grained +expression while reducing the spatio-temporal relation and motion information +density. Thus, large amounts of video samples are continuously required for +traditional data-driven training. However, samples are not always sufficient in +real-world scenarios, promoting few-shot action recognition (FSAR) research. We +observe that most recent FSAR works build spatio-temporal relation of video +samples via temporal alignment after spatial feature extraction, cutting apart +spatial and temporal features within samples. They also capture motion +information via narrow perspectives between adjacent frames without considering +density, leading to insufficient motion information capturing. Therefore, we +propose a novel plug-and-play architecture for FSAR called Spatio-tempOral +frAme tuPle enhancer (SOAP) in this paper. The model we designed with such +architecture refers to SOAP-Net. Temporal connections between different feature +channels and spatio-temporal relation of features are considered instead of +simple feature extraction. Comprehensive motion information is also captured, +using frame tuples with multiple frames containing more motion information than +adjacent frames. Combining frame tuples of diverse frame counts further +provides a broader perspective. SOAP-Net achieves new state-of-the-art +performance across well-known benchmarks such as SthSthV2, Kinetics, UCF101, +and HMDB51. Extensive empirical evaluations underscore the competitiveness, +pluggability, generalization, and robustness of SOAP. The code is released at +https://github.com/wenbohuang1002/SOAP. + +
+
+ comment: Accepted by ACM MM 2024 +
+
+
+
+
+ + ♻ ☆ In My Perspective, In My Hands: Accurate Egocentric 2D Hand Pose and + Action Recognition + + +
+ Action recognition is essential for egocentric video understanding, allowing +automatic and continuous monitoring of Activities of Daily Living (ADLs) +without user effort. Existing literature focuses on 3D hand pose input, which +requires computationally intensive depth estimation networks or wearing an +uncomfortable depth sensor. In contrast, there has been insufficient research +in understanding 2D hand pose for egocentric action recognition, despite the +availability of user-friendly smart glasses in the market capable of capturing +a single RGB image. Our study aims to fill this research gap by exploring the +field of 2D hand pose estimation for egocentric action recognition, making two +contributions. Firstly, we introduce two novel approaches for 2D hand pose +estimation, namely EffHandNet for single-hand estimation and EffHandEgoNet, +tailored for an egocentric perspective, capturing interactions between hands +and objects. Both methods outperform state-of-the-art models on H2O and FPHA +public benchmarks. Secondly, we present a robust action recognition +architecture from 2D hand and object poses. This method incorporates +EffHandEgoNet, and a transformer-based action recognition method. Evaluated on +H2O and FPHA datasets, our architecture has a faster inference time and +achieves an accuracy of 91.32% and 94.43%, respectively, surpassing state of +the art, including 3D-based methods. Our work demonstrates that using 2D +skeletal data is a robust approach for egocentric action understanding. +Extensive evaluation and ablation studies show the impact of the hand pose +estimation approach, and how each input affects the overall performance. + +
+
+ comment: Accepted at: The 18th IEEE International Conference on Automatic Face + and Gesture Recognition +
+
+
+
+
+ + ♻ ☆ Bilateral Reference for High-Resolution Dichotomous Image Segmentation + + +
+ We introduce a novel bilateral reference framework (BiRefNet) for +high-resolution dichotomous image segmentation (DIS). It comprises two +essential components: the localization module (LM) and the reconstruction +module (RM) with our proposed bilateral reference (BiRef). The LM aids in +object localization using global semantic information. Within the RM, we +utilize BiRef for the reconstruction process, where hierarchical patches of +images provide the source reference and gradient maps serve as the target +reference. These components collaborate to generate the final predicted maps. +We also introduce auxiliary gradient supervision to enhance focus on regions +with finer details. Furthermore, we outline practical training strategies +tailored for DIS to improve map quality and training process. To validate the +general applicability of our approach, we conduct extensive experiments on four +tasks to evince that BiRefNet exhibits remarkable performance, outperforming +task-specific cutting-edge methods across all benchmarks. Our codes are +available at https://github.com/ZhengPeng7/BiRefNet. + +
+
+ comment: Version 6, the final version of the journal with a fixed institute +
+
+
+
+
+ + ♻ ☆ GPSFormer: A Global Perception and Local Structure Fitting-based + Transformer for Point Cloud Understanding ECCV 2024 + + +
+ Despite the significant advancements in pre-training methods for point cloud +understanding, directly capturing intricate shape information from irregular +point clouds without reliance on external data remains a formidable challenge. +To address this problem, we propose GPSFormer, an innovative Global Perception +and Local Structure Fitting-based Transformer, which learns detailed shape +information from point clouds with remarkable precision. The core of GPSFormer +is the Global Perception Module (GPM) and the Local Structure Fitting +Convolution (LSFConv). Specifically, GPM utilizes Adaptive Deformable Graph +Convolution (ADGConv) to identify short-range dependencies among similar +features in the feature space and employs Multi-Head Attention (MHA) to learn +long-range dependencies across all positions within the feature space, +ultimately enabling flexible learning of contextual representations. Inspired +by Taylor series, we design LSFConv, which learns both low-order fundamental +and high-order refinement information from explicitly encoded local geometric +structures. Integrating the GPM and LSFConv as fundamental components, we +construct GPSFormer, a cutting-edge Transformer that effectively captures +global and local structures of point clouds. Extensive experiments validate +GPSFormer's effectiveness in three point cloud tasks: shape classification, +part segmentation, and few-shot learning. The code of GPSFormer is available at +\url{https://github.com/changshuowang/GPSFormer}. + +
+
+ comment: Accepted by ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Learning to Transform Dynamically for Better Adversarial Transferability CVPR 2024 + + +
+ Adversarial examples, crafted by adding perturbations imperceptible to +humans, can deceive neural networks. Recent studies identify the adversarial +transferability across various models, \textit{i.e.}, the cross-model attack +ability of adversarial samples. To enhance such adversarial transferability, +existing input transformation-based methods diversify input data with +transformation augmentation. However, their effectiveness is limited by the +finite number of available transformations. In our study, we introduce a novel +approach named Learning to Transform (L2T). L2T increases the diversity of +transformed images by selecting the optimal combination of operations from a +pool of candidates, consequently improving adversarial transferability. We +conceptualize the selection of optimal transformation combinations as a +trajectory optimization problem and employ a reinforcement learning strategy to +effectively solve the problem. Comprehensive experiments on the ImageNet +dataset, as well as practical tests with Google Vision and GPT-4V, reveal that +L2T surpasses current methodologies in enhancing adversarial transferability, +thereby confirming its effectiveness and practical significance. The code is +available at https://github.com/RongyiZhu/L2T. + +
+
+ comment: accepted as a poster in CVPR 2024 +
+
+
+
+
+ + ♻ ☆ 3D Adaptive Structural Convolution Network for Domain-Invariant Point + Cloud Recognition + + +
+ Adapting deep learning networks for point cloud data recognition in +self-driving vehicles faces challenges due to the variability in datasets and +sensor technologies, emphasizing the need for adaptive techniques to maintain +accuracy across different conditions. In this paper, we introduce the 3D +Adaptive Structural Convolution Network (3D-ASCN), a cutting-edge framework for +3D point cloud recognition. It combines 3D convolution kernels, a structural +tree structure, and adaptive neighborhood sampling for effective geometric +feature extraction. This method obtains domain-invariant features and +demonstrates robust, adaptable performance on a variety of point cloud +datasets, ensuring compatibility across diverse sensor configurations without +the need for parameter adjustments. This highlights its potential to +significantly enhance the reliability and efficiency of self-driving vehicle +technology. + +
+
+ comment: 11 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ MSD: A Benchmark Dataset for Floor Plan Generation of Building Complexes ECCV 2024 + + +
+ Diverse and realistic floor plan data are essential for the development of +useful computer-aided methods in architectural design. Today's large-scale +floor plan datasets predominantly feature simple floor plan layouts, typically +representing single-apartment dwellings only. To compensate for the mismatch +between current datasets and the real world, we develop \textbf{Modified Swiss +Dwellings} (MSD) -- the first large-scale floor plan dataset that contains a +significant share of layouts of multi-apartment dwellings. MSD features over +5.3K floor plans of medium- to large-scale building complexes, covering over +18.9K distinct apartments. We validate that existing approaches for floor plan +generation, while effective in simpler scenarios, cannot yet seamlessly address +the challenges posed by MSD. Our benchmark calls for new research in floor plan +machine understanding. Code and data are open. + +
+
+ comment: ECCV 2024 (incl. Suppl. Mat.) +
+
+
+
+
+ + ♻ ☆ P-MSDiff: Parallel Multi-Scale Diffusion for Remote Sensing Image + Segmentation + + +
+ Diffusion models and multi-scale features are essential components in +semantic segmentation tasks that deal with remote-sensing images. They +contribute to improved segmentation boundaries and offer significant contextual +information. U-net-like architectures are frequently employed in diffusion +models for segmentation tasks. These architectural designs include dense skip +connections that may pose challenges for interpreting intermediate features. +Consequently, they might not efficiently convey semantic information throughout +various layers of the encoder-decoder architecture. To address these +challenges, we propose a new model for semantic segmentation known as the +diffusion model with parallel multi-scale branches. This model consists of +Parallel Multiscale Diffusion modules (P-MSDiff) and a Cross-Bridge Linear +Attention mechanism (CBLA). P-MSDiff enhances the understanding of semantic +information across multiple levels of granularity and detects repetitive +distribution data through the integration of recursive denoising branches. It +further facilitates the amalgamation of data by connecting relevant branches to +the primary framework to enable concurrent denoising. Furthermore, within the +interconnected transformer architecture, the LA module has been substituted +with the CBLA module. This module integrates a semidefinite matrix linked to +the query into the dot product computation of keys and values. This integration +enables the adaptation of queries within the LA framework. This adjustment +enhances the structure for multi-head attention computation, leading to +enhanced network performance and CBLA is a plug-and-play module. Our model +demonstrates superior performance based on the J1 metric on both the UAVid and +Vaihingen Building datasets, showing improvements of 1.60% and 1.40% over +strong baseline models, respectively. + +
+
+
+
+
+ + ♻ ☆ Unifying 3D Vision-Language Understanding via Promptable Queries ECCV 2024 + + +
+ A unified model for 3D vision-language (3D-VL) understanding is expected to +take various scene representations and perform a wide range of tasks in a 3D +scene. However, a considerable gap exists between existing methods and such a +unified model, due to the independent application of representation and +insufficient exploration of 3D multi-task training. In this paper, we introduce +PQ3D, a unified model capable of using Promptable Queries to tackle a wide +range of 3D-VL tasks, from low-level instance segmentation to high-level +reasoning and planning. This is achieved through three key innovations: (1) +unifying various 3D scene representations (i.e., voxels, point clouds, +multi-view images) into a shared 3D coordinate space by segment-level grouping, +(2) an attention-based query decoder for task-specific information retrieval +guided by prompts, and (3) universal output heads for different tasks to +support multi-task training. Tested across ten diverse 3D-VL datasets, PQ3D +demonstrates impressive performance on these tasks, setting new records on most +benchmarks. Particularly, PQ3D improves the state-of-the-art on ScanNet200 by +4.9% (AP25), ScanRefer by 5.4% (acc@0.5), Multi3DRefer by 11.7% (F1@0.5), and +Scan2Cap by 13.4% (CIDEr@0.5). Moreover, PQ3D supports flexible inference with +individual or combined forms of available 3D representations, e.g., solely +voxel input. + +
+
+ comment: ECCV 2024. Project page: https://pq3d.github.io +
+
+
+
+
+ + ♻ ☆ DisControlFace: Adding Disentangled Control to Diffusion Autoencoder for + One-shot Explicit Facial Image Editing + + +
+ In this work, we focus on exploring explicit fine-grained control of +generative facial image editing, all while generating faithful facial +appearances and consistent semantic details, which however, is quite +challenging and has not been extensively explored, especially under an one-shot +scenario. We identify the key challenge as the exploration of disentangled +conditional control between high-level semantics and explicit parameters (e.g., +3DMM) in the generation process, and accordingly propose a novel +diffusion-based editing framework, named DisControlFace. Specifically, we +leverage a Diffusion Autoencoder (Diff-AE) as the semantic reconstruction +backbone. To enable explicit face editing, we construct an Exp-FaceNet that is +compatible with Diff-AE to generate spatial-wise explicit control conditions +based on estimated 3DMM parameters. Different from current diffusion-based +editing methods that train the whole conditional generative model from scratch, +we freeze the pre-trained weights of the Diff-AE to maintain its semantically +deterministic conditioning capability and accordingly propose a random semantic +masking (RSM) strategy to effectively achieve an independent training of +Exp-FaceNet. This setting endows the model with disentangled face control +meanwhile reducing semantic information shift in editing. Our model can be +trained using 2D in-the-wild portrait images without requiring 3D or video data +and perform robust editing on any new facial image through a simple one-shot +fine-tuning. Comprehensive experiments demonstrate that DisControlFace can +generate realistic facial images with better editing accuracy and identity +preservation over state-of-the-art methods. Project page: +https://discontrolface.github.io/ + +
+
+
+
+
+ + ♻ ☆ Aggregated Attributions for Explanatory Analysis of 3D Segmentation + Models + + +
+ Analysis of 3D segmentation models, especially in the context of medical +imaging, is often limited to segmentation performance metrics that overlook the +crucial aspect of explainability and bias. Currently, effectively explaining +these models with saliency maps is challenging due to the high dimensions of +input images multiplied by the ever-growing number of segmented class labels. +To this end, we introduce Agg^2Exp, a methodology for aggregating fine-grained +voxel attributions of the segmentation model's predictions. Unlike classical +explanation methods that primarily focus on the local feature attribution, +Agg^2Exp enables a more comprehensive global view on the importance of +predicted segments in 3D images. Our benchmarking experiments show that +gradient-based voxel attributions are more faithful to the model's predictions +than perturbation-based explanations. As a concrete use-case, we apply Agg^2Exp +to discover knowledge acquired by the Swin UNEt TRansformer model trained on +the TotalSegmentator v2 dataset for segmenting anatomical structures in +computed tomography medical images. Agg^2Exp facilitates the explanatory +analysis of large segmentation models beyond their predictive performance. + +
+
+ comment: Added Acknowledgments +
+
+
+
+
+ + ♻ ☆ VAAD: Visual Attention Analysis Dashboard applied to e-Learning + + +
+ In this paper, we present an approach in the Multimodal Learning Analytics +field. Within this approach, we have developed a tool to visualize and analyze +eye movement data collected during learning sessions in online courses. The +tool is named VAAD, an acronym for Visual Attention Analysis Dashboard. These +eye movement data have been gathered using an eye-tracker and subsequently +processed and visualized for interpretation. The purpose of the tool is to +conduct a descriptive analysis of the data by facilitating its visualization, +enabling the identification of differences and learning patterns among various +learner populations. Additionally, it integrates a predictive module capable of +anticipating learner activities during a learning session. Consequently, VAAD +holds the potential to offer valuable insights into online learning behaviors +from both descriptive and predictive perspectives. + +
+
+ comment: Accepted in CEDI 2024 (VII Congreso Espa\~nol de Inform\'atica), A + Coru\~na, Spain +
+
+
+
+
+ + ♻ ☆ Asynchronous Large Language Model Enhanced Planner for Autonomous + Driving ECCV 2024 + + +
+ Despite real-time planners exhibiting remarkable performance in autonomous +driving, the growing exploration of Large Language Models (LLMs) has opened +avenues for enhancing the interpretability and controllability of motion +planning. Nevertheless, LLM-based planners continue to encounter significant +challenges, including elevated resource consumption and extended inference +times, which pose substantial obstacles to practical deployment. In light of +these challenges, we introduce AsyncDriver, a new asynchronous LLM-enhanced +closed-loop framework designed to leverage scene-associated instruction +features produced by LLM to guide real-time planners in making precise and +controllable trajectory predictions. On one hand, our method highlights the +prowess of LLMs in comprehending and reasoning with vectorized scene data and a +series of routing instructions, demonstrating its effective assistance to +real-time planners. On the other hand, the proposed framework decouples the +inference processes of the LLM and real-time planners. By capitalizing on the +asynchronous nature of their inference frequencies, our approach have +successfully reduced the computational cost introduced by LLM, while +maintaining comparable performance. Experiments show that our approach achieves +superior closed-loop evaluation performance on nuPlan's challenging scenarios. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Early Detection of Late Blight Tomato Disease using Histogram Oriented + Gradient based Support Vector Machine + + +
+ The tomato is one of the most important fruits on earth. It plays an +important and useful role in the agricultural production of any country. This +research propose a novel smart technique for early detection of late blight +diseases in tomatoes. This work improve the dataset with an increase in images +from the field (the Plant Village dataset) and proposed a hybrid algorithm +composed of support vector machines (SVM) and histogram-oriented gradients +(HOG) for real-time detection of late blight tomato disease. To propose a +HOG-based SVM model for early detection of late blight tomato leaf disease. To +check the performance of the proposed model in terms of MSE, accuracy, +precision, and recall as compared to Decision Tree and KNN. The integration of +advanced technology in agriculture has the potential to revolutionize the +industry, making it more efficient, sustainable, and profitable. This research +work on the early detection of tomato diseases contributes to the growing +importance of smart farming, the need for climate-smart agriculture, the rising +need to more efficiently utilize natural resources, and the demand for higher +crop yields. The proposed hybrid algorithm of SVM and HOG has significant +potential for the early detection of late blight disease in tomato plants. The +performance of the proposed model against decision tree and KNN algorithms and +the results may assist in selecting the best algorithm for future applications. +The research work can help farmers make data-driven decisions to optimize crop +yield and quality while also reducing the environmental impact of farming +practices. + +
+
+ comment: The article titled "Early Detection of Late Blight Tomato Disease + using Histogram Oriented Gradient based Support Vector Machine" need to be + withdrawn there are other contributors in the improvement of this article +
+
+
+
+
+ + ♻ ☆ Surf-D: Generating High-Quality Surfaces of Arbitrary Topologies Using + Diffusion Models ECCV 2024 + + +
+ We present Surf-D, a novel method for generating high-quality 3D shapes as +Surfaces with arbitrary topologies using Diffusion models. Previous methods +explored shape generation with different representations and they suffer from +limited topologies and poor geometry details. To generate high-quality surfaces +of arbitrary topologies, we use the Unsigned Distance Field (UDF) as our +surface representation to accommodate arbitrary topologies. Furthermore, we +propose a new pipeline that employs a point-based AutoEncoder to learn a +compact and continuous latent space for accurately encoding UDF and support +high-resolution mesh extraction. We further show that our new pipeline +significantly outperforms the prior approaches to learning the distance fields, +such as the grid-based AutoEncoder, which is not scalable and incapable of +learning accurate UDF. In addition, we adopt a curriculum learning strategy to +efficiently embed various surfaces. With the pretrained shape latent space, we +employ a latent diffusion model to acquire the distribution of various shapes. +Extensive experiments are presented on using Surf-D for unconditional +generation, category conditional generation, image conditional generation, and +text-to-shape tasks. The experiments demonstrate the superior performance of +Surf-D in shape generation across multiple modalities as conditions. Visit our +project page at https://yzmblog.github.io/projects/SurfD/. + +
+
+ comment: Accepted to ECCV 2024. Project Page: + https://yzmblog.github.io/projects/SurfD/ +
+
+
+
+
+ + ♻ ☆ The Surprising Effectiveness of Multimodal Large Language Models for + Video Moment Retrieval + + +
+ Recent studies have shown promising results in utilizing multimodal large +language models (MLLMs) for computer vision tasks such as object detection and +semantic segmentation. However, many challenging video tasks remain +under-explored. Video-language tasks necessitate spatial and temporal +comprehension and require significant compute. Therefore, prior works have +developed complex, highly specialized architectures or leveraged additional +input signals such as video transcripts to best encode contextual and temporal +information, which limits their generality and can be impractical. One +particularly challenging task is video moment retrieval, which requires precise +temporal and contextual grounding. This work demonstrates the surprising +effectiveness of leveraging image-text pretrained MLLMs for moment retrieval. +We introduce Mr. BLIP (Mr. as in Moment Retrieval), a multimodal, single-stage +model that requires no expensive video-language pretraining, no additional +input signal (e.g., no transcript or audio), and has a simpler and more +versatile design than prior state-of-the-art methods. We achieve a new +state-of-the-art in moment retrieval on the widely used benchmarks +Charades-STA, QVHighlights, and ActivityNet Captions. Notably, we attain over +9% (absolute) higher Recall (at 0.5 and 0.7 IoU) on the challenging long-video +multi-moment QVHighlights benchmark. Our code is publicly available. + +
+
+ comment: Code: https://github.com/sudo-Boris/mr-Blip +
+
+
+
+
+ + ♻ ☆ MovePose: A High-performance Human Pose Estimation Algorithm on Mobile + and Edge Devices ICANN 2024 + + +
+ We present MovePose, an optimized lightweight convolutional neural network +designed specifically for real-time body pose estimation on CPU-based mobile +devices. The current solutions do not provide satisfactory accuracy and speed +for human posture estimation, and MovePose addresses this gap. It aims to +maintain real-time performance while improving the accuracy of human posture +estimation for mobile devices. Our MovePose algorithm has attained an Mean +Average Precision (mAP) score of 68.0 on the COCO \cite{cocodata} validation +dataset. The MovePose algorithm displayed efficiency with a performance of 69+ +frames per second (fps) when run on an Intel i9-10920x CPU. Additionally, it +showcased an increased performance of 452+ fps on an NVIDIA RTX3090 GPU. On an +Android phone equipped with a Snapdragon 8 + 4G processor, the fps reached +above 11. To enhance accuracy, we incorporated three techniques: deconvolution, +large kernel convolution, and coordinate classification methods. Compared to +basic upsampling, deconvolution is trainable, improves model capacity, and +enhances the receptive field. Large kernel convolution strengthens these +properties at a decreased computational cost. In summary, MovePose provides +high accuracy and real-time performance, marking it a potential tool for a +variety of applications, including those focused on mobile-side human posture +estimation. The code and models for this algorithm will be made publicly +accessible. + +
+
+ comment: This paper has been accepted by ICANN 2024 and is an oral + presentation +
+
+
+
+
+ + ♻ ☆ Spatiotemporal Graph Guided Multi-modal Network for Livestreaming + Product Retrieval + + +
+ With the rapid expansion of e-commerce, more consumers have become accustomed +to making purchases via livestreaming. Accurately identifying the products +being sold by salespeople, i.e., livestreaming product retrieval (LPR), poses a +fundamental and daunting challenge. The LPR task encompasses three primary +dilemmas in real-world scenarios: 1) the recognition of intended products from +distractor products present in the background; 2) the video-image heterogeneity +that the appearance of products showcased in live streams often deviates +substantially from standardized product images in stores; 3) there are numerous +confusing products with subtle visual nuances in the shop. To tackle these +challenges, we propose the Spatiotemporal Graphing Multi-modal Network (SGMN). +First, we employ a text-guided attention mechanism that leverages the spoken +content of salespeople to guide the model to focus toward intended products, +emphasizing their salience over cluttered background products. Second, a +long-range spatiotemporal graph network is further designed to achieve both +instance-level interaction and frame-level matching, solving the misalignment +caused by video-image heterogeneity. Third, we propose a multi-modal hard +example mining, assisting the model in distinguishing highly similar products +with fine-grained features across the video-image-text domain. Through +extensive quantitative and qualitative experiments, we demonstrate the superior +performance of our proposed SGMN model, surpassing the state-of-the-art methods +by a substantial margin. The code is available at +https://github.com/Huxiaowan/SGMN. + +
+
+ comment: 9 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ Efficient Temporal Sentence Grounding in Videos with Multi-Teacher + Knowledge Distillation + + +
+ Temporal Sentence Grounding in Videos (TSGV) aims to detect the event +timestamps described by the natural language query from untrimmed videos. This +paper discusses the challenge of achieving efficient computation in TSGV models +while maintaining high performance. Most existing approaches exquisitely design +complex architectures to improve accuracy with extra layers and loss, suffering +from inefficiency and heaviness. Although some works have noticed that, they +only make an issue of feature fusion layers, which can hardly enjoy the +highspeed merit in the whole clunky network. To tackle this problem, we propose +a novel efficient multi-teacher model (EMTM) based on knowledge distillation to +transfer diverse knowledge from both heterogeneous and isomorphic networks. +Specifically, We first unify different outputs of the heterogeneous models into +one single form. Next, a Knowledge Aggregation Unit (KAU) is built to acquire +high-quality integrated soft labels from multiple teachers. After that, the KAU +module leverages the multi-scale video and global query information to +adaptively determine the weights of different teachers. A Shared Encoder +strategy is then proposed to solve the problem that the student shallow layers +hardly benefit from teachers, in which an isomorphic teacher is collaboratively +trained with the student to align their hidden states. Extensive experimental +results on three popular TSGV benchmarks demonstrate that our method is both +effective and efficient without bells and whistles. + +
+
+
+
+
+ + ♻ ☆ AesExpert: Towards Multi-modality Foundation Model for Image Aesthetics + Perception + + +
+ The highly abstract nature of image aesthetics perception (IAP) poses +significant challenge for current multimodal large language models (MLLMs). The +lack of human-annotated multi-modality aesthetic data further exacerbates this +dilemma, resulting in MLLMs falling short of aesthetics perception +capabilities. To address the above challenge, we first introduce a +comprehensively annotated Aesthetic Multi-Modality Instruction Tuning (AesMMIT) +dataset, which serves as the footstone for building multi-modality aesthetics +foundation models. Specifically, to align MLLMs with human aesthetics +perception, we construct a corpus-rich aesthetic critique database with 21,904 +diverse-sourced images and 88K human natural language feedbacks, which are +collected via progressive questions, ranging from coarse-grained aesthetic +grades to fine-grained aesthetic descriptions. To ensure that MLLMs can handle +diverse queries, we further prompt GPT to refine the aesthetic critiques and +assemble the large-scale aesthetic instruction tuning dataset, i.e. AesMMIT, +which consists of 409K multi-typed instructions to activate stronger aesthetic +capabilities. Based on the AesMMIT database, we fine-tune the open-sourced +general foundation models, achieving multi-modality Aesthetic Expert models, +dubbed AesExpert. Extensive experiments demonstrate that the proposed AesExpert +models deliver significantly better aesthetic perception performances than the +state-of-the-art MLLMs, including the most advanced GPT-4V and +Gemini-Pro-Vision. Project homepage: https://yipoh.github.io/aes-expert/. + +
+
+ comment: Accepted by ACMMM24 +
+
+
+
+
+ + ♻ ☆ Leveraging Temporal Contextualization for Video Action Recognition ECCV'24 + + +
+ We propose a novel framework for video understanding, called Temporally +Contextualized CLIP (TC-CLIP), which leverages essential temporal information +through global interactions in a spatio-temporal domain within a video. To be +specific, we introduce Temporal Contextualization (TC), a layer-wise temporal +information infusion mechanism for videos, which 1) extracts core information +from each frame, 2) connects relevant information across frames for the +summarization into context tokens, and 3) leverages the context tokens for +feature encoding. Furthermore, the Video-conditional Prompting (VP) module +processes context tokens to generate informative prompts in the text modality. +Extensive experiments in zero-shot, few-shot, base-to-novel, and +fully-supervised action recognition validate the effectiveness of our model. +Ablation studies for TC and VP support our design choices. Our project page +with the source code is available at https://github.com/naver-ai/tc-clip + +
+
+ comment: 26 pages, 11 figures, 16 tables. To be presented at ECCV'24 +
+
+
+
+
+ + ♻ ☆ The Platonic Representation Hypothesis + + +
+ We argue that representations in AI models, particularly deep networks, are +converging. First, we survey many examples of convergence in the literature: +over time and across multiple domains, the ways by which different neural +networks represent data are becoming more aligned. Next, we demonstrate +convergence across data modalities: as vision models and language models get +larger, they measure distance between datapoints in a more and more alike way. +We hypothesize that this convergence is driving toward a shared statistical +model of reality, akin to Plato's concept of an ideal reality. We term such a +representation the platonic representation and discuss several possible +selective pressures toward it. Finally, we discuss the implications of these +trends, their limitations, and counterexamples to our analysis. + +
+
+ comment: Equal contributions. Project: https://phillipi.github.io/prh/ Code: + https://github.com/minyoungg/platonic-rep +
+
+
+
+
+ + ♻ ☆ Video Understanding with Large Language Models: A Survey + + +
+ With the burgeoning growth of online video platforms and the escalating +volume of video content, the demand for proficient video understanding tools +has intensified markedly. Given the remarkable capabilities of large language +models (LLMs) in language and multimodal tasks, this survey provides a detailed +overview of recent advancements in video understanding that harness the power +of LLMs (Vid-LLMs). The emergent capabilities of Vid-LLMs are surprisingly +advanced, particularly their ability for open-ended multi-granularity (general, +temporal, and spatiotemporal) reasoning combined with commonsense knowledge, +suggesting a promising path for future video understanding. We examine the +unique characteristics and capabilities of Vid-LLMs, categorizing the +approaches into three main types: Video Analyzer x LLM, Video Embedder x LLM, +and (Analyzer + Embedder) x LLM. Furthermore, we identify five sub-types based +on the functions of LLMs in Vid-LLMs: LLM as Summarizer, LLM as Manager, LLM as +Text Decoder, LLM as Regressor, and LLM as Hidden Layer. Furthermore, this +survey presents a comprehensive study of the tasks, datasets, benchmarks, and +evaluation methodologies for Vid-LLMs. Additionally, it explores the expansive +applications of Vid-LLMs across various domains, highlighting their remarkable +scalability and versatility in real-world video understanding challenges. +Finally, it summarizes the limitations of existing Vid-LLMs and outlines +directions for future research. For more information, readers are recommended +to visit the repository at +https://github.com/yunlong10/Awesome-LLMs-for-Video-Understanding. + +
+
+
+
+
+ + ♻ ☆ Continuous Memory Representation for Anomaly Detection + + +
+ There have been significant advancements in anomaly detection in an +unsupervised manner, where only normal images are available for training. +Several recent methods aim to detect anomalies based on a memory, comparing or +reconstructing the input with directly stored normal features (or trained +features with normal images). However, such memory-based approaches operate on +a discrete feature space implemented by the nearest neighbor or attention +mechanism, suffering from poor generalization or an identity shortcut issue +outputting the same as input, respectively. Furthermore, the majority of +existing methods are designed to detect single-class anomalies, resulting in +unsatisfactory performance when presented with multiple classes of objects. To +tackle all of the above challenges, we propose CRAD, a novel anomaly detection +method for representing normal features within a "continuous" memory, enabled +by transforming spatial features into coordinates and mapping them to +continuous grids. Furthermore, we carefully design the grids tailored for +anomaly detection, representing both local and global normal features and +fusing them effectively. Our extensive experiments demonstrate that CRAD +successfully generalizes the normal features and mitigates the identity +shortcut, furthermore, CRAD effectively handles diverse classes in a single +model thanks to the high-granularity continuous representation. In an +evaluation using the MVTec AD dataset, CRAD significantly outperforms the +previous state-of-the-art method by reducing 65.0% of the error for multi-class +unified anomaly detection. The project page is available at +https://tae-mo.github.io/crad/. + +
+
+ comment: Project page: https://tae-mo.github.io/crad/ +
+
+
+
+
+ + ♻ ☆ Magic Clothing: Controllable Garment-Driven Image Synthesis + + +
+ We propose Magic Clothing, a latent diffusion model (LDM)-based network +architecture for an unexplored garment-driven image synthesis task. Aiming at +generating customized characters wearing the target garments with diverse text +prompts, the image controllability is the most critical issue, i.e., to +preserve the garment details and maintain faithfulness to the text prompts. To +this end, we introduce a garment extractor to capture the detailed garment +features, and employ self-attention fusion to incorporate them into the +pretrained LDMs, ensuring that the garment details remain unchanged on the +target character. Then, we leverage the joint classifier-free guidance to +balance the control of garment features and text prompts over the generated +results. Meanwhile, the proposed garment extractor is a plug-in module +applicable to various finetuned LDMs, and it can be combined with other +extensions like ControlNet and IP-Adapter to enhance the diversity and +controllability of the generated characters. Furthermore, we design +Matched-Points-LPIPS (MP-LPIPS), a robust metric for evaluating the consistency +of the target image to the source garment. Extensive experiments demonstrate +that our Magic Clothing achieves state-of-the-art results under various +conditional controls for garment-driven image synthesis. Our source code is +available at https://github.com/ShineChen1024/MagicClothing. + +
+
+
+
+
+ + ♻ ☆ MINT-1T: Scaling Open-Source Multimodal Data by 10x: A Multimodal + Dataset with One Trillion Tokens + + +
+ Multimodal interleaved datasets featuring free-form interleaved sequences of +images and text are crucial for training frontier large multimodal models +(LMMs). Despite the rapid progression of open-source LMMs, there remains a +pronounced scarcity of large-scale, diverse open-source multimodal interleaved +datasets. In response, we introduce MINT-1T, the most extensive and diverse +open-source Multimodal INTerleaved dataset to date. MINT-1T comprises one +trillion text tokens and 3.4 billion images, a 10x scale-up from existing +open-source datasets. Additionally, we include previously untapped sources such +as PDFs and ArXiv papers. As scaling multimodal interleaved datasets requires +substantial engineering effort, sharing the data curation process and releasing +the dataset greatly benefits the community. Our experiments show that LMMs +trained on MINT-1T rival the performance of models trained on the previous +leading dataset, OBELICS. Our data and code will be released at +https://github.com/mlfoundations/MINT-1T. + +
+
+
+
+
+ + ♻ ☆ SvANet: A Scale-variant Attention-based Network for Small Medical Object + Segmentation + + +
+ Early detection and accurate diagnosis can predict the risk of malignant +disease transformation, thereby increasing the probability of effective +treatment. A mild syndrome with small infected regions is an ominous warning +and is foremost in the early diagnosis of diseases. Deep learning algorithms, +such as convolutional neural networks (CNNs), have been used to segment natural +or medical objects, showing promising results. However, analyzing medical +objects of small areas in images remains a challenge due to information losses +and compression defects caused by convolution and pooling operations in CNNs. +These losses and defects become increasingly significant as the network +deepens, particularly for small medical objects. To address these challenges, +we propose a novel scale-variant attention-based network (SvANet) for accurate +small-scale object segmentation in medical images. The SvANet consists of Monte +Carlo attention, scale-variant attention, and vision transformer, which +incorporates cross-scale features and alleviates compression artifacts for +enhancing the discrimination of small medical objects. Quantitative +experimental results demonstrate the superior performance of SvANet, achieving +96.12%, 96.11%, 89.79%, 84.15%, 80.25%, 73.05%, and 72.58% in mean Dice +coefficient for segmenting kidney tumors, skin lesions, hepatic tumors, polyps, +surgical excision cells, retinal vasculatures, and sperms, which occupy less +than 1% of the image areas in KiTS23, ISIC 2018, ATLAS, PolypGen, TissueNet, +FIVES, and SpermHealth datasets, respectively. + +
+
+ comment: 14 pages, 9 figures, under review +
+
+
+
+
+ + ♻ ☆ Edge Detectors Can Make Deep Convolutional Neural Networks More Robust + + +
+ Deep convolutional neural networks (DCNN for short) are vulnerable to +examples with small perturbations. Improving DCNN's robustness is of great +significance to the safety-critical applications, such as autonomous driving +and industry automation. Inspired by the principal way that human eyes +recognize objects, i.e., largely relying on the shape features, this paper +first employs the edge detectors as layer kernels and designs a binary edge +feature branch (BEFB for short) to learn the binary edge features, which can be +easily integrated into any popular backbone. The four edge detectors can learn +the horizontal, vertical, positive diagonal, and negative diagonal edge +features, respectively, and the branch is stacked by multiple Sobel layers +(using edge detectors as kernels) and one threshold layer. The binary edge +features learned by the branch, concatenated with the texture features learned +by the backbone, are fed into the fully connected layers for classification. We +integrate the proposed branch into VGG16 and ResNet34, respectively, and +conduct experiments on multiple datasets. Experimental results demonstrate the +BEFB is lightweight and has no side effects on training. And the accuracy of +the BEFB integrated models is better than the original ones on all datasets +when facing FGSM, PGD, and C\&W attacks. Besides, BEFB integrated models +equipped with the robustness enhancing techniques can achieve better +classification accuracy compared to the original models. The work in this paper +for the first time shows it is feasible to enhance the robustness of DCNNs +through combining both shape-like features and texture features. + +
+
+ comment: 26 pages, 18 figures, 7 tables +
+
+
+
+
+ + ♻ ☆ TLRN: Temporal Latent Residual Networks For Large Deformation Image + Registration MICCAI 2024 + + +
+ This paper presents a novel approach, termed {\em Temporal Latent Residual +Network (TLRN)}, to predict a sequence of deformation fields in time-series +image registration. The challenge of registering time-series images often lies +in the occurrence of large motions, especially when images differ significantly +from a reference (e.g., the start of a cardiac cycle compared to the peak +stretching phase). To achieve accurate and robust registration results, we +leverage the nature of motion continuity and exploit the temporal smoothness in +consecutive image frames. Our proposed TLRN highlights a temporal residual +network with residual blocks carefully designed in latent deformation spaces, +which are parameterized by time-sequential initial velocity fields. We treat a +sequence of residual blocks over time as a dynamic training system, where each +block is designed to learn the residual function between desired deformation +features and current input accumulated from previous time frames. We validate +the effectivenss of TLRN on both synthetic data and real-world cine cardiac +magnetic resonance (CMR) image videos. Our experimental results shows that TLRN +is able to achieve substantially improved registration accuracy compared to the +state-of-the-art. Our code is publicly available at +https://github.com/nellie689/TLRN. + +
+
+ comment: 10 pages. Accepted by MICCAI 2024 +
+
+
+
+
+ + ♻ ☆ LLM as Dataset Analyst: Subpopulation Structure Discovery with Large + Language Model ECCV24 + + +
+ The distribution of subpopulations is an important property hidden within a +dataset. Uncovering and analyzing the subpopulation distribution within +datasets provides a comprehensive understanding of the datasets, standing as a +powerful tool beneficial to various downstream tasks, including Dataset +Subpopulation Organization, Subpopulation Shift, and Slice Discovery. Despite +its importance, there has been no work that systematically explores the +subpopulation distribution of datasets to our knowledge. To address the +limitation and solve all the mentioned tasks in a unified way, we introduce a +novel concept of subpopulation structures to represent, analyze, and utilize +subpopulation distributions within datasets. To characterize the structures in +an interpretable manner, we propose the Subpopulation Structure Discovery with +Large Language Models (SSD-LLM) framework, which employs world knowledge and +instruction-following capabilities of Large Language Models (LLMs) to +linguistically analyze informative image captions and summarize the structures. +Furthermore, we propose complete workflows to address downstream tasks, named +Task-specific Tuning, showcasing the application of the discovered structure to +a spectrum of subpopulation-related tasks, including dataset subpopulation +organization, subpopulation shift, and slice discovery. Furthermore, we propose +complete workflows to address downstream tasks, named Task-specific Tuning, +showcasing the application of the discovered structure to a spectrum of +subpopulation-related tasks, including dataset subpopulation organization, +subpopulation shift, and slice discovery. + +
+
+ comment: ECCV24 Camera Ready +
+
+
+
+
+ + ♻ ☆ PEA-Diffusion: Parameter-Efficient Adapter with Knowledge Distillation + in non-English Text-to-Image Generation ECCV 2024 + + +
+ Text-to-image diffusion models are well-known for their ability to generate +realistic images based on textual prompts. However, the existing works have +predominantly focused on English, lacking support for non-English text-to-image +models. The most commonly used translation methods cannot solve the generation +problem related to language culture, while training from scratch on a specific +language dataset is prohibitively expensive. In this paper, we are inspired to +propose a simple plug-and-play language transfer method based on knowledge +distillation. All we need to do is train a lightweight MLP-like +parameter-efficient adapter (PEA) with only 6M parameters under teacher +knowledge distillation along with a small parallel data corpus. We are +surprised to find that freezing the parameters of UNet can still achieve +remarkable performance on the language-specific prompt evaluation set, +demonstrating that PEA can stimulate the potential generation ability of the +original UNet. Additionally, it closely approaches the performance of the +English text-to-image model on a general prompt evaluation set. Furthermore, +our adapter can be used as a plugin to achieve significant results in +downstream tasks in cross-lingual text-to-image generation. Code will be +available at: https://github.com/OPPO-Mente-Lab/PEA-Diffusion + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ♻ ☆ MicroEmo: Time-Sensitive Multimodal Emotion Recognition with + Micro-Expression Dynamics in Video Dialogues + + +
+ Multimodal Large Language Models (MLLMs) have demonstrated remarkable +multimodal emotion recognition capabilities, integrating multimodal cues from +visual, acoustic, and linguistic contexts in the video to recognize human +emotional states. However, existing methods ignore capturing local facial +features of temporal dynamics of micro-expressions and do not leverage the +contextual dependencies of the utterance-aware temporal segments in the video, +thereby limiting their expected effectiveness to a certain extent. In this +work, we propose MicroEmo, a time-sensitive MLLM aimed at directing attention +to the local facial micro-expression dynamics and the contextual dependencies +of utterance-aware video clips. Our model incorporates two key architectural +contributions: (1) a global-local attention visual encoder that integrates +global frame-level timestamp-bound image features with local facial features of +temporal dynamics of micro-expressions; (2) an utterance-aware video Q-Former +that captures multi-scale and contextual dependencies by generating visual +token sequences for each utterance segment and for the entire video then +combining them. Preliminary qualitative experiments demonstrate that in a new +Explainable Multimodal Emotion Recognition (EMER) task that exploits +multi-modal and multi-faceted clues to predict emotions in an open-vocabulary +(OV) manner, MicroEmo demonstrates its effectiveness compared with the latest +methods. + +
+
+
+
+
+ + ♻ ☆ Rasterized Edge Gradients: Handling Discontinuities Differentiably + + +
+ Computing the gradients of a rendering process is paramount for diverse +applications in computer vision and graphics. However, accurate computation of +these gradients is challenging due to discontinuities and rendering +approximations, particularly for surface-based representations and +rasterization-based rendering. We present a novel method for computing +gradients at visibility discontinuities for rasterization-based differentiable +renderers. Our method elegantly simplifies the traditionally complex problem +through a carefully designed approximation strategy, allowing for a +straightforward, effective, and performant solution. We introduce a novel +concept of micro-edges, which allows us to treat the rasterized images as +outcomes of a differentiable, continuous process aligned with the inherently +non-differentiable, discrete-pixel rasterization. This technique eliminates the +necessity for rendering approximations or other modifications to the forward +pass, preserving the integrity of the rendered image, which makes it applicable +to rasterized masks, depth, and normals images where filtering is prohibitive. +Utilizing micro-edges simplifies gradient interpretation at discontinuities and +enables handling of geometry intersections, offering an advantage over the +prior art. We showcase our method in dynamic human head scene reconstruction, +demonstrating effective handling of camera images and segmentation masks. + +
+
+
+
+
+ + ♻ ☆ FCNR: Fast Compressive Neural Representation of Visualization Images + + +
+ We present FCNR, a fast compressive neural representation for tens of +thousands of visualization images under varying viewpoints and timesteps. The +existing NeRVI solution, albeit enjoying a high compression ratio, incurs slow +speeds in encoding and decoding. Built on the recent advances in stereo image +compression, FCNR assimilates stereo context modules and joint context transfer +modules to compress image pairs. Our solution significantly improves encoding +and decoding speed while maintaining high reconstruction quality and satisfying +compression ratio. To demonstrate its effectiveness, we compare FCNR with +state-of-the-art neural compression methods, including E-NeRV, HNeRV, NeRVI, +and ECSIC. The source code can be found at +https://github.com/YunfeiLu0112/FCNR. + +
+
+
+
+
+ + ♻ ☆ Interactive Text-to-Image Retrieval with Large Language Models: A + Plug-and-Play Approach ACL 2024 + + +
+ In this paper, we primarily address the issue of dialogue-form context query +within the interactive text-to-image retrieval task. Our methodology, PlugIR, +actively utilizes the general instruction-following capability of LLMs in two +ways. First, by reformulating the dialogue-form context, we eliminate the +necessity of fine-tuning a retrieval model on existing visual dialogue data, +thereby enabling the use of any arbitrary black-box model. Second, we construct +the LLM questioner to generate non-redundant questions about the attributes of +the target image, based on the information of retrieval candidate images in the +current context. This approach mitigates the issues of noisiness and redundancy +in the generated questions. Beyond our methodology, we propose a novel +evaluation metric, Best log Rank Integral (BRI), for a comprehensive assessment +of the interactive retrieval system. PlugIR demonstrates superior performance +compared to both zero-shot and fine-tuned baselines in various benchmarks. +Additionally, the two methodologies comprising PlugIR can be flexibly applied +together or separately in various situations. Our codes are available at +https://github.com/Saehyung-Lee/PlugIR. + +
+
+ comment: ACL 2024 Oral +
+
+
+
+
+ + ♻ ☆ Deep Hybrid Camera Deblurring for Smartphone Cameras SIGGRAPH 2024 + + +
+ Mobile cameras, despite their significant advancements, still have difficulty +in low-light imaging due to compact sensors and lenses, leading to longer +exposures and motion blur. Traditional blind deconvolution methods and +learning-based deblurring methods can be potential solutions to remove blur. +However, achieving practical performance still remains a challenge. To address +this, we propose a learning-based deblurring framework for smartphones, +utilizing wide and ultra-wide cameras as a hybrid camera system. We +simultaneously capture a long-exposure wide image and short-exposure burst +ultra-wide images, and utilize the burst images to deblur the wide image. To +fully exploit burst ultra-wide images, we present HCDeblur, a practical +deblurring framework that includes novel deblurring networks, HC-DNet and +HC-FNet. HC-DNet utilizes motion information extracted from burst images to +deblur a wide image, and HC-FNet leverages burst images as reference images to +further enhance a deblurred output. For training and evaluating the proposed +method, we introduce the HCBlur dataset, which consists of synthetic and +real-world datasets. Our experiments demonstrate that HCDeblur achieves +state-of-the-art deblurring quality. Code and datasets are available at +https://cg.postech.ac.kr/research/HCDeblur. + +
+
+ comment: SIGGRAPH 2024, Project page: + http://cg.postech.ac.kr/research/HCDeblur +
+
+
+
+
+ + ♻ ☆ Self-supervised Visualisation of Medical Image Datasets + + +
+ Self-supervised learning methods based on data augmentations, such as SimCLR, +BYOL, or DINO, allow obtaining semantically meaningful representations of image +datasets and are widely used prior to supervised fine-tuning. A recent +self-supervised learning method, $t$-SimCNE, uses contrastive learning to +directly train a 2D representation suitable for visualisation. When applied to +natural image datasets, $t$-SimCNE yields 2D visualisations with semantically +meaningful clusters. In this work, we used $t$-SimCNE to visualise medical +image datasets, including examples from dermatology, histology, and blood +microscopy. We found that increasing the set of data augmentations to include +arbitrary rotations improved the results in terms of class separability, +compared to data augmentations used for natural images. Our 2D representations +show medically relevant structures and can be used to aid data exploration and +annotation, improving on common approaches for data visualisation. + +
+
+
+
+
+ + ♻ ☆ Towards Adaptive Pseudo-label Learning for Semi-Supervised Temporal + Action Localization ECCV 2024 + + +
+ Alleviating noisy pseudo labels remains a key challenge in Semi-Supervised +Temporal Action Localization (SS-TAL). Existing methods often filter pseudo +labels based on strict conditions, but they typically assess classification and +localization quality separately, leading to suboptimal pseudo-label ranking and +selection. In particular, there might be inaccurate pseudo labels within +selected positives, alongside reliable counterparts erroneously assigned to +negatives. To tackle these problems, we propose a novel Adaptive Pseudo-label +Learning (APL) framework to facilitate better pseudo-label selection. +Specifically, to improve the ranking quality, Adaptive Label Quality Assessment +(ALQA) is proposed to jointly learn classification confidence and localization +reliability, followed by dynamically selecting pseudo labels based on the joint +score. Additionally, we propose an Instance-level Consistency Discriminator +(ICD) for eliminating ambiguous positives and mining potential positives +simultaneously based on inter-instance intrinsic consistency, thereby leading +to a more precise selection. We further introduce a general unsupervised +Action-aware Contrastive Pre-training (ACP) to enhance the discrimination both +within actions and between actions and backgrounds, which benefits SS-TAL. +Extensive experiments on THUMOS14 and ActivityNet v1.3 demonstrate that our +method achieves state-of-the-art performance under various semi-supervised +settings. + +
+
+ comment: Accepted to ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Low Latency Instance Segmentation by Continuous Clustering for LiDAR + Sensors + + +
+ Low-latency instance segmentation of LiDAR point clouds is crucial in +real-world applications because it serves as an initial and frequently-used +building block in a robot's perception pipeline, where every task adds further +delay. Particularly in dynamic environments, this total delay can result in +significant positional offsets of dynamic objects, as seen in highway +scenarios. To address this issue, we employ a new technique, which we call +continuous clustering. Unlike most existing clustering approaches, which use a +full revolution of the LiDAR sensor, we process the data stream in a continuous +and seamless fashion. Our approach does not rely on the concept of complete or +partial sensor rotations with multiple discrete range images; instead, it views +the range image as a single and infinitely horizontally growing entity. Each +new column of this continuous range image is processed as soon it is available. +Obstacle points are clustered to existing instances in real-time and it is +checked at a high-frequency which instances are completed in order to publish +them without waiting for the completion of the revolution or some other +integration period. In the case of rotating sensors, no problematic +discontinuities between the points of the end and the start of a scan are +observed. In this work we describe the two-layered data structure and the +corresponding algorithm for continuous clustering. It is able to achieve an +average latency of just 5 ms with respect to the latest timestamp of all points +in the cluster. We are publishing the source code at +https://github.com/UniBwTAS/continuous_clustering. + +
+
+ comment: Accompanying Video: https://www.youtube.com/watch?v=ex4qcR2bkWs +
+
+
+
+
+ + ♻ ☆ Npix2Cpix: A GAN-based Image-to-Image Translation Network with + Retrieval-Classification Integration for Watermark Retrieval from Historical + Document Images + + +
+ The identification and restoration of ancient watermarks have long been a +major topic in codicology and history. Classifying historical documents based +on watermarks is challenging due to their diversity, noisy samples, multiple +representation modes, and minor distinctions between classes and intra-class +variations. This paper proposes a modified U-net-based conditional generative +adversarial network (GAN) named Npix2Cpix to translate noisy raw historical +watermarked images into clean, handwriting-free watermarked images by +performing image translation from degraded (noisy) pixels to clean pixels. +Using image-to-image translation and adversarial learning, the network creates +clutter-free images for watermark restoration and categorization. The generator +and discriminator of the proposed GAN are trained using two separate loss +functions, each based on the distance between images, to learn the mapping from +the input noisy image to the output clean image. After using the proposed GAN +to pre-process noisy watermarked images, Siamese-based one-shot learning is +employed for watermark classification. Experimental results on a large-scale +historical watermark dataset demonstrate that cleaning the noisy watermarked +images can help to achieve high one-shot classification accuracy. The +qualitative and quantitative evaluation of the retrieved watermarked image +highlights the effectiveness of the proposed approach. + +
+
+
+
+
+ + ♻ ☆ Beyond Aesthetics: Cultural Competence in Text-to-Image Models + + +
+ Text-to-Image (T2I) models are being increasingly adopted in diverse global +communities where they create visual representations of their unique cultures. +Current T2I benchmarks primarily focus on faithfulness, aesthetics, and realism +of generated images, overlooking the critical dimension of cultural competence. +In this work, we introduce a framework to evaluate cultural competence of T2I +models along two crucial dimensions: cultural awareness and cultural diversity, +and present a scalable approach using a combination of structured knowledge +bases and large language models to build a large dataset of cultural artifacts +to enable this evaluation. In particular, we apply this approach to build CUBE +(CUltural BEnchmark for Text-to-Image models), a first-of-its-kind benchmark to +evaluate cultural competence of T2I models. CUBE covers cultural artifacts +associated with 8 countries across different geo-cultural regions and along 3 +concepts: cuisine, landmarks, and art. CUBE consists of 1) CUBE-1K, a set of +high-quality prompts that enable the evaluation of cultural awareness, and 2) +CUBE-CSpace, a larger dataset of cultural artifacts that serves as grounding to +evaluate cultural diversity. We also introduce cultural diversity as a novel +T2I evaluation component, leveraging quality-weighted Vendi score. Our +evaluations reveal significant gaps in the cultural awareness of existing +models across countries and provide valuable insights into the cultural +diversity of T2I outputs for under-specified prompts. Our methodology is +extendable to other cultural regions and concepts, and can facilitate the +development of T2I models that better cater to the global population. + +
+
+ comment: 30 pages, 10 figures, preprint +
+
+
+
+
+
+
+
+ + Information Retrieval 16 + +
+
+
+ + ☆ BlueTempNet: A Temporal Multi-network Dataset of Social Interactions in + Bluesky Social + + +
+ Decentralized social media platforms like Bluesky Social (Bluesky) have made +it possible to publicly disclose some user behaviors with millisecond-level +precision. Embracing Bluesky's principles of open-source and open-data, we +present the first collection of the temporal dynamics of user-driven social +interactions. BlueTempNet integrates multiple types of networks into a single +multi-network, including user-to-user interactions (following and blocking +users) and user-to-community interactions (creating and joining communities). +Communities are user-formed groups in custom Feeds, where users subscribe to +posts aligned with their interests. Following Bluesky's public data policy, we +collect existing Bluesky Feeds, including the users who liked and generated +these Feeds, and provide tools to gather users' social interactions within a +date range. This data-collection strategy captures past user behaviors and +supports the future data collection of user behavior. + +
+
+ comment: to appear in IEEE Data Description +
+
+
+
+
+ + ☆ A Novel Two-Step Fine-Tuning Pipeline for Cold-Start Active Learning in + Text Classification Tasks + + +
+ This is the first work to investigate the effectiveness of BERT-based +contextual embeddings in active learning (AL) tasks on cold-start scenarios, +where traditional fine-tuning is infeasible due to the absence of labeled data. +Our primary contribution is the proposal of a more robust fine-tuning pipeline +- DoTCAL - that diminishes the reliance on labeled data in AL using two steps: +(1) fully leveraging unlabeled data through domain adaptation of the embeddings +via masked language modeling and (2) further adjusting model weights using +labeled data selected by AL. Our evaluation contrasts BERT-based embeddings +with other prevalent text representation paradigms, including Bag of Words +(BoW), Latent Semantic Indexing (LSI), and FastText, at two critical stages of +the AL process: instance selection and classification. Experiments conducted on +eight ATC benchmarks with varying AL budgets (number of labeled instances) and +number of instances (about 5,000 to 300,000) demonstrate DoTCAL's superior +effectiveness, achieving up to a 33% improvement in Macro-F1 while reducing +labeling efforts by half compared to the traditional one-step method. We also +found that in several tasks, BoW and LSI (due to information aggregation) +produce results superior (up to 59% ) to BERT, especially in low-budget +scenarios and hard-to-classify tasks, which is quite surprising. + +
+
+ comment: 11 pages, 4 figures, 2 Tables, and 1 algorithm +
+
+
+
+
+ + ☆ Intent-Guided Heterogeneous Graph Contrastive Learning for + Recommendation + + +
+ Contrastive Learning (CL)-based recommender systems have gained prominence in +the context of Heterogeneous Graph (HG) due to their capacity to enhance the +consistency of representations across different views. Nonetheless, existing +frameworks often neglect the fact that user-item interactions within HG are +governed by diverse latent intents (for instance, preferences towards specific +brands or the demographic characteristics of item audiences), which are pivotal +in capturing fine-grained relations. The exploration of these underlying +intents, particularly through the lens of meta-paths in HGs, presents us with +two principal challenges: i) How to integrate CL mechanisms with latent +intents; ii) How to mitigate the noise associated with these complicated +intents.To address these challenges, we propose an innovative framework termed +Intent-Guided Heterogeneous Graph Contrastive Learning (IHGCL), which designed +to enhance CL-based recommendation by capturing the intents contained within +meta-paths. Specifically, the IHGCL framework includes: i) it employs a +meta-path-based dual contrastive learning approach to effectively integrate +intents into the recommendation, constructing meta-path contrast and view +contrast; ii) it uses an bottlenecked autoencoder that combines mask +propagation with the information bottleneck principle to significantly reduce +noise perturbations introduced by meta-paths. Empirical evaluations conducted +across six distinct datasets demonstrate the superior performance of our IHGCL +framework relative to conventional baseline methods. Our model implementation +is available at https://github.com/wangyu0627/IHGCL. + +
+
+ comment: 14pages, 11figures +
+
+
+
+
+ + ☆ Reinforced Prompt Personalization for Recommendation with Large Language + Models + + +
+ Designing effective prompts can empower LLMs to understand user preferences +and provide recommendations by leveraging LLMs' intent comprehension and +knowledge utilization capabilities. However, existing research predominantly +concentrates on task-wise prompting, developing fixed prompt templates composed +of four patterns (i.e., role-playing, history records, reasoning guidance, and +output format) and applying them to all users for a given task. Although +convenient, task-wise prompting overlooks individual user differences, leading +to potential mismatches in capturing user preferences. To address it, we +introduce the concept of instance-wise prompting to personalize discrete +prompts for individual users and propose Reinforced Prompt Personalization +(RPP) to optimize the four patterns in prompts using multi-agent reinforcement +learning (MARL). To boost efficiency, RPP formulates prompt personalization as +selecting optimal sentences holistically across the four patterns, rather than +optimizing word-by-word. To ensure the quality of prompts, RPP meticulously +crafts diverse expressions for each of the four patterns, considering multiple +analytical perspectives for specific recommendation tasks. In addition to RPP, +our proposal of RPP+ aims to enhance the scalability of action space by +dynamically refining actions with LLMs throughout the iterative process. We +evaluate the effectiveness of RPP/RPP+ in ranking tasks over various datasets. +Experimental results demonstrate the superiority of RPP/RPP+ over traditional +recommender models, few-shot methods, and other prompt-based methods, +underscoring the significance of instance-wise prompting for LLMs in +recommendation tasks and validating the effectiveness of RPP/RPP+. Our code is +available at https://github.com/maowenyu-11/RPP. + +
+
+
+
+
+ + ☆ scGHSOM: Hierarchical clustering and visualization of single-cell and + CRISPR data using growing hierarchical SOM KDD + + +
+ High-dimensional single-cell data poses significant challenges in identifying +underlying biological patterns due to the complexity and heterogeneity of +cellular states. We propose a comprehensive gene-cell dependency visualization +via unsupervised clustering, Growing Hierarchical Self-Organizing Map (GHSOM), +specifically designed for analyzing high-dimensional single-cell data like +single-cell sequencing and CRISPR screens. GHSOM is applied to cluster samples +in a hierarchical structure such that the self-growth structure of clusters +satisfies the required variations between and within. We propose a novel +Significant Attributes Identification Algorithm to identify features that +distinguish clusters. This algorithm pinpoints attributes with minimal +variation within a cluster but substantial variation between clusters. These +key attributes can then be used for targeted data retrieval and downstream +analysis. Furthermore, we present two innovative visualization tools: Cluster +Feature Map and Cluster Distribution Map. The Cluster Feature Map highlights +the distribution of specific features across the hierarchical structure of +GHSOM clusters. This allows for rapid visual assessment of cluster uniqueness +based on chosen features. The Cluster Distribution Map depicts leaf clusters as +circles on the GHSOM grid, with circle size reflecting cluster data size and +color customizable to visualize features like cell type or other attributes. We +apply our analysis to three single-cell datasets and one CRISPR dataset +(cell-gene database) and evaluate clustering methods with internal and external +CH and ARI scores. GHSOM performs well, being the best performer in internal +evaluation (CH=4.2). In external evaluation, GHSOM has the third-best +performance of all methods. + +
+
+ comment: Abstract presentation at BIOKDD@ACM KDD 2024 +
+
+
+
+
+ + ☆ BLAZE: Cross-Language and Cross-Project Bug Localization via Dynamic + Chunking and Hard Example Learning + + +
+ Software bugs require developers to exert significant effort to identify and +resolve them, often consuming about one-third of their time. Bug localization, +the process of pinpointing the exact source code files that need modification, +is crucial in reducing this effort. Existing bug localization tools, typically +reliant on deep learning techniques, face limitations in cross-project +applicability and effectiveness in multi-language environments. Recent +advancements with Large Language Models (LLMs) offer detailed representations +for bug localization. However, they encounter challenges with limited context +windows and mapping accuracy. To address these issues, we propose BLAZE, an +approach that employs dynamic chunking and hard example learning. First, BLAZE +dynamically segments source code to minimize continuity loss. Then, BLAZE +fine-tunes a GPT-based model using challenging bug cases, in order to enhance +cross-project and cross-language bug localization. To support the capability of +BLAZE, we create the BEETLEBOX dataset, which comprises 26,321 bugs from 29 +large and thriving open-source projects across five different programming +languages (Java, C++, Python, Go, and JavaScript). Our evaluations of BLAZE on +three benchmark datasets BEETLEBOX, SWE-Bench, and Ye et al. demonstrate +substantial improvements compared to six state-of-the-art baselines. +Specifically, BLAZE achieves up to an increase of 120% in Top 1 accuracy, 144% +in Mean Average Precision (MAP), and 100% in Mean Reciprocal Rank (MRR). An +extensive ablation study confirms the contributions of our pipeline components +to the overall performance enhancement. + +
+
+
+
+
+ + ☆ What Matters in Explanations: Towards Explainable Fake Review Detection + Focusing on Transformers + + +
+ Customers' reviews and feedback play crucial role on electronic +commerce~(E-commerce) platforms like Amazon, Zalando, and eBay in influencing +other customers' purchasing decisions. However, there is a prevailing concern +that sellers often post fake or spam reviews to deceive potential customers and +manipulate their opinions about a product. Over the past decade, there has been +considerable interest in using machine learning (ML) and deep learning (DL) +models to identify such fraudulent reviews. Unfortunately, the decisions made +by complex ML and DL models - which often function as \emph{black-boxes} - can +be surprising and difficult for general users to comprehend. In this paper, we +propose an explainable framework for detecting fake reviews with high precision +in identifying fraudulent content with explanations and investigate what +information matters most for explaining particular decisions by conducting +empirical user evaluation. Initially, we develop fake review detection models +using DL and transformer models including XLNet and DistilBERT. We then +introduce layer-wise relevance propagation (LRP) technique for generating +explanations that can map the contributions of words toward the predicted +class. The experimental results on two benchmark fake review detection datasets +demonstrate that our predictive models achieve state-of-the-art performance and +outperform several existing methods. Furthermore, the empirical user evaluation +of the generated explanations concludes which important information needs to be +considered in generating explanations in the context of fake review +identification. + +
+
+
+
+
+ + ♻ ☆ Bridging the Gap: Unravelling Local Government Data Sharing Barriers in + Estonia and Beyond + + +
+ Open Government Data (OGD) plays a crucial role in transforming smart cities +into sustainable and intelligent entities by providing data for analytics, +real-time monitoring, and informed decision-making. This data is increasingly +used in urban digital twins, enhancing city management through stakeholder +collaboration. However, local administrative data remains underutilized even in +digitally advanced countries like Estonia. This study explores barriers +preventing Estonian municipalities from sharing OGD, using a qualitative +approach through interviews with Estonian municipalities and drawing on the +OGD-adapted Innovation Resistance Theory model (IRT). Interviews with local +government officials highlight ongoing is-sues in data provision and quality. +By addressing overlooked weaknesses in the Estonian open data ecosystem and +providing actionable recommendations, this research contributes to a more +resilient and sustainable open data ecosystem. Additionally, by validating the +OGD-adapted Innovation Resistance Theory model and proposing a revised version +tailored for local government contexts, the study advances theoretical +frameworks for understanding data sharing resistance. Ultimately, this study +serves as a call to action for policymakers and practitioners to prioritize +local OGD initiatives, ensuring the full utilization of OGD in smart city +development. + +
+
+
+
+
+ + ♻ ☆ Description-Based Text Similarity + + +
+ Identifying texts with a given semantics is central for many information +seeking scenarios. Similarity search over vector embeddings appear to be +central to this ability, yet the similarity reflected in current text +embeddings is corpus-driven, and is inconsistent and sub-optimal for many use +cases. What, then, is a good notion of similarity for effective retrieval of +text? + We identify the need to search for texts based on abstract descriptions of +their content, and the corresponding notion of \emph{description based +similarity}. We demonstrate the inadequacy of current text embeddings and +propose an alternative model that significantly improves when used in standard +nearest neighbor search. The model is trained using positive and negative pairs +sourced through prompting a LLM, demonstrating how data from LLMs can be used +for creating new capabilities not immediately possible using the original +model. + +
+
+ comment: Accepted in COLM 2024 +
+
+
+
+
+ + ♻ ☆ Inter and Intra Prior Learning-based Hyperspectral Image Reconstruction + Using Snapshot SWIR Metasurface + + +
+ Shortwave-infrared(SWIR) spectral information, ranging from 1 {\mu}m to +2.5{\mu}m, overcomes the limitations of traditional color cameras in acquiring +scene information. However, conventional SWIR hyperspectral imaging systems +face challenges due to their bulky setups and low acquisition speeds. This work +introduces a snapshot SWIR hyperspectral imaging system based on a metasurface +filter and a corresponding filter selection method to achieve the lowest +correlation coefficient among these filters. This system offers the advantages +of compact size and snapshot imaging. We propose a novel inter and intra prior +learning unfolding framework to achieve high-quality SWIR hyperspectral image +reconstruction, which bridges the gap between prior learning and cross-stage +information interaction. Additionally, We design an adaptive feature transfer +mechanism to adaptively transfer the contextual correlation of multi-scale +encoder features to prevent detailed information loss in the decoder. +Experiment results demonstrate that our method can reconstruct hyperspectral +images with high speed and superior performance over existing methods. + +
+
+ comment: 12 pages,9 figures +
+
+
+
+
+ + ♻ ☆ Pacer and Runner: Cooperative Learning Framework between Single- and + Cross-Domain Sequential Recommendation SIGIR'24 + + +
+ Cross-Domain Sequential Recommendation (CDSR) improves recommendation +performance by utilizing information from multiple domains, which contrasts +with Single-Domain Sequential Recommendation (SDSR) that relies on a historical +interaction within a specific domain. However, CDSR may underperform compared +to the SDSR approach in certain domains due to negative transfer, which occurs +when there is a lack of relation between domains or different levels of data +sparsity. To address the issue of negative transfer, our proposed CDSR model +estimates the degree of negative transfer of each domain and adaptively assigns +it as a weight factor to the prediction loss, to control gradient flows through +domains with significant negative transfer. To this end, our model compares the +performance of a model trained on multiple domains (CDSR) with a model trained +solely on the specific domain (SDSR) to evaluate the negative transfer of each +domain using our asymmetric cooperative network. In addition, to facilitate the +transfer of valuable cues between the SDSR and CDSR tasks, we developed an +auxiliary loss that maximizes the mutual information between the representation +pairs from both tasks on a per-domain basis. This cooperative learning between +SDSR and CDSR tasks is similar to the collaborative dynamics between pacers and +runners in a marathon. Our model outperformed numerous previous works in +extensive experiments on two real-world industrial datasets across ten service +domains. We also have deployed our model in the recommendation system of our +personal assistant app service, resulting in 21.4% increase in click-through +rate compared to existing models, which is valuable to real-world business. + +
+
+ comment: Accepted at SIGIR'24 (Best Paper Honorable Mention) +
+
+
+
+
+ + ♻ ☆ Heterophily-Aware Fair Recommendation using Graph Convolutional Networks + + +
+ In recent years, graph neural networks (GNNs) have become a popular tool to +improve the accuracy and performance of recommender systems. Modern recommender +systems are not only designed to serve end users, but also to benefit other +participants, such as items and items providers. These participants may have +different or conflicting goals and interests, which raise the need for fairness +and popularity bias considerations. GNN-based recommendation methods also face +the challenges of unfairness and popularity bias and their normalization and +aggregation processes suffer from these challenges. In this paper, we propose a +fair GNN-based recommender system, called HetroFair, to improve items' side +fairness. HetroFair uses two separate components to generate fairness-aware +embeddings: i) fairnessaware attention which incorporates dot product in the +normalization process of GNNs, to decrease the effect of nodes' degrees, and +ii) heterophily feature weighting to assign distinct weights to different +features during the aggregation process. In order to evaluate the effectiveness +of HetroFair, we conduct extensive experiments over six real-world datasets. +Our experimental results reveal that HetroFair not only alleviates the +unfairness and popularity bias on items' side, but also achieves superior +accuracy on users' side. Our implementation is publicly available at +https://github.com/NematGH/HetroFair. + +
+
+
+
+
+ + ♻ ☆ CADC: Encoding User-Item Interactions for Compressing Recommendation + Model Training Data + + +
+ Deep learning recommendation models (DLRMs) are at the heart of the current +e-commerce industry. However, the amount of training data used to train these +large models is growing exponentially, leading to substantial training hurdles. +The training dataset contains two primary types of information: content-based +information (features of users and items) and collaborative information +(interactions between users and items). One approach to reduce the training +dataset is to remove user-item interactions. But that significantly diminishes +collaborative information, which is crucial for maintaining accuracy due to its +inclusion of interaction histories. This loss profoundly impacts DLRM +performance. + This paper makes an important observation that if one can capture the +user-item interaction history to enrich the user and item embeddings, then the +interaction history can be compressed without losing model accuracy. Thus, this +work, Collaborative Aware Data Compression (CADC), takes a two-step approach to +training dataset compression. In the first step, we use matrix factorization of +the user-item interaction matrix to create a novel embedding representation for +both the users and items. Once the user and item embeddings are enriched by the +interaction history information the approach then applies uniform random +sampling of the training dataset to drastically reduce the training dataset +size while minimizing model accuracy drop. The source code of CADC is available +at +\href{https://anonymous.4open.science/r/DSS-RM-8C1D/README.md}{https://anonymous.4open.science/r/DSS-RM-8C1D/README.md}. + +
+
+
+
+
+ + ♻ ☆ Large Language Models Enhanced Collaborative Filtering CIKM 2024 + + +
+ Recent advancements in Large Language Models (LLMs) have attracted +considerable interest among researchers to leverage these models to enhance +Recommender Systems (RSs). Existing work predominantly utilizes LLMs to +generate knowledge-rich texts or utilizes LLM-derived embeddings as features to +improve RSs. Although the extensive world knowledge embedded in LLMs generally +benefits RSs, the application can only take limited number of users and items +as inputs, without adequately exploiting collaborative filtering information. +Considering its crucial role in RSs, one key challenge in enhancing RSs with +LLMs lies in providing better collaborative filtering information through LLMs. +In this paper, drawing inspiration from the in-context learning and chain of +thought reasoning in LLMs, we propose the Large Language Models enhanced +Collaborative Filtering (LLM-CF) framework, which distils the world knowledge +and reasoning capabilities of LLMs into collaborative filtering. We also +explored a concise and efficient instruction-tuning method, which improves the +recommendation capabilities of LLMs while preserving their general +functionalities (e.g., not decreasing on the LLM benchmark). Comprehensive +experiments on three real-world datasets demonstrate that LLM-CF significantly +enhances several backbone recommendation models and consistently outperforms +competitive baselines, showcasing its effectiveness in distilling the world +knowledge and reasoning capabilities of LLM into collaborative filtering. + +
+
+ comment: Accepted by CIKM 2024 +
+
+
+
+
+ + ♻ ☆ AI-Driven Guided Response for Security Operation Centers with Microsoft + Copilot for Security + + +
+ Security operation centers contend with a constant stream of security +incidents, ranging from straightforward to highly complex. To address this, we +developed Copilot Guided Response (CGR), an industry-scale ML architecture that +guides security analysts across three key tasks -- (1) investigation, providing +essential historical context by identifying similar incidents; (2) triaging to +ascertain the nature of the incident -- whether it is a true positive, false +positive, or benign positive; and (3) remediation, recommending tailored +containment actions. CGR is integrated into the Microsoft Defender XDR product +and deployed worldwide, generating millions of recommendations across thousands +of customers. Our extensive evaluation, incorporating internal evaluation, +collaboration with security experts, and customer feedback, demonstrates that +CGR delivers high-quality recommendations across all three tasks. We provide a +comprehensive overview of the CGR architecture, setting a precedent as the +first cybersecurity company to openly discuss these capabilities in such depth. +Additionally, we GUIDE, the largest public collection of real-world security +incidents, spanning 13M evidences across 1M annotated incidents. By enabling +researchers and practitioners to conduct research on real-world data, GUIDE +advances the state of cybersecurity and supports the development of +next-generation machine learning systems. + +
+
+
+
+
+ + ♻ ☆ Understanding and Mitigating the Threat of Vec2Text to Dense Retrieval + Systems + + +
+ The emergence of Vec2Text -- a method for text embedding inversion -- has +raised serious privacy concerns for dense retrieval systems which use text +embeddings, such as those offered by OpenAI and Cohere. This threat comes from +the ability for a malicious attacker with access to embeddings to reconstruct +the original text. In this paper, we investigate various factors related to +embedding models that may impact text recoverability via Vec2Text. We explore +factors such as distance metrics, pooling functions, bottleneck pre-training, +training with noise addition, embedding quantization, and embedding dimensions, +which were not considered in the original Vec2Text paper. Through a +comprehensive analysis of these factors, our objective is to gain a deeper +understanding of the key elements that affect the trade-offs between the text +recoverability and retrieval effectiveness of dense retrieval systems, offering +insights for practitioners designing privacy-aware dense retrieval systems. We +also propose a simple embedding transformation fix that guarantees equal +ranking effectiveness while mitigating the recoverability risk. Overall, this +study reveals that Vec2Text could pose a threat to current dense retrieval +systems, but there are some effective methods to patch such systems. + +
+
+
+
+
+
+
+
+ + Machine Learning 148 + +
+
+
+ + ☆ CMR Scaling Law: Predicting Critical Mixture Ratios for Continual + Pre-training of Language Models + + +
+ Large Language Models (LLMs) excel in diverse tasks but often underperform in +specialized fields due to limited domain-specific or proprietary corpus. +Continual pre-training (CPT) enhances LLM capabilities by imbuing new +domain-specific or proprietary knowledge while replaying general corpus to +prevent catastrophic forgetting. The data mixture ratio of general corpus and +domain-specific corpus, however, has been chosen heuristically, leading to +sub-optimal training efficiency in practice. In this context, we attempt to +re-visit the scaling behavior of LLMs under the hood of CPT, and discover a +power-law relationship between loss, mixture ratio, and training tokens scale. +We formalize the trade-off between general and domain-specific capabilities, +leading to a well-defined Critical Mixture Ratio (CMR) of general and domain +data. By striking the balance, CMR maintains the model's general ability and +achieves the desired domain transfer, ensuring the highest utilization of +available resources. Therefore, if we value the balance between efficiency and +effectiveness, CMR can be consider as the optimal mixture ratio.Through +extensive experiments, we ascertain the predictability of CMR, and propose CMR +scaling law and have substantiated its generalization. These findings offer +practical guidelines for optimizing LLM training in specialized domains, +ensuring both general and domain-specific performance while efficiently +managing training resources. + +
+
+
+
+
+ + ☆ Traversing Pareto Optimal Policies: Provably Efficient Multi-Objective + Reinforcement Learning + + +
+ This paper investigates multi-objective reinforcement learning (MORL), which +focuses on learning Pareto optimal policies in the presence of multiple reward +functions. Despite MORL's significant empirical success, there is still a lack +of satisfactory understanding of various MORL optimization targets and +efficient learning algorithms. Our work offers a systematic analysis of several +optimization targets to assess their abilities to find all Pareto optimal +policies and controllability over learned policies by the preferences for +different objectives. We then identify Tchebycheff scalarization as a favorable +scalarization method for MORL. Considering the non-smoothness of Tchebycheff +scalarization, we reformulate its minimization problem into a new min-max-max +optimization problem. Then, for the stochastic policy class, we propose +efficient algorithms using this reformulation to learn Pareto optimal policies. +We first propose an online UCB-based algorithm to achieve an $\varepsilon$ +learning error with an $\tilde{\mathcal{O}}(\varepsilon^{-2})$ sample +complexity for a single given preference. To further reduce the cost of +environment exploration under different preferences, we propose a +preference-free framework that first explores the environment without +pre-defined preferences and then generates solutions for any number of +preferences. We prove that it only requires an +$\tilde{\mathcal{O}}(\varepsilon^{-2})$ exploration complexity in the +exploration phase and demands no additional exploration afterward. Lastly, we +analyze the smooth Tchebycheff scalarization, an extension of Tchebycheff +scalarization, which is proved to be more advantageous in distinguishing the +Pareto optimal policies from other weakly Pareto optimal policies based on +entry values of preference vectors. Furthermore, we extend our algorithms and +theoretical analysis to accommodate this optimization target. + +
+
+ comment: Initially submitted in May 2024 +
+
+
+
+
+ + ☆ u-$μ$P: The Unit-Scaled Maximal Update Parametrization + + +
+ The Maximal Update Parametrization ($\mu$P) aims to make the optimal +hyperparameters (HPs) of a model independent of its size, allowing them to be +swept using a cheap proxy model rather than the full-size target model. We +present a new scheme, u-$\mu$P, which improves upon $\mu$P by combining it with +Unit Scaling, a method for designing models that makes them easy to train in +low-precision. The two techniques have a natural affinity: $\mu$P ensures that +the scale of activations is independent of model size, and Unit Scaling ensures +that activations, weights and gradients begin training with a scale of one. +This synthesis opens the door to a simpler scheme, whose default values are +near-optimal. This in turn facilitates a more efficient sweeping strategy, with +u-$\mu$P models reaching a lower loss than comparable $\mu$P models and working +out-of-the-box in FP8. + +
+
+ comment: 48 pages +
+
+
+
+
+ + ☆ SoNIC: Safe Social Navigation with Adaptive Conformal Inference and + Constrained Reinforcement Learning + + +
+ Reinforcement Learning (RL) has enabled social robots to generate +trajectories without human-designed rules or interventions, which makes it more +effective than hard-coded systems for generalizing to complex real-world +scenarios. However, social navigation is a safety-critical task that requires +robots to avoid collisions with pedestrians while previous RL-based solutions +fall short in safety performance in complex environments. To enhance the safety +of RL policies, to the best of our knowledge, we propose the first algorithm, +SoNIC, that integrates adaptive conformal inference (ACI) with constrained +reinforcement learning (CRL) to learn safe policies for social navigation. More +specifically, our method augments RL observations with ACI-generated +nonconformity scores and provides explicit guidance for agents to leverage the +uncertainty metrics to avoid safety-critical areas by incorporating safety +constraints with spatial relaxation. Our method outperforms state-of-the-art +baselines in terms of both safety and adherence to social norms by a large +margin and demonstrates much stronger robustness to out-of-distribution +scenarios. Our code and video demos are available on our project website: +https://sonic-social-nav.github.io/. + +
+
+ comment: Project website: https://sonic-social-nav.github.io/ +
+
+
+
+
+ + ☆ Hidden or Inferred: Fair Learning-To-Rank with Unknown Demographics AAAI + + +
+ As learning-to-rank models are increasingly deployed for decision-making in +areas with profound life implications, the FairML community has been developing +fair learning-to-rank (LTR) models. These models rely on the availability of +sensitive demographic features such as race or sex. However, in practice, +regulatory obstacles and privacy concerns protect this data from collection and +use. As a result, practitioners may either need to promote fairness despite the +absence of these features or turn to demographic inference tools to attempt to +infer them. Given that these tools are fallible, this paper aims to further +understand how errors in demographic inference impact the fairness performance +of popular fair LTR strategies. In which cases would it be better to keep such +demographic attributes hidden from models versus infer them? We examine a +spectrum of fair LTR strategies ranging from fair LTR with and without +demographic features hidden versus inferred to fairness-unaware LTR followed by +fair re-ranking. We conduct a controlled empirical investigation modeling +different levels of inference errors by systematically perturbing the inferred +sensitive attribute. We also perform three case studies with real-world +datasets and popular open-source inference methods. Our findings reveal that as +inference noise grows, LTR-based methods that incorporate fairness +considerations into the learning process may increase bias. In contrast, fair +re-ranking strategies are more robust to inference errors. All source code, +data, and experimental artifacts of our experimental study are available here: +https://github.com/sewen007/hoiltr.git + +
+
+ comment: This paper has been accepted by AAAI/AIES to the AIES 2024 conference +
+
+
+
+
+ + ☆ EuroCropsML: A Time Series Benchmark Dataset For Few-Shot Crop Type + Classification + + +
+ We introduce EuroCropsML, an analysis-ready remote sensing machine learning +dataset for time series crop type classification of agricultural parcels in +Europe. It is the first dataset designed to benchmark transnational few-shot +crop type classification algorithms that supports advancements in algorithmic +development and research comparability. It comprises 706 683 multi-class +labeled data points across 176 classes, featuring annual time series of +per-parcel median pixel values from Sentinel-2 L1C data for 2021, along with +crop type labels and spatial coordinates. Based on the open-source EuroCrops +collection, EuroCropsML is publicly available on Zenodo. + +
+
+ comment: 5 pages, 5 figures +
+
+
+
+
+ + ☆ Looking at Model Debiasing through the Lens of Anomaly Detection + + +
+ It is widely recognized that deep neural networks are sensitive to bias in +the data. This means that during training these models are likely to learn +spurious correlations between data and labels, resulting in limited +generalization abilities and low performance. In this context, model debiasing +approaches can be devised aiming at reducing the model's dependency on such +unwanted correlations, either leveraging the knowledge of bias information or +not. In this work, we focus on the latter and more realistic scenario, showing +the importance of accurately predicting the bias-conflicting and bias-aligned +samples to obtain compelling performance in bias mitigation. On this ground, we +propose to conceive the problem of model bias from an out-of-distribution +perspective, introducing a new bias identification method based on anomaly +detection. We claim that when data is mostly biased, bias-conflicting samples +can be regarded as outliers with respect to the bias-aligned distribution in +the feature space of a biased model, thus allowing for precisely detecting them +with an anomaly detection method. Coupling the proposed bias identification +approach with bias-conflicting data upsampling and augmentation in a two-step +strategy, we reach state-of-the-art performance on synthetic and real benchmark +datasets. Ultimately, our proposed approach shows that the data bias issue does +not necessarily require complex debiasing methods, given that an accurate bias +identification procedure is defined. + +
+
+ comment: 15 pages, 7 figures +
+
+
+
+
+ + ☆ Fractional signature: a generalisation of the signature inspired by + fractional calculus + + +
+ In this paper, we propose a novel generalisation of the signature of a path, +motivated by fractional calculus, which is able to describe the solutions of +linear Caputo controlled FDEs. We also propose another generalisation of the +signature, inspired by the previous one, but more convenient to use in machine +learning. Finally, we test this last signature in a toy application to the +problem of handwritten digit recognition, where significant improvements in +accuracy rates are observed compared to those of the original signature. + +
+
+ comment: 9 pages, 1 figure +
+
+
+
+
+ + ☆ HumanVid: Demystifying Training Data for Camera-controllable Human Image + Animation + + +
+ Human image animation involves generating videos from a character photo, +allowing user control and unlocking potential for video and movie production. +While recent approaches yield impressive results using high-quality training +data, the inaccessibility of these datasets hampers fair and transparent +benchmarking. Moreover, these approaches prioritize 2D human motion and +overlook the significance of camera motions in videos, leading to limited +control and unstable video generation.To demystify the training data, we +present HumanVid, the first large-scale high-quality dataset tailored for human +image animation, which combines crafted real-world and synthetic data. For the +real-world data, we compile a vast collection of copyright-free real-world +videos from the internet. Through a carefully designed rule-based filtering +strategy, we ensure the inclusion of high-quality videos, resulting in a +collection of 20K human-centric videos in 1080P resolution. Human and camera +motion annotation is accomplished using a 2D pose estimator and a SLAM-based +method. For the synthetic data, we gather 2,300 copyright-free 3D avatar assets +to augment existing available 3D assets. Notably, we introduce a rule-based +camera trajectory generation method, enabling the synthetic pipeline to +incorporate diverse and precise camera motion annotation, which can rarely be +found in real-world data. To verify the effectiveness of HumanVid, we establish +a baseline model named CamAnimate, short for Camera-controllable Human +Animation, that considers both human and camera motions as conditions. Through +extensive experimentation, we demonstrate that such simple baseline training on +our HumanVid achieves state-of-the-art performance in controlling both human +pose and camera motions, setting a new benchmark. Code and data will be +publicly available at \url{https://github.com/zhenzhiwang/HumanVid/}. + +
+
+ comment: camera controllable human image animation, a dataset and a baseline +
+
+
+
+
+ + ☆ Nerva: a Truly Sparse Implementation of Neural Networks + + +
+ We introduce Nerva, a fast neural network library under development in C++. +It supports sparsity by using the sparse matrix operations of Intel's Math +Kernel Library (MKL), which eliminates the need for binary masks. We show that +Nerva significantly decreases training time and memory usage while reaching +equivalent accuracy to PyTorch. We run static sparse experiments with an MLP on +CIFAR-10. On high sparsity levels like $99\%$, the runtime is reduced by a +factor of $4\times$ compared to a PyTorch model using masks. Similar to other +popular frameworks such as PyTorch and Keras, Nerva offers a Python interface +for users to work with. + +
+
+ comment: The Nerva library is available at https://github.com/wiegerw/nerva +
+
+
+
+
+ + ☆ Can Watermarking Large Language Models Prevent Copyrighted Text + Generation and Hide Training Data? + + +
+ Large Language Models (LLMs) have demonstrated impressive capabilities in +generating diverse and contextually rich text. However, concerns regarding +copyright infringement arise as LLMs may inadvertently produce copyrighted +material. In this paper, we first investigate the effectiveness of watermarking +LLMs as a deterrent against the generation of copyrighted texts. Through +theoretical analysis and empirical evaluation, we demonstrate that +incorporating watermarks into LLMs significantly reduces the likelihood of +generating copyrighted content, thereby addressing a critical concern in the +deployment of LLMs. Additionally, we explore the impact of watermarking on +Membership Inference Attacks (MIAs), which aim to discern whether a sample was +part of the pretraining dataset and may be used to detect copyright violations. +Surprisingly, we find that watermarking adversely affects the success rate of +MIAs, complicating the task of detecting copyrighted text in the pretraining +dataset. Finally, we propose an adaptive technique to improve the success rate +of a recent MIA under watermarking. Our findings underscore the importance of +developing adaptive methods to study critical problems in LLMs with potential +legal implications. + +
+
+ comment: 21 pages, 6 figures +
+
+
+
+
+ + ☆ Systematic Reasoning About Relational Domains With Graph Neural Networks + + +
+ Developing models that can learn to reason is a notoriously challenging +problem. We focus on reasoning in relational domains, where the use of Graph +Neural Networks (GNNs) seems like a natural choice. However, previous work on +reasoning with GNNs has shown that such models tend to fail when presented with +test examples that require longer inference chains than those seen during +training. This suggests that GNNs lack the ability to generalize from training +examples in a systematic way, which would fundamentally limit their reasoning +abilities. A common solution is to instead rely on neuro-symbolic methods, +which are capable of reasoning in a systematic way by design. Unfortunately, +the scalability of such methods is often limited and they tend to rely on +overly strong assumptions, e.g.\ that queries can be answered by inspecting a +single relational path. In this paper, we revisit the idea of reasoning with +GNNs, showing that systematic generalization is possible as long as the right +inductive bias is provided. In particular, we argue that node embeddings should +be treated as epistemic states and that GNN should be parameterised +accordingly. We propose a simple GNN architecture which is based on this view +and show that it is capable of achieving state-of-the-art results. We +furthermore introduce a benchmark which requires models to aggregate evidence +from multiple relational paths. We show that existing neuro-symbolic approaches +fail on this benchmark, whereas our considered GNN model learns to reason +accurately. + +
+
+ comment: 10+16 pages, 2+7 figures, 4+9 tables. Preprint under review. Comments + welcome +
+
+
+
+
+ + ☆ Five reasons against assuming a data-generating distribution in Machine + Learning ICML 2024 + + +
+ Machine Learning research, as most of Statistics, heavily relies on the +concept of a data-generating probability distribution. As data points are +thought to be sampled from such a distribution, we can learn from observed data +about this distribution and, thus, predict future data points drawn from it +(with some probability of success). Drawing on scholarship across disciplines, +we here argue that this framework is not always a good model. Not only do such +true probability distributions not exist; the framework can also be misleading +and obscure both the choices made and the goals pursued in machine learning +practice. We suggest an alternative framework that focuses on finite +populations rather than abstract distributions; while classical learning theory +can be left almost unchanged, it opens new opportunities, especially to model +sampling. We compile these considerations into five reasons for modelling +machine learning -- in some settings -- with finite distributions rather than +generative distributions, both to be more faithful to practice and to provide +novel theoretical insights. + +
+
+ comment: Presented at the Humans, Algorithmic Decision-Making and Society + Workshop at ICML 2024 +
+
+
+
+
+ + ☆ Causal modelling without counterfactuals and individualised effects ICML 2024 + + +
+ The most common approach to causal modelling is the potential outcomes +framework due to Neyman and Rubin. In this framework, outcomes of +counterfactual treatments are assumed to be well-defined. This metaphysical +assumption is often thought to be problematic yet indispensable. The +conventional approach relies not only on counterfactuals, but also on abstract +notions of distributions and assumptions of independence that are not directly +testable. In this paper, we construe causal inference as treatment-wise +predictions for finite populations where all assumptions are testable; this +means that one can not only test predictions themselves (without any +fundamental problem), but also investigate sources of error when they fail. The +new framework highlights the model-dependence of causal claims as well as the +difference between statistical and scientific inference. + +
+
+ comment: Presented at the Humans, Algorithmic Decision-Making and Society + Workshop at ICML 2024 +
+
+
+
+
+ + ☆ A Comprehensive Approach to Misspelling Correction with BERT and + Levenshtein Distance + + +
+ Writing, as an omnipresent form of human communication, permeates nearly +every aspect of contemporary life. Consequently, inaccuracies or errors in +written communication can lead to profound consequences, ranging from financial +losses to potentially life-threatening situations. Spelling mistakes, among the +most prevalent writing errors, are frequently encountered due to various +factors. This research aims to identify and rectify diverse spelling errors in +text using neural networks, specifically leveraging the Bidirectional Encoder +Representations from Transformers (BERT) masked language model. To achieve this +goal, we compiled a comprehensive dataset encompassing both non-real-word and +real-word errors after categorizing different types of spelling mistakes. +Subsequently, multiple pre-trained BERT models were employed. To ensure optimal +performance in correcting misspelling errors, we propose a combined approach +utilizing the BERT masked language model and Levenshtein distance. The results +from our evaluation data demonstrate that the system presented herein exhibits +remarkable capabilities in identifying and rectifying spelling mistakes, often +surpassing existing systems tailored for the Persian language. + +
+
+ comment: 12 pages, 9 figures, 5 tables +
+
+
+
+
+ + ☆ Entropy Reweighted Conformal Classification + + +
+ Conformal Prediction (CP) is a powerful framework for constructing prediction +sets with guaranteed coverage. However, recent studies have shown that +integrating confidence calibration with CP can lead to a degradation in +efficiency. In this paper, We propose an adaptive approach that considers the +classifier's uncertainty and employs entropy-based reweighting to enhance the +efficiency of prediction sets for conformal classification. Our experimental +results demonstrate that this method significantly improves efficiency. + +
+
+
+
+
+ + ☆ Quantile Learn-Then-Test: Quantile-Based Risk Control for Hyperparameter + Optimization + + +
+ The increasing adoption of Artificial Intelligence (AI) in engineering +problems calls for the development of calibration methods capable of offering +robust statistical reliability guarantees. The calibration of black box AI +models is carried out via the optimization of hyperparameters dictating +architecture, optimization, and/or inference configuration. Prior work has +introduced learn-then-test (LTT), a calibration procedure for hyperparameter +optimization (HPO) that provides statistical guarantees on average performance +measures. Recognizing the importance of controlling risk-aware objectives in +engineering contexts, this work introduces a variant of LTT that is designed to +provide statistical guarantees on quantiles of a risk measure. We illustrate +the practical advantages of this approach by applying the proposed algorithm to +a radio access scheduling problem. + +
+
+
+
+
+ + ☆ Gradient-based inference of abstract task representations for + generalization in neural networks + + +
+ Humans and many animals show remarkably adaptive behavior and can respond +differently to the same input depending on their internal goals. The brain not +only represents the intermediate abstractions needed to perform a computation +but also actively maintains a representation of the computation itself (task +abstraction). Such separation of the computation and its abstraction is +associated with faster learning, flexible decision-making, and broad +generalization capacity. We investigate if such benefits might extend to neural +networks trained with task abstractions. For such benefits to emerge, one needs +a task inference mechanism that possesses two crucial abilities: First, the +ability to infer abstract task representations when no longer explicitly +provided (task inference), and second, manipulate task representations to adapt +to novel problems (task recomposition). To tackle this, we cast task inference +as an optimization problem from a variational inference perspective and ground +our approach in an expectation-maximization framework. We show that gradients +backpropagated through a neural network to a task representation layer are an +efficient heuristic to infer current task demands, a process we refer to as +gradient-based inference (GBI). Further iterative optimization of the task +representation layer allows for recomposing abstractions to adapt to novel +situations. Using a toy example, a novel image classifier, and a language +model, we demonstrate that GBI provides higher learning efficiency and +generalization to novel tasks and limits forgetting. Moreover, we show that GBI +has unique advantages such as preserving information for uncertainty estimation +and detecting out-of-distribution samples. + +
+
+
+
+
+ + ☆ Scalify: scale propagation for efficient low-precision LLM training ICML 2024 + + +
+ Low-precision formats such as float8 have been introduced in machine learning +accelerated hardware to improve computational efficiency for large language +models training and inference. Nevertheless, adoption by the ML community has +been slowed down by the complex, and sometimes brittle, techniques required to +match higher precision training accuracy. In this work, we present Scalify, a +end-to-end scale propagation paradigm for computational graphs, generalizing +and formalizing existing tensor scaling methods. Experiment results show that +Scalify supports out-of-the-box float8 matrix multiplication and gradients +representation, as well as float16 optimizer state storage. Our JAX +implementation of Scalify is open-sourced at +https://github.com/graphcore-research/jax-scalify + +
+
+ comment: 11 pages, 5 figures, ICML 2024 WANT workshop +
+
+
+
+
+ + ☆ Mathematical programming algorithms for convex hull approximation with a + hyperplane budget + + +
+ We consider the following problem in computational geometry: given, in the +d-dimensional real space, a set of points marked as positive and a set of +points marked as negative, such that the convex hull of the positive set does +not intersect the negative set, find K hyperplanes that separate, if possible, +all the positive points from the negative ones. That is, we search for a convex +polyhedron with at most K faces, containing all the positive points and no +negative point. The problem is known in the literature for pure convex +polyhedral approximation; our interest stems from its possible applications in +constraint learning, where points are feasible or infeasible solutions of a +Mixed Integer Program, and the K hyperplanes are linear constraints to be +found. We cast the problem as an optimization one, minimizing the number of +negative points inside the convex polyhedron, whenever exact separation cannot +be achieved. We introduce models inspired by support vector machines and we +design two mathematical programming formulations with binary variables. We +exploit Dantzig-Wolfe decomposition to obtain extended formulations, and we +devise column generation algorithms with ad-hoc pricing routines. We compare +computing time and separation error values obtained by all our approaches on +synthetic datasets, with number of points from hundreds up to a few thousands, +showing our approaches to perform better than existing ones from the +literature. Furthermore, we observe that key computational differences arise, +depending on whether the budget K is sufficient to completely separate the +positive points from the negative ones or not. On 8-dimensional instances (and +over), existing convex hull algorithms become computational inapplicable, while +our algorithms allow to identify good convex hull approximations in minutes of +computation. + +
+
+
+
+
+ + ☆ Global and Local Confidence Based Fraud Detection Graph Neural Network + + +
+ This paper presents the Global and Local Confidence Graph Neural Network +(GLC-GNN), an innovative approach to graph-based anomaly detection that +addresses the challenges of heterophily and camouflage in fraudulent +activities. By introducing a prototype to encapsulate the global features of a +graph and calculating a Global Confidence (GC) value for each node, GLC-GNN +effectively distinguishes between benign and fraudulent nodes. The model +leverages GC to generate attention values for message aggregation, enhancing +its ability to capture both homophily and heterophily. Through extensive +experiments on four open datasets, GLC-GNN demonstrates superior performance +over state-of-the-art models in accuracy and convergence speed, while +maintaining a compact model size and expedited training process. The +integration of global and local confidence measures in GLC-GNN offers a robust +solution for detecting anomalies in graphs, with significant implications for +fraud detection across diverse domains. + +
+
+
+
+
+ + ☆ Low dimensional representation of multi-patient flow cytometry datasets + using optimal transport for minimal residual disease detection in leukemia + + +
+ Representing and quantifying Minimal Residual Disease (MRD) in Acute Myeloid +Leukemia (AML), a type of cancer that affects the blood and bone marrow, is +essential in the prognosis and follow-up of AML patients. As traditional +cytological analysis cannot detect leukemia cells below 5\%, the analysis of +flow cytometry dataset is expected to provide more reliable results. In this +paper, we explore statistical learning methods based on optimal transport (OT) +to achieve a relevant low-dimensional representation of multi-patient flow +cytometry measurements (FCM) datasets considered as high-dimensional +probability distributions. Using the framework of OT, we justify the use of the +K-means algorithm for dimensionality reduction of multiple large-scale point +clouds through mean measure quantization by merging all the data into a single +point cloud. After this quantization step, the visualization of the intra and +inter-patients FCM variability is carried out by embedding low-dimensional +quantized probability measures into a linear space using either Wasserstein +Principal Component Analysis (PCA) through linearized OT or log-ratio PCA of +compositional data. Using a publicly available FCM dataset and a FCM dataset +from Bordeaux University Hospital, we demonstrate the benefits of our approach +over the popular kernel mean embedding technique for statistical learning from +multiple high-dimensional probability distributions. We also highlight the +usefulness of our methodology for low-dimensional projection and clustering +patient measurements according to their level of MRD in AML from FCM. In +particular, our OT-based approach allows a relevant and informative +two-dimensional representation of the results of the FlowSom algorithm, a +state-of-the-art method for the detection of MRD in AML using multi-patient +FCM. + +
+
+
+
+
+ + ☆ MoveLight: Enhancing Traffic Signal Control through Movement-Centric + Deep Reinforcement Learning + + +
+ This paper introduces MoveLight, a novel traffic signal control system that +enhances urban traffic management through movement-centric deep reinforcement +learning. By leveraging detailed real-time data and advanced machine learning +techniques, MoveLight overcomes the limitations of traditional traffic signal +control methods. It employs a lane-level control approach using the FRAP +algorithm to achieve dynamic and adaptive traffic signal control, optimizing +traffic flow, reducing congestion, and improving overall efficiency. Our +research demonstrates the scalability and effectiveness of MoveLight across +single intersections, arterial roads, and network levels. Experimental results +using real-world datasets from Cologne and Hangzhou show significant +improvements in metrics such as queue length, delay, and throughput compared to +existing methods. This study highlights the transformative potential of deep +reinforcement learning in intelligent traffic signal control, setting a new +standard for sustainable and efficient urban transportation systems. + +
+
+
+
+
+ + ☆ Enhanced SMC$^2$: Leveraging Gradient Information from Differentiable + Particle Filters Within Langevin Proposals + + +
+ Sequential Monte Carlo Squared (SMC$^2$) is a Bayesian method which can infer +the states and parameters of non-linear, non-Gaussian state-space models. The +standard random-walk proposal in SMC$^2$ faces challenges, particularly with +high-dimensional parameter spaces. This study outlines a novel approach by +harnessing first-order gradients derived from a Common Random Numbers - +Particle Filter (CRN-PF) using PyTorch. The resulting gradients can be +leveraged within a Langevin proposal without accept/reject. Including Langevin +dynamics within the proposal can result in a higher effective sample size and +more accurate parameter estimates when compared with the random-walk. The +resulting algorithm is parallelized on distributed memory using Message Passing +Interface (MPI) and runs in $\mathcal{O}(\log_2N)$ time complexity. Utilizing +64 computational cores we obtain a 51x speed-up when compared to a single core. +A GitHub link is given which provides access to the code. + +
+
+ comment: 8 pages, 3 images. Accepted to 2024 IEEE International Conference on + Multisensor Fusion and Integration (MFI 2024). https://mfi2024.org/. arXiv + admin note: text overlap with arXiv:2311.12973 +
+
+
+
+
+ + ☆ A Novel Two-Step Fine-Tuning Pipeline for Cold-Start Active Learning in + Text Classification Tasks + + +
+ This is the first work to investigate the effectiveness of BERT-based +contextual embeddings in active learning (AL) tasks on cold-start scenarios, +where traditional fine-tuning is infeasible due to the absence of labeled data. +Our primary contribution is the proposal of a more robust fine-tuning pipeline +- DoTCAL - that diminishes the reliance on labeled data in AL using two steps: +(1) fully leveraging unlabeled data through domain adaptation of the embeddings +via masked language modeling and (2) further adjusting model weights using +labeled data selected by AL. Our evaluation contrasts BERT-based embeddings +with other prevalent text representation paradigms, including Bag of Words +(BoW), Latent Semantic Indexing (LSI), and FastText, at two critical stages of +the AL process: instance selection and classification. Experiments conducted on +eight ATC benchmarks with varying AL budgets (number of labeled instances) and +number of instances (about 5,000 to 300,000) demonstrate DoTCAL's superior +effectiveness, achieving up to a 33% improvement in Macro-F1 while reducing +labeling efforts by half compared to the traditional one-step method. We also +found that in several tasks, BoW and LSI (due to information aggregation) +produce results superior (up to 59% ) to BERT, especially in low-budget +scenarios and hard-to-classify tasks, which is quite surprising. + +
+
+ comment: 11 pages, 4 figures, 2 Tables, and 1 algorithm +
+
+
+
+
+ + ☆ Enhanced Feature Learning via Regularisation: Integrating Neural + Networks and Kernel Methods + + +
+ We propose a new method for feature learning and function estimation in +supervised learning via regularised empirical risk minimisation. Our approach +considers functions as expectations of Sobolev functions over all possible +one-dimensional projections of the data. This framework is similar to kernel +ridge regression, where the kernel is $\mathbb{E}_w ( k^{(B)}(w^\top x,w^\top +x^\prime))$, with $k^{(B)}(a,b) := \min(|a|, |b|)1_{ab>0}$ the Brownian kernel, +and the distribution of the projections $w$ is learnt. This can also be viewed +as an infinite-width one-hidden layer neural network, optimising the first +layer's weights through gradient descent and explicitly adjusting the +non-linearity and weights of the second layer. We introduce an efficient +computation method for the estimator, called Brownian Kernel Neural Network +(BKerNN), using particles to approximate the expectation. The optimisation is +principled due to the positive homogeneity of the Brownian kernel. Using +Rademacher complexity, we show that BKerNN's expected risk converges to the +minimal risk with explicit high-probability rates of $O( \min((d/n)^{1/2}, +n^{-1/6}))$ (up to logarithmic factors). Numerical experiments confirm our +optimisation intuitions, and BKerNN outperforms kernel ridge regression, and +favourably compares to a one-hidden layer neural network with ReLU activations +in various settings and real data sets. + +
+
+
+
+
+ + ☆ Channel-Aware Low-Rank Adaptation in Time Series Forecasting CIKM 2024 + + +
+ The balance between model capacity and generalization has been a key focus of +recent discussions in long-term time series forecasting. Two representative +channel strategies are closely associated with model expressivity and +robustness, including channel independence (CI) and channel dependence (CD). +The former adopts individual channel treatment and has been shown to be more +robust to distribution shifts, but lacks sufficient capacity to model +meaningful channel interactions. The latter is more expressive for representing +complex cross-channel dependencies, but is prone to overfitting. To balance the +two strategies, we present a channel-aware low-rank adaptation method to +condition CD models on identity-aware individual components. As a plug-in +solution, it is adaptable for a wide range of backbone architectures. Extensive +experiments show that it can consistently and significantly improve the +performance of both CI and CD models with demonstrated efficiency and +flexibility. The code is available at https://github.com/tongnie/C-LoRA. + +
+
+ comment: Accepted by CIKM 2024, short research paper track +
+
+
+
+
+ + ☆ Pretrained Visual Representations in Reinforcement Learning + + +
+ Visual reinforcement learning (RL) has made significant progress in recent +years, but the choice of visual feature extractor remains a crucial design +decision. This paper compares the performance of RL algorithms that train a +convolutional neural network (CNN) from scratch with those that utilize +pre-trained visual representations (PVRs). We evaluate the Dormant Ratio +Minimization (DRM) algorithm, a state-of-the-art visual RL method, against +three PVRs: ResNet18, DINOv2, and Visual Cortex (VC). We use the Metaworld +Push-v2 and Drawer-Open-v2 tasks for our comparison. Our results show that the +choice of training from scratch compared to using PVRs for maximising +performance is task-dependent, but PVRs offer advantages in terms of reduced +replay buffer size and faster training times. We also identify a strong +correlation between the dormant ratio and model performance, highlighting the +importance of exploration in visual RL. Our study provides insights into the +trade-offs between training from scratch and using PVRs, informing the design +of future visual RL algorithms. + +
+
+
+
+
+ + ☆ Statistical Batch-Based Bearing Fault Detection + + +
+ In the domain of rotating machinery, bearings are vulnerable to different +mechanical faults, including ball, inner, and outer race faults. Various +techniques can be used in condition-based monitoring, from classical signal +analysis to deep learning methods. Based on the complex working conditions of +rotary machines, multivariate statistical process control charts such as +Hotelling's $T^2$ and Squared Prediction Error are useful for providing early +warnings. However, these methods are rarely applied to condition monitoring of +rotating machinery due to the univariate nature of the datasets. In the present +paper, we propose a multivariate statistical process control-based fault +detection method that utilizes multivariate data composed of Fourier transform +features extracted for fixed-time batches. Our approach makes use of the +multidimensional nature of Fourier transform characteristics, which record more +detailed information about the machine's status, in an effort to enhance early +defect detection and diagnosis. Experiments with varying vibration measurement +locations (Fan End, Drive End), fault types (ball, inner, and outer race +faults), and motor loads (0-3 horsepower) are used to validate the suggested +approach. The outcomes illustrate our method's effectiveness in fault detection +and point to possible broader uses in industrial maintenance. + +
+
+
+
+
+ + ☆ A Hybrid Federated Kernel Regularized Least Squares Algorithm + + +
+ Federated learning is becoming an increasingly viable and accepted strategy +for building machine learning models in critical privacy-preserving scenarios +such as clinical settings. Often, the data involved is not limited to clinical +data but also includes additional omics features (e.g. proteomics). +Consequently, data is distributed not only across hospitals but also across +omics centers, which are labs capable of generating such additional features +from biosamples. This scenario leads to a hybrid setting where data is +scattered both in terms of samples and features. In this hybrid setting, we +present an efficient reformulation of the Kernel Regularized Least Squares +algorithm, introduce two variants and validate them using well-established +datasets. Lastly, we discuss security measures to defend against possible +attacks. + +
+
+
+
+
+ + ☆ Sublinear Regret for An Actor-Critic Algorithm in Continuous-Time + Linear-Quadratic Reinforcement Learning + + +
+ We study reinforcement learning (RL) for a class of continuous-time +linear-quadratic (LQ) control problems for diffusions where volatility of the +state processes depends on both state and control variables. We apply a +model-free approach that relies neither on knowledge of model parameters nor on +their estimations, and devise an actor-critic algorithm to learn the optimal +policy parameter directly. Our main contributions include the introduction of a +novel exploration schedule and a regret analysis of the proposed algorithm. We +provide the convergence rate of the policy parameter to the optimal one, and +prove that the algorithm achieves a regret bound of $O(N^{\frac{3}{4}})$ up to +a logarithmic factor. We conduct a simulation study to validate the theoretical +results and demonstrate the effectiveness and reliability of the proposed +algorithm. We also perform numerical comparisons between our method and those +of the recent model-based stochastic LQ RL studies adapted to the state- and +control-dependent volatility setting, demonstrating a better performance of the +former in terms of regret bounds. + +
+
+ comment: 42 pages, 4 figures +
+
+
+
+
+ + ☆ An Adaptive Second-order Method for a Class of Nonconvex Nonsmooth + Composite Optimization + + +
+ This paper explores a specific type of nonconvex sparsity-promoting +regularization problems, namely those involving $\ell_p$-norm regularization, +in conjunction with a twice continuously differentiable loss function. We +propose a novel second-order algorithm designed to effectively address this +class of challenging nonconvex and nonsmooth problems, showcasing several +innovative features: (i) The use of an alternating strategy to solve a +reweighted $\ell_1$ regularized subproblem and the subspace approximate Newton +step. (ii) The reweighted $\ell_1$ regularized subproblem relies on a convex +approximation to the nonconvex regularization term, enabling a closed-form +solution characterized by the soft-thresholding operator. This feature allows +our method to be applied to various nonconvex regularization problems. (iii) +Our algorithm ensures that the iterates maintain their sign values and that +nonzero components are kept away from 0 for a sufficient number of iterations, +eventually transitioning to a perturbed Newton method. (iv) We provide +theoretical guarantees of global convergence, local superlinear convergence in +the presence of the Kurdyka-\L ojasiewicz (KL) property, and local quadratic +convergence when employing the exact Newton step in our algorithm. We also +showcase the effectiveness of our approach through experiments on a diverse set +of model prediction problems. + +
+
+
+
+
+ + ☆ Application of Machine Learning and Convex Limiting to Subgrid Flux + Modeling in the Shallow-Water Equations + + +
+ We propose a combination of machine learning and flux limiting for +property-preserving subgrid scale modeling in the context of flux-limited +finite volume methods for the one-dimensional shallow-water equations. The +numerical fluxes of a conservative target scheme are fitted to the coarse-mesh +averages of a monotone fine-grid discretization using a neural network to +parametrize the subgrid scale components. To ensure positivity preservation and +the validity of local maximum principles, we use a flux limiter that constrains +the intermediate states of an equivalent fluctuation form to stay in a convex +admissible set. The results of our numerical studies confirm that the proposed +combination of machine learning with monolithic convex limiting produces +meaningful closures even in scenarios for which the network was not trained. + +
+
+
+
+
+ + ☆ Spectrum-Informed Multistage Neural Networks: Multiscale Function + Approximators of Machine Precision ICML 2024 + + +
+ Deep learning frameworks have become powerful tools for approaching +scientific problems such as turbulent flow, which has wide-ranging +applications. In practice, however, existing scientific machine learning +approaches have difficulty fitting complex, multi-scale dynamical systems to +very high precision, as required in scientific contexts. We propose using the +novel multistage neural network approach with a spectrum-informed +initialization to learn the residue from the previous stage, utilizing the +spectral biases associated with neural networks to capture high frequency +features in the residue, and successfully tackle the spectral bias of neural +networks. This approach allows the neural network to fit target functions to +double floating-point machine precision $O(10^{-16})$. + +
+
+ comment: 8 pages, 3 figures, ICML 2024 workshop (AI for Science: Scaling in AI + for Scientific Discovery) +
+
+
+
+
+ + ☆ Nonverbal Immediacy Analysis in Education: A Multimodal Computational + Model + + +
+ This paper introduces a novel computational approach for analyzing nonverbal +social behavior in educational settings. Integrating multimodal behavioral +cues, including facial expressions, gesture intensity, and spatial dynamics, +the model assesses the nonverbal immediacy (NVI) of teachers from RGB classroom +videos. A dataset of 400 30-second video segments from German classrooms was +constructed for model training and validation. The gesture intensity regressor +achieved a correlation of 0.84, the perceived distance regressor 0.55, and the +NVI model 0.44 with median human ratings. The model demonstrates the potential +to provide a valuable support in nonverbal behavior assessment, approximating +the accuracy of individual human raters. Validated against both questionnaire +data and trained observer ratings, our models show moderate to strong +correlations with relevant educational outcomes, indicating their efficacy in +reflecting effective teaching behaviors. This research advances the objective +assessment of nonverbal communication behaviors, opening new pathways for +educational research. + +
+
+ comment: 12 pages, 3 figures. Camera-ready version for the SAB 2024: 17th + International Conference on the Simulation of Adaptive Behavior +
+
+
+
+
+ + ☆ Take a Step and Reconsider: Sequence Decoding for Self-Improved Neural + Combinatorial Optimization ECAI-2024 + + +
+ The constructive approach within Neural Combinatorial Optimization (NCO) +treats a combinatorial optimization problem as a finite Markov decision +process, where solutions are built incrementally through a sequence of +decisions guided by a neural policy network. To train the policy, recent +research is shifting toward a 'self-improved' learning methodology that +addresses the limitations of reinforcement learning and supervised approaches. +Here, the policy is iteratively trained in a supervised manner, with solutions +derived from the current policy serving as pseudo-labels. The way these +solutions are obtained from the policy determines the quality of the +pseudo-labels. In this paper, we present a simple and problem-independent +sequence decoding method for self-improved learning based on sampling sequences +without replacement. We incrementally follow the best solution found and repeat +the sampling process from intermediate partial solutions. By modifying the +policy to ignore previously sampled sequences, we force it to consider only +unseen alternatives, thereby increasing solution diversity. Experimental +results for the Traveling Salesman and Capacitated Vehicle Routing Problem +demonstrate its strong performance. Furthermore, our method outperforms +previous NCO approaches on the Job Shop Scheduling Problem. + +
+
+ comment: Accepted at ECAI-2024 +
+
+
+
+
+ + ☆ Generalization Bounds of Surrogate Policies for Combinatorial + Optimization Problems + + +
+ A recent stream of structured learning approaches has improved the practical +state of the art for a range of combinatorial optimization problems with +complex objectives encountered in operations research. Such approaches train +policies that chain a statistical model with a surrogate combinatorial +optimization oracle to map any instance of the problem to a feasible solution. +The key idea is to exploit the statistical distribution over instances instead +of dealing with instances separately. However learning such policies by risk +minimization is challenging because the empirical risk is piecewise constant in +the parameters, and few theoretical guarantees have been provided so far. In +this article, we investigate methods that smooth the risk by perturbing the +policy, which eases optimization and improves generalization. Our main +contribution is a generalization bound that controls the perturbation bias, the +statistical learning error, and the optimization error. Our analysis relies on +the introduction of a uniform weak property, which captures and quantifies the +interplay of the statistical model and the surrogate combinatorial optimization +oracle. This property holds under mild assumptions on the statistical model, +the surrogate optimization, and the instance data distribution. We illustrate +the result on a range of applications such as stochastic vehicle scheduling. In +particular, such policies are relevant for contextual stochastic optimization +and our results cover this case. + +
+
+ comment: 10 pages main document, 3 pages supplement +
+
+
+
+
+ + ☆ Surrogate-guided optimization in quantum networks + + +
+ We propose an optimization algorithm to improve the design and performance of +quantum communication networks. When physical architectures become too complex +for analytical methods, numerical simulation becomes essential to study quantum +network behavior. Although highly informative, these simulations involve +complex numerical functions without known analytical forms, making traditional +optimization techniques that assume continuity, differentiability, or convexity +inapplicable. Additionally, quantum network simulations are computationally +demanding, rendering global approaches like Simulated Annealing or genetic +algorithms, + which require extensive function evaluations, impractical. We introduce a +more efficient optimization workflow using machine learning models, which serve +as surrogates for a given objective function. We demonstrate the effectiveness +of our approach by applying it to three well-known optimization problems in +quantum networking: quantum memory allocation for multiple network nodes, +tuning an experimental parameter in all physical links of a quantum +entanglement switch, and finding efficient protocol settings within a large +asymmetric quantum network. The solutions found by our algorithm consistently +outperform those obtained with our baseline approaches -- Simulated Annealing +and Bayesian optimization -- in the allotted time limit by up to 18\% and 20\%, +respectively. Our framework thus allows for more comprehensive quantum network +studies, integrating surrogate-assisted optimization with existing quantum +network simulators. + +
+
+ comment: 20 pages (including supplementary notes), 12 figures +
+
+
+
+
+ + ☆ Solving the Electrical Impedance Tomography Problem with a DeepONet Type + Neural Network: Theory and Application + + +
+ In this work, we consider the non-invasive medical imaging modality of +Electrical Impedance Tomography, where the problem is to recover the +conductivity in a medium from a set of data that arises out of a +current-to-voltage map (Neumann-to-Dirichlet operator) defined on the boundary +of the medium. We formulate this inverse problem as an operator-learning +problem where the goal is to learn the implicitly defined operator-to-function +map between the space of Neumann-to-Dirichlet operators to the space of +admissible conductivities. Subsequently, we use an operator-learning +architecture, popularly called DeepONets, to learn this operator-to-function +map. Thus far, most of the operator learning architectures have been +implemented to learn operators between function spaces. In this work, we +generalize the earlier works and use a DeepONet to actually {learn an +operator-to-function} map. We provide a Universal Approximation Theorem type +result which guarantees that this implicitly defined operator-to-function map +between the space of Neumann-to-Dirichlet operator to the space of conductivity +function can be approximated to an arbitrary degree using such a DeepONet. +Furthermore, we provide a computational implementation of our proposed approach +and compare it against a standard baseline. We show that the proposed approach +achieves good reconstructions and outperforms the baseline method in our +experiments. + +
+
+
+
+
+ + ☆ NarrationDep: Narratives on Social Media For Automatic Depression + Detection + + +
+ Social media posts provide valuable insight into the narrative of users and +their intentions, including providing an opportunity to automatically model +whether a social media user is depressed or not. The challenge lies in +faithfully modelling user narratives from their online social media posts, +which could potentially be useful in several different applications. We have +developed a novel and effective model called \texttt{NarrationDep}, which +focuses on detecting narratives associated with depression. By analyzing a +user's tweets, \texttt{NarrationDep} accurately identifies crucial narratives. +\texttt{NarrationDep} is a deep learning framework that jointly models +individual user tweet representations and clusters of users' tweets. As a +result, \texttt{NarrationDep} is characterized by a novel two-layer deep +learning model: the first layer models using social media text posts, and the +second layer learns semantic representations of tweets associated with a +cluster. To faithfully model these cluster representations, the second layer +incorporates a novel component that hierarchically learns from users' posts. +The results demonstrate that our framework outperforms other comparative models +including recently developed models on a variety of datasets. + +
+
+
+
+
+ + ☆ Robust Deep Hawkes Process under Label Noise of Both Event and + Occurrence ECAI2024 + + +
+ Integrating deep neural networks with the Hawkes process has significantly +improved predictive capabilities in finance, health informatics, and +information technology. Nevertheless, these models often face challenges in +real-world settings, particularly due to substantial label noise. This issue is +of significant concern in the medical field, where label noise can arise from +delayed updates in electronic medical records or misdiagnoses, leading to +increased prediction risks. Our research indicates that deep Hawkes process +models exhibit reduced robustness when dealing with label noise, particularly +when it affects both event types and timing. To address these challenges, we +first investigate the influence of label noise in approximated intensity +functions and present a novel framework, the Robust Deep Hawkes Process (RDHP), +to overcome the impact of label noise on the intensity function of Hawkes +models, considering both the events and their occurrences. We tested RDHP using +multiple open-source benchmarks with synthetic noise and conducted a case study +on obstructive sleep apnea-hypopnea syndrome (OSAHS) in a real-world setting +with inherent label noise. The results demonstrate that RDHP can effectively +perform classification and regression tasks, even in the presence of noise +related to events and their timing. To the best of our knowledge, this is the +first study to successfully address both event and time label noise in deep +Hawkes process models, offering a promising solution for medical applications, +specifically in diagnosing OSAHS. + +
+
+ comment: ECAI2024 +
+
+
+
+
+ + ☆ Explainable Artificial Intelligence Techniques for Irregular Temporal + Classification of Multidrug Resistance Acquisition in Intensive Care Unit + Patients + + +
+ Antimicrobial Resistance represents a significant challenge in the Intensive +Care Unit (ICU), where patients are at heightened risk of Multidrug-Resistant +(MDR) infections-pathogens resistant to multiple antimicrobial agents. This +study introduces a novel methodology that integrates Gated Recurrent Units +(GRUs) with advanced intrinsic and post-hoc interpretability techniques for +detecting the onset of MDR in patients across time. Within interpretability +methods, we propose Explainable Artificial Intelligence (XAI) approaches to +handle irregular Multivariate Time Series (MTS), introducing Irregular Time +Shapley Additive Explanations (IT-SHAP), a modification of Shapley Additive +Explanations designed for irregular MTS with Recurrent Neural Networks focused +on temporal outputs. Our methodology aims to identify specific risk factors +associated with MDR in ICU patients. GRU with Hadamard's attention demonstrated +high initial specificity and increasing sensitivity over time, correlating with +increased nosocomial infection risks during prolonged ICU stays. XAI analysis, +enhanced by Hadamard attention and IT-SHAP, identified critical factors such as +previous non-resistant cultures, specific antibiotic usage patterns, and +hospital environment dynamics. These insights suggest that early detection of +at-risk patients can inform interventions such as preventive isolation and +customized treatments, significantly improving clinical outcomes. The proposed +GRU model for temporal classification achieved an average Receiver Operating +Characteristic Area Under the Curve of 78.27 +- 1.26 over time, indicating +strong predictive performance. In summary, this study highlights the clinical +utility of our methodology, which combines predictive accuracy with +interpretability, thereby facilitating more effective healthcare interventions +by professionals. + +
+
+
+
+
+ + ☆ dlordinal: a Python package for deep ordinal classification + + +
+ dlordinal is a new Python library that unifies many recent deep ordinal +classification methodologies available in the literature. Developed using +PyTorch as underlying framework, it implements the top performing +state-of-the-art deep learning techniques for ordinal classification problems. +Ordinal approaches are designed to leverage the ordering information present in +the target variable. Specifically, it includes loss functions, various output +layers, dropout techniques, soft labelling methodologies, and other +classification strategies, all of which are appropriately designed to +incorporate the ordinal information. Furthermore, as the performance metrics to +assess novel proposals in ordinal classification depend on the distance between +target and predicted classes in the ordinal scale, suitable ordinal evaluation +metrics are also included. dlordinal is distributed under the BSD-3-Clause +license and is available at https://github.com/ayrna/dlordinal. + +
+
+
+
+
+ + ☆ Quantum Supervised Learning + + +
+ Recent advancements in quantum computing have positioned it as a prospective +solution for tackling intricate computational challenges, with supervised +learning emerging as a promising domain for its application. Despite this +potential, the field of quantum machine learning is still in its early stages, +and there persists a level of skepticism regarding a possible near-term quantum +advantage. This paper aims to provide a classical perspective on current +quantum algorithms for supervised learning, effectively bridging traditional +machine learning principles with advancements in quantum machine learning. +Specifically, this study charts a research trajectory that diverges from the +predominant focus of quantum machine learning literature, originating from the +prerequisites of classical methodologies and elucidating the potential impact +of quantum approaches. Through this exploration, our objective is to deepen the +understanding of the convergence between classical and quantum methods, thereby +laying the groundwork for future advancements in both domains and fostering the +involvement of classical practitioners in the field of quantum machine +learning. + +
+
+ comment: 16 pages, 3 figures, 1 table +
+
+
+
+
+ + ☆ Path Following and Stabilisation of a Bicycle Model using a + Reinforcement Learning Approach + + +
+ Over the years, complex control approaches have been developed to control the +motion of a bicycle. Reinforcement Learning (RL), a branch of machine learning, +promises easy deployment of so-called agents. Deployed agents are increasingly +considered as an alternative to controllers for mechanical systems. The present +work introduces an RL approach to do path following with a virtual bicycle +model while simultaneously stabilising it laterally. The bicycle, modelled as +the Whipple benchmark model and using multibody system dynamics, has no +stabilisation aids. The agent succeeds in both path following and stabilisation +of the bicycle model exclusively by outputting steering angles, which are +converted into steering torques via a PD controller. Curriculum learning is +applied as a state-of-the-art training strategy. Different settings for the +implemented RL framework are investigated and compared to each other. The +performance of the deployed agents is evaluated using different types of paths +and measurements. The ability of the deployed agents to do path following and +stabilisation of the bicycle model travelling between 2m/s and 7m/s along +complex paths including full circles, slalom manoeuvres, and lane changes is +demonstrated. Explanatory methods for machine learning are used to analyse the +functionality of a deployed agent and link the introduced RL approach with +research in the field of bicycle dynamics. + +
+
+
+
+
+ + ☆ Behavioral Testing: Can Large Language Models Implicitly Resolve + Ambiguous Entities? + + +
+ One of the major aspects contributing to the striking performance of large +language models (LLMs) is the vast amount of factual knowledge accumulated +during pre-training. Yet, many LLMs suffer from self-inconsistency, which +raises doubts about their trustworthiness and reliability. In this paper, we +focus on entity type ambiguity and analyze current state-of-the-art LLMs for +their proficiency and consistency in applying their factual knowledge when +prompted for entities under ambiguity. To do so, we propose an evaluation +protocol that disentangles knowing from applying knowledge, and test +state-of-the-art LLMs on 49 entities. Our experiments reveal that LLMs perform +poorly with ambiguous prompts, achieving only 80% accuracy. Our results further +demonstrate systematic discrepancies in LLM behavior and their failure to +consistently apply information, indicating that the models can exhibit +knowledge without being able to utilize it, significant biases for preferred +readings, as well as self inconsistencies. Our study highlights the importance +of handling entity ambiguity in future for more trustworthy LLMs + +
+
+
+
+
+ + ☆ Parameter-Efficient Fine-Tuning for Continual Learning: A Neural Tangent + Kernel Perspective + + +
+ Parameter-efficient fine-tuning for continual learning (PEFT-CL) has shown +promise in adapting pre-trained models to sequential tasks while mitigating +catastrophic forgetting problem. However, understanding the mechanisms that +dictate continual performance in this paradigm remains elusive. To tackle this +complexity, we undertake a rigorous analysis of PEFT-CL dynamics to derive +relevant metrics for continual scenarios using Neural Tangent Kernel (NTK) +theory. With the aid of NTK as a mathematical analysis tool, we recast the +challenge of test-time forgetting into the quantifiable generalization gaps +during training, identifying three key factors that influence these gaps and +the performance of PEFT-CL: training sample size, task-level feature +orthogonality, and regularization. To address these challenges, we introduce +NTK-CL, a novel framework that eliminates task-specific parameter storage while +adaptively generating task-relevant features. Aligning with theoretical +guidance, NTK-CL triples the feature representation of each sample, +theoretically and empirically reducing the magnitude of both task-interplay and +task-specific generalization gaps. Grounded in NTK analysis, our approach +imposes an adaptive exponential moving average mechanism and constraints on +task-level feature orthogonality, maintaining intra-task NTK forms while +attenuating inter-task NTK forms. Ultimately, by fine-tuning optimizable +parameters with appropriate regularization, NTK-CL achieves state-of-the-art +performance on established PEFT-CL benchmarks. This work provides a theoretical +foundation for understanding and improving PEFT-CL models, offering insights +into the interplay between feature representation, task orthogonality, and +generalization, contributing to the development of more efficient continual +learning systems. + +
+
+
+
+
+ + ☆ EverAdapt: Continuous Adaptation for Dynamic Machine Fault Diagnosis + Environments + + +
+ Unsupervised Domain Adaptation (UDA) has emerged as a key solution in +data-driven fault diagnosis, addressing domain shift where models underperform +in changing environments. However, under the realm of continually changing +environments, UDA tends to underperform on previously seen domains when +adapting to new ones - a problem known as catastrophic forgetting. To address +this limitation, we introduce the EverAdapt framework, specifically designed +for continuous model adaptation in dynamic environments. Central to EverAdapt +is a novel Continual Batch Normalization (CBN), which leverages source domain +statistics as a reference point to standardize feature representations across +domains. EverAdapt not only retains statistical information from previous +domains but also adapts effectively to new scenarios. Complementing CBN, we +design a class-conditional domain alignment module for effective integration of +target domains, and a Sample-efficient Replay strategy to reinforce memory +retention. Experiments on real-world datasets demonstrate EverAdapt superiority +in maintaining robust fault diagnosis in dynamic environments. Our code is +available: https://github.com/mohamedr002/EverAdapt + +
+
+
+
+
+ + ☆ Neural Dueling Bandits ICML 2024 + + +
+ Contextual dueling bandit is used to model the bandit problems, where a +learner's goal is to find the best arm for a given context using observed noisy +preference feedback over the selected arms for the past contexts. However, +existing algorithms assume the reward function is linear, which can be complex +and non-linear in many real-life applications like online recommendations or +ranking web search results. To overcome this challenge, we use a neural network +to estimate the reward function using preference feedback for the previously +selected arms. We propose upper confidence bound- and Thompson sampling-based +algorithms with sub-linear regret guarantees that efficiently select arms in +each round. We then extend our theoretical results to contextual bandit +problems with binary feedback, which is in itself a non-trivial contribution. +Experimental results on the problem instances derived from synthetic datasets +corroborate our theoretical results. + +
+
+ comment: Accepted at ICML 2024 Workshop on Foundations of Reinforcement + Learning and Control +
+
+
+
+
+ + ☆ Towards Robust Knowledge Tracing Models via k-Sparse Attention SIGIR'2023 + + +
+ Knowledge tracing (KT) is the problem of predicting students' future +performance based on their historical interaction sequences. With the advanced +capability of capturing contextual long-term dependency, attention mechanism +becomes one of the essential components in many deep learning based KT (DLKT) +models. In spite of the impressive performance achieved by these attentional +DLKT models, many of them are often vulnerable to run the risk of overfitting, +especially on small-scale educational datasets. Therefore, in this paper, we +propose \textsc{sparseKT}, a simple yet effective framework to improve the +robustness and generalization of the attention based DLKT approaches. +Specifically, we incorporate a k-selection module to only pick items with the +highest attention scores. We propose two sparsification heuristics : (1) +soft-thresholding sparse attention and (2) top-$K$ sparse attention. We show +that our \textsc{sparseKT} is able to help attentional KT models get rid of +irrelevant student interactions and have comparable predictive performance when +compared to 11 state-of-the-art KT models on three publicly available +real-world educational datasets. To encourage reproducible research, we make +our data and code publicly available at +\url{https://github.com/pykt-team/pykt-toolkit}\footnote{We merged our model to +the \textsc{pyKT} benchmark at \url{https://pykt.org/}.}. + +
+
+ comment: Accepted at SIGIR'2023 (revised version with additional results) +
+
+
+
+
+ + ☆ Assessing Non-Nested Configurations of Multifidelity Machine Learning + for Quantum-Chemical Properties + + +
+ Multifidelity machine learning (MFML) for quantum chemical (QC) properties +has seen strong development in the recent years. The method has been shown to +reduce the cost of generating training data for high-accuracy low-cost ML +models. In such a set-up, the ML models are trained on molecular geometries and +some property of interest computed at various computational chemistry +accuracies, or fidelities. These are then combined in training the MFML models. +In some multifidelity models, the training data is required to be nested, that +is the same molecular geometries are included to calculate the property across +all the fidelities. In these multifidelity models, the requirement of a nested +configuration restricts the kind of sampling that can be performed while +selection training samples at different fidelities. + This work assesses the use of non-nested training data for two of these +multifidelity methods, namely MFML and optimized MFML (o-MFML). The assessment +is carried out for the prediction of ground state energies and first vertical +excitation energies of a diverse collection of molecules of the CheMFi dataset. +Results indicate that the MFML method still requires a nested structure of +training data across the fidelities. However, the o-MFML method shows promising +results for non-nested multifidelity training data with model errors comparable +to the nested configurations. + +
+
+
+
+
+ + ☆ OVR: A Dataset for Open Vocabulary Temporal Repetition Counting in + Videos + + +
+ We introduce a dataset of annotations of temporal repetitions in videos. The +dataset, OVR (pronounced as over), contains annotations for over 72K videos, +with each annotation specifying the number of repetitions, the start and end +time of the repetitions, and also a free-form description of what is repeating. +The annotations are provided for videos sourced from Kinetics and Ego4D, and +consequently cover both Exo and Ego viewing conditions, with a huge variety of +actions and activities. Moreover, OVR is almost an order of magnitude larger +than previous datasets for video repetition. We also propose a baseline +transformer-based counting model, OVRCounter, that can localise and count +repetitions in videos that are up to 320 frames long. The model is trained and +evaluated on the OVR dataset, and its performance assessed with and without +using text to specify the target class to count. The performance is also +compared to a prior repetition counting model. The dataset is available for +download at: https://sites.google.com/view/openvocabreps/ + +
+
+
+
+
+ + ☆ Contrastive Learning Is Not Optimal for Quasiperiodic Time Series IJCAI 2024 + + +
+ Despite recent advancements in Self-Supervised Learning (SSL) for time series +analysis, a noticeable gap persists between the anticipated achievements and +actual performance. While these methods have demonstrated formidable +generalization capabilities with minimal labels in various domains, their +effectiveness in distinguishing between different classes based on a limited +number of annotated records is notably lacking. Our hypothesis attributes this +bottleneck to the prevalent use of Contrastive Learning, a shared training +objective in previous state-of-the-art (SOTA) methods. By mandating +distinctiveness between representations for negative pairs drawn from separate +records, this approach compels the model to encode unique record-based patterns +but simultaneously neglects changes occurring across the entire record. To +overcome this challenge, we introduce Distilled Embedding for Almost-Periodic +Time Series (DEAPS) in this paper, offering a non-contrastive method tailored +for quasiperiodic time series, such as electrocardiogram (ECG) data. By +avoiding the use of negative pairs, we not only mitigate the model's blindness +to temporal changes but also enable the integration of a "Gradual Loss (Lgra)" +function. This function guides the model to effectively capture dynamic +patterns evolving throughout the record. The outcomes are promising, as DEAPS +demonstrates a notable improvement of +10% over existing SOTA methods when just +a few annotated records are presented to fit a Machine Learning (ML) model +based on the learned representation. + +
+
+ comment: Accepted to IJCAI 2024 +
+
+
+
+
+ + ☆ An Efficient Procedure for Computing Bayesian Network Structure Learning + + +
+ We propose a globally optimal Bayesian network structure discovery algorithm +based on a progressively leveled scoring approach. Bayesian network structure +discovery is a fundamental yet NP-hard problem in the field of probabilistic +graphical models, and as the number of variables increases, memory usage grows +exponentially. The simple and effective method proposed by Silander and +Myllym\"aki has been widely applied in this field, as it incrementally +calculates local scores to achieve global optimality. However, existing methods +that utilize disk storage, while capable of handling networks with a larger +number of variables, introduce issues such as latency, fragmentation, and +additional overhead associated with disk I/O operations. To avoid these +problems, we explore how to further enhance computational efficiency and reduce +peak memory usage using only memory. We introduce an efficient hierarchical +computation method that requires only a single traversal of all local +structures, retaining only the data and information necessary for the current +computation, thereby improving efficiency and significantly reducing memory +requirements. Experimental results indicate that our method, when using only +memory, not only reduces peak memory usage but also improves computational +efficiency compared to existing methods, demonstrating good scalability for +handling larger networks and exhibiting stable experimental results. +Ultimately, we successfully achieved the processing of a Bayesian network with +28 variables using only memory. + +
+
+
+
+
+ + ☆ Curriculum Negative Mining For Temporal Networks + + +
+ Temporal networks are effective in capturing the evolving interactions of +networks over time, such as social networks and e-commerce networks. In recent +years, researchers have primarily concentrated on developing specific model +architectures for Temporal Graph Neural Networks (TGNNs) in order to improve +the representation quality of temporal nodes and edges. However, limited +attention has been given to the quality of negative samples during the training +of TGNNs. When compared with static networks, temporal networks present two +specific challenges for negative sampling: positive sparsity and positive +shift. Positive sparsity refers to the presence of a single positive sample +amidst numerous negative samples at each timestamp, while positive shift +relates to the variations in positive samples across different timestamps. To +robustly address these challenges in training TGNNs, we introduce Curriculum +Negative Mining (CurNM), a model-aware curriculum learning framework that +adaptively adjusts the difficulty of negative samples. Within this framework, +we first establish a dynamically updated negative pool that balances random, +historical, and hard negatives to address the challenges posed by positive +sparsity. Secondly, we implement a temporal-aware negative selection module +that focuses on learning from the disentangled factors of recently active +edges, thus accurately capturing shifting preferences. Extensive experiments on +12 datasets and 3 TGNNs demonstrate that our method outperforms baseline +methods by a significant margin. Additionally, thorough ablation studies and +parameter sensitivity experiments verify the usefulness and robustness of our +approach. Our code is available at https://github.com/zziyue83/CurNM. + +
+
+
+
+
+ + ☆ Time Series Missing Imputation with Multivariate Radial Basis Function + Neural Network + + +
+ Researchers have been persistently working to address the issue of missing +values in time series data. Numerous models have been proposed, striving to +estimate the distribution of the data. The Radial Basis Functions Neural +Network (RBFNN) has recently exhibited exceptional performance in estimating +data distribution. In this paper, we propose a time series imputation model +based on RBFNN. Our imputation model learns local information from timestamps +to create a continuous function. Additionally, we incorporate time gaps to +facilitate learning information considering the missing terms of missing +values. We name this model the Missing Imputation Multivariate RBFNN +(MIM-RBFNN). However, MIM-RBFNN relies on a local information-based learning +approach, which presents difficulties in utilizing temporal information. +Therefore, we propose an extension called the Missing Value Imputation +Recurrent Neural Network with Continuous Function (MIRNN-CF) using the +continuous function generated by MIM-RBFNN. We evaluate the performance using +two real-world datasets with non-random missing and random missing patterns, +and conduct an ablation study comparing MIM-RBFNN and MIRNN-CF. + +
+
+
+
+
+ + ☆ Sparse Inducing Points in Deep Gaussian Processes: Enhancing Modeling + with Denoising Diffusion Variational Inference + + +
+ Deep Gaussian processes (DGPs) provide a robust paradigm for Bayesian deep +learning. In DGPs, a set of sparse integration locations called inducing points +are selected to approximate the posterior distribution of the model. This is +done to reduce computational complexity and improve model efficiency. However, +inferring the posterior distribution of inducing points is not straightforward. +Traditional variational inference approaches to posterior approximation often +lead to significant bias. To address this issue, we propose an alternative +method called Denoising Diffusion Variational Inference (DDVI) that uses a +denoising diffusion stochastic differential equation (SDE) to generate +posterior samples of inducing variables. We rely on score matching methods for +denoising diffusion model to approximate score functions with a neural network. +Furthermore, by combining classical mathematical theory of SDEs with the +minimization of KL divergence between the approximate and true processes, we +propose a novel explicit variational lower bound for the marginal likelihood +function of DGP. Through experiments on various datasets and comparisons with +baseline methods, we empirically demonstrate the effectiveness of DDVI for +posterior inference of inducing points for DGP models. + +
+
+
+
+
+ + ☆ Gymnasium: A Standard Interface for Reinforcement Learning Environments + + +
+ Gymnasium is an open-source library providing an API for reinforcement +learning environments. Its main contribution is a central abstraction for wide +interoperability between benchmark environments and training algorithms. +Gymnasium comes with various built-in environments and utilities to simplify +researchers' work along with being supported by most training libraries. This +paper outlines the main design decisions for Gymnasium, its key features, and +the differences to alternative APIs. + +
+
+ comment: 6 pages, 1 figure, preprint +
+
+
+
+
+ + ☆ Accurate and Efficient Fine-Tuning of Quantized Large Language Models + Through Optimal Balance + + +
+ Large Language Models (LLMs) have demonstrated impressive performance across +various domains. However, the enormous number of model parameters makes +fine-tuning challenging, significantly limiting their application and +deployment. Existing solutions combine parameter quantization with Low-Rank +Adaptation (LoRA), greatly reducing memory usage but resulting in noticeable +performance degradation. In this paper, we identify an imbalance in fine-tuning +quantized pre-trained models: overly complex adapter inputs and outputs versus +low effective trainability of the adaptation. We propose Quantized LLMs with +Balanced-rank Adaptation (Q-BaRA), which simplifies the adapter inputs and +outputs while increasing the adapter's rank to achieve a more suitable balance +for fine-tuning quantized LLMs. Additionally, for scenarios where fine-tuned +LLMs need to be deployed as low-precision inference models, we introduce +Quantization-Aware Fine-tuning with Higher Rank Adaptation (QA-HiRA), which +simplifies the adapter inputs and outputs to align with the pre-trained model's +block-wise quantization while employing a single matrix to achieve a higher +rank. Both Q-BaRA and QA-HiRA are easily implemented and offer the following +optimizations: (i) Q-BaRA consistently achieves the highest accuracy compared +to baselines and other variants, requiring the same number of trainable +parameters and computational effort; (ii) QA-HiRA naturally merges adapter +parameters into the block-wise quantized model after fine-tuning, achieving the +highest accuracy compared to other methods. We apply our Q-BaRA and QA-HiRA to +the LLaMA and LLaMA2 model families and validate their effectiveness across +different fine-tuning datasets and downstream scenarios. + Code will be made available at +\href{https://github.com/xiaocaigou/qbaraqahira}{https://github.com/xiaocaigou/qbaraqahira} + +
+
+
+
+
+ + ☆ SepsisLab: Early Sepsis Prediction with Uncertainty Quantification and + Active Sensing KDD 2024 + + +
+ Sepsis is the leading cause of in-hospital mortality in the USA. Early sepsis +onset prediction and diagnosis could significantly improve the survival of +sepsis patients. Existing predictive models are usually trained on high-quality +data with few missing information, while missing values widely exist in +real-world clinical scenarios (especially in the first hours of admissions to +the hospital), which causes a significant decrease in accuracy and an increase +in uncertainty for the predictive models. The common method to handle missing +values is imputation, which replaces the unavailable variables with estimates +from the observed data. The uncertainty of imputation results can be propagated +to the sepsis prediction outputs, which have not been studied in existing works +on either sepsis prediction or uncertainty quantification. In this study, we +first define such propagated uncertainty as the variance of prediction output +and then introduce uncertainty propagation methods to quantify the propagated +uncertainty. Moreover, for the potential high-risk patients with low confidence +due to limited observations, we propose a robust active sensing algorithm to +increase confidence by actively recommending clinicians to observe the most +informative variables. We validate the proposed models in both publicly +available data (i.e., MIMIC-III and AmsterdamUMCdb) and proprietary data in The +Ohio State University Wexner Medical Center (OSUWMC). The experimental results +show that the propagated uncertainty is dominant at the beginning of admissions +to hospitals and the proposed algorithm outperforms state-of-the-art active +sensing methods. Finally, we implement a SepsisLab system for early sepsis +prediction and active sensing based on our pre-trained models. Clinicians and +potential sepsis patients can benefit from the system in early prediction and +diagnosis of sepsis. + +
+
+ comment: To be published in KDD 2024 +
+
+
+
+
+ + ☆ A Voter-Based Stochastic Rejection-Method Framework for Asymptotically + Safe Language Model Outputs + + +
+ This paper proposes a new method for preventing unsafe or otherwise low +quality large language model (LLM) outputs, by leveraging the stochasticity of +LLMs. We propose a system whereby LLM checkers vote on the acceptability of a +generated output, regenerating it if a threshold of disapproval is reached, +until sufficient checkers approve. We further propose estimators for cost and +failure rate, and based on those estimators and experimental data tailored to +the application, we propose an algorithm that achieves a desired failure rate +at the least possible cost. We demonstrate that, under these models, failure +rate decreases exponentially as a function of cost when voter count and +threshold are chosen according to the algorithm, and that the models reasonably +estimate the actual performance of such a system in action, even with limited +data. + +
+
+ comment: 7 pages, 2 figures +
+
+
+
+
+ + ☆ Sparse Tensor PCA via Tensor Decomposition for Unsupervised Feature + Selection + + +
+ Recently, introducing Tensor Decomposition (TD) methods into unsupervised +feature selection (UFS) has been a rising research point. A tensor structure is +beneficial for mining the relations between different modes and helps relieve +the computation burden. However, while existing methods exploit TD to minimize +the reconstruction error of a data tensor, they don't fully utilize the +interpretable and discriminative information in the factor matrices. Moreover, +most methods require domain knowledge to perform feature selection. To solve +the above problems, we develop two Sparse Tensor Principal Component Analysis +(STPCA) models that utilize the projection directions in the factor matrices to +perform UFS. The first model extends Tucker Decomposition to a multiview sparse +regression form and is transformed into several alternatively solved convex +subproblems. The second model formulates a sparse version of the family of +Tensor Singular Value Decomposition (T-SVDs) and is transformed into individual +convex subproblems. For both models, we prove the optimal solution of each +subproblem falls onto the Hermitian Positive Semidefinite Cone (HPSD). +Accordingly, we design two fast algorithms based on HPSD projection and prove +their convergence. According to the experimental results on two original +synthetic datasets (Orbit and Array Signal) and five real-world datasets, the +two proposed methods are suitable for handling different data tensor scenarios +and outperform the state-of-the-art UFS methods. + +
+
+
+
+
+ + ☆ scGHSOM: Hierarchical clustering and visualization of single-cell and + CRISPR data using growing hierarchical SOM KDD + + +
+ High-dimensional single-cell data poses significant challenges in identifying +underlying biological patterns due to the complexity and heterogeneity of +cellular states. We propose a comprehensive gene-cell dependency visualization +via unsupervised clustering, Growing Hierarchical Self-Organizing Map (GHSOM), +specifically designed for analyzing high-dimensional single-cell data like +single-cell sequencing and CRISPR screens. GHSOM is applied to cluster samples +in a hierarchical structure such that the self-growth structure of clusters +satisfies the required variations between and within. We propose a novel +Significant Attributes Identification Algorithm to identify features that +distinguish clusters. This algorithm pinpoints attributes with minimal +variation within a cluster but substantial variation between clusters. These +key attributes can then be used for targeted data retrieval and downstream +analysis. Furthermore, we present two innovative visualization tools: Cluster +Feature Map and Cluster Distribution Map. The Cluster Feature Map highlights +the distribution of specific features across the hierarchical structure of +GHSOM clusters. This allows for rapid visual assessment of cluster uniqueness +based on chosen features. The Cluster Distribution Map depicts leaf clusters as +circles on the GHSOM grid, with circle size reflecting cluster data size and +color customizable to visualize features like cell type or other attributes. We +apply our analysis to three single-cell datasets and one CRISPR dataset +(cell-gene database) and evaluate clustering methods with internal and external +CH and ARI scores. GHSOM performs well, being the best performer in internal +evaluation (CH=4.2). In external evaluation, GHSOM has the third-best +performance of all methods. + +
+
+ comment: Abstract presentation at BIOKDD@ACM KDD 2024 +
+
+
+
+
+ + ☆ On the Parameter Identifiability of Partially Observed Linear Causal + Models + + +
+ Linear causal models are important tools for modeling causal dependencies and +yet in practice, only a subset of the variables can be observed. In this paper, +we examine the parameter identifiability of these models by investigating +whether the edge coefficients can be recovered given the causal structure and +partially observed data. Our setting is more general than that of prior +research - we allow all variables, including both observed and latent ones, to +be flexibly related, and we consider the coefficients of all edges, whereas +most existing works focus only on the edges between observed variables. +Theoretically, we identify three types of indeterminacy for the parameters in +partially observed linear causal models. We then provide graphical conditions +that are sufficient for all parameters to be identifiable and show that some of +them are provably necessary. Methodologically, we propose a novel +likelihood-based parameter estimation method that addresses the variance +indeterminacy of latent variables in a specific way and can asymptotically +recover the underlying parameters up to trivial indeterminacy. Empirical +studies on both synthetic and real-world datasets validate our identifiability +theory and the effectiveness of the proposed method in the finite-sample +regime. + +
+
+
+
+
+ + ☆ Towards Aligning Language Models with Textual Feedback + + +
+ We present ALT (ALignment with Textual feedback), an approach that aligns +language models with user preferences expressed in text. We argue that text +offers greater expressiveness, enabling users to provide richer feedback than +simple comparative preferences and this richer feedback can lead to more +efficient and effective alignment. ALT aligns the model by conditioning its +generation on the textual feedback. Our method relies solely on language +modeling techniques and requires minimal hyper-parameter tuning, though it +still presents the main benefits of RL-based alignment algorithms and can +effectively learn from textual feedback. We explore the efficacy and efficiency +of textual feedback across different tasks such as toxicity reduction, +summarization, and dialog response generation. We find that ALT outperforms PPO +for the task of toxicity reduction while being able to match its performance on +summarization with only 20% of the samples. We also explore how ALT can be used +with feedback provided by an existing LLM where we explore an LLM providing +constrained and unconstrained textual feedback. We also outline future +directions to align models with natural language feedback. + +
+
+
+
+
+ + ☆ Stochastic Variance-Reduced Iterative Hard Thresholding in Graph + Sparsity Optimization + + +
+ Stochastic optimization algorithms are widely used for large-scale data +analysis due to their low per-iteration costs, but they often suffer from slow +asymptotic convergence caused by inherent variance. Variance-reduced techniques +have been therefore used to address this issue in structured sparse models +utilizing sparsity-inducing norms or $\ell_0$-norms. However, these techniques +are not directly applicable to complex (non-convex) graph sparsity models, +which are essential in applications like disease outbreak monitoring and social +network analysis. In this paper, we introduce two stochastic variance-reduced +gradient-based methods to solve graph sparsity optimization: GraphSVRG-IHT and +GraphSCSG-IHT. We provide a general framework for theoretical analysis, +demonstrating that our methods enjoy a linear convergence speed. Extensive +experiments validate + +
+
+
+
+
+ + ☆ When AI Defeats Password Deception! A Deep Learning Framework to + Distinguish Passwords and Honeywords + + +
+ "Honeywords" have emerged as a promising defense mechanism for detecting data +breaches and foiling offline dictionary attacks (ODA) by deceiving attackers +with false passwords. In this paper, we propose PassFilter, a novel deep +learning (DL) based attack framework, fundamental in its ability to identify +passwords from a set of sweetwords associated with a user account, effectively +challenging a variety of honeywords generation techniques (HGTs). The DL model +in PassFilter is trained with a set of previously collected or adversarially +generated passwords and honeywords, and carefully orchestrated to predict +whether a sweetword is the password or a honeyword. Our model can compromise +the security of state-of-the-art, heuristics-based, and representation +learning-based HGTs proposed by Dionysiou et al. Specifically, our analysis +with nine publicly available password datasets shows that PassFilter +significantly outperforms the baseline random guessing success rate of 5%, +achieving 6.10% to 52.78% on the 1st guessing attempt, considering 20 +sweetwords per account. This success rate rapidly increases with additional +login attempts before account lock-outs, often allowed on many real-world +online services to maintain reasonable usability. For example, it ranges from +41.78% to 96.80% for five attempts, and from 72.87% to 99.00% for ten attempts, +compared to 25% and 50% random guessing, respectively. We also examined +PassFilter against general-purpose language models used for honeyword +generation, like those proposed by Yu et al. These honeywords also proved +vulnerable to our attack, with success rates of 14.19% for 1st guessing +attempt, increasing to 30.23%, 41.70%, and 63.10% after 3rd, 5th, and 10th +guessing attempts, respectively. Our findings demonstrate the effectiveness of +DL model deployed in PassFilter in breaching state-of-the-art HGTs and +compromising password security based on ODA. + +
+
+
+
+
+ + ☆ Dynamic Graph Transformer with Correlated Spatial-Temporal Positional + Encoding + + +
+ Learning effective representations for Continuous-Time Dynamic Graphs (CTDGs) +has garnered significant research interest, largely due to its powerful +capabilities in modeling complex interactions between nodes. A fundamental and +crucial requirement for representation learning in CTDGs is the appropriate +estimation and preservation of proximity. However, due to the sparse and +evolving characteristics of CTDGs, the spatial-temporal properties inherent in +high-order proximity remain largely unexplored. Despite its importance, this +property presents significant challenges due to the computationally intensive +nature of personalized interaction intensity estimation and the dynamic +attributes of CTDGs. To this end, we propose a novel Correlated +Spatial-Temporal Positional encoding that incorporates a parameter-free +personalized interaction intensity estimation under the weak assumption of the +Poisson Point Process. Building on this, we introduce the Dynamic Graph +Transformer with \Correlated Spatial-Temporal Positional Encoding (CorDGT), +which efficiently retains the evolving spatial-temporal high-order proximity +for effective node representation learning in CTDGs. Extensive experiments on +seven small and two large-scale datasets demonstrate the superior performance +and scalability of the proposed CorDGT. + +
+
+
+
+
+ + ☆ Cheems: Wonderful Matrices More Efficient and More Effective + Architecture + + +
+ Recent studies have shown that, relative position encoding performs well in +selective state space model scanning algorithms, and the architecture that +balances SSM and Attention enhances the efficiency and effectiveness of the +algorithm, while the sparse activation of the mixture of experts reduces the +training cost. I studied the effectiveness of using different position +encodings in structured state space dual algorithms, and the more effective +SSD-Attn internal and external function mixing method, and designed a more +efficient cross domain mixture of experts. I found that the same matrix is very +wonderful in different algorithms, which allows us to establish a new hybrid +sparse architecture: Cheems. Compared with other hybrid architectures, it is +more efficient and more effective in language modeling tasks. + +
+
+
+
+
+ + ☆ Towards Transfer Unlearning: Empirical Evidence of Cross-Domain Bias + Mitigation + + +
+ Large language models (LLMs) often inherit biases from vast amounts of +training corpora. Traditional debiasing methods, while effective to some +extent, do not completely eliminate memorized biases and toxicity in LLMs. In +this paper, we study an unlearning-based approach to debiasing in LLMs by +performing gradient ascent on hate speech against minority groups, i.e., +minimizing the likelihood of biased or toxic content. Specifically, we propose +a mask language modeling unlearning technique, which unlearns the harmful part +of the text. This method enables LLMs to selectively forget and disassociate +from biased and harmful content. Experimental results demonstrate the +effectiveness of our approach in diminishing bias while maintaining the +language modeling abilities. Surprisingly, the results also unveil an +unexpected potential for cross-domain transfer unlearning: debiasing in one +bias form (e.g. gender) may contribute to mitigating others (e.g. race and +religion). + +
+
+
+
+
+ + ☆ An Adaptive Gradient Regularization Method + + +
+ Optimizer plays an important role in neural network training with high +efficiency and performance. Weight update based on its gradient is the central +part of the optimizer. It has been shown that normalization and standardization +operation on weight and gradient can accelerate the training process and +improve performance such as Weight Standardization (WS), weight normalization +(WN) and gradient normalization (GN); there is also gradient centralization +(GC). In this work, we introduce a new optimization technique based on the +gradient magnitude in a gradient vector named adaptive gradient regularization +(AGR), which normalizes the gradient vector in all dimensions as a coefficient +vector and subtracts the product of the gradient and its coefficient vector by +the vanilla gradient. It can be viewed as an adaptive gradient clipping method. +We show that the AGR can improve the loss function Lipschitzness with a more +stable training process and better generalization performance. AGR is very +simple to be embedded into vanilla optimizers such as Adan and AdamW with only +three lines of code. Our experiments are conducted in image generation, image +classification and language representation, which shows that our AGR improves +the training result. + +
+
+ comment: 11 pages, 11 figures +
+
+
+
+
+ + ☆ GV-Rep: A Large-Scale Dataset for Genetic Variant Representation + Learning + + +
+ Genetic variants (GVs) are defined as differences in the DNA sequences among +individuals and play a crucial role in diagnosing and treating genetic +diseases. The rapid decrease in next generation sequencing cost has led to an +exponential increase in patient-level GV data. This growth poses a challenge +for clinicians who must efficiently prioritize patient-specific GVs and +integrate them with existing genomic databases to inform patient management. To +addressing the interpretation of GVs, genomic foundation models (GFMs) have +emerged. However, these models lack standardized performance assessments, +leading to considerable variability in model evaluations. This poses the +question: How effectively do deep learning methods classify unknown GVs and +align them with clinically-verified GVs? We argue that representation learning, +which transforms raw data into meaningful feature spaces, is an effective +approach for addressing both indexing and classification challenges. We +introduce a large-scale Genetic Variant dataset, named GV-Rep, featuring +variable-length contexts and detailed annotations, designed for deep learning +models to learn GV representations across various traits, diseases, tissue +types, and experimental contexts. Our contributions are three-fold: (i) +Construction of a comprehensive dataset with 7 million records, each labeled +with characteristics of the corresponding variants, alongside additional data +from 17,548 gene knockout tests across 1,107 cell types, 1,808 variant +combinations, and 156 unique clinically verified GVs from real-world patients. +(ii) Analysis of the structure and properties of the dataset. (iii) +Experimentation of the dataset with pre-trained GFMs. The results show a +significant gap between GFMs current capabilities and accurate GV +representation. We hope this dataset will help advance genomic deep learning to +bridge this gap. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Synthetic Trajectory Generation Through Convolutional Neural Networks + + +
+ Location trajectories provide valuable insights for applications from urban +planning to pandemic control. However, mobility data can also reveal sensitive +information about individuals, such as political opinions, religious beliefs, +or sexual orientations. Existing privacy-preserving approaches for publishing +this data face a significant utility-privacy trade-off. Releasing synthetic +trajectory data generated through deep learning offers a promising solution. +Due to the trajectories' sequential nature, most existing models are based on +recurrent neural networks (RNNs). However, research in generative adversarial +networks (GANs) largely employs convolutional neural networks (CNNs) for image +generation. This discrepancy raises the question of whether advances in +computer vision can be applied to trajectory generation. In this work, we +introduce a Reversible Trajectory-to-CNN Transformation (RTCT) that adapts +trajectories into a format suitable for CNN-based models. We integrated this +transformation with the well-known DCGAN in a proof-of-concept (PoC) and +evaluated its performance against an RNN-based trajectory GAN using four +metrics across two datasets. The PoC was superior in capturing spatial +distributions compared to the RNN model but had difficulty replicating +sequential and temporal properties. Although the PoC's utility is not +sufficient for practical applications, the results demonstrate the +transformation's potential to facilitate the use of CNNs for trajectory +generation, opening up avenues for future research. To support continued +research, all source code has been made available under an open-source license. + +
+
+ comment: To appear in the proceedings of the 21st Annual International + Conference on Privacy, Security & Trust (PST 2024) +
+
+
+
+
+ + ☆ Provable Benefit of Annealed Langevin Monte Carlo for Non-log-concave + Sampling + + +
+ We address the outstanding problem of sampling from an unnormalized density +that may be non-log-concave and multimodal. To enhance the performance of +simple Markov chain Monte Carlo (MCMC) methods, techniques of annealing type +have been widely used. However, quantitative theoretical guarantees of these +techniques are under-explored. This study takes a first step toward providing a +non-asymptotic analysis of annealed MCMC. Specifically, we establish, for the +first time, an oracle complexity of $\widetilde{O}\left(\frac{d\beta^2{\cal +A}^2}{\varepsilon^6}\right)$ for simple annealed Langevin Monte Carlo algorithm +to achieve $\varepsilon^2$ accuracy in Kullback-Leibler divergence to the +target distribution $\pi\propto{\rm e}^{-V}$ on $\mathbb{R}^d$ with +$\beta$-smooth potential $V$. Here, ${\cal A}$ represents the action of a curve +of probability measures interpolating the target distribution $\pi$ and a +readily sampleable distribution. + +
+
+
+
+
+ + ☆ Federated Automatic Latent Variable Selection in Multi-output Gaussian + Processes + + +
+ This paper explores a federated learning approach that automatically selects +the number of latent processes in multi-output Gaussian processes (MGPs). The +MGP has seen great success as a transfer learning tool when data is generated +from multiple sources/units/entities. A common approach in MGPs to transfer +knowledge across units involves gathering all data from each unit to a central +server and extracting common independent latent processes to express each unit +as a linear combination of the shared latent patterns. However, this approach +poses key challenges in (i) determining the adequate number of latent processes +and (ii) relying on centralized learning which leads to potential privacy risks +and significant computational burdens on the central server. To address these +issues, we propose a hierarchical model that places spike-and-slab priors on +the coefficients of each latent process. These priors help automatically select +only needed latent processes by shrinking the coefficients of unnecessary ones +to zero. To estimate the model while avoiding the drawbacks of centralized +learning, we propose a variational inference-based approach, that formulates +model inference as an optimization problem compatible with federated settings. +We then design a federated learning algorithm that allows units to jointly +select and infer the common latent processes without sharing their data. We +also discuss an efficient learning approach for a new unit within our proposed +federated framework. Simulation and case studies on Li-ion battery degradation +and air temperature data demonstrate the advantageous features of our proposed +approach. + +
+
+
+
+
+ + ☆ Deep Koopman-based Control of Quality Variation in Multistage + Manufacturing Systems + + +
+ This paper presents a modeling-control synthesis to address the quality +control challenges in multistage manufacturing systems (MMSs). A new +feedforward control scheme is developed to minimize the quality variations +caused by process disturbances in MMSs. Notably, the control framework +leverages a stochastic deep Koopman (SDK) model to capture the quality +propagation mechanism in the MMSs, highlighted by its ability to transform the +nonlinear propagation dynamics into a linear one. Two roll-to-roll case studies +are presented to validate the proposed method and demonstrate its +effectiveness. The overall method is suitable for nonlinear MMSs and does not +require extensive expert knowledge. + +
+
+ comment: The paper was in the proceeding of 2024 American Control Conference. + This submitted version addresses a minor correction to one equation (Eq. 14), + while the results and conclusions remain the same +
+
+
+
+
+ + ☆ DeepCell: A Ubiquitous Accurate Provider-side Cellular-based + Localization + + +
+ Although outdoor localization is already available to the general public and +businesses through the wide spread use of the GPS, it is not supported by +low-end phones, requires a direct line of sight to satellites and can drain +phone battery quickly. The current fingerprinting solutions can provide +high-accuracy localization but are based on the client side. This limits their +ubiquitous deployment and accuracy. In this paper, we introduce DeepCell: a +provider-side fingerprinting localization system that can provide high accuracy +localization for any cell phone. To build its fingerprint, DeepCell leverages +the unlabeled cellular measurements recorded by the cellular provider while +opportunistically synchronizing with selected client devices to get location +labels. The fingerprint is then used to train a deep neural network model that +is harnessed for localization. To achieve this goal, DeepCell need to address a +number of challenges including using unlabeled data from the provider side, +handling noise and sparsity, scaling the data to large areas, and finally +providing enough data that is required for training deep models without +overhead. Evaluation of DeepCell in a typical realistic environment shows that +it can achieve a consistent median accuracy of 29m. This accuracy outperforms +the state-of-the-art client-based cellular-based systems by more than 75.4%. In +addition, the same accuracy is extended to low-end phones. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2106.13632 +
+
+
+
+
+ + ☆ Handling Device Heterogeneity for Deep Learning-based Localization + + +
+ Deep learning-based fingerprinting is one of the current promising +technologies for outdoor localization in cellular networks. However, deploying +such localization systems for heterogeneous phones affects their accuracy as +the cellular received signal strength (RSS) readings vary for different types +of phones. In this paper, we introduce a number of techniques for addressing +the phones heterogeneity problem in the deep-learning based localization +systems. The basic idea is either to approximate a function that maps the +cellular RSS measurements between different devices or to transfer the +knowledge across them. + Evaluation of the proposed techniques using different Android phones on four +independent testbeds shows that our techniques can improve the localization +accuracy by more than 220% for the four testbeds as compared to the +state-of-the-art systems. This highlights the promise of the proposed device +heterogeneity handling techniques for enabling a wide deployment of deep +learning-based localization systems over different devices. + +
+
+
+
+
+ + ☆ TelescopeML -- I. An End-to-End Python Package for Interpreting + Telescope Datasets through Training Machine Learning Models, Generating + Statistical Reports, and Visualizing Results + + +
+ We are on the verge of a revolutionary era in space exploration, thanks to +advancements in telescopes such as the James Webb Space Telescope +(\textit{JWST}). High-resolution, high signal-to-noise spectra from exoplanet +and brown dwarf atmospheres have been collected over the past few decades, +requiring the development of accurate and reliable pipelines and tools for +their analysis. Accurately and swiftly determining the spectroscopic parameters +from the observational spectra of these objects is crucial for understanding +their atmospheric composition and guiding future follow-up observations. +\texttt{TelescopeML} is a Python package developed to perform three main tasks: +1. Process the synthetic astronomical datasets for training a CNN model and +prepare the observational dataset for later use for prediction; 2. Train a CNN +model by implementing the optimal hyperparameters; and 3. Deploy the trained +CNN models on the actual observational data to derive the output spectroscopic +parameters. + +
+
+ comment: Please find the accepted paper with complete reference list at + https://joss.theoj.org/papers/10.21105/joss.06346 +
+
+
+
+
+ + ☆ Cross-Domain Policy Transfer by Representation Alignment via + Multi-Domain Behavioral Cloning + + +
+ Transferring learned skills across diverse situations remains a fundamental +challenge for autonomous agents, particularly when agents are not allowed to +interact with an exact target setup. While prior approaches have predominantly +focused on learning domain translation, they often struggle with handling +significant domain gaps or out-of-distribution tasks. In this paper, we present +a simple approach for cross-domain policy transfer that learns a shared latent +representation across domains and a common abstract policy on top of it. Our +approach leverages multi-domain behavioral cloning on unaligned trajectories of +proxy tasks and employs maximum mean discrepancy (MMD) as a regularization term +to encourage cross-domain alignment. The MMD regularization better preserves +structures of latent state distributions than commonly used +domain-discriminative distribution matching, leading to higher transfer +performance. Moreover, our approach involves training only one multi-domain +policy, which makes extension easier than existing methods. Empirical +evaluations demonstrate the efficacy of our method across various domain +shifts, especially in scenarios where exact domain translation is challenging, +such as cross-morphology or cross-viewpoint settings. Our ablation studies +further reveal that multi-domain behavioral cloning implicitly contributes to +representation alignment alongside domain-adversarial regularization. + +
+
+ comment: CoLLAs 2024 (Oral). Code: + https://github.com/hwatahiki/portable-latent-policy +
+
+
+
+
+ + ♻ ☆ A Unified Framework for Model Editing ACL 2024 + + +
+ ROME and MEMIT are largely believed to be two different model editing +algorithms, with the major difference between them being the ability to perform +batched edits. In this paper, we unify these two algorithms under a single +conceptual umbrella, optimizing for the same goal, which we call the +preservation-memorization objective. ROME uses an equality constraint to +optimize this objective to perform one edit at a time, whereas MEMIT employs a +more flexible least-square constraint that allows for batched edits. We +generalize ROME and enable batched editing with equality constraint in the form +of EMMET - an Equality-constrained Mass Model Editing algorithm for +Transformers, a new batched memory-editing algorithm. EMMET can perform +batched-edits up to a batch-size of 10,000, with very similar performance to +MEMIT across multiple dimensions. With the introduction of EMMET, we truly +unify ROME and MEMIT and show that both algorithms are equivalent in terms of +their optimization objective, their abilities (singular and batched editing), +their model editing performance and their limitations. + +
+
+ comment: Under review. To appear as poster at KnowledgeableLM Workshop + co-located with ACL 2024 +
+
+
+
+
+ + ♻ ☆ Investigating Resource-efficient Neutron/Gamma Classification ML Models + Targeting eFPGAs + + +
+ There has been considerable interest and resulting progress in implementing +machine learning (ML) models in hardware over the last several years from the +particle and nuclear physics communities. A big driver has been the release of +the Python package, hls4ml, which has enabled porting models specified and +trained using Python ML libraries to register transfer level (RTL) code. So +far, the primary end targets have been commercial FPGAs or synthesized custom +blocks on ASICs. However, recent developments in open-source embedded FPGA +(eFPGA) frameworks now provide an alternate, more flexible pathway for +implementing ML models in hardware. These customized eFPGA fabrics can be +integrated as part of an overall chip design. In general, the decision between +a fully custom, eFPGA, or commercial FPGA ML implementation will depend on the +details of the end-use application. In this work, we explored the parameter +space for eFPGA implementations of fully-connected neural network (fcNN) and +boosted decision tree (BDT) models using the task of neutron/gamma +classification with a specific focus on resource efficiency. We used data +collected using an AmBe sealed source incident on Stilbene, which was optically +coupled to an OnSemi J-series SiPM to generate training and test data for this +study. We investigated relevant input features and the effects of +bit-resolution and sampling rate as well as trade-offs in hyperparameters for +both ML architectures while tracking total resource usage. The performance +metric used to track model performance was the calculated neutron efficiency at +a gamma leakage of 10$^{-3}$. The results of the study will be used to aid the +specification of an eFPGA fabric, which will be integrated as part of a test +chip. + +
+
+
+
+
+ + ♻ ☆ Solving Deep Reinforcement Learning Tasks with Evolution Strategies and + Linear Policy Networks + + +
+ Although deep reinforcement learning methods can learn effective policies for +challenging problems such as Atari games and robotics tasks, algorithms are +complex, and training times are often long. This study investigates how +Evolution Strategies perform compared to gradient-based deep reinforcement +learning methods. We use Evolution Strategies to optimize the weights of a +neural network via neuroevolution, performing direct policy search. We +benchmark both deep policy networks and networks consisting of a single linear +layer from observations to actions for three gradient-based methods, such as +Proximal Policy Optimization. These methods are evaluated against three +classical Evolution Strategies and Augmented Random Search, which all use +linear policy networks. Our results reveal that Evolution Strategies can find +effective linear policies for many reinforcement learning benchmark tasks, +unlike deep reinforcement learning methods that can only find successful +policies using much larger networks, suggesting that current benchmarks are +easier to solve than previously assumed. Interestingly, Evolution Strategies +also achieve results comparable to gradient-based deep reinforcement learning +algorithms for higher-complexity tasks. Furthermore, we find that by directly +accessing the memory state of the game, Evolution Strategies can find +successful policies in Atari that outperform the policies found by Deep +Q-Learning. Evolution Strategies also outperform Augmented Random Search in +most benchmarks, demonstrating superior sample efficiency and robustness in +training linear policy networks. + +
+
+
+
+
+ + ♻ ☆ Dissecting Language Models: Machine Unlearning via Selective Pruning + + +
+ Understanding and shaping the behaviour of Large Language Models (LLMs) is +increasingly important as applications become more powerful and more frequently +adopted. This paper introduces a machine unlearning method specifically +designed for LLMs. We introduce a selective pruning method for LLMs that +removes neurons based on their relative importance on a targeted capability +compared to overall network performance. This approach is a compute- and +data-efficient method for identifying and removing neurons that enable specific +behaviours. Our findings reveal that both feed-forward and attention neurons in +LLMs are specialized; that is, for specific tasks, certain neurons are more +crucial than others. Code from all experiments is available at +https://github.com/nickypro/selective-pruning + +
+
+
+
+
+ + ♻ ☆ Proof-of-Collaborative-Learning: A Multi-winner Federated Learning + Consensus Algorithm + + +
+ Regardless of their variations, blockchains require a consensus mechanism to +validate transactions, supervise added blocks, maintain network security, +synchronize the network state, and distribute incentives. Proof-of-Work (PoW), +one of the most influential implementations of consensus mechanisms, consumes +an extraordinary amount of energy for a task that lacks direct productive +output. In this paper, we propose Proof-of-Collaborative-Learning (PoCL), a +multi-winner federated learning validated consensus mechanism that redirects +the computation power of blockchains to train federated learning models. In +addition, we present a novel evaluation mechanism to ensure the efficiency of +the locally trained models of miners. We evaluated the security of our +evaluation mechanism by introducing and conducting probable attacks. Moreover, +we present a novel reward distribution mechanism to incentivize winning miners +fairly, and demonstrate that our reward system is fair both within and across +all rounds. + +
+
+ comment: 8 pages. Accepted at the 7th IEEE International Conference on + Blockchain (Blockchain 2024) +
+
+
+
+
+ + ♻ ☆ The Elements of Differentiable Programming + + +
+ Artificial intelligence has recently experienced remarkable advances, fueled +by large models, vast datasets, accelerated hardware, and, last but not least, +the transformative power of differentiable programming. This new programming +paradigm enables end-to-end differentiation of complex computer programs +(including those with control flows and data structures), making gradient-based +optimization of program parameters possible. As an emerging paradigm, +differentiable programming builds upon several areas of computer science and +applied mathematics, including automatic differentiation, graphical models, +optimization and statistics. This book presents a comprehensive review of the +fundamental concepts useful for differentiable programming. We adopt two main +perspectives, that of optimization and that of probability, with clear +analogies between the two. Differentiable programming is not merely the +differentiation of programs, but also the thoughtful design of programs +intended for differentiation. By making programs differentiable, we inherently +introduce probability distributions over their execution, providing a means to +quantify the uncertainty associated with program outputs. + +
+
+ comment: Draft version 2 +
+
+
+
+
+ + ♻ ☆ Efficient Unbiased Sparsification + + +
+ An unbiased $m$-sparsification of a vector $p\in \mathbb{R}^n$ is a random +vector $Q\in \mathbb{R}^n$ with mean $p$ that has at most $m +
+
+
+
+
+ + ♻ ☆ Consent in Crisis: The Rapid Decline of the AI Data Commons + + +
+ General-purpose artificial intelligence (AI) systems are built on massive +swathes of public web data, assembled into corpora such as C4, RefinedWeb, and +Dolma. To our knowledge, we conduct the first, large-scale, longitudinal audit +of the consent protocols for the web domains underlying AI training corpora. +Our audit of 14,000 web domains provides an expansive view of crawlable web +data and how codified data use preferences are changing over time. We observe a +proliferation of AI-specific clauses to limit use, acute differences in +restrictions on AI developers, as well as general inconsistencies between +websites' expressed intentions in their Terms of Service and their robots.txt. +We diagnose these as symptoms of ineffective web protocols, not designed to +cope with the widespread re-purposing of the internet for AI. Our longitudinal +analyses show that in a single year (2023-2024) there has been a rapid +crescendo of data restrictions from web sources, rendering ~5%+ of all tokens +in C4, or 28%+ of the most actively maintained, critical sources in C4, fully +restricted from use. For Terms of Service crawling restrictions, a full 45% of +C4 is now restricted. If respected or enforced, these restrictions are rapidly +biasing the diversity, freshness, and scaling laws for general-purpose AI +systems. We hope to illustrate the emerging crises in data consent, for both +developers and creators. The foreclosure of much of the open web will impact +not only commercial AI, but also non-commercial AI and academic research. + +
+
+ comment: 41 pages (13 main), 5 figures, 9 tables +
+
+
+
+
+ + ♻ ☆ Self-driving lab discovers principles for steering spontaneous emission + + +
+ We developed an autonomous experimentation platform to accelerate +interpretable scientific discovery in ultrafast nanophotonics, targeting a +novel method to steer spontaneous emission from reconfigurable semiconductor +metasurfaces. Controlling spontaneous emission is crucial for clean-energy +solutions in illumination, thermal radiation engineering, and remote sensing. +Despite the potential of reconfigurable semiconductor metasurfaces with +embedded sources for spatiotemporal control, achieving arbitrary far-field +control remains challenging. Here, we present a self-driving lab (SDL) platform +that addresses this challenge by discovering the governing equations for +predicting the far-field emission profile from light-emitting metasurfaces. We +discover that both the spatial gradient (grating-like) and the curvature +(lens-like) of the local refractive index are key factors in steering +spontaneous emission. The SDL employs a machine-learning framework comprising: +(1) a variational autoencoder for generating complex spatial refractive index +profiles, (2) an active learning agent for guiding experiments with real-time +closed-loop feedback, and (3) a neural network-based equation learner to +uncover structure-property relationships. The SDL demonstrated a four-fold +enhancement in peak emission directivity (up to 77%) over a 72{\deg} field of +view within ~300 experiments. Our findings reveal that combinations of positive +gratings and lenses are as effective as negative lenses and gratings for all +emission angles, offering a novel strategy for controlling spontaneous emission +beyond conventional Fourier optics. + +
+
+ comment: 25 pages, 4 figures in main text, 5 figures in supplementary + information +
+
+
+
+
+ + ♻ ☆ MELTing point: Mobile Evaluation of Language Transformers + + +
+ Transformers have revolutionized the machine learning landscape, gradually +making their way into everyday tasks and equipping our computers with "sparks +of intelligence". However, their runtime requirements have prevented them from +being broadly deployed on mobile. As personal devices become increasingly +powerful and prompt privacy becomes an ever more pressing issue, we explore the +current state of mobile execution of Large Language Models (LLMs). To achieve +this, we have created our own automation infrastructure, MELT, which supports +the headless execution and benchmarking of LLMs on device, supporting different +models, devices and frameworks, including Android, iOS and Nvidia Jetson +devices. We evaluate popular instruction fine-tuned LLMs and leverage different +frameworks to measure their end-to-end and granular performance, tracing their +memory and energy requirements along the way. Our analysis is the first +systematic study of on-device LLM execution, quantifying performance, energy +efficiency and accuracy across various state-of-the-art models and showcases +the state of on-device intelligence in the era of hyperscale models. Results +highlight the performance heterogeneity across targets and corroborates that +LLM inference is largely memory-bound. Quantization drastically reduces memory +requirements and renders execution viable, but at a non-negligible accuracy +cost. Drawing from its energy footprint and thermal behavior, the continuous +execution of LLMs remains elusive, as both factors negatively affect user +experience. Last, our experience shows that the ecosystem is still in its +infancy, and algorithmic as well as hardware breakthroughs can significantly +shift the execution cost. We expect NPU acceleration, and framework-hardware +co-design to be the biggest bet towards efficient standalone execution, with +the alternative of offloading tailored towards edge deployments. + +
+
+ comment: Accepted at the 30th Annual International Conference On Mobile + Computing And Networking (MobiCom 2024) +
+
+
+
+
+ + ♻ ☆ Detecting Throat Cancer from Speech Signals using Machine Learning: A + Scoping Literature Review + + +
+ Introduction: Cases of throat cancer are rising worldwide. With survival +decreasing significantly at later stages, early detection is vital. Artificial +intelligence (AI) and machine learning (ML) have the potential to detect throat +cancer from patient speech, facilitating earlier diagnosis and reducing the +burden on overstretched healthcare systems. However, no comprehensive review +has explored the use of AI and ML for detecting throat cancer from speech. This +review aims to fill this gap by evaluating how these technologies perform and +identifying issues that need to be addressed in future research. Materials and +Methods: We conducted a scoping literature review across three databases: +Scopus,Web of Science, and PubMed. We included articles that classified speech +using machine learning and specified the inclusion of throat cancer patients in +their data. Articles were categorized based on whether they performed binary or +multi-class classification. Results: We found 27 articles fitting our inclusion +criteria, 12 performing binary classification, 13 performing multi-class +classification, and two that do both binary and multiclass classification. The +most common classification method used was neural networks, and the most +frequently extracted feature was mel-spectrograms. We also documented +pre-processing methods and classifier performance. We compared each article +against the TRIPOD-AI checklist, which showed a significant lack of open +science, with only one article sharing code and only three using open-access +data. Conclusion: Open-source code is essential for external validation and +further development in this field. Our review indicates that no single method +or specific feature consistently outperforms others in detecting throat cancer +from speech. Future research should focus on standardizing methodologies and +improving the reproducibility of results. + +
+
+ comment: 15 pages, 10 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Causal Discovery over High-Dimensional Structured Hypothesis Spaces with + Causal Graph Partitioning + + +
+ The aim in many sciences is to understand the mechanisms that underlie the +observed distribution of variables, starting from a set of initial hypotheses. +Causal discovery allows us to infer mechanisms as sets of cause and effect +relationships in a generalized way -- without necessarily tailoring to a +specific domain. Causal discovery algorithms search over a structured +hypothesis space, defined by the set of directed acyclic graphs, to find the +graph that best explains the data. For high-dimensional problems, however, this +search becomes intractable and scalable algorithms for causal discovery are +needed to bridge the gap. In this paper, we define a novel causal graph +partition that allows for divide-and-conquer causal discovery with theoretical +guarantees. We leverage the idea of a superstructure -- a set of learned or +existing candidate hypotheses -- to partition the search space. We prove under +certain assumptions that learning with a causal graph partition always yields +the Markov Equivalence Class of the true causal graph. We show our algorithm +achieves comparable accuracy and a faster time to solution for +biologically-tuned synthetic networks and networks up to ${10^4}$ variables. +This makes our method applicable to gene regulatory network inference and other +domains with high-dimensional structured hypothesis spaces. + +
+
+
+
+
+ + ♻ ☆ Variation Spaces for Multi-Output Neural Networks: Insights on + Multi-Task Learning and Network Compression + + +
+ This paper introduces a novel theoretical framework for the analysis of +vector-valued neural networks through the development of vector-valued +variation spaces, a new class of reproducing kernel Banach spaces. These spaces +emerge from studying the regularization effect of weight decay in training +networks with activations like the rectified linear unit (ReLU). This framework +offers a deeper understanding of multi-output networks and their function-space +characteristics. A key contribution of this work is the development of a +representer theorem for the vector-valued variation spaces. This representer +theorem establishes that shallow vector-valued neural networks are the +solutions to data-fitting problems over these infinite-dimensional spaces, +where the network widths are bounded by the square of the number of training +data. This observation reveals that the norm associated with these +vector-valued variation spaces encourages the learning of features that are +useful for multiple tasks, shedding new light on multi-task learning with +neural networks. Finally, this paper develops a connection between weight-decay +regularization and the multi-task lasso problem. This connection leads to novel +bounds for layer widths in deep networks that depend on the intrinsic +dimensions of the training data representations. This insight not only deepens +the understanding of the deep network architectural requirements, but also +yields a simple convex optimization method for deep neural network compression. +The performance of this compression procedure is evaluated on various +architectures. + +
+
+ comment: Updated to version published in JMLR +
+
+
+
+
+ + ♻ ☆ An Experimental Study on the Rashomon Effect of Balancing Methods in + Imbalanced Classification + + +
+ Predictive models may generate biased predictions when classifying imbalanced +datasets. This happens when the model favors the majority class, leading to low +performance in accurately predicting the minority class. To address this issue, +balancing or resampling methods are critical data-centric AI approaches in the +modeling process to improve prediction performance. However, there have been +debates and questions about the functionality of these methods in recent years. +In particular, many candidate models may exhibit very similar predictive +performance, called the Rashomon effect, in model selection, and they may even +produce different predictions for the same observations. Selecting one of these +models without considering the predictive multiplicity -- which is the case of +yielding conflicting models' predictions for any sample -- can result in blind +selection. In this paper, the impact of balancing methods on predictive +multiplicity is examined using the Rashomon effect. It is crucial because the +blind model selection in data-centric AI is risky from a set of approximately +equally accurate models. This may lead to severe problems in model selection, +validation, and explanation. To tackle this matter, we conducted real dataset +experiments to observe the impact of balancing methods on predictive +multiplicity through the Rashomon effect by using a newly proposed metric +obscurity in addition to the existing ones: ambiguity and discrepancy. Our +findings showed that balancing methods inflate the predictive multiplicity and +yield varying results. To monitor the trade-off between the prediction +performance and predictive multiplicity for conducting the modeling process +responsibly, we proposed using the extended version of the performance-gain +plot when balancing the training data. + +
+
+ comment: 16 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Euler Characteristic Tools For Topological Data Analysis + + +
+ In this article, we study Euler characteristic techniques in topological data +analysis. Pointwise computing the Euler characteristic of a family of +simplicial complexes built from data gives rise to the so-called Euler +characteristic profile. We show that this simple descriptor achieve +state-of-the-art performance in supervised tasks at a very low computational +cost. Inspired by signal analysis, we compute hybrid transforms of Euler +characteristic profiles. These integral transforms mix Euler characteristic +techniques with Lebesgue integration to provide highly efficient compressors of +topological signals. As a consequence, they show remarkable performances in +unsupervised settings. On the qualitative side, we provide numerous heuristics +on the topological and geometric information captured by Euler profiles and +their hybrid transforms. Finally, we prove stability results for these +descriptors as well as asymptotic guarantees in random settings. + +
+
+ comment: 39 pages - Version accepted in JMLR +
+
+
+
+
+ + ♻ ☆ Description-Based Text Similarity + + +
+ Identifying texts with a given semantics is central for many information +seeking scenarios. Similarity search over vector embeddings appear to be +central to this ability, yet the similarity reflected in current text +embeddings is corpus-driven, and is inconsistent and sub-optimal for many use +cases. What, then, is a good notion of similarity for effective retrieval of +text? + We identify the need to search for texts based on abstract descriptions of +their content, and the corresponding notion of \emph{description based +similarity}. We demonstrate the inadequacy of current text embeddings and +propose an alternative model that significantly improves when used in standard +nearest neighbor search. The model is trained using positive and negative pairs +sourced through prompting a LLM, demonstrating how data from LLMs can be used +for creating new capabilities not immediately possible using the original +model. + +
+
+ comment: Accepted in COLM 2024 +
+
+
+
+
+ + ♻ ☆ QUACK: Quantum Aligned Centroid Kernel + + +
+ Quantum computing (QC) seems to show potential for application in machine +learning (ML). In particular quantum kernel methods (QKM) exhibit promising +properties for use in supervised ML tasks. However, a major disadvantage of +kernel methods is their unfavorable quadratic scaling with the number of +training samples. Together with the limits imposed by currently available +quantum hardware (NISQ devices) with their low qubit coherence times, small +number of qubits, and high error rates, the use of QC in ML at an industrially +relevant scale is currently impossible. As a small step in improving the +potential applications of QKMs, we introduce QUACK, a quantum kernel algorithm +whose time complexity scales linear with the number of samples during training, +and independent of the number of training samples in the inference stage. In +the training process, only the kernel entries for the samples and the centers +of the classes are calculated, i.e. the maximum shape of the kernel for n +samples and c classes is (n, c). During training, the parameters of the quantum +kernel and the positions of the centroids are optimized iteratively. In the +inference stage, for every new sample the circuit is only evaluated for every +centroid, i.e. c times. We show that the QUACK algorithm nevertheless provides +satisfactory results and can perform at a similar level as classical kernel +methods with quadratic scaling during training. In addition, our (simulated) +algorithm is able to handle high-dimensional datasets such as MNIST with 784 +features without any dimensionality reduction. + +
+
+ comment: Accepted to IEEE International Conference on Quantum Computing and + Engineering (QCE) 2024 +
+
+
+
+
+ + ♻ ☆ Q-Sparse: All Large Language Models can be Fully Sparsely-Activated + + +
+ We introduce, Q-Sparse, a simple yet effective approach to training +sparsely-activated large language models (LLMs). Q-Sparse enables full sparsity +of activations in LLMs which can bring significant efficiency gains in +inference. This is achieved by applying top-K sparsification to the activations +and the straight-through-estimator to the training. We also introduce Block +Q-Sparse for batch training and inference. The key results from this work are, +(1) Q-Sparse can achieve results comparable to those of baseline LLMs while +being much more efficient at inference time; (2) We present an +inference-optimal scaling law for sparsely-activated LLMs; (3) Q-Sparse is +effective in different settings, including training-from-scratch, +continue-training of off-the-shelf LLMs, and finetuning; (4) Q-Sparse works for +both full-precision and 1-bit LLMs (e.g., BitNet b1.58). Particularly, the +synergy of BitNet b1.58 and Q-Sparse (can be equipped with MoE) provides the +cornerstone and a clear path to revolutionize the efficiency, including cost +and energy consumption, of future LLMs. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ High-Probability Convergence for Composite and Distributed Stochastic + Minimization and Variational Inequalities with Heavy-Tailed Noise ICML 2024 + + +
+ High-probability analysis of stochastic first-order optimization methods +under mild assumptions on the noise has been gaining a lot of attention in +recent years. Typically, gradient clipping is one of the key algorithmic +ingredients to derive good high-probability guarantees when the noise is +heavy-tailed. However, if implemented na\"ively, clipping can spoil the +convergence of the popular methods for composite and distributed optimization +(Prox-SGD/Parallel SGD) even in the absence of any noise. Due to this reason, +many works on high-probability analysis consider only unconstrained +non-distributed problems, and the existing results for composite/distributed +problems do not include some important special cases (like strongly convex +problems) and are not optimal. To address this issue, we propose new stochastic +methods for composite and distributed optimization based on the clipping of +stochastic gradient differences and prove tight high-probability convergence +results (including nearly optimal ones) for the new methods. Using similar +ideas, we also develop new methods for composite and distributed variational +inequalities and analyze the high-probability convergence of these methods. + +
+
+ comment: ICML 2024; changes in version 2: minor corrections (typos were fixed + and the structure was modified) +
+
+
+
+
+ + ♻ ☆ Learning from Graphs with Heterophily: Progress and Future + + +
+ Graphs are structured data that models complex relations between real-world +entities. Heterophilous graphs, where linked nodes are prone to be with +different labels or dissimilar features, have recently attracted significant +attention and found many applications. Meanwhile, increasing efforts have been +made to advance learning from heterophilous graphs. Although there exist +surveys on the relevant topic, they focus on heterophilous GNNs, which are only +sub-topics of heterophilous graph learning. In this survey, we comprehensively +overview existing works on learning from graphs with heterophily.First, we +collect over 180 publications and introduce the development of this field. +Then, we systematically categorize existing methods based on a hierarchical +taxonomy including learning strategies, model architectures and practical +applications. Finally, we discuss the primary challenges of existing studies +and highlight promising avenues for future research.More publication details +and corresponding open-source codes can be accessed and will be continuously +updated at our +repositories:https://github.com/gongchenghua/Papers-Graphs-with-Heterophily. + +
+
+
+
+
+ + ♻ ☆ Large Language Models as Topological Structure Enhancers for + Text-Attributed Graphs + + +
+ The latest advancements in large language models (LLMs) have revolutionized +the field of natural language processing (NLP). Inspired by the success of LLMs +in NLP tasks, some recent work has begun investigating the potential of +applying LLMs in graph learning tasks. However, most of the existing work +focuses on utilizing LLMs as powerful node feature augmenters, leaving +employing LLMs to enhance graph topological structures an understudied problem. +In this work, we explore how to leverage the information retrieval and text +generation capabilities of LLMs to refine/enhance the topological structure of +text-attributed graphs (TAGs) under the node classification setting. First, we +propose using LLMs to help remove unreliable edges and add reliable ones in the +TAG. Specifically, we first let the LLM output the semantic similarity between +node attributes through delicate prompt designs, and then perform edge deletion +and edge addition based on the similarity. Second, we propose using +pseudo-labels generated by the LLM to improve graph topology, that is, we +introduce the pseudo-label propagation as a regularization to guide the graph +neural network (GNN) in learning proper edge weights. Finally, we incorporate +the two aforementioned LLM-based methods for graph topological refinement into +the process of GNN training, and perform extensive experiments on four +real-world datasets. The experimental results demonstrate the effectiveness of +LLM-based graph topology refinement (achieving a 0.15%--2.47% performance gain +on public benchmarks). + +
+
+ comment: 10 pages +
+
+
+
+
+ + ♻ ☆ $Φ$-DVAE: Physics-Informed Dynamical Variational Autoencoders for + Unstructured Data Assimilation + + +
+ Incorporating unstructured data into physical models is a challenging problem +that is emerging in data assimilation. Traditional approaches focus on +well-defined observation operators whose functional forms are typically assumed +to be known. This prevents these methods from achieving a consistent model-data +synthesis in configurations where the mapping from data-space to model-space is +unknown. To address these shortcomings, in this paper we develop a +physics-informed dynamical variational autoencoder ($\Phi$-DVAE) to embed +diverse data streams into time-evolving physical systems described by +differential equations. Our approach combines a standard, possibly nonlinear, +filter for the latent state-space model and a VAE, to assimilate the +unstructured data into the latent dynamical system. Unstructured data, in our +example systems, comes in the form of video data and velocity field +measurements, however the methodology is suitably generic to allow for +arbitrary unknown observation operators. A variational Bayesian framework is +used for the joint estimation of the encoding, latent states, and unknown +system parameters. To demonstrate the method, we provide case studies with the +Lorenz-63 ordinary differential equation, and the advection and Korteweg-de +Vries partial differential equations. Our results, with synthetic data, show +that $\Phi$-DVAE provides a data efficient dynamics encoding methodology which +is competitive with standard approaches. Unknown parameters are recovered with +uncertainty quantification, and unseen data are accurately predicted. + +
+
+ comment: 29 pages, 9 figures, updated version +
+
+
+
+
+ + ♻ ☆ Physics-informed Information Field Theory for Modeling Physical Systems + with Uncertainty Quantification + + +
+ Data-driven approaches coupled with physical knowledge are powerful +techniques to model systems. The goal of such models is to efficiently solve +for the underlying field by combining measurements with known physical laws. As +many systems contain unknown elements, such as missing parameters, noisy data, +or incomplete physical laws, this is widely approached as an uncertainty +quantification problem. The common techniques to handle all the variables +typically depend on the numerical scheme used to approximate the posterior, and +it is desirable to have a method which is independent of any such +discretization. Information field theory (IFT) provides the tools necessary to +perform statistics over fields that are not necessarily Gaussian. We extend IFT +to physics-informed IFT (PIFT) by encoding the functional priors with +information about the physical laws which describe the field. The posteriors +derived from this PIFT remain independent of any numerical scheme and can +capture multiple modes, allowing for the solution of problems which are +ill-posed. We demonstrate our approach through an analytical example involving +the Klein-Gordon equation. We then develop a variant of stochastic gradient +Langevin dynamics to draw samples from the joint posterior over the field and +model parameters. We apply our method to numerical examples with various +degrees of model-form error and to inverse problems involving nonlinear +differential equations. As an addendum, the method is equipped with a metric +which allows the posterior to automatically quantify model-form uncertainty. +Because of this, our numerical experiments show that the method remains robust +to even an incorrect representation of the physics given sufficient data. We +numerically demonstrate that the method correctly identifies when the physics +cannot be trusted, in which case it automatically treats learning the field as +a regression problem. + +
+
+ comment: 32 pages, 8 figures. Published in Journal of Computational Physics +
+
+
+
+
+ + ♻ ☆ When Does Bottom-up Beat Top-down in Hierarchical Community Detection? + + +
+ Hierarchical clustering of networks consists in finding a tree of +communities, such that lower levels of the hierarchy reveal finer-grained +community structures. There are two main classes of algorithms tackling this +problem. Divisive ($\textit{top-down}$) algorithms recursively partition the +nodes into two communities, until a stopping rule indicates that no further +split is needed. In contrast, agglomerative ($\textit{bottom-up}$) algorithms +first identify the smallest community structure and then repeatedly merge the +communities using a $\textit{linkage}$ method. In this article, we establish +theoretical guarantees for the recovery of the hierarchical tree and community +structure of a Hierarchical Stochastic Block Model by a bottom-up algorithm. We +also establish that this bottom-up algorithm attains the information-theoretic +threshold for exact recovery at intermediate levels of the hierarchy. Notably, +these recovery conditions are less restrictive compared to those existing for +top-down algorithms. This shows that bottom-up algorithms extend the feasible +region for achieving exact recovery at intermediate levels. Numerical +experiments on both synthetic and real data sets confirm the superiority of +bottom-up algorithms over top-down algorithms. We also observe that top-down +algorithms can produce dendrograms with inversions. These findings contribute +to a better understanding of hierarchical clustering techniques and their +applications in network analysis. + +
+
+
+
+
+ + ♻ ☆ Arrows of Time for Large Language Models + + +
+ We study the probabilistic modeling performed by Autoregressive Large +Language Models (LLMs) through the angle of time directionality, addressing a +question first raised in (Shannon, 1951). For large enough models, we +empirically find a time asymmetry in their ability to learn natural language: a +difference in the average log-perplexity when trying to predict the next token +versus when trying to predict the previous one. This difference is at the same +time subtle and very consistent across various modalities (language, model +size, training time, ...). Theoretically, this is surprising: from an +information-theoretic point of view, there should be no such difference. We +provide a theoretical framework to explain how such an asymmetry can appear +from sparsity and computational complexity considerations, and outline a number +of perspectives opened by our results. + +
+
+ comment: Corrected typos in Table 2. Added links. 12 figures, 20 pages +
+
+
+
+
+ + ♻ ☆ Adaptive Splitting of Reusable Temporal Monitors for Rare Traffic + Violations + + +
+ Autonomous Vehicles (AVs) are often tested in simulation to estimate the +probability they will violate safety specifications. Two common issues arise +when using existing techniques to produce this estimation: If violations occur +rarely, simple Monte-Carlo sampling techniques can fail to produce efficient +estimates; if simulation horizons are too long, importance sampling techniques +(which learn proposal distributions from past simulations) can fail to +converge. This paper addresses both issues by interleaving rare-event sampling +techniques with online specification monitoring algorithms. We use adaptive +multi-level splitting to decompose simulations into partial trajectories, then +calculate the distance of those partial trajectories to failure by leveraging +robustness metrics from Signal Temporal Logic (STL). By caching those partial +robustness metric values, we can efficiently re-use computations across +multiple sampling stages. Our experiments on an interstate lane-change scenario +show our method is viable for testing simulated AV-pipelines, efficiently +estimating failure probabilities for STL specifications based on real traffic +rules. We produce better estimates than Monte-Carlo and importance sampling in +fewer simulations. + +
+
+
+
+
+ + ♻ ☆ FreeCG: Free the Design Space of Clebsch-Gordan Transform for Machine + Learning Force Fields + + +
+ The Clebsch-Gordan Transform (CG transform) effectively encodes many-body +interactions. Many studies have proven its accuracy in depicting atomic +environments, although this comes with high computational needs. The +computational burden of this challenge is hard to reduce due to the need for +permutation equivariance, which limits the design space of the CG transform +layer. We show that, implementing the CG transform layer on +permutation-invariant inputs allows complete freedom in the design of this +layer without affecting symmetry. Developing further on this premise, our idea +is to create a CG transform layer that operates on permutation-invariant +abstract edges generated from real edge information. We bring in group CG +transform with sparse path, abstract edges shuffling, and attention enhancer to +form a powerful and efficient CG transform layer. Our method, known as FreeCG, +achieves State-of-The-Art (SoTA) results in force prediction for MD17, rMD17, +MD22, and property prediction in QM9 datasets with notable enhancement. The +extensibility to other models is also examined. Molecular dynamics simulations +are carried out on MD17 and other periodic systems, including water and LiPS, +showcasing the capacity for real-world applications of FreeCG. It introduces a +novel paradigm for carrying out efficient and expressive CG transform in future +geometric neural network designs. + +
+
+ comment: 29 pages, 8 tables, 10 figures +
+
+
+
+
+ + ♻ ☆ Tree-Planner: Efficient Close-loop Task Planning with Large Language + Models ICLR 2024 + + +
+ This paper studies close-loop task planning, which refers to the process of +generating a sequence of skills (a plan) to accomplish a specific goal while +adapting the plan based on real-time observations. Recently, prompting Large +Language Models (LLMs) to generate actions iteratively has become a prevalent +paradigm due to its superior performance and user-friendliness. However, this +paradigm is plagued by two inefficiencies: high token consumption and redundant +error correction, both of which hinder its scalability for large-scale testing +and applications. To address these issues, we propose Tree-Planner, which +reframes task planning with LLMs into three distinct phases: plan sampling, +action tree construction, and grounded deciding. Tree-Planner starts by using +an LLM to sample a set of potential plans before execution, followed by the +aggregation of them to form an action tree. Finally, the LLM performs a +top-down decision-making process on the tree, taking into account real-time +environmental information. Experiments show that Tree-Planner achieves +state-of-the-art performance while maintaining high efficiency. By decomposing +LLM queries into a single plan-sampling call and multiple grounded-deciding +calls, a considerable part of the prompt are less likely to be repeatedly +consumed. As a result, token consumption is reduced by 92.2% compared to the +previously best-performing model. Additionally, by enabling backtracking on the +action tree as needed, the correction process becomes more flexible, leading to +a 40.5% decrease in error corrections. + +
+
+ comment: Published in ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Copyright Protection in Generative AI: A Technical Perspective + + +
+ Generative AI has witnessed rapid advancement in recent years, expanding +their capabilities to create synthesized content such as text, images, audio, +and code. The high fidelity and authenticity of contents generated by these +Deep Generative Models (DGMs) have sparked significant copyright concerns. +There have been various legal debates on how to effectively safeguard +copyrights in DGMs. This work delves into this issue by providing a +comprehensive overview of copyright protection from a technical perspective. We +examine from two distinct viewpoints: the copyrights pertaining to the source +data held by the data owners and those of the generative models maintained by +the model builders. For data copyright, we delve into methods data owners can +protect their content and DGMs can be utilized without infringing upon these +rights. For model copyright, our discussion extends to strategies for +preventing model theft and identifying outputs generated by specific models. +Finally, we highlight the limitations of existing techniques and identify areas +that remain unexplored. Furthermore, we discuss prospective directions for the +future of copyright protection, underscoring its importance for the sustainable +and ethical development of Generative AI. + +
+
+ comment: 26 pages +
+
+
+
+
+ + ♻ ☆ Discovering Dynamic Symbolic Policies with Genetic Programming + + +
+ Artificial intelligence techniques are increasingly being applied to solve +control problems, but often rely on black-box methods without transparent +output generation. To improve the interpretability and transparency in control +systems, models can be defined as white-box symbolic policies described by +mathematical expressions. While current approaches to learn symbolic policies +focus on static policies that directly map observations to control signals, +these may fail in partially observable and volatile environments. We instead +consider dynamic symbolic policies with memory, optimised with genetic +programming. The resulting policies are robust, and consist of easy to +interpret coupled differential equations. Our results show that dynamic +symbolic policies compare with black-box policies on a variety of control +tasks. Furthermore, the benefit of the memory in dynamic policies is +demonstrated on experiments where static policies fall short. Overall, we +present a method for evolving high-performing symbolic policies that offer +interpretability and transparency, which lacks in black-box models. + +
+
+ comment: 19 pages including references and appendix, 5 figures, 1 algorithm, 5 + tables +
+
+
+
+
+ + ♻ ☆ Efficient Convex Optimization Requires Superlinear Memory + + +
+ We show that any memory-constrained, first-order algorithm which minimizes +$d$-dimensional, $1$-Lipschitz convex functions over the unit ball to +$1/\mathrm{poly}(d)$ accuracy using at most $d^{1.25 - \delta}$ bits of memory +must make at least $\tilde{\Omega}(d^{1 + (4/3)\delta})$ first-order queries +(for any constant $\delta \in [0, 1/4]$). Consequently, the performance of such +memory-constrained algorithms are a polynomial factor worse than the optimal +$\tilde{O}(d)$ query bound for this problem obtained by cutting plane methods +that use $\tilde{O}(d^2)$ memory. This resolves a COLT 2019 open problem of +Woodworth and Srebro. + +
+
+ comment: 33 pages, 1 figure +
+
+
+
+
+ + ♻ ☆ On the Utility of Speech and Audio Foundation Models for Marmoset Call + Analysis + + +
+ Marmoset monkeys encode vital information in their calls and serve as a +surrogate model for neuro-biologists to understand the evolutionary origins of +human vocal communication. Traditionally analyzed with signal processing-based +features, recent approaches have utilized self-supervised models pre-trained on +human speech for feature extraction, capitalizing on their ability to learn a +signal's intrinsic structure independently of its acoustic domain. However, the +utility of such foundation models remains unclear for marmoset call analysis in +terms of multi-class classification, bandwidth, and pre-training domain. This +study assesses feature representations derived from speech and general audio +domains, across pre-training bandwidths of 4, 8, and 16 kHz for marmoset +call-type and caller classification tasks. Results show that models with higher +bandwidth improve performance, and pre-training on speech or general audio +yields comparable results, improving over a spectral baseline. + +
+
+ comment: Accepted at Interspeech 2024 satellite event (VIHAR 2024) +
+
+
+
+
+ + ♻ ☆ A Simulation Benchmark for Autonomous Racing with Large-Scale Human Data + + +
+ Despite the availability of international prize-money competitions, scaled +vehicles, and simulation environments, research on autonomous racing and the +control of sports cars operating close to the limit of handling has been +limited by the high costs of vehicle acquisition and management, as well as the +limited physics accuracy of open-source simulators. In this paper, we propose a +racing simulation platform based on the simulator Assetto Corsa to test, +validate, and benchmark autonomous driving algorithms, including reinforcement +learning (RL) and classical Model Predictive Control (MPC), in realistic and +challenging scenarios. Our contributions include the development of this +simulation platform, several state-of-the-art algorithms tailored to the racing +environment, and a comprehensive dataset collected from human drivers. +Additionally, we evaluate algorithms in the offline RL setting. All the +necessary code (including environment and benchmarks), working examples, +datasets, and videos are publicly released and can be found at: +https://assetto-corsa-gym.github.io + +
+
+ comment: Project page and code can be found at: + \url{https://assetto-corsa-gym.github.io/} +
+
+
+
+
+ + ♻ ☆ EXACT: How to Train Your Accuracy + + +
+ Classification tasks are usually evaluated in terms of accuracy. However, +accuracy is discontinuous and cannot be directly optimized using gradient +ascent. Popular methods minimize cross-entropy, hinge loss, or other surrogate +losses, which can lead to suboptimal results. In this paper, we propose a new +optimization framework by introducing stochasticity to a model's output and +optimizing expected accuracy, i.e. accuracy of the stochastic model. Extensive +experiments on linear models and deep image classification show that the +proposed optimization method is a powerful alternative to widely used +classification losses. + +
+
+ comment: Pattern Recognition Letters (2024) +
+
+
+
+
+ + ♻ ☆ Towards Robust Continual Learning with Bayesian Adaptive Moment + Regularization + + +
+ The pursuit of long-term autonomy mandates that machine learning models must +continuously adapt to their changing environments and learn to solve new tasks. +Continual learning seeks to overcome the challenge of catastrophic forgetting, +where learning to solve new tasks causes a model to forget previously learnt +information. Prior-based continual learning methods are appealing as they are +computationally efficient and do not require auxiliary models or data storage. +However, prior-based approaches typically fail on important benchmarks and are +thus limited in their potential applications compared to their memory-based +counterparts. We introduce Bayesian adaptive moment regularization (BAdam), a +novel prior-based method that better constrains parameter growth, reducing +catastrophic forgetting. Our method boasts a range of desirable properties such +as being lightweight and task label-free, converging quickly, and offering +calibrated uncertainty that is important for safe real-world deployment. +Results show that BAdam achieves state-of-the-art performance for prior-based +methods on challenging single-headed class-incremental experiments such as +Split MNIST and Split FashionMNIST, and does so without relying on task labels +or discrete task boundaries. + +
+
+
+
+
+ + ♻ ☆ Logistic regression models for patient-level prediction based on massive + observational data: Do we need all data? + + +
+ Objective: Provide guidance on sample size considerations for developing +predictive models by empirically establishing the adequate sample size, which +balances the competing objectives of improving model performance and reducing +model complexity as well as computational requirements. + Materials and Methods: We empirically assess the effect of sample size on +prediction performance and model complexity by generating learning curves for +81 prediction problems (23 outcomes predicted in a depression cohort, 58 +outcomes predicted in a hypertension cohort) in three large observational +health databases, requiring training of 17,248 prediction models. The adequate +sample size was defined as the sample size for which the performance of a model +equalled the maximum model performance minus a small threshold value. + Results: The adequate sample size achieves a median reduction of the number +of observations of 9.5%, 37.3%, 58.5%, and 78.5% for the thresholds of 0.001, +0.005, 0.01, and 0.02, respectively. The median reduction of the number of +predictors in the models was 8.6%, 32.2%, 48.2%, and 68.3% for the thresholds +of 0.001, 0.005, 0.01, and 0.02, respectively. + Discussion: Based on our results a conservative, yet significant, reduction +in sample size and model complexity can be estimated for future prediction +work. Though, if a researcher is willing to generate a learning curve a much +larger reduction of the model complexity may be possible as suggested by a +large outcome-dependent variability. + Conclusion: Our results suggest that in most cases only a fraction of the +available data was sufficient to produce a model close to the performance of +one developed on the full data set, but with a substantially reduced model +complexity. + +
+
+
+
+
+ + ♻ ☆ SAE: Single Architecture Ensemble Neural Networks BMVC'24 + + +
+ Ensembles of separate neural networks (NNs) have shown superior accuracy and +confidence calibration over single NN across tasks. To improve the hardware +efficiency of ensembles of separate NNs, recent methods create ensembles within +a single network via adding early exits or considering multi input multi output +approaches. However, it is unclear which of these methods is the most effective +for a given task, needing a manual and separate search through each method. Our +novel Single Architecture Ensemble (SAE) framework enables an automatic and +joint search through the early exit and multi input multi output configurations +and their previously unobserved in-between combinations. SAE consists of two +parts: a scalable search space that generalises the previous methods and their +in-between configurations, and an optimisation objective that allows learning +the optimal configuration for a given task. Our image classification and +regression experiments show that with SAE we can automatically find diverse +configurations that fit the task, achieving competitive accuracy or confidence +calibration to baselines while reducing the compute operations or parameter +count by up to $1.5{\sim}3.7\times$. + +
+
+ comment: Accepted at BMVC'24 +
+
+
+
+
+ + ♻ ☆ On the Federated Learning Framework for Cooperative Perception + + +
+ Cooperative perception is essential to enhance the efficiency and safety of +future transportation systems, requiring extensive data sharing among vehicles +on the road, which raises significant privacy concerns. Federated learning +offers a promising solution by enabling data privacy-preserving collaborative +enhancements in perception, decision-making, and planning among connected and +autonomous vehicles (CAVs). However, federated learning is impeded by +significant challenges arising from data heterogeneity across diverse clients, +potentially diminishing model accuracy and prolonging convergence periods. This +study introduces a specialized federated learning framework for CP, termed the +federated dynamic weighted aggregation (FedDWA) algorithm, facilitated by +dynamic adjusting loss (DALoss) function. This framework employs dynamic client +weighting to direct model convergence and integrates a novel loss function that +utilizes Kullback-Leibler divergence (KLD) to counteract the detrimental +effects of non-independently and identically distributed (Non-IID) and +unbalanced data. Utilizing the BEV transformer as the primary model, our +rigorous testing on the OpenV2V dataset, augmented with FedBEVT data, +demonstrates significant improvements in the average intersection over union +(IoU). These results highlight the substantial potential of our federated +learning framework to address data heterogeneity challenges in CP, thereby +enhancing the accuracy of environmental perception models and facilitating more +robust and efficient collaborative learning solutions in the transportation +sector. + +
+
+
+
+
+ + ♻ ☆ Improved Random Features for Dot Product Kernels + + +
+ Dot product kernels, such as polynomial and exponential (softmax) kernels, +are among the most widely used kernels in machine learning, as they enable +modeling the interactions between input features, which is crucial in +applications like computer vision, natural language processing, and recommender +systems. We make several novel contributions for improving the efficiency of +random feature approximations for dot product kernels, to make these kernels +more useful in large scale learning. First, we present a generalization of +existing random feature approximations for polynomial kernels, such as +Rademacher and Gaussian sketches and TensorSRHT, using complex-valued random +features. We show empirically that the use of complex features can +significantly reduce the variances of these approximations. Second, we provide +a theoretical analysis for understanding the factors affecting the efficiency +of various random feature approximations, by deriving closed-form expressions +for their variances. These variance formulas elucidate conditions under which +certain approximations (e.g., TensorSRHT) achieve lower variances than others +(e.g., Rademacher sketches), and conditions under which the use of complex +features leads to lower variances than real features. Third, by using these +variance formulas, which can be evaluated in practice, we develop a data-driven +optimization approach to improve random feature approximations for general dot +product kernels, which is also applicable to the Gaussian kernel. We describe +the improvements brought by these contributions with extensive experiments on a +variety of tasks and datasets. + +
+
+
+
+
+ + ♻ ☆ Heterophily-Aware Fair Recommendation using Graph Convolutional Networks + + +
+ In recent years, graph neural networks (GNNs) have become a popular tool to +improve the accuracy and performance of recommender systems. Modern recommender +systems are not only designed to serve end users, but also to benefit other +participants, such as items and items providers. These participants may have +different or conflicting goals and interests, which raise the need for fairness +and popularity bias considerations. GNN-based recommendation methods also face +the challenges of unfairness and popularity bias and their normalization and +aggregation processes suffer from these challenges. In this paper, we propose a +fair GNN-based recommender system, called HetroFair, to improve items' side +fairness. HetroFair uses two separate components to generate fairness-aware +embeddings: i) fairnessaware attention which incorporates dot product in the +normalization process of GNNs, to decrease the effect of nodes' degrees, and +ii) heterophily feature weighting to assign distinct weights to different +features during the aggregation process. In order to evaluate the effectiveness +of HetroFair, we conduct extensive experiments over six real-world datasets. +Our experimental results reveal that HetroFair not only alleviates the +unfairness and popularity bias on items' side, but also achieves superior +accuracy on users' side. Our implementation is publicly available at +https://github.com/NematGH/HetroFair. + +
+
+
+
+
+ + ♻ ☆ Learning a Patent-Informed Biomedical Knowledge Graph Reveals + Technological Potential of Drug Repositioning Candidates + + +
+ Drug repositioning-a promising strategy for discovering new therapeutic uses +for existing drugs-has been increasingly explored in the computational science +literature using biomedical databases. However, the technological potential of +drug repositioning candidates has often been overlooked. This study presents a +novel protocol to comprehensively analyse various sources such as +pharmaceutical patents and biomedical databases, and identify drug +repositioning candidates with both technological potential and scientific +evidence. To this end, first, we constructed a scientific biomedical knowledge +graph (s-BKG) comprising relationships between drugs, diseases, and genes +derived from biomedical databases. Our protocol involves identifying drugs that +exhibit limited association with the target disease but are closely located in +the s-BKG, as potential drug candidates. We constructed a patent-informed +biomedical knowledge graph (p-BKG) by adding pharmaceutical patent information. +Finally, we developed a graph embedding protocol to ascertain the structure of +the p-BKG, thereby calculating the relevance scores of those candidates with +target disease-related patents to evaluate their technological potential. Our +case study on Alzheimer's disease demonstrates its efficacy and feasibility, +while the quantitative outcomes and systematic methods are expected to bridge +the gap between computational discoveries and successful market applications in +drug repositioning research. + +
+
+ comment: We are sorry to withdraw this paper. We found some critical errors in + the introduction and results sections. Specifically, we found that the first + author have wrongly inserted citations on background works and he made + mistakes in the graph embedding methods and relevant results are wrongly + calculated. In this regard, we tried to revise this paper and withdraw the + current version. Thank you +
+
+
+
+
+ + ♻ ☆ MetaLLM: A High-performant and Cost-efficient Dynamic Framework for + Wrapping LLMs + + +
+ The rapid progress in machine learning (ML) has brought forth many large +language models (LLMs) that excel in various tasks and areas. These LLMs come +with different abilities and costs in terms of computation or pricing. Since +the demand for each query can vary, e.g., because of the queried domain or its +complexity, defaulting to one LLM in an application is not usually the best +choice, whether it is the biggest, priciest, or even the one with the best +average test performance. Consequently, picking the right LLM that is both +accurate and cost-effective for an application remains a challenge. In this +paper, we introduce MetaLLM, a framework that dynamically and intelligently +routes each query to the optimal LLM (among several available LLMs) for +classification tasks, achieving significantly improved accuracy and +cost-effectiveness. By framing the selection problem as a multi-armed bandit, +MetaLLM balances prediction accuracy and cost efficiency under uncertainty. Our +experiments, conducted on popular LLM platforms such as OpenAI's GPT models, +Amazon's Titan, Anthropic's Claude, and Meta's LLaMa, showcase MetaLLM's +efficacy in real-world scenarios, laying the groundwork for future extensions +beyond classification tasks. + +
+
+
+
+
+ + ♻ ☆ Boosting Gradient Ascent for Continuous DR-submodular Maximization ICML + 2022 + + +
+ Projected Gradient Ascent (PGA) is the most commonly used optimization scheme +in machine learning and operations research areas. Nevertheless, numerous +studies and examples have shown that the PGA methods may fail to achieve the +tight approximation ratio for continuous DR-submodular maximization problems. +To address this challenge, we present a boosting technique in this paper, which +can efficiently improve the approximation guarantee of the standard PGA to +\emph{optimal} with only small modifications on the objective function. The +fundamental idea of our boosting technique is to exploit non-oblivious search +to derive a novel auxiliary function $F$, whose stationary points are excellent +approximations to the global maximum of the original DR-submodular objective +$f$. Specifically, when $f$ is monotone and $\gamma$-weakly DR-submodular, we +propose an auxiliary function $F$ whose stationary points can provide a better +$(1-e^{-\gamma})$-approximation than the +$(\gamma^2/(1+\gamma^2))$-approximation guaranteed by the stationary points of +$f$ itself. Similarly, for the non-monotone case, we devise another auxiliary +function $F$ whose stationary points can achieve an optimal +$\frac{1-\min_{\boldsymbol{x}\in\mathcal{C}}\|\boldsymbol{x}\|_{\infty}}{4}$-approximation +guarantee where $\mathcal{C}$ is a convex constraint set. In contrast, the +stationary points of the original non-monotone DR-submodular function can be +arbitrarily bad~\citep{chen2023continuous}. Furthermore, we demonstrate the +scalability of our boosting technique on four problems. In all of these four +problems, our resulting variants of boosting PGA algorithm beat the previous +standard PGA in several aspects such as approximation ratio and efficiency. +Finally, we corroborate our theoretical findings with numerical experiments, +which demonstrate the effectiveness of our boosting PGA methods. + +
+
+ comment: 74 pages, 6 figures and 9 tables. An extended version of Stochastic + Continuous Submodular Maximization: Boosting via Non-oblivious Function (ICML + 2022) +
+
+
+
+
+ + ♻ ☆ Surrogate Neural Networks Local Stability for Aircraft Predictive + Maintenance + + +
+ Surrogate Neural Networks are nowadays routinely used in industry as +substitutes for computationally demanding engineering simulations (e.g., in +structural analysis). They allow to generate faster predictions and thus +analyses in industrial applications e.g., during a product design, testing or +monitoring phases. Due to their performance and time-efficiency, these +surrogate models are now being developed for use in safety-critical +applications. Neural network verification and in particular the assessment of +their robustness (e.g., to perturbations) is the next critical step to allow +their inclusion in real-life applications and certification. We assess the +applicability and scalability of empirical and formal methods in the context of +aircraft predictive maintenance for surrogate neural networks designed to +predict the stress sustained by an aircraft part from external loads. The case +study covers a high-dimensional input and output space and the verification +process thus accommodates multi-objective constraints. We explore the +complementarity of verification methods in assessing the local stability +property of such surrogate models to input noise. We showcase the effectiveness +of sequentially combining methods in one verification 'pipeline' and +demonstrate the subsequent gain in runtime required to assess the targeted +property. + +
+
+ comment: Peer-reviewed and accepted at the 29th International Conference on + Formal Methods for Industrial Critical Systems (FMICS 2024) - 15 pages +
+
+
+
+
+ + ♻ ☆ A spatiotemporal deep learning framework for prediction of crack + dynamics in heterogeneous solids: efficient mapping of concrete + microstructures to its fracture properties + + +
+ A spatiotemporal deep learning framework is proposed that is capable of 2D +full-field prediction of fracture in concrete mesostructures. This framework +not only predicts fractures but also captures the entire history of the +fracture process, from the crack initiation in the interfacial transition zone +to the subsequent propagation of the cracks in the mortar matrix. In addition, +a convolutional neural network is developed which can predict the averaged +stress-strain curve of the mesostructures. The UNet modeling framework, which +comprises an encoder-decoder section with skip connections, is used as the deep +learning surrogate model. Training and test data are generated from +high-fidelity fracture simulations of randomly generated concrete +mesostructures. These mesostructures include geometric variabilities such as +different aggregate particle geometrical features, spatial distribution, and +the total volume fraction of aggregates. The fracture simulations are carried +out in Abaqus, utilizing the cohesive phase-field fracture modeling technique +as the fracture modeling approach. In this work, to reduce the number of +training datasets, the spatial distribution of three sets of material +properties for three-phase concrete mesostructures, along with the spatial +phase-field damage index, are fed to the UNet to predict the corresponding +stress and spatial damage index at the subsequent step. It is shown that after +the training process using this methodology, the UNet model is capable of +accurately predicting damage on the unseen test dataset by using 470 datasets. +Moreover, another novel aspect of this work is the conversion of irregular +finite element data into regular grids using a developed pipeline. This +approach allows for the implementation of less complex UNet architecture and +facilitates the integration of phase-field fracture equations into surrogate +models for future developments. + +
+
+
+
+
+ + ♻ ☆ Knowledge-augmented Graph Machine Learning for Drug Discovery: A Survey + + +
+ The integration of Artificial Intelligence (AI) into the field of drug +discovery has been a growing area of interdisciplinary scientific research. +However, conventional AI models are heavily limited in handling complex +biomedical structures (such as 2D or 3D protein and molecule structures) and +providing interpretations for outputs, which hinders their practical +application. As of late, Graph Machine Learning (GML) has gained considerable +attention for its exceptional ability to model graph-structured biomedical data +and investigate their properties and functional relationships. Despite +extensive efforts, GML methods still suffer from several deficiencies, such as +the limited ability to handle supervision sparsity and provide interpretability +in learning and inference processes, and their ineffectiveness in utilising +relevant domain knowledge. In response, recent studies have proposed +integrating external biomedical knowledge into the GML pipeline to realise more +precise and interpretable drug discovery with limited training instances. +However, a systematic definition for this burgeoning research direction is yet +to be established. This survey presents a comprehensive overview of +long-standing drug discovery principles, provides the foundational concepts and +cutting-edge techniques for graph-structured data and knowledge databases, and +formally summarises Knowledge-augmented Graph Machine Learning (KaGML) for drug +discovery. we propose a thorough review of related KaGML works, collected +following a carefully designed search methodology, and organise them into four +categories following a novel-defined taxonomy. To facilitate research in this +promptly emerging field, we also share collected practical resources that are +valuable for intelligent drug discovery and provide an in-depth discussion of +the potential avenues for future advancements. + +
+
+
+
+
+ + ♻ ☆ DCoM: Active Learning for All Learners + + +
+ Deep Active Learning (AL) techniques can be effective in reducing annotation +costs for training deep models. However, their effectiveness in low- and +high-budget scenarios seems to require different strategies, and achieving +optimal results across varying budget scenarios remains a challenge. In this +study, we introduce Dynamic Coverage & Margin mix (DCoM), a novel active +learning approach designed to bridge this gap. Unlike existing strategies, DCoM +dynamically adjusts its strategy, considering the competence of the current +model. Through theoretical analysis and empirical evaluations on diverse +datasets, including challenging computer vision tasks, we demonstrate DCoM's +ability to overcome the cold start problem and consistently improve results +across different budgetary constraints. Thus DCoM achieves state-of-the-art +performance in both low- and high-budget regimes. + +
+
+
+
+
+ + ♻ ☆ VAAD: Visual Attention Analysis Dashboard applied to e-Learning + + +
+ In this paper, we present an approach in the Multimodal Learning Analytics +field. Within this approach, we have developed a tool to visualize and analyze +eye movement data collected during learning sessions in online courses. The +tool is named VAAD, an acronym for Visual Attention Analysis Dashboard. These +eye movement data have been gathered using an eye-tracker and subsequently +processed and visualized for interpretation. The purpose of the tool is to +conduct a descriptive analysis of the data by facilitating its visualization, +enabling the identification of differences and learning patterns among various +learner populations. Additionally, it integrates a predictive module capable of +anticipating learner activities during a learning session. Consequently, VAAD +holds the potential to offer valuable insights into online learning behaviors +from both descriptive and predictive perspectives. + +
+
+ comment: Accepted in CEDI 2024 (VII Congreso Espa\~nol de Inform\'atica), A + Coru\~na, Spain +
+
+
+
+
+ + ♻ ☆ Diversity-Preserving K-Armed Bandits, Revisited + + +
+ We consider the bandit-based framework for diversity-preserving +recommendations introduced by Celis et al. (2019), who approached it in the +case of a polytope mainly by a reduction to the setting of linear bandits. We +design a UCB algorithm using the specific structure of the setting and show +that it enjoys a bounded distribution-dependent regret in the natural cases +when the optimal mixed actions put some probability mass on all actions (i.e., +when diversity is desirable). The regret lower bounds provided show that +otherwise, at least when the model is mean-unbounded, a $\ln T$ regret is +suffered. We also discuss an example beyond the special case of polytopes. + +
+
+
+
+
+ + ♻ ☆ MovePose: A High-performance Human Pose Estimation Algorithm on Mobile + and Edge Devices ICANN 2024 + + +
+ We present MovePose, an optimized lightweight convolutional neural network +designed specifically for real-time body pose estimation on CPU-based mobile +devices. The current solutions do not provide satisfactory accuracy and speed +for human posture estimation, and MovePose addresses this gap. It aims to +maintain real-time performance while improving the accuracy of human posture +estimation for mobile devices. Our MovePose algorithm has attained an Mean +Average Precision (mAP) score of 68.0 on the COCO \cite{cocodata} validation +dataset. The MovePose algorithm displayed efficiency with a performance of 69+ +frames per second (fps) when run on an Intel i9-10920x CPU. Additionally, it +showcased an increased performance of 452+ fps on an NVIDIA RTX3090 GPU. On an +Android phone equipped with a Snapdragon 8 + 4G processor, the fps reached +above 11. To enhance accuracy, we incorporated three techniques: deconvolution, +large kernel convolution, and coordinate classification methods. Compared to +basic upsampling, deconvolution is trainable, improves model capacity, and +enhances the receptive field. Large kernel convolution strengthens these +properties at a decreased computational cost. In summary, MovePose provides +high accuracy and real-time performance, marking it a potential tool for a +variety of applications, including those focused on mobile-side human posture +estimation. The code and models for this algorithm will be made publicly +accessible. + +
+
+ comment: This paper has been accepted by ICANN 2024 and is an oral + presentation +
+
+
+
+
+ + ♻ ☆ Probing the Decision Boundaries of In-context Learning in Large Language + Models + + +
+ In-context learning is a key paradigm in large language models (LLMs) that +enables them to generalize to new tasks and domains by simply prompting these +models with a few exemplars without explicit parameter updates. Many attempts +have been made to understand in-context learning in LLMs as a function of model +scale, pretraining data, and other factors. In this work, we propose a new +mechanism to probe and understand in-context learning from the lens of decision +boundaries for in-context binary classification. Decision boundaries are +straightforward to visualize and provide important information about the +qualitative behavior of the inductive biases of standard classifiers. To our +surprise, we find that the decision boundaries learned by current LLMs in +simple binary classification tasks are often irregular and non-smooth, +regardless of linear separability in the underlying task. This paper +investigates the factors influencing these decision boundaries and explores +methods to enhance their generalizability. We assess various approaches, +including training-free and fine-tuning methods for LLMs, the impact of model +architecture, and the effectiveness of active prompting techniques for +smoothing decision boundaries in a data-efficient manner. Our findings provide +a deeper understanding of in-context learning dynamics and offer practical +improvements for enhancing robustness and generalizability of in-context +learning. + +
+
+ comment: 18 pages, code at https://github.com/siyan-zhao/ICL_decision_boundary +
+
+
+
+
+ + ♻ ☆ Spatial-Temporal Cross-View Contrastive Pre-training for Check-in + Sequence Representation Learning + + +
+ The rapid growth of location-based services (LBS) has yielded massive amounts +of data on human mobility. Effectively extracting meaningful representations +for user-generated check-in sequences is pivotal for facilitating various +downstream services. However, the user-generated check-in data are +simultaneously influenced by the surrounding objective circumstances and the +user's subjective intention. Specifically, the temporal uncertainty and spatial +diversity exhibited in check-in data make it difficult to capture the +macroscopic spatial-temporal patterns of users and to understand the semantics +of user mobility activities. Furthermore, the distinct characteristics of the +temporal and spatial information in check-in sequences call for an effective +fusion method to incorporate these two types of information. In this paper, we +propose a novel Spatial-Temporal Cross-view Contrastive Representation (STCCR) +framework for check-in sequence representation learning. Specifically, STCCR +addresses the above challenges by employing self-supervision from "spatial +topic" and "temporal intention" views, facilitating effective fusion of spatial +and temporal information at the semantic level. Besides, STCCR leverages +contrastive clustering to uncover users' shared spatial topics from diverse +mobility activities, while employing angular momentum contrast to mitigate the +impact of temporal uncertainty and noise. We extensively evaluate STCCR on +three real-world datasets and demonstrate its superior performance across three +downstream tasks. + +
+
+ comment: This paper has been accepted as a regular paper at IEEE TKDE +
+
+
+
+
+ + ♻ ☆ The Platonic Representation Hypothesis + + +
+ We argue that representations in AI models, particularly deep networks, are +converging. First, we survey many examples of convergence in the literature: +over time and across multiple domains, the ways by which different neural +networks represent data are becoming more aligned. Next, we demonstrate +convergence across data modalities: as vision models and language models get +larger, they measure distance between datapoints in a more and more alike way. +We hypothesize that this convergence is driving toward a shared statistical +model of reality, akin to Plato's concept of an ideal reality. We term such a +representation the platonic representation and discuss several possible +selective pressures toward it. Finally, we discuss the implications of these +trends, their limitations, and counterexamples to our analysis. + +
+
+ comment: Equal contributions. Project: https://phillipi.github.io/prh/ Code: + https://github.com/minyoungg/platonic-rep +
+
+
+
+
+ + ♻ ☆ Nonlinear Schrödinger Network + + +
+ Deep neural networks (DNNs) have achieved exceptional performance across +various fields by learning complex nonlinear mappings from large-scale +datasets. However, they encounter challenges such as high computational costs +and limited interpretability. To address these issues, hybrid approaches that +integrate physics with AI are gaining interest. This paper introduces a novel +physics-based AI model called the "Nonlinear Schr\"odinger Network", which +treats the Nonlinear Schr\"odinger Equation (NLSE) as a general-purpose +trainable model for learning complex patterns including nonlinear mappings and +memory effects from data. Existing physics-informed machine learning methods +use neural networks to approximate the solutions of partial differential +equations (PDEs). In contrast, our approach directly treats the PDE as a +trainable model to obtain general nonlinear mappings that would otherwise +require neural networks. As a type of physics-AI symbiosis, it offers a more +interpretable and parameter-efficient alternative to traditional black-box +neural networks, achieving comparable or better accuracy in some time series +classification tasks while significantly reducing the number of required +parameters. Notably, the trained Nonlinear Schr\"odinger Network is +interpretable, with all parameters having physical meanings as properties of a +virtual physical system that transforms the data to a more separable space. +This interpretability allows for insight into the underlying dynamics of the +data transformation process. Applications to time series forecasting have also +been explored. While our current implementation utilizes the NLSE, the proposed +method of using physics equations as trainable models to learn nonlinear +mappings from data is not limited to the NLSE and may be extended to other +master equations of physics. + +
+
+
+
+
+ + ♻ ☆ Fourier-MIONet: Fourier-enhanced multiple-input neural operators for + multiphase modeling of geological carbon sequestration + + +
+ Geologic carbon sequestration (GCS) is a safety-critical technology that aims +to reduce the amount of carbon dioxide in the atmosphere, which also places +high demands on reliability. Multiphase flow in porous media is essential to +understand CO$_2$ migration and pressure fields in the subsurface associated +with GCS. However, numerical simulation for such problems in 4D is +computationally challenging and expensive, due to the multiphysics and +multiscale nature of the highly nonlinear governing partial differential +equations (PDEs). It prevents us from considering multiple subsurface scenarios +and conducting real-time optimization. Here, we develop a Fourier-enhanced +multiple-input neural operator (Fourier-MIONet) to learn the solution operator +of the problem of multiphase flow in porous media. Fourier-MIONet utilizes the +recently developed framework of the multiple-input deep neural operators +(MIONet) and incorporates the Fourier neural operator (FNO) in the network +architecture. Once Fourier-MIONet is trained, it can predict the evolution of +saturation and pressure of the multiphase flow under various reservoir +conditions, such as permeability and porosity heterogeneity, anisotropy, +injection configurations, and multiphase flow properties. Compared to the +enhanced FNO (U-FNO), the proposed Fourier-MIONet has 90% fewer unknown +parameters, and it can be trained in significantly less time (about 3.5 times +faster) with much lower CPU memory ($<$ 15%) and GPU memory ($<$ 35%) +requirements, to achieve similar prediction accuracy. In addition to the lower +computational cost, Fourier-MIONet can be trained with only 6 snapshots of time +to predict the PDE solutions for 30 years. The excellent generalizability of +Fourier-MIONet is enabled by its adherence to the physical principle that the +solution to a PDE is continuous over time. + +
+
+
+
+
+ + ♻ ☆ Label-efficient Time Series Representation Learning: A Review + + +
+ Label-efficient time series representation learning, which aims to learn +effective representations with limited labeled data, is crucial for deploying +deep learning models in real-world applications. To address the scarcity of +labeled time series data, various strategies, e.g., transfer learning, +self-supervised learning, and semi-supervised learning, have been developed. In +this survey, we introduce a novel taxonomy for the first time, categorizing +existing approaches as in-domain or cross-domain, based on their reliance on +external data sources or not. Furthermore, we present a review of the recent +advances in each strategy, conclude the limitations of current methodologies, +and suggest future research directions that promise further improvements in the +field. + +
+
+ comment: Accepted in the IEEE Transactions on Artificial Intelligence (TAI) + https://ieeexplore.ieee.org/document/10601520 +
+
+
+
+
+ + ♻ ☆ CADC: Encoding User-Item Interactions for Compressing Recommendation + Model Training Data + + +
+ Deep learning recommendation models (DLRMs) are at the heart of the current +e-commerce industry. However, the amount of training data used to train these +large models is growing exponentially, leading to substantial training hurdles. +The training dataset contains two primary types of information: content-based +information (features of users and items) and collaborative information +(interactions between users and items). One approach to reduce the training +dataset is to remove user-item interactions. But that significantly diminishes +collaborative information, which is crucial for maintaining accuracy due to its +inclusion of interaction histories. This loss profoundly impacts DLRM +performance. + This paper makes an important observation that if one can capture the +user-item interaction history to enrich the user and item embeddings, then the +interaction history can be compressed without losing model accuracy. Thus, this +work, Collaborative Aware Data Compression (CADC), takes a two-step approach to +training dataset compression. In the first step, we use matrix factorization of +the user-item interaction matrix to create a novel embedding representation for +both the users and items. Once the user and item embeddings are enriched by the +interaction history information the approach then applies uniform random +sampling of the training dataset to drastically reduce the training dataset +size while minimizing model accuracy drop. The source code of CADC is available +at +\href{https://anonymous.4open.science/r/DSS-RM-8C1D/README.md}{https://anonymous.4open.science/r/DSS-RM-8C1D/README.md}. + +
+
+
+
+
+ + ♻ ☆ On the Trade-offs between Adversarial Robustness and Actionable + Explanations AAAI + + +
+ As machine learning models are increasingly being employed in various +high-stakes settings, it becomes important to ensure that predictions of these +models are not only adversarially robust, but also readily explainable to +relevant stakeholders. However, it is unclear if these two notions can be +simultaneously achieved or if there exist trade-offs between them. In this +work, we make one of the first attempts at studying the impact of adversarially +robust models on actionable explanations which provide end users with a means +for recourse. We theoretically and empirically analyze the cost (ease of +implementation) and validity (probability of obtaining a positive model +prediction) of recourses output by state-of-the-art algorithms when the +underlying models are adversarially robust vs. non-robust. More specifically, +we derive theoretical bounds on the differences between the cost and the +validity of the recourses generated by state-of-the-art algorithms for +adversarially robust vs. non-robust linear and non-linear models. Our empirical +results with multiple real-world datasets validate our theoretical results and +show the impact of varying degrees of model robustness on the cost and validity +of the resulting recourses. Our analyses demonstrate that adversarially robust +models significantly increase the cost and reduce the validity of the resulting +recourses, thus shedding light on the inherent trade-offs between adversarial +robustness and actionable explanations. + +
+
+ comment: Accepted in the 7th AAAI Conference on AI, Ethics, and Society, 2024 +
+
+
+
+
+ + ♻ ☆ MINT-1T: Scaling Open-Source Multimodal Data by 10x: A Multimodal + Dataset with One Trillion Tokens + + +
+ Multimodal interleaved datasets featuring free-form interleaved sequences of +images and text are crucial for training frontier large multimodal models +(LMMs). Despite the rapid progression of open-source LMMs, there remains a +pronounced scarcity of large-scale, diverse open-source multimodal interleaved +datasets. In response, we introduce MINT-1T, the most extensive and diverse +open-source Multimodal INTerleaved dataset to date. MINT-1T comprises one +trillion text tokens and 3.4 billion images, a 10x scale-up from existing +open-source datasets. Additionally, we include previously untapped sources such +as PDFs and ArXiv papers. As scaling multimodal interleaved datasets requires +substantial engineering effort, sharing the data curation process and releasing +the dataset greatly benefits the community. Our experiments show that LMMs +trained on MINT-1T rival the performance of models trained on the previous +leading dataset, OBELICS. Our data and code will be released at +https://github.com/mlfoundations/MINT-1T. + +
+
+
+
+
+ + ♻ ☆ Optimizer's Information Criterion: Dissecting and Correcting Bias in + Data-Driven Optimization + + +
+ In data-driven optimization, the sample performance of the obtained decision +typically incurs an optimistic bias against the true performance, a phenomenon +commonly known as the Optimizer's Curse and intimately related to overfitting +in machine learning. Common techniques to correct this bias, such as +cross-validation, require repeatedly solving additional optimization problems +and are therefore computationally expensive. We develop a general bias +correction approach, building on what we call Optimizer's Information Criterion +(OIC), that directly approximates the first-order bias and does not require +solving any additional optimization problems. Our OIC generalizes the +celebrated Akaike Information Criterion to evaluate the objective performance +in data-driven optimization, which crucially involves not only model fitting +but also its interplay with the downstream optimization. As such it can be used +for decision selection instead of only model selection. We apply our approach +to a range of data-driven optimization formulations comprising empirical and +parametric models, their regularized counterparts, and furthermore contextual +optimization. Finally, we provide numerical validation on the superior +performance of our approach under synthetic and real-world datasets. + +
+
+
+
+
+ + ♻ ☆ Multi-Convformer: Extending Conformer with Multiple Convolution Kernels INTERSPEECH 2024 + + +
+ Convolutions have become essential in state-of-the-art end-to-end Automatic +Speech Recognition~(ASR) systems due to their efficient modelling of local +context. Notably, its use in Conformers has led to superior performance +compared to vanilla Transformer-based ASR systems. While components other than +the convolution module in the Conformer have been reexamined, altering the +convolution module itself has been far less explored. Towards this, we +introduce Multi-Convformer that uses multiple convolution kernels within the +convolution module of the Conformer in conjunction with gating. This helps in +improved modeling of local dependencies at varying granularities. Our model +rivals existing Conformer variants such as CgMLP and E-Branchformer in +performance, while being more parameter efficient. We empirically compare our +approach with Conformer and its variants across four different datasets and +three different modelling paradigms and show up to 8% relative word error +rate~(WER) improvements. + +
+
+ comment: Accepted to INTERSPEECH 2024 +
+
+
+
+
+ + ♻ ☆ Enabling On-Device LLMs Personalization with Smartphone Sensing + + +
+ This demo presents a novel end-to-end framework that combines on-device large +language models (LLMs) with smartphone sensing technologies to achieve +context-aware and personalized services. The framework addresses critical +limitations of current personalization solutions via cloud LLMs, such as +privacy concerns, latency and cost, and limited personal information. To +achieve this, we innovatively proposed deploying LLMs on smartphones with +multimodal sensor data through context-aware sensing and customized prompt +engineering, ensuring privacy and enhancing personalization performance. A case +study involving a university student demonstrated the capability of the +framework to provide tailored recommendations. In addition, we show that the +framework achieves the best trade-off in privacy, performance, latency, cost, +battery and energy consumption between on-device and cloud LLMs. To the best of +our knowledge, this is the first framework to provide on-device LLMs +personalization with smartphone sensing. Future work will incorporate more +diverse sensor data and involve extensive user studies to enhance +personalization. Our proposed framework has the potential to substantially +improve user experiences across domains including healthcare, productivity, and +entertainment. + +
+
+ comment: 5 pages, 3 figures, conference demo paper +
+
+
+
+
+ + ♻ ☆ Universally Harmonizing Differential Privacy Mechanisms for Federated + Learning: Boosting Accuracy and Convergence + + +
+ Differentially private federated learning (DP-FL) is a promising technique +for collaborative model training while ensuring provable privacy for clients. +However, optimizing the tradeoff between privacy and accuracy remains a +critical challenge. To our best knowledge, we propose the first DP-FL framework +(namely UDP-FL), which universally harmonizes any randomization mechanism +(e.g., an optimal one) with the Gaussian Moments Accountant (viz. DP-SGD) to +significantly boost accuracy and convergence. Specifically, UDP-FL demonstrates +enhanced model performance by mitigating the reliance on Gaussian noise. The +key mediator variable in this transformation is the R\'enyi Differential +Privacy notion, which is carefully used to harmonize privacy budgets. We also +propose an innovative method to theoretically analyze the convergence for DP-FL +(including our UDP-FL ) based on mode connectivity analysis. Moreover, we +evaluate our UDP-FL through extensive experiments benchmarked against +state-of-the-art (SOTA) methods, demonstrating superior performance on both +privacy guarantees and model performance. Notably, UDP-FL exhibits substantial +resilience against different inference attacks, indicating a significant +advance in safeguarding sensitive data in federated learning environments. + +
+
+
+
+
+ + ♻ ☆ AI-Driven Guided Response for Security Operation Centers with Microsoft + Copilot for Security + + +
+ Security operation centers contend with a constant stream of security +incidents, ranging from straightforward to highly complex. To address this, we +developed Copilot Guided Response (CGR), an industry-scale ML architecture that +guides security analysts across three key tasks -- (1) investigation, providing +essential historical context by identifying similar incidents; (2) triaging to +ascertain the nature of the incident -- whether it is a true positive, false +positive, or benign positive; and (3) remediation, recommending tailored +containment actions. CGR is integrated into the Microsoft Defender XDR product +and deployed worldwide, generating millions of recommendations across thousands +of customers. Our extensive evaluation, incorporating internal evaluation, +collaboration with security experts, and customer feedback, demonstrates that +CGR delivers high-quality recommendations across all three tasks. We provide a +comprehensive overview of the CGR architecture, setting a precedent as the +first cybersecurity company to openly discuss these capabilities in such depth. +Additionally, we GUIDE, the largest public collection of real-world security +incidents, spanning 13M evidences across 1M annotated incidents. By enabling +researchers and practitioners to conduct research on real-world data, GUIDE +advances the state of cybersecurity and supports the development of +next-generation machine learning systems. + +
+
+
+
+
+ + ♻ ☆ SLADE: Detecting Dynamic Anomalies in Edge Streams without Labels via + Self-Supervised Learning KDD 2024 + + +
+ To detect anomalies in real-world graphs, such as social, email, and +financial networks, various approaches have been developed. While they +typically assume static input graphs, most real-world graphs grow over time, +naturally represented as edge streams. In this context, we aim to achieve three +goals: (a) instantly detecting anomalies as they occur, (b) adapting to +dynamically changing states, and (c) handling the scarcity of dynamic anomaly +labels. In this paper, we propose SLADE (Self-supervised Learning for Anomaly +Detection in Edge Streams) for rapid detection of dynamic anomalies in edge +streams, without relying on labels. SLADE detects the shifts of nodes into +abnormal states by observing deviations in their interaction patterns over +time. To this end, it trains a deep neural network to perform two +self-supervised tasks: (a) minimizing drift in node representations and (b) +generating long-term interaction patterns from short-term ones. Failure in +these tasks for a node signals its deviation from the norm. Notably, the neural +network and tasks are carefully designed so that all required operations can be +performed in constant time (w.r.t. the graph size) in response to each new edge +in the input stream. In dynamic anomaly detection across four real-world +datasets, SLADE outperforms nine competing methods, even those leveraging label +supervision. + +
+
+ comment: 15 pages, 6 figures, To Appear in KDD 2024 +
+
+
+
+
+ + ♻ ☆ A Survey on Hypergraph Neural Networks: An In-Depth and Step-By-Step + Guide KDD 2024 + + +
+ Higher-order interactions (HOIs) are ubiquitous in real-world complex systems +and applications. Investigation of deep learning for HOIs, thus, has become a +valuable agenda for the data mining and machine learning communities. As +networks of HOIs are expressed mathematically as hypergraphs, hypergraph neural +networks (HNNs) have emerged as a powerful tool for representation learning on +hypergraphs. Given the emerging trend, we present the first survey dedicated to +HNNs, with an in-depth and step-by-step guide. Broadly, the present survey +overviews HNN architectures, training strategies, and applications. First, we +break existing HNNs down into four design components: (i) input features, (ii) +input structures, (iii) message-passing schemes, and (iv) training strategies. +Second, we examine how HNNs address and learn HOIs with each of their +components. Third, we overview the recent applications of HNNs in +recommendation, bioinformatics and medical science, time series analysis, and +computer vision. Lastly, we conclude with a discussion on limitations and +future directions. + +
+
+ comment: To appear in KDD 2024 (survey paper) +
+
+
+
+
+ + ♻ ☆ Guaranteed Trajectory Tracking under Learned Dynamics with Contraction + Metrics and Disturbance Estimation + + +
+ This paper presents an approach to trajectory-centric learning control based +on contraction metrics and disturbance estimation for nonlinear systems subject +to matched uncertainties. The approach uses deep neural networks to learn +uncertain dynamics while still providing guarantees of transient tracking +performance throughout the learning phase. Within the proposed approach, a +disturbance estimation law is adopted to estimate the pointwise value of the +uncertainty, with pre-computable estimation error bounds (EEBs). The learned +dynamics, the estimated disturbances, and the EEBs are then incorporated in a +robust Riemann energy condition to compute the control law that guarantees +exponential convergence of actual trajectories to desired ones throughout the +learning phase, even when the learned model is poor. On the other hand, with +improved accuracy, the learned model can help improve the robustness of the +tracking controller, e.g., against input delays, and can be incorporated to +plan better trajectories with improved performance, e.g., lower energy +consumption and shorter travel time.The proposed framework is validated on a +planar quadrotor example. + +
+
+ comment: 18 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ A Library of Mirrors: Deep Neural Nets in Low Dimensions are Convex + Lasso Models with Reflection Features + + +
+ We prove that training neural networks on 1-D data is equivalent to solving +convex Lasso problems with discrete, explicitly defined dictionary matrices. We +consider neural networks with piecewise linear activations and depths ranging +from 2 to an arbitrary but finite number of layers. We first show that +two-layer networks with piecewise linear activations are equivalent to Lasso +models using a discrete dictionary of ramp functions, with breakpoints +corresponding to the training data points. In certain general architectures +with absolute value or ReLU activations, a third layer surprisingly creates +features that reflect the training data about themselves. Additional layers +progressively generate reflections of these reflections. The Lasso +representation provides valuable insights into the analysis of globally optimal +networks, elucidating their solution landscapes and enabling closed-form +solutions in certain special cases. Numerical results show that reflections +also occur when optimizing standard deep networks using standard non-convex +optimizers. Additionally, we demonstrate our theory with autoregressive time +series models. + +
+
+
+
+
+
+
+
+ + Multimedia 8 + +
+
+
+ + ☆ Revolutionizing Text-to-Image Retrieval as Autoregressive Token-to-Voken + Generation + + +
+ Text-to-image retrieval is a fundamental task in multimedia processing, +aiming to retrieve semantically relevant cross-modal content. Traditional +studies have typically approached this task as a discriminative problem, +matching the text and image via the cross-attention mechanism (one-tower +framework) or in a common embedding space (two-tower framework). Recently, +generative cross-modal retrieval has emerged as a new research line, which +assigns images with unique string identifiers and generates the target +identifier as the retrieval target. Despite its great potential, existing +generative approaches are limited due to the following issues: insufficient +visual information in identifiers, misalignment with high-level semantics, and +learning gap towards the retrieval target. To address the above issues, we +propose an autoregressive voken generation method, named AVG. AVG tokenizes +images into vokens, i.e., visual tokens, and innovatively formulates the +text-to-image retrieval task as a token-to-voken generation problem. AVG +discretizes an image into a sequence of vokens as the identifier of the image, +while maintaining the alignment with both the visual information and high-level +semantics of the image. Additionally, to bridge the learning gap between +generative training and the retrieval target, we incorporate discriminative +training to modify the learning direction during token-to-voken training. +Extensive experiments demonstrate that AVG achieves superior results in both +effectiveness and efficiency. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ The Sketchfab 3D Creative Commons Collection (S3D3C) + + +
+ The technology to capture, create, and use three-dimensional (3D) models has +become increasingly accessible in recent years. + With increasing numbers of use cases for 3D models and collections of rapidly +increasing size, better methods to analyze the content of 3D models are +required. + While previously proposed 3D model collections for research purposes exist, +these often contain only untextured geometry and are typically designed for a +specific application, which limits their use in quantitative evaluations of +modern 3D model analysis methods. + In this paper, we introduce the Sketchfab 3D Creative Commons Collection +(S3D3C), a new 3D model research collection consisting of 40,802 creative +commons licensed models downloaded from the 3D model platform Sketchfab. + By including popular freely available models with a wide variety of technical +properties, such as textures, materials, and animations, we enable its use in +the evaluation of state-of-the-art geometry-based and view-based 3D model +analysis and retrieval techniques. + +
+
+
+
+
+ + ☆ Enhancing Environmental Monitoring through Multispectral Imaging: The + WasteMS Dataset for Semantic Segmentation of Lakeside Waste + + +
+ Environmental monitoring of lakeside green areas is crucial for environmental +protection. Compared to manual inspections, computer vision technologies offer +a more efficient solution when deployed on-site. Multispectral imaging provides +diverse information about objects under different spectrums, aiding in the +differentiation between waste and lakeside lawn environments. This study +introduces WasteMS, the first multispectral dataset established for the +semantic segmentation of lakeside waste. WasteMS includes a diverse range of +waste types in lawn environments, captured under various lighting conditions. +We implemented a rigorous annotation process to label waste in images. +Representative semantic segmentation frameworks were used to evaluate +segmentation accuracy using WasteMS. Challenges encountered when using WasteMS +for segmenting waste on lakeside lawns were discussed. The WasteMS dataset is +available at https://github.com/zhuqinfeng1999/WasteMS. + +
+
+
+
+
+ + ☆ Selective Vision-Language Subspace Projection for Few-shot CLIP + + +
+ Vision-language models such as CLIP are capable of mapping the different +modality data into a unified feature space, enabling zero/few-shot inference by +measuring the similarity of given images and texts. However, most existing +methods overlook modality gaps in CLIP's encoded features, which is shown as +the text and image features lie far apart from each other, resulting in limited +classification performance. To tackle this issue, we introduce a method called +Selective Vision-Language Subspace Projection (SSP), which incorporates local +image features and utilizes them as a bridge to enhance the alignment between +image-text pairs. Specifically, our SSP framework comprises two parallel +modules: a vision projector and a language projector. Both projectors utilize +local image features to span the respective subspaces for image and texts, +thereby projecting the image and text features into their respective subspaces +to achieve alignment. Moreover, our approach entails only training-free matrix +calculations and can be seamlessly integrated into advanced CLIP-based few-shot +learning frameworks. Extensive experiments on 11 datasets have demonstrated +SSP's superior text-image alignment capabilities, outperforming the +state-of-the-art alignment methods. The code is available at +https://github.com/zhuhsingyuu/SSP + +
+
+ comment: Accepted to ACM MultiMedia 2024 +
+
+
+
+
+ + ☆ Improved symbolic drum style classification with grammar-based + hierarchical representations + + +
+ Deep learning models have become a critical tool for analysis and +classification of musical data. These models operate either on the audio +signal, e.g. waveform or spectrogram, or on a symbolic representation, such as +MIDI. In the latter, musical information is often reduced to basic features, +i.e. durations, pitches and velocities. Most existing works then rely on +generic tokenization strategies from classical natural language processing, or +matrix representations, e.g. piano roll. In this work, we evaluate how enriched +representations of symbolic data can impact deep models, i.e. Transformers and +RNN, for music style classification. In particular, we examine representations +that explicitly incorporate musical information implicitly present in MIDI-like +encodings, such as rhythmic organization, and show that they outperform generic +tokenization strategies. We introduce a new tree-based representation of MIDI +data built upon a context-free musical grammar. We show that this grammar +representation accurately encodes high-level rhythmic information and +outperforms existing encodings on the GrooveMIDI Dataset for drumming style +classification, while being more compact and parameter-efficient. + +
+
+ comment: International Society for Music Information Retrieval Conference + 2024, Nov 2024, San Francisco, United States +
+
+
+
+
+ + ♻ ☆ A Bounding Box is Worth One Token: Interleaving Layout and Text in a + Large Language Model for Document Understanding + + +
+ Recently, many studies have demonstrated that exclusively incorporating +OCR-derived text and spatial layouts with large language models (LLMs) can be +highly effective for document understanding tasks. However, existing methods +that integrate spatial layouts with text have limitations, such as producing +overly long text sequences or failing to fully leverage the autoregressive +traits of LLMs. In this work, we introduce Interleaving Layout and Text in a +Large Language Model (LayTextLLM)} for document understanding. In particular, +LayTextLLM projects each bounding box to a single embedding and interleaves it +with text, efficiently avoiding long sequence issues while leveraging +autoregressive traits of LLMs. LayTextLLM not only streamlines the interaction +of layout and textual data but also shows enhanced performance in Key +Information Extraction (KIE) and Visual Question Answering (VQA). Comprehensive +benchmark evaluations reveal significant improvements, with a 27.2% increase on +KIE tasks and 12.0% on VQA tasks compared to previous state-of-the-art document +understanding MLLMs, as well as a 15.1% improvement over other SOTA OCR-based +LLMs on KIE tasks. + +
+
+
+
+
+ + ♻ ☆ Spatiotemporal Graph Guided Multi-modal Network for Livestreaming + Product Retrieval + + +
+ With the rapid expansion of e-commerce, more consumers have become accustomed +to making purchases via livestreaming. Accurately identifying the products +being sold by salespeople, i.e., livestreaming product retrieval (LPR), poses a +fundamental and daunting challenge. The LPR task encompasses three primary +dilemmas in real-world scenarios: 1) the recognition of intended products from +distractor products present in the background; 2) the video-image heterogeneity +that the appearance of products showcased in live streams often deviates +substantially from standardized product images in stores; 3) there are numerous +confusing products with subtle visual nuances in the shop. To tackle these +challenges, we propose the Spatiotemporal Graphing Multi-modal Network (SGMN). +First, we employ a text-guided attention mechanism that leverages the spoken +content of salespeople to guide the model to focus toward intended products, +emphasizing their salience over cluttered background products. Second, a +long-range spatiotemporal graph network is further designed to achieve both +instance-level interaction and frame-level matching, solving the misalignment +caused by video-image heterogeneity. Third, we propose a multi-modal hard +example mining, assisting the model in distinguishing highly similar products +with fine-grained features across the video-image-text domain. Through +extensive quantitative and qualitative experiments, we demonstrate the superior +performance of our proposed SGMN model, surpassing the state-of-the-art methods +by a substantial margin. The code is available at +https://github.com/Huxiaowan/SGMN. + +
+
+ comment: 9 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ MicroEmo: Time-Sensitive Multimodal Emotion Recognition with + Micro-Expression Dynamics in Video Dialogues + + +
+ Multimodal Large Language Models (MLLMs) have demonstrated remarkable +multimodal emotion recognition capabilities, integrating multimodal cues from +visual, acoustic, and linguistic contexts in the video to recognize human +emotional states. However, existing methods ignore capturing local facial +features of temporal dynamics of micro-expressions and do not leverage the +contextual dependencies of the utterance-aware temporal segments in the video, +thereby limiting their expected effectiveness to a certain extent. In this +work, we propose MicroEmo, a time-sensitive MLLM aimed at directing attention +to the local facial micro-expression dynamics and the contextual dependencies +of utterance-aware video clips. Our model incorporates two key architectural +contributions: (1) a global-local attention visual encoder that integrates +global frame-level timestamp-bound image features with local facial features of +temporal dynamics of micro-expressions; (2) an utterance-aware video Q-Former +that captures multi-scale and contextual dependencies by generating visual +token sequences for each utterance segment and for the entire video then +combining them. Preliminary qualitative experiments demonstrate that in a new +Explainable Multimodal Emotion Recognition (EMER) task that exploits +multi-modal and multi-faceted clues to predict emotions in an open-vocabulary +(OV) manner, MicroEmo demonstrates its effectiveness compared with the latest +methods. + +
+
+
+
+
+
+
+ + + +
+
+ +
+
+ + diff --git a/index.js b/index.js new file mode 100644 index 00000000..69f5da7b --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`